Ejemplo n.º 1
0
def fasta2vcf(fasta_file, outfile):
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)
    refseq = pp.fasta(conf["ref"]).fa_dict
    seqs = pp.fasta(fasta_file)
    samples = list(seqs.fa_dict.keys())

    for sample in samples:
        fname = pp.get_random_file()
        open(fname, "w").write(">%s\n%s\n" % (sample, seqs.fa_dict[sample]))
        fasta_obj = pp.fasta(fname)
        vcf_obj = pp.vcf(fasta_obj.get_ref_variants(conf["ref"], sample))
        pp.run_cmd("rm %s" % fname)

    sample_chunks = [samples[i:i + 200] for i in range(0, len(samples), 200)]
    tmp_vcfs = []
    for tmp_samps in sample_chunks:
        tmp_list = pp.get_random_file()
        tmp_vcf = pp.get_random_file()
        open(tmp_list,
             "w").write("\n".join(["%s.vcf.gz" % x for x in tmp_samps]))
        pp.run_cmd("bcftools merge -0 -l %s -Oz -o %s" % (tmp_list, tmp_vcf))
        pp.run_cmd("bcftools index %s" % tmp_vcf)
        tmp_vcfs.append(tmp_vcf)
        pp.rm_files([tmp_list])

    pp.run_cmd("bcftools merge -0  %s | bcftools view -V indels -Oz -o %s" %
               (" ".join(tmp_vcfs), outfile))

    vcf_files = ["%s.vcf.gz" % s for s in samples]
    vcf_csi_files = ["%s.vcf.gz.csi" % s for s in samples]
    pp.rm_files(vcf_files + vcf_csi_files + tmp_vcfs)
def main(args):
    refseq = list(pp.fasta(args.ref).fa_dict.values())[0]
    for l in sys.stdin:
        if l[0] == "#":
            if l.strip() == "##contig=<ID=1,length=29903>":
                l = l.replace("1", "NC_045512.2")
            sys.stdout.write(l)
            continue
        row = l.strip().split()
        if row[0] == "1": row[0] = args.seqname
        ipos = int(row[1]) - 1
        possible_ref_allele = row[3]
        true_ref_allele = refseq[ipos]
        alts = row[4].split(",")
        alleles = [possible_ref_allele] + alts
        if possible_ref_allele != true_ref_allele:
            ref_allele_index = alts.index(true_ref_allele)
            row[3] = true_ref_allele
            alts[ref_allele_index] = possible_ref_allele
            row[4] = ",".join(alts)
            genos = "".join(row[9:]).replace(
                str(ref_allele_index + 1),
                "R").replace("0", str(ref_allele_index + 1)).replace("R", "0")

            row[9:] = list(genos)
            sys.stdout.write("\t".join(row) + "\n")
        else:
            sys.stdout.write("\t".join(row) + "\n")
Ejemplo n.º 3
0
def main(args):
    ref = pp.fasta(args.ref).fa_dict
    cds = gff_load_cds(args.gff)
    final_list = []
    coding = defaultdict(list)
    generator = pp.cmd_out(
        f"bcftools view {args.vcf}") if args.vcf else sys.stdin
    for l in generator:
        row = l.strip().split()
        if l[0] == "#":
            sys.stdout.write(l.strip() + "\n")
        elif len(row[3]) > 1 or len(row[4]) > 1:
            final_list.append(row)
        else:
            gene, cpos = get_codon_pos(row[0], int(row[1]), cds)
            if gene == None:
                final_list.append(row)
            else:
                coding[(gene, cpos)].append(row)

    for rows in coding.values():
        chrom = rows[0][0]
        pos = sorted([int(r[1]) for r in rows])

        ref_nucs = {p: ref[chrom][p - 1] for p in range(pos[0], pos[-1] + 1)}
        alt_nucs = ref_nucs.copy()
        for i, p in enumerate(pos):
            alt_nucs[p] = rows[i][4]
        new_row = rows[0]
        new_row[3] = "".join(ref_nucs.values())
        new_row[4] = "".join(alt_nucs.values())
        final_list.append(new_row)

    for row in sorted(final_list, key=lambda x: int(x[1])):
        sys.stdout.write("\t".join(row) + "\n")
Ejemplo n.º 4
0
def main(args):
    seqs = pp.fasta(args.fasta).fa_dict
    for seq in seqs:
        seqs[seq] = list(seqs[seq])
    for l in open(args.bed):
        row = l.strip().split()
        for i in tqdm(range(int(row[1])-1,int(row[2]))):
            for seq in seqs:
                seqs[seq][i] = "N"

    with open(args.out,"w") as O:
        for seq in seqs:
            O.write(">%s\n%s\n" % (seq,"".join(seqs[seq])))
Ejemplo n.º 5
0
def main(args):
    acgtn = set(["A","C","G","T","N"])
    sys.stderr.write("Loading sequences\n")
    seqs = pp.fasta(args.fasta).fa_dict
    sys.stderr.write("Masking sequences\n")
    for seq in tqdm(seqs):
        nucs = Counter(list(seqs[seq]))
        for nuc in nucs:
            if nuc not in acgtn:
                seqs[seq] = seqs[seq].replace(nuc,"N")

    sys.stderr.write("Writing sequences\n")
    with open(args.out,"w") as O:
        for seq in seqs:
            O.write(">%s\n%s\n" % (seq,"".join(seqs[seq])))
Ejemplo n.º 6
0
def main(args):
    seqs = pp.fasta(args.msa).fa_dict
    ref_pos = 0
    print("ref_pos\talignment_pos\tA\tC\tG\tT\tN\tgap")
    for i in range(len(list(seqs.values())[0])):
        alignment_pos = i+1
        if seqs[refseq][i]!="-":
            ref_pos+=1
        allele_count = Counter([seqs[s][i].upper() for s in seqs])
        num_N = allele_count["N"] if "N" in allele_count else 0
        num_gap = allele_count["-"] if "-" in allele_count else 0
        num_A = allele_count["A"] if "A" in allele_count else 0
        num_C = allele_count["C"] if "C" in allele_count else 0
        num_G = allele_count["G"] if "G" in allele_count else 0
        num_T = allele_count["T"] if "T" in allele_count else 0


        print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ref_pos,alignment_pos,num_A,num_C,num_G,num_T,num_N,num_gap))
Ejemplo n.º 7
0
def primer_evaluation(args):
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)
    msa_obj = pp.fasta(conf["msa"]).fa_dict
    forward_results = cp.run_fuzznuc(conf["msa"],
                                     args.primerF,
                                     pmismatch=args.mismatch)
    reverse_results = cp.run_fuzznuc(conf["msa"],
                                     args.primerR,
                                     pmismatch=args.mismatch)
    probe_results = cp.run_fuzznuc(conf["msa"],
                                   args.probe,
                                   pmismatch=args.mismatch)

    rows = []
    for s in tqdm(msa_obj):
        amplicon = cp.find_amplicon(forward_results[s], reverse_results[s],
                                    probe_results[s])
        rows.append(amplicon)
    with open(args.out, "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(rows[0].keys()))
        writer.writeheader()
        writer.writerows(rows)
Ejemplo n.º 8
0
def main(args):
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(".targets.csq.vcf.gz", "") for x in os.listdir(args.dir)
            if x[-19:] == ".targets.csq.vcf.gz"
        ]
    sample_fastas = defaultdict(list)
    params = {
        "tmp_locations": pp.get_random_file(),
        "tmp_mappings": pp.get_random_file(),
        "ref": conf["ref"]
    }
    pp.run_cmd("awk '{print $1\":\"$2\"-\"$3\"\\t\"$5}' %s > %s" %
               (conf["bed"], params["tmp_mappings"]))
    pp.run_cmd("cut -f1 %s > %s" %
               (params["tmp_mappings"], params["tmp_locations"]))
    FILES = {}
    for l in open(params["tmp_mappings"]):
        row = l.rstrip().split()
        FILES[row[0]] = open("%s.fasta" % row[1], "w")
    for s in samples:
        params["vcf"] = "%s/%s.targets.csq.vcf.gz" % (args.dir, s)
        params["tmp_vcf"] = "%s/%s.targets.csq.tmp.vcf.gz" % (args.dir, s)
        params["sample_fa"] = "%s.targets.fa" % (s)
        pp.run_cmd(
            "bcftools filter -e 'sum(AD)=0' -S . %(vcf)s | bcftools view -a | grep -v NON_REF | bcftools view -Oz -o %(tmp_vcf)s"
            % params)
        pp.run_cmd("bcftools index %(tmp_vcf)s" % params)
        pp.run_cmd(
            "samtools faidx -r %(tmp_locations)s %(ref)s | bcftools consensus -H A %(tmp_vcf)s > %(sample_fa)s"
            % params)
        fa_dict = pp.fasta(params["sample_fa"]).fa_dict
        for locus in fa_dict:
            FILES[locus].write(">%s\n%s\n" % (s, fa_dict[locus]))
        pp.rm_files([params["tmp_vcf"]])
    pp.rm_files([params["tmp_locations"], params["tmp_mappings"]])
Ejemplo n.º 9
0
def main(args):
    genes = load_gff(args.gff)
    refseq = pp.fasta(args.ref).fa_dict

    mutations = {}
    converted_mutations = {}
    for row in csv.DictReader(open(args.csv)):
        gene = [
            g for g in genes
            if g.name == row["Gene"] or g.locus_tag == row["Gene"]
        ][0]
        mut = None
        r = re.search("r.([0-9]+)([acgt]+)>([acgt]+)", row["Mutation"])
        if r:
            converted_mutations[(
                row["Gene"], row["Mutation"]
            )] = f"n.{r.group(1)}{r.group(2).upper()}>{r.group(3).upper()}"
        r = re.search("p\..+", row["Mutation"])
        if r:
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]
        r = re.search("c.-[0-9]+[ACGT]>[ACGT]", row["Mutation"])
        if r:
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]

        r = re.search("c.[0-9]+dup[ACGT]+", row["Mutation"])
        if r:
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]
        r = re.search("c.[0-9]+_[0-9]+dup[ACGT]+", row["Mutation"])
        if r:
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]

        r = re.search("c.([0-9]+)del", row["Mutation"])
        if r:
            # "ethA" "c.341del"
            del_start = int(r.group(1))
            del_end = int(r.group(1))
            if gene.strand == "+":
                # rpoB "c.1282_1290del"
                genome_start = gene.start + del_start - 2
                genome_end = gene.start + del_end
            else:
                # "ethA" "c.1057_1059del"
                genome_start = gene.start - del_end
                genome_end = gene.start - del_start + 2
            ref = refseq["Chromosome"][genome_start - 1:genome_end - 1]
            alt = ref[0]
            mutations[(row["Gene"], row["Mutation"])] = {
                "pos": genome_start,
                "ref": ref,
                "alt": alt,
                "gene": row["Gene"],
                "type": "nucleotide"
            }

        r = re.search("c.([0-9]+)_([0-9]+)del", row["Mutation"])
        if r:
            del_start = int(r.group(1))
            del_end = int(r.group(2))
            if gene.strand == "+":
                # rpoB "c.1282_1290del"
                genome_start = gene.start + del_start - 2
                genome_end = gene.start + del_end
            else:
                # "ethA" "c.1057_1059del"
                genome_start = gene.start - del_end
                genome_end = gene.start - del_start + 2
            ref = refseq["Chromosome"][genome_start - 1:genome_end - 1]
            alt = ref[0]
            mutations[(row["Gene"], row["Mutation"])] = {
                "pos": genome_start,
                "ref": ref,
                "alt": alt,
                "gene": row["Gene"],
                "type": "nucleotide"
            }

        r = re.search("c.-([0-9]+)del", row["Mutation"])
        if r:
            del_start = int(r.group(1))
            del_end = int(r.group(1))
            if gene.strand == "+":
                # "embA" "c.-29_-28del"
                genome_start = gene.start + del_start - 1
                genome_end = gene.start + del_end + 1
            else:
                # "alr" "c.-283_-280delCAAT"
                genome_start = gene.start - del_end - 1
                genome_end = gene.start - del_start + 1
            ref = refseq["Chromosome"][genome_start - 1:genome_end - 1]
            alt = ref[0]
            mutations[(row["Gene"], row["Mutation"])] = {
                "pos": genome_start,
                "ref": ref,
                "alt": alt,
                "gene": row["Gene"],
                "type": "nucleotide"
            }

        r = re.search("c.(-[0-9]+)_(-[0-9]+)del", row["Mutation"])
        if r:
            del_start = int(r.group(1))
            del_end = int(r.group(2))
            if gene.strand == "+":
                # "embA" "c.-29_-28del"
                genome_start = gene.start + del_start - 1
                genome_end = gene.start + del_end + 1
            else:
                # "alr" "c.-283_-280delCAAT"
                genome_start = gene.start - del_end - 1
                genome_end = gene.start - del_start + 1
            ref = refseq["Chromosome"][genome_start - 1:genome_end - 1]
            alt = ref[0]
            mutations[(row["Gene"], row["Mutation"])] = {
                "pos": genome_start,
                "ref": ref,
                "alt": alt,
                "gene": row["Gene"],
                "type": "nucleotide"
            }

        r = re.search("c.([0-9]+)_([0-9]+)ins([ACGT]+)", row["Mutation"])
        if r:
            ins_start = int(r.group(1))
            ins_end = int(r.group(2))
            ins_seq = r.group(3)
            if gene.strand == "+":
                # "rpoB" "c.1296_1297insTTC"
                genome_start = gene.start + ins_start - 1
                genome_end = gene.start + ins_end - 1
            else:
                # "pncA" "c.521_522insT"
                ins_seq = pp.revcom(ins_seq)
                genome_start = gene.start - ins_start
                genome_end = gene.start - ins_end + 2

            ref = refseq["Chromosome"][genome_start - 1:genome_end - 1]
            alt = ref + ins_seq
            mutations[(row["Gene"], row["Mutation"])] = {
                "pos": genome_start,
                "ref": ref,
                "alt": alt,
                "gene": row["Gene"],
                "type": "nucleotide"
            }

        if row["Mutation"] == "frameshift":
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]
        if row["Mutation"] == "large_deletion":
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]
        if row["Mutation"][:19] == "any_missense_codon_":
            converted_mutations[(row["Gene"],
                                 row["Mutation"])] = row["Mutation"]

    print("Converting %s mutations" % len(mutations))
    mutation_conversion = get_ann(mutations)
    for key in mutation_conversion:
        converted_mutations[key] = mutation_conversion[key]

    with open(args.out + ".csv", "w") as O:
        with open(args.out + ".log", "w") as L:
            writer = csv.DictWriter(O, fieldnames=list(row))
            writer.writeheader()
            for row in csv.DictReader(open(args.csv)):
                key = (row["Gene"], row["Mutation"])
                if row["Mutation"] != converted_mutations[key]:
                    L.write(
                        f'Recoded {row["Gene"]} {row["Mutation"]} as {converted_mutations[key]}\n'
                    )

                row["Mutation"] = converted_mutations[key]

                writer.writerow(row)
def main(args):

    vcf_class = pp.vcf(args.vcf)
    vcf_positions = vcf_class.get_positions()

    if not args.fasta:
        if not args.ref:
            sys.stderr.write(
                "\nERROR: Please supply a reference with --ref\n\n")
            quit()
        pp.run_cmd(
            "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" %
            vars(args))
        args.fasta = "%s.snps.fa" % vcf_class.prefix
    if pp.nofile("%s.asr.state" % args.fasta):
        pp.run_cmd(
            "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr"
            % vars(args))

    tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    states_file = "%s.asr.state" % args.fasta
    states = defaultdict(dict)
    sys.stderr.write("Loading states\n")
    for l in tqdm(open(states_file)):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        site = int(row[1])
        if row[0] not in internal_node_names: continue
        states[site][row[0]] = row[2]

    seqs = pp.fasta(args.fasta).fa_dict
    for site in tqdm(list(states)):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"])
    convergent_sites = []
    for site in tqdm(list(states)):
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objects
        origins = []

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors(
            )[0].state and node_state in acgt and n.get_ancestors(
            )[0].state in acgt:
                origins.append(n.name)
            n.add_feature("state", node_state)
        if len(origins) > 1:
            convergent_sites.append((site, vcf_positions[site - 1], origins))

    with open(args.out, "w") as O:
        for site in convergent_sites:
            O.write("%s\t%s\n" % (site[1][1], len(site[2])))
Ejemplo n.º 11
0
def main_preprocess(args):
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)
    refseq = pp.fasta(conf["ref"]).fa_dict
    refseqname = list(refseq.keys())[0]
    if args.seqs:
        seqs = pp.fasta(args.seqs)

    samples = list(seqs.fa_dict.keys())

    fasta2vcf(args.seqs, "merged.vcf.gz")
    #
    #
    # pp.run_cmd("vcf2fasta.py --vcf merged.vcf.gz --ref %s" % conf["ref"])

    # pp.run_cmd("iqtree -s merged.fa -m GTR+F+R2 -bb 1000 -nt AUTO -asr -czb -redo")
    variant_data = get_variant_data("merged.vcf.gz", conf["ref"], conf["gff"],
                                    conf["proteins"])
    #### variant_data = {29366: {'alts': 'T', 'types': 'missense', 'changes': '365P>365S', 'gene': 'N', 'gene_function': 'Nucleocapsid protein', 'gene_reference': '10.1186/s40779-020-00240-0'}}

    #### mutations  = [{'position': 29868, 'mutation_type': 'convergent', 'origins': 14, 'branches': 'Node927'}]
    for i in range(len(mutations)):
        for key in variant_data[mutations[i]["position"]]:
            mutations[i][key] = variant_data[mutations[i]["position"]][key]

    with open(args.out + ".csv", "w") as O:
        tmp = list(mutations[0].keys())
        fieldnames = tmp[:4] + tmp[-6:] + tmp[4:-6]
        writer = csv.DictWriter(O, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(mutations)
    quit()
    seqs = pp.fasta("merged.fa").fa_dict

    tree = ete3.Tree("merged.fa.treefile", format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    for n in tree.traverse():
        if n.name.split("/")[0] in node_names:
            if "Node" in n.name:
                tmp = n.name.split("/")
                n.name = tmp[0]
                if len(tmp) > 1:
                    n.support = tmp[1]

    states = defaultdict(dict)
    sites = set()
    sys.stderr.write("Loading states\n")
    for l in tqdm(open("merged.fa.state")):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        states[int(row[1])][row[0]] = row[2]
        sites.add(int(row[1]))

    sys.stderr.write("Loading alignment sites\n")
    for site in tqdm(sites):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    barcoding_sites = []
    convergent_sites = []
    mutations = []
    for site in tqdm(sites):
        if site not in variant_data:
            continue  ####### Not sure if this is right yet
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objjects
        origins = []
        internal_node_change = False

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors()[0].state:
                origins.append(n.name)
                if n.name in internal_node_names:
                    internal_node_change = True
            n.add_feature("state", node_state)

        type = "unique"
        if internal_node_change and len(origins) == 1:
            type = "barcoding"
            barcoding_sites.append(site)
        if len(origins) > 1:
            type = "convergent"
            convergent_sites.append(site)

        tmp_data = {
            "position": site,
            "mutation_type": type,
            "origins": len(origins),
            "branches": ",".join(origins),
            "gene": variant_data[site]["gene"],
            "gene_function": variant_data[site]["gene_function"],
            "gene_reference": variant_data[site]["gene_reference"],
            "alts": variant_data[site]["alts"],
            "functional_types": variant_data[site]["types"],
            "changes": variant_data[site]["changes"]
        }
        for sample in leaf_names:
            tmp_data[sample] = states[site][sample]
        mutations.append(tmp_data)

    print("Barcoding sites: ", barcoding_sites)
    print("Convergent sites: ", convergent_sites)

    # Reroot tree at S/L types
    outgroup_leaf_names = [s for s in leaf_names if seqs[s][8782 - 1] == "T"]
    tree.set_outgroup(tree.get_common_ancestor(outgroup_leaf_names))

    tree.write(format=1, outfile=args.out + ".tree")

    with open(args.out + ".barcode.bed", "w") as O:
        for pos in barcoding_sites:
            for allele in set(list(states[pos].values())):
                tmp_samps = [x for x in leaf_names if states[pos][x] == allele]
                O.write("%s\t%s\t%s\t%s\t%s\n" %
                        (refseqname, pos - 1, pos, allele,
                         tree.get_common_ancestor(tmp_samps).name))

    with open(args.out + ".mutation_summary.csv", "w") as O:
        writer = csv.DictWriter(
            O,
            fieldnames=[
                "position", "mutation_type", "origins", "branches", "gene",
                "gene_function", "gene_reference", "alts", "functional_types",
                "changes"
            ] + list(leaf_names))
        writer.writeheader()
        for row in mutations:
            writer.writerow(row)
Ejemplo n.º 12
0
def main_profile(args):
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    if args.fasta:
        if args.read1 or args.read2:
            sys.stderr.write(
                "Please use --fasta or --read1/2 but not both... Exiting!\n")
            quit()
        fasta_obj = pp.fasta(args.fasta)
        wg_vcf_obj = pp.vcf(
            fasta_obj.get_ref_variants(conf["ref"],
                                       prefix=args.prefix,
                                       file_prefix=files_prefix))
    else:
        if not args.read1:
            sys.stderr.write(
                "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n"
            )
            quit()
        ### Create bam file if fastq has been supplied ###
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        wg_vcf_obj = bam_obj.call_variants(conf["ref"],
                                           args.caller,
                                           remove_missing=True)
        cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"],
                         wg_vcf_obj.samples[0],
                         wg_vcf_obj.prefix + ".consensus.fasta")
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    refseq = pp.fasta(conf["ref"]).fa_dict
    refseqname = list(refseq.keys())[0]

    results = {}
    barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    barcode = pp.barcode(barcode_mutations, conf["barcode"])
    clade = ";".join(sorted([d["annotation"] for d in barcode]))
    sys.stdout.write("%s\t%s\n" % (args.prefix, clade))
    results["clade"] = clade

    variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"],
                                       conf["gff"], conf["proteins"])
    results["variants"] = variant_data

    json.dump(results, open("%s.results.json" % files_prefix, "w"))
Ejemplo n.º 13
0
def main(args):
    if args.bed:
        pp.split_bed(args.bed, args.size, reformat=args.reformat)
    else:
        fasta = pp.fasta(args.fasta)
        fasta.splitchr(args.size, reformat=args.reformat)
Ejemplo n.º 14
0
def find_ancestral_mutations(msa_file,
                             tree_file,
                             states_file,
                             variant_sites=None):
    seqs = pp.fasta(msa_file).fa_dict

    tree = ete3.Tree(tree_file, format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    for n in tree.traverse():
        if n.name.split("/")[0] in node_names:
            if "Node" in n.name:
                tmp = n.name.split("/")
                n.name = tmp[0]
                if len(tmp) > 1:
                    n.support = tmp[1]

    tree.write(format=1, outfile=tree_file + ".reformatted.tree")

    states = defaultdict(dict)
    sites = set()
    sys.stderr.write("Loading states\n")
    for l in tqdm(open(states_file)):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        site = int(row[1])
        if variant_sites and site not in variant_sites: continue
        if row[0] not in internal_node_names: continue
        states[site][row[0]] = row[2]
        sites.add(site)

    sys.stderr.write("Loading alignment sites\n")
    for site in tqdm(sites):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    barcoding_sites = []
    convergent_sites = []
    mutations = []
    for site in tqdm(sites):
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objjects
        origins = []
        internal_node_change = False

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors(
            )[0].state and node_state != "N" and n.get_ancestors(
            )[0].state != "N":
                origins.append(n.name)
                if n.name in internal_node_names:
                    internal_node_change = True
            n.add_feature("state", node_state)

        type = "unique"
        if internal_node_change and len(origins) == 1:
            type = "barcoding"
            barcoding_sites.append(site)
        if len(origins) > 1:
            type = "convergent"
            convergent_sites.append(site)

        tmp_data = {
            "position": site,
            "mutation_type": type,
            "origins": len(origins),
            "branches": ",".join(origins),
        }
        for sample in leaf_names:
            tmp_data[sample] = states[site][sample]
        mutations.append(tmp_data)

    return mutations