コード例 #1
0
def export_sequence_fasta(T, path):
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Align import MultipleSeqAlignment
    from Bio import AlignIO

    fname = tree_sequence_alignment(path, 'nuc')
    seqs = [SeqRecord(Seq(''.join(T.root.sequence)), name='root', id='root')]
    for node in T.find_clades():
        seqs.append(SeqRecord(Seq(''.join(node.sequence)), name=node.name, id=node.name))
    AlignIO.write(MultipleSeqAlignment(seqs), fname, 'fasta')


if __name__ == '__main__':
    parser = generic_argparse("Build the tree from the prepared sequence data")
    parser.add_argument('--nthreads', type=int, default=2,
                        help='number of threads')
    parser.add_argument('--ancestral', action='store_true', default=False,
                        help='calculate ancestral sequences')
    parser.add_argument('--timetree', action='store_true', default=False,
                       help='infer time stamped phylogeny')
    parser.add_argument('--confidence', action='store_true', default=False,
                       help='estimate confidence intervals for node timing')
    parser.add_argument('--Tc', type=float, default=0.0,
                       help='coalescence time scale measured in substitution rate units')
    parser.add_argument('--keeproot', action='store_true', default=False,
                        help="don't reroot the tree")
    args = parser.parse_args()
    path = args.path
コード例 #2
0

def tree_layout(T):
    yval = T.count_terminals()
    for n in T.find_clades(order='postorder'):
        if n.is_terminal():
            n.yvalue = yval
            yval -= 1
        else:
            child_yvalues = [c.yvalue for c in n]
            n.yvalue = 0.5 * (np.min(child_yvalues) + np.max(child_yvalues))
        n.xvalue = n.attr['div']


if __name__ == '__main__':
    parser = generic_argparse("Export precomputed data as auspice jsons")
    parser.add_argument(
        '--prefix',
        required=True,
        help=
        "prefix for json files that are passed on to auspice (e.g., zika.fasta)"
    )
    parser.add_argument(
        '--reference',
        required=True,
        help="reference sequence needed for entropy feature export")

    args = parser.parse_args()
    path = args.path

    T = Phylo.read(tree_newick(path), 'newick')
コード例 #3
0
        ref_array = np.array(seqs[reference])
        ungapped = ref_array != '-'
        ref_aln_array = np.array(aln)[:, ungapped]
    else:
        print("reference", reference, "not found in alignment")
        return

    out_seqs = []
    for seq, seq_array in zip(aln, ref_aln_array):
        seq.seq = Seq.Seq(''.join(seq_array))
        if keep_reference or seq.name != reference:
            out_seqs.append(seq)

    return out_seqs


if __name__ == '__main__':
    parser = generic_argparse(
        "strip out all positions that don't align to the reference")
    parser.add_argument('--reference',
                        required=True,
                        help='the name of the reference sequence')
    parser.add_argument('--keep_reference',
                        action='store_true',
                        default=False,
                        help='keep the reference as part of the alignment')
    args = parser.parse_args()

    seqs = strip_non_reference(args.path, args.reference)
    write_fasta(seqs, ref_alignment(args.path))
コード例 #4
0
        seqs[seq.name] = seq

    muts = {}
    muts[T.root.name]=''
    for node in T.get_nonterminals():
        pseq = seqs[node.name]
        for c in node:
            cseq = seqs[c.name]
            muts[c.name]=','.join([anc+str(pos+1)+der
                        for pos, (anc, der) in enumerate(zip(pseq, cseq))
                        if anc!=der])

    return muts


if __name__ == '__main__':
    parser = generic_argparse("Assign amino acid mutations to the tree")
    args = parser.parse_args()
    path = args.path

    tree_meta = read_tree_meta_data(path)
    T = Phylo.read(tree_newick(path), 'newick')

    for gene, aln_fname in get_genes_and_alignments(path, tree=True):
        if gene!='nuc':
            muts = get_amino_acid_mutations(T, aln_fname)

        for node_name in tree_meta:
            tree_meta[node_name][gene+'_mutations'] = muts[node_name]
    write_tree_meta_data(path, tree_meta)
コード例 #5
0
    #N were causing problems later. Removing all variance and allowing these
    #regions to be the same as Ref should be the same, anyway.

    #with open(ref_fasta(path), "w") as output_handle:
    #    SeqIO.write(maskedRef_seqRec, output_handle, "fasta")

    return maskRefFile


if __name__ == '__main__':
    #to do - add so can pass vcf file instead of gzvcf file?

    import time
    start = time.time()

    parser = generic_argparse(
        "parse vcf/vcf.gz file and meta_data to drop samples")
    parser.add_argument("--gzvcf",
                        required=True,
                        type=str,
                        help="file with input sequences as gunzipped vcf")
    parser.add_argument(
        "--ref",
        required=True,
        type=str,
        help="fasta file with reference sequence that vcf is mapped to")
    parser.add_argument("--strip_loci",
                        required=False,
                        type=str,
                        help="file that contains loci to strip from analysis")
    args = parser.parse_args()
    path = args.path
コード例 #6
0
ファイル: prepare.py プロジェクト: nextstrain/augurlinos
    dropped_strains = []
    if os.path.isfile(fname):
        with open(fname) as ifile:
            for line in ifile:
                fields = line.strip().split('#')
                if fields[0].strip():
                    dropped_strains.append(fields[0].strip())
    else:
        print("File with dropped strains not found. Looking for", fname)

    return dropped_strains



if __name__ == '__main__':
    parser = generic_argparse("parse fasta file and separate meta_data into table")
    parser.add_argument("--sequences", required=True, type=str,
                        help = "file with input sequences as fasta")
    args = parser.parse_args()
    path = args.path

    header_fields = {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country',
                    6:'division', 8:'db', 10:'authors', 11:'url', 12:'title',
                    13: 'journal', 14: 'paper_url'}

    sequences, meta = parse_fasta(args.sequences, header_fields)

    dropped_strains = get_dropped_strains(path)
    sequences = {k:v for k,v in sequences.items() if k not in dropped_strains}
    meta = {k:v for k,v in meta.items() if k not in dropped_strains}
コード例 #7
0
ファイル: find_drm.py プロジェクト: emmahodcroft/augurlinos
        if numResist not in drugMuts["Drug_Resistance"]:
            drugMuts["Drug_Resistance"].append(numResist)

    #for any with no resistance, add a 0 to tree_meta
    for seq, v in tree_meta.iteritems():
        if 'Drug_Resistance' not in tree_meta[seq]:
            tree_meta[seq]["Drug_Resistance"] = '0'

    write_tree_meta_data(path, tree_meta)

    return drugMuts


if __name__ == '__main__':
    parser = generic_argparse(
        "Find drug resistance mutations according to supplied file. ONLY WORKS FOR VCF FILES."
    )
    parser.add_argument('--drm', type=str, help="file of DRMs to find")

    args = parser.parse_args()
    path = args.path

    import time
    start = time.time()

    compress_seq = read_in_vcf(tree_vcf_alignment(path), ref_fasta(path))

    sequences = compress_seq['sequences']
    positions = compress_seq['positions']
    ref = compress_seq['reference']
コード例 #8
0
ファイル: translate.py プロジェクト: emmahodcroft/augurlinos
            for line in ifile:
                fields = line.strip().split('#')
                if fields[0].strip():
                    genes.append(fields[0].strip())
    else:
        print("File with genes not found. Looking for", fname)

    featN = np.array(genes)
    if len(np.unique(featN)) != len(genes):
        print "You have duplicates in your genes file. They are being ignored."

    return genes


if __name__ == '__main__':
    parser = generic_argparse("Translate the nucleotide alignments")
    parser.add_argument('--reference', required=True,
                        help='genbank file containing the annotation')
    parser.add_argument('--genes', nargs='+', help="genes to translate")
    #EBH 11 Dec 17
    parser.add_argument('--vcf', action='store_true', default=False,
                        help="sequence is in VCF format")
    parser.add_argument('--assignMuts', action='store_true', default=False,
                        help="write amino acid mutations onto the tree")
    args = parser.parse_args()

    path = args.path

    #The original way of doing this called load_features twice!
    if not args.genes:
        genes = None #if load_features is passed None it loads all
コード例 #9
0
ファイル: mugration.py プロジェクト: emmahodcroft/augurlinos
                    pdis = node.marginal_profile[0]
                    S = -np.sum(pdis*np.log(pdis+TINY))

                    marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))]
                    marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods
                    marginal = [(a, b) for a, b in marginal if b > 0.01][:4] #only take stuff over 1% and the top 4 elements
                    conf = {a:b for a,b in marginal}
                    node.__setattr__(field + "_entropy", S)
                    node.__setattr__(field + "_confidence", conf)

            return tt, alphabet



if __name__ == '__main__':
    parser = generic_argparse("Infer ancestral states for a discrete character")
    parser.add_argument('--field', default='region',
                        help='meta data field to perform discrete reconstruction on')
    parser.add_argument('--confidence',action="store_true",
                        help='record the distribution of subleading mugration states')
    parser.add_argument('--vcf', action='store_true', default=False,
                        help="sequence is in VCF format")

    args = parser.parse_args()
    path = args.path
    T = tree_newick(path)

    import time
    start = time.time()

    seq_meta = read_sequence_meta_data(path)
コード例 #10
0
import os
from filenames import sequence_input, raw_alignment
from util import generic_argparse

if __name__ == '__main__':
    parser = generic_argparse("Align sequences")
    parser.add_argument('--nthreads', type=int, default=2,
                        help="number of threads used by mafft")
    parser.add_argument('--aligner', default='mafft',
                        help="analysis path, e.g. zika")
    args = parser.parse_args()

    in_file = sequence_input(args.path)
    out_file = raw_alignment(args.path)

    if args.aligner=='mafft':
        os.system("mafft --anysymbol --thread %d %s 1> %s 2>mafft_stderr"%(args.nthreads, in_file, out_file))
    else:
        print('not implemented')

    from Bio import AlignIO
    aln = AlignIO.read(out_file, 'fasta')
    for seq in aln:
        seq.seq = seq.seq.upper()
    AlignIO.write(aln, out_file, 'fasta')