コード例 #1
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}

    try:
        T = read_tree(args.tree)
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    import numpy as np
    missing_internal_node_names = [
        n.name is None for n in T.get_nonterminals()
    ]
    if np.all(missing_internal_node_names):
        print("\n*** WARNING: Tree has no internal node names!")
        print(
            "*** Without internal node names, ancestral sequences can't be linked up to the correct node later."
        )
        print(
            "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'."
        )
        print(
            "*** If you haven't run 'augur refine', you can add node names to your tree by running:"
        )
        print("*** augur refine --tree %s --output-tree <filename>.nwk" %
              (args.tree))
        print(
            "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'"
        )

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # Enfore treetime 0.7 or later
    from distutils.version import StrictVersion
    import treetime
    if StrictVersion(treetime.version) < StrictVersion('0.7.0'):
        print("ERROR: this version of augur requires TreeTime 0.7 or later.")
        return 1

    # Infer ambiguous bases if the user has requested that we infer them (either
    # explicitly or by default) and the user has not explicitly requested that
    # we keep them.
    infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous

    tt = ancestral_sequence_inference(tree=T,
                                      aln=aln,
                                      ref=ref,
                                      marginal=args.inference,
                                      fill_overhangs=not (args.keep_overhangs),
                                      infer_tips=infer_ambiguous)

    character_map = {}
    for x in tt.gtr.profile_map:
        if tt.gtr.profile_map[x].sum() == tt.gtr.n_states:
            # TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous
            # To clean up auspice output, we map all those to 'N'
            character_map[x] = 'N'
        else:
            character_map[x] = x

    anc_seqs['nodes'] = collect_mutations_and_sequences(
        tt,
        full_sequences=not is_vcf,
        infer_tips=infer_ambiguous,
        character_map=character_map)
    # add reference sequence to json structure. This is the sequence with
    # respect to which mutations on the tree are defined.
    if is_vcf:
        anc_seqs['reference'] = {"nuc": compress_seq['reference']}
    else:
        anc_seqs['reference'] = {
            "nuc":
            "".join(T.root.sequence) if hasattr(T.root, 'sequence') else ''
        }

    out_name = get_json_name(
        args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
    write_json(anc_seqs, out_name)
    print("ancestral mutations written to", out_name, file=sys.stdout)

    if args.output_sequences:
        if args.output_vcf:
            print(
                "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.",
                file=sys.stderr)
        else:
            records = [
                SeqRecord(Seq(node_data["sequence"]),
                          id=node_name,
                          description="")
                for node_name, node_data in anc_seqs["nodes"].items()
            ]
            SeqIO.write(records, args.output_sequences, "fasta")
            print("ancestral sequences FASTA written to",
                  args.output_sequences,
                  file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",
              vcf_fname,
              file=sys.stdout)

    return 0
コード例 #2
0
ファイル: ancestral.py プロジェクト: charesredhat/augur
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}

    try:
        T = read_tree(args.tree)
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    import numpy as np
    missing_internal_node_names = [
        n.name is None for n in T.get_nonterminals()
    ]
    if np.all(missing_internal_node_names):
        print("\n*** WARNING: Tree has no internal node names!")
        print(
            "*** Without internal node names, ancestral sequences can't be linked up to the correct node later."
        )
        print(
            "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'."
        )
        print(
            "*** If you haven't run 'augur refine', you can add node names to your tree by running:"
        )
        print("*** augur refine --tree %s --output-tree <filename>.nwk" %
              (args.tree))
        print(
            "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'"
        )

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer
    # Otherwise it returns nonsense.
    from distutils.version import StrictVersion
    import treetime
    if args.keep_ambiguous and not is_vcf and StrictVersion(
            treetime.version) < StrictVersion('0.5.6'):
        print(
            "ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer."
            + "\nYour version is " + treetime.version +
            "\nUpdate TreeTime or run without the --keep-ambiguous flag.")
        return 1

    tt = ancestral_sequence_inference(tree=T,
                                      aln=aln,
                                      ref=ref,
                                      marginal=args.inference,
                                      fill_overhangs=not (args.keep_overhangs))

    if is_vcf or args.keep_ambiguous:
        # TreeTime overwrites ambig sites on tips during ancestral reconst.
        # Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf)
    # add reference sequence to json structure. This is the sequence with
    # respect to which mutations on the tree are defined.
    if is_vcf:
        anc_seqs['reference'] = {"nuc": compress_seq['reference']}
    else:
        anc_seqs['reference'] = {
            "nuc":
            "".join(T.root.sequence) if hasattr(T.root, 'sequence') else ''
        }

    out_name = get_json_name(
        args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
    write_json(anc_seqs, out_name)
    print("ancestral mutations written to", out_name, file=sys.stdout)

    if args.output_sequences:
        if args.output_vcf:
            print(
                "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.",
                file=sys.stderr)
        else:
            records = [
                SeqRecord(Seq(node_data["sequence"]),
                          id=node_name,
                          description="")
                for node_name, node_data in anc_seqs["nodes"].items()
            ]
            SeqIO.write(records, args.output_sequences, "fasta")
            print("ancestral sequences FASTA written to",
                  args.output_sequences,
                  file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",
              vcf_fname,
              file=sys.stdout)

    return 0
コード例 #3
0
ファイル: wrappers.py プロジェクト: junhuili/treetime
def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False,
                              report_ambiguous=False, timetree=False, confidence=False):
    seq_info = is_vcf or tt.aln
    if is_vcf:
        tt.recover_var_ambigs()
        outaln_name = basename + 'ancestral_sequences.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), outaln_name)
    elif tt.aln:
        outaln_name = basename + 'ancestral_sequences.fasta'
        AlignIO.write(tt.get_reconstructed_alignment(), outaln_name, 'fasta')
    if seq_info:
        print("\n--- alignment including ancestral nodes saved as  \n\t %s\n"%outaln_name)

    # decorate tree with inferred mutations
    terminal_count = 0
    offset = 0 if zero_based else 1
    if timetree:
        dates_fname = basename + 'dates.tsv'
        fh_dates = open(dates_fname, 'w')
        if confidence:
            fh_dates.write('#Lower and upper bound delineate the 90% max posterior region\n')
            fh_dates.write('#node\tdate\tnumeric date\tlower bound\tupper bound\n')
        else:
            fh_dates.write('#node\tdate\tnumeric date\n')
    for n in tt.tree.find_clades():
        if timetree:
            if confidence:
                conf = tt.get_max_posterior_region(n, fraction=0.9)
                fh_dates.write('%s\t%s\t%f\t%f\t%f\n'%(n.name, n.date, n.numdate,conf[0], conf[1]))
            else:
                fh_dates.write('%s\t%s\t%f\n'%(n.name, n.date, n.numdate))

        n.confidence=None
        # due to a bug in older versions of biopython that truncated filenames in nexus export
        # we truncate them by hand and make them unique.
        if n.is_terminal() and len(n.name)>40 and bioversion<"1.69":
            n.name = n.name[:35]+'_%03d'%terminal_count
            terminal_count+=1
        n.comment=''
        if seq_info and len(n.mutations):
            if report_ambiguous:
                n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations])+'"'
            else:
                n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations
                                                      if tt.gtr.ambiguous not in [a,d]])+'"'
        if timetree:
            n.comment+=(',' if n.comment else '&') + 'date=%1.2f'%n.numdate

    # write tree to file
    fmt_bl = "%1.6f" if tt.seq_len<1e6 else "%1.8e"
    if timetree:
        outtree_name = basename + 'timetree.nexus'
        print("--- saved divergence times in \n\t %s\n"%dates_fname)
        Phylo.write(tt.tree, outtree_name, 'nexus')
    else:
        outtree_name = basename + 'annotated_tree.nexus'
        Phylo.write(tt.tree, outtree_name, 'nexus', format_branch_length=fmt_bl)
    print("--- tree saved in nexus format as  \n\t %s\n"%outtree_name)

    if timetree:
        for n in tt.tree.find_clades():
            n.branch_length = n.mutation_length
        outtree_name = basename + 'divergence_tree.nexus'
        Phylo.write(tt.tree, outtree_name, 'nexus', format_branch_length=fmt_bl)
        print("--- divergence tree saved in nexus format as  \n\t %s\n"%outtree_name)
コード例 #4
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    tree_meta = {'alignment': args.alignment}
    attributes = ['branch_length']
    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            tree_meta['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed." % args.tree)
        return -1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.ancestral or args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference"
            )
            return -1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    else:
        aln = args.alignment

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree and T:
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction"
            )
            return -1
        metadata, columns = read_metadata(args.metadata)
        if args.year_limit:
            args.year_limit.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_fmt,
                                    min_max_year=args.year_limit)
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(
                args.root
        ) == 1:  #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = timetree(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.root or 'best',
            Tc=args.coalescent if args.coalescent is not None else
            0.01,  #Otherwise can't set to 0
            use_marginal=args.time_marginal or False,
            branch_length_mode=args.branch_length_mode or 'auto',
            clock_rate=args.clock_rate,
            n_iqd=args.n_iqd)

        tree_meta['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        attributes.extend([
            'numdate', 'clock_length', 'mutation_length', 'mutations',
            'raw_date', 'date'
        ])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
        if args.date_confidence:
            attributes.append('num_date_confidence')
    elif args.ancestral in ['joint', 'marginal']:
        tt = ancestral_sequence_inference(
            tree=T,
            aln=aln,
            ref=ref,
            marginal=args.ancestral,
            optimize_branch_length=args.branchlengths,
            branch_length_mode=args.branch_length_mode)
        attributes.extend(['mutation_length', 'mutations'])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    if is_vcf:
        #TreeTime overwrites ambig sites on tips during ancestral reconst.
        #Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    tree_meta['nodes'] = prep_tree(T, attributes, is_vcf)

    if T:
        import json
        tree_success = Phylo.write(T,
                                   tree_fname,
                                   'newick',
                                   format_branch_length='%1.8f')
        if args.node_data:
            node_data_fname = args.node_data
        else:
            node_data_fname = '.'.join(
                args.alignment.split('.')[:-1]) + '.node_data'

        with open(node_data_fname, 'w') as ofile:
            meta_success = json.dump(tree_meta, ofile)

    #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs
    if is_vcf and (args.ancestral or args.timetree):
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)

        return 0 if (tree_success and meta_success) else -1
    else:
        return -1
コード例 #5
0
ファイル: wrappers.py プロジェクト: neherlab/treetime
def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False,
                              report_ambiguous=False, timetree=False, confidence=False):
    seq_info = is_vcf or tt.aln
    if is_vcf:
        tt.recover_var_ambigs()
        outaln_name = basename + 'ancestral_sequences.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), outaln_name)
    elif tt.aln:
        outaln_name = basename + 'ancestral_sequences.fasta'
        AlignIO.write(tt.get_reconstructed_alignment(), outaln_name, 'fasta')
    if seq_info:
        print("\n--- alignment including ancestral nodes saved as  \n\t %s\n"%outaln_name)

    # decorate tree with inferred mutations
    terminal_count = 0
    offset = 0 if zero_based else 1
    if timetree:
        dates_fname = basename + 'dates.tsv'
        fh_dates = open(dates_fname, 'w')
        if confidence:
            fh_dates.write('#Lower and upper bound delineate the 90% max posterior region\n')
            fh_dates.write('#node\tdate\tnumeric date\tlower bound\tupper bound\n')
        else:
            fh_dates.write('#node\tdate\tnumeric date\n')
    for n in tt.tree.find_clades():
        if timetree:
            if confidence:
                conf = tt.get_max_posterior_region(n, fraction=0.9)
                fh_dates.write('%s\t%s\t%f\t%f\t%f\n'%(n.name, n.date, n.numdate,conf[0], conf[1]))
            else:
                fh_dates.write('%s\t%s\t%f\n'%(n.name, n.date, n.numdate))

        n.confidence=None
        # due to a bug in older versions of biopython that truncated filenames in nexus export
        # we truncate them by hand and make them unique.
        if n.is_terminal() and len(n.name)>40 and bioversion<"1.69":
            n.name = n.name[:35]+'_%03d'%terminal_count
            terminal_count+=1
        n.comment=''
        if seq_info and len(n.mutations):
            if report_ambiguous:
                n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations])+'"'
            else:
                n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations
                                                      if tt.gtr.ambiguous not in [a,d]])+'"'
        if timetree:
            n.comment+=(',' if n.comment else '&') + 'date=%1.2f'%n.numdate

    # write tree to file
    fmt_bl = "%1.6f" if tt.seq_len<1e6 else "%1.8e"
    if timetree:
        outtree_name = basename + 'timetree.nexus'
        print("--- saved divergence times in \n\t %s\n"%dates_fname)
        Phylo.write(tt.tree, outtree_name, 'nexus')
    else:
        outtree_name = basename + 'annotated_tree.nexus'
        Phylo.write(tt.tree, outtree_name, 'nexus', format_branch_length=fmt_bl)
    print("--- tree saved in nexus format as  \n\t %s\n"%outtree_name)

    if timetree:
        for n in tt.tree.find_clades():
            n.branch_length = n.mutation_length
        outtree_name = basename + 'divergence_tree.nexus'
        Phylo.write(tt.tree, outtree_name, 'nexus', format_branch_length=fmt_bl)
        print("--- divergence tree saved in nexus format as  \n\t %s\n"%outtree_name)
コード例 #6
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}
    # check if tree is provided and can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed."%args.tree)
        return 1

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer
    # Otherwise it returns nonsense.
    from distutils.version import StrictVersion
    import treetime
    if args.keep_ambiguous and not is_vcf and StrictVersion(treetime.version) < StrictVersion('0.5.6'):
        print("ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer."+
                "\nYour version is "+treetime.version+
                "\nUpdate TreeTime or run without the --keep-ambiguous flag.")
        return 1

    tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference,
                                      fill_overhangs = not(args.keep_overhangs))

    if is_vcf or args.keep_ambiguous:
        # TreeTime overwrites ambig sites on tips during ancestral reconst.
        # Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf)

    if args.output:
        anc_seqs_fname = args.output
    else:
        anc_seqs_fname = '.'.join(args.alignment.split('.')[:-1]) + '.anc_seqs.json'

    write_json(anc_seqs, anc_seqs_fname)
    print("ancestral sequences written to",anc_seqs_fname, file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout)

    return 0
コード例 #7
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}
    # check if tree is provided and can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed." % args.tree)
        return -1

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    tt = ancestral_sequence_inference(tree=T,
                                      aln=aln,
                                      ref=ref,
                                      marginal=args.inference)

    if is_vcf:
        # TreeTime overwrites ambig sites on tips during ancestral reconst.
        # Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf)

    if args.output:
        anc_seqs_fname = args.output
    else:
        anc_seqs_fname = '.'.join(
            args.alignment.split('.')[:-1]) + '.anc_seqs.json'

    anc_seqs_success = write_json(anc_seqs, anc_seqs_fname)
    print("ancestral sequences written to", anc_seqs_fname, file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",
              vcf_fname,
              file=sys.stdout)

    if anc_seqs_success:
        return 0
    else:
        return 1