Exemple #1
0
def read_if_vcf(params):
    """
    Checks if input is VCF and reads in appropriately if it is
    """
    ref = None
    aln = params.aln
    fixed_pi = None
    if hasattr(params, 'aln') and params.aln is not None:
        if any([params.aln.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
            if not params.vcf_reference:
                print("ERROR: a reference Fasta is required with VCF-format alignments")
                return -1
            compress_seq = read_vcf(params.aln, params.vcf_reference)
            sequences = compress_seq['sequences']
            ref = compress_seq['reference']
            aln = sequences

            if not hasattr(params, 'gtr') or params.gtr=="infer": #if not specified, set it:
                alpha = alphabets['aa'] if params.aa else alphabets['nuc']
                fixed_pi = [ref.count(base)/len(ref) for base in alpha]
                if fixed_pi[-1] == 0:
                    fixed_pi[-1] = 0.05
                    fixed_pi = [v-0.01 for v in fixed_pi]

    return aln, ref, fixed_pi
Exemple #2
0
def read_if_vcf(params):
    """
    Checks if input is VCF and reads in appropriately if it is
    """
    ref = None
    aln = params.aln
    fixed_pi = None
    if hasattr(params, 'aln') and params.aln is not None:
        if any([params.aln.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
            if not params.vcf_reference:
                print("ERROR: a reference Fasta is required with VCF-format alignments")
                return -1
            compress_seq = read_vcf(params.aln, params.vcf_reference)
            sequences = compress_seq['sequences']
            ref = compress_seq['reference']
            aln = sequences

            if not hasattr(params, 'gtr') or params.gtr=="infer": #if not specified, set it:
                alpha = alphabets['aa'] if params.aa else alphabets['nuc']
                fixed_pi = [ref.count(base)/len(ref) for base in alpha]
                if fixed_pi[-1] == 0:
                    fixed_pi[-1] = 0.05
                    fixed_pi = [v-0.01 for v in fixed_pi]

    return aln, ref, fixed_pi
def run(args):
    '''
    This should be modified to work on Fasta-input files!!
    '''
    print("This method may change in future! Please use 'augur sequence-traits -h' to check the latest options.")
    ## check file format and read in sequences
    is_vcf = False
    if ( (args.ancestral_sequences and any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) or
        (args.translations and any([args.translations.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) ):
        if ((args.ancestral_sequences and not args.vcf_reference) or
            (args.translations and not args.vcf_translate_reference)):
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1
        is_vcf = True
        compress_seq = defaultdict(dict)
        if args.translations:
            compress_seq = read_in_translate_vcf(args.translations, args.vcf_translate_reference)
        if args.ancestral_sequences:
            compress_seq["nuc"] = read_vcf(args.ancestral_sequences, args.vcf_reference)
    else:
        # TO-DO fill in fasta-format processing
        aln = args.alignment

    features = read_in_features(args.features)
    annotations = annotate_strains(features, compress_seq)
    #convert the annotations into string label that auspice can display
    seq_features = attach_features(annotations, args.label, args.count)

    #write out json
    with open(args.output, 'w') as results:
        json.dump({"nodes":seq_features}, results, indent=1, sort_keys = True)
Exemple #4
0
def run(args):
    '''
    This should be modified to work on Fasta-input files!!
    '''
    print("This method may change in future! Please use 'augur sequence-traits -h' to check the latest options.")
    print("Unfortunately this method currently only works with VCF input.")
    ## check file format and read in sequences
    is_vcf = False
    if ( (args.ancestral_sequences and any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) or
        (args.translations and any([args.translations.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])) ):
        if ((args.ancestral_sequences and not args.vcf_reference) or
            (args.translations and not args.vcf_translate_reference)):
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1
        is_vcf = True
        compress_seq = defaultdict(dict)
        if args.translations:
            compress_seq = read_in_translate_vcf(args.translations, args.vcf_translate_reference)
        if args.ancestral_sequences:
            compress_seq["nuc"] = read_vcf(args.ancestral_sequences, args.vcf_reference)
    else:
        # TO-DO fill in fasta-format processing
        aln = args.ancestral_sequences
        print("\nERROR: Unfortunately this feature currently only works with VCF input! It will be expanded to work with Fasta-input soon.")
        return 1

    features = read_in_features(args.features)
    annotations = annotate_strains(features, compress_seq)
    #convert the annotations into string label that auspice can display
    seq_features = attach_features(annotations, args.label, args.count)

    #write out json
    out_name = get_json_name(args)
    write_json({"nodes":seq_features},out_name)
    print("sequence traits written to", out_name, file=sys.stdout)
Exemple #5
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1
        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    else:
        aln = args.alignment

    start = time.time()

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk'

    # construct reduced alignment if needed
    if is_vcf:
        variable_fasta = write_out_informative_fasta(
            compress_seq, args.alignment, stripFile=args.strip_sites)
        fasta = variable_fasta
    else:
        fasta = aln

    if args.iqmodel and not args.method == 'iqtree':
        print(
            "Cannot specify model unless using IQTree. Model specification ignored."
        )

    if args.method == 'raxml':
        T = build_raxml(fasta, tree_fname, args.nthreads)
    elif args.method == 'iqtree':
        T = build_iqtree(fasta, tree_fname, args.iqmodel, args.nthreads)
    else:  #use fasttree - if add more options, put another check here
        T = build_fasttree(fasta, tree_fname)
    end = time.time()
    print("Building original tree took {} seconds".format(str(end - start)))

    if is_vcf and not args.keep_vcf_fasta:
        os.remove(variable_fasta)

    if T:
        import json
        tree_success = Phylo.write(T,
                                   tree_fname,
                                   'newick',
                                   format_branch_length='%1.8f')
    else:
        return -1
Exemple #6
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        # Prepare a multiple sequence alignment from the given variants VCF and
        # reference FASTA.
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1
        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    elif args.exclude_sites:
        # Mask excluded sites from the given multiple sequence alignment.
        aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites)
    else:
        # Use the multiple sequence alignment as is.
        aln = args.alignment

    start = time.time()

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk'

    # construct reduced alignment if needed
    if is_vcf:
        variable_fasta = write_out_informative_fasta(compress_seq, args.alignment, stripFile=args.exclude_sites)
        fasta = variable_fasta
    else:
        fasta = aln

    if args.substitution_model and not args.method=='iqtree':
        print("Cannot specify model unless using IQTree. Model specification ignored.")

    if args.method=='raxml':
        T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args)
    elif args.method=='iqtree':
        T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args)
    elif args.method=='fasttree':
        T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args)
    else:
        print("ERROR: unknown tree builder provided to --method: %s" % args.method, file = sys.stderr)
        return 1

    end = time.time()
    print("\nBuilding original tree took {} seconds".format(str(end-start)))

    if T:
        import json
        tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
    else:
        return 1
Exemple #7
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        # Prepare a multiple sequence alignment from the given variants VCF and
        # reference FASTA.
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1
        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    elif args.exclude_sites:
        # Mask excluded sites from the given multiple sequence alignment.
        aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites)
    else:
        # Use the multiple sequence alignment as is.
        aln = args.alignment

    start = time.time()

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk'

    # construct reduced alignment if needed
    if is_vcf:
        variable_fasta = write_out_informative_fasta(compress_seq, args.alignment, stripFile=args.exclude_sites)
        fasta = variable_fasta
    else:
        fasta = aln

    if args.substitution_model and not args.method=='iqtree':
        print("Cannot specify model unless using IQTree. Model specification ignored.")

    if args.method=='raxml':
        T = build_raxml(fasta, tree_fname, nthreads=args.nthreads)
    elif args.method=='iqtree':
        T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads)
    elif args.method=='fasttree':
        T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads)
    else:
        print("ERROR: unknown tree builder provided to --method: %s" % args.method, file = sys.stderr)
        return 1

    end = time.time()
    print("Building original tree took {} seconds".format(str(end-start)))

    if T:
        import json
        tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
    else:
        return 1
Exemple #8
0
def run(args):
    if args.seed is not None:
        np.random.seed(args.seed)

    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    try:
        T = read_tree(args.tree)
        node_data['input_tree'] = args.tree
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    if not args.alignment:
        if args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference",
                file=sys.stderr)
            return 1

        if args.divergence_units == 'mutations':
            print(
                "ERROR: alignment is required for divergence in units of mutations",
                file=sys.stderr)
            return 1

        # fake alignment to appease treetime when only using it for naming nodes...
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments",
                file=sys.stderr)
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    from treetime import version as treetime_version
    print(f"augur refine is using TreeTime version {treetime_version}")

    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    elif args.alignment:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'
    else:
        tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk'

    if args.root and len(
            args.root
    ) == 1:  #if anything but a list of seqs, don't send as a list
        args.root = args.root[0]
    if args.keep_root:  # This flag overrides anything specified by 'root'
        args.root = None

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction",
                file=sys.stderr)
            return 1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        tt = refine(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.
            root,  # or 'best', # We now have a default in param spec - this just adds confusion.
            Tc=0.01 if args.coalescent is None else
            args.coalescent,  #use 0.01 as default coalescent time scale
            use_marginal=args.date_inference == 'marginal',
            branch_length_inference=args.branch_length_inference or 'auto',
            precision='auto' if args.precision is None else args.precision,
            clock_rate=args.clock_rate,
            clock_std=args.clock_std_dev,
            clock_filter_iqd=args.clock_filter_iqd,
            covariance=args.covariance,
            resolve_polytomies=(not args.keep_polytomies))

        node_data['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        if args.coalescent == 'skyline':
            try:
                skyline, conf = tt.merger_model.skyline_inferred(
                    gen=args.gen_per_year, confidence=2)
                node_data['skyline'] = [[float(x) for x in skyline.x],
                                        [float(y) for y in conf[0]],
                                        [float(y) for y in skyline.y],
                                        [float(y) for y in conf[1]]]
            except:
                print("ERROR: skyline optimization by TreeTime has failed.",
                      file=sys.stderr)
                return 1

        attributes.extend(
            ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        if args.root:
            if args.root == 'best':
                print(
                    "Warning: To root without inferring a timetree, you must specify an explicit outgroup."
                )
                print(
                    "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n"
                )
            elif args.root in ['least-squares', 'min_dev', 'oldest']:
                raise TypeError(
                    "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup."
                    % args.root)
            else:
                T.root_with_outgroup(args.root)

        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)
    if args.divergence_units == 'mutations-per-site':  #default
        pass
    elif args.divergence_units == 'mutations':
        if not args.timetree:
            tt.infer_ancestral_sequences()
        nuc_map = profile_maps['nuc']

        def are_sequence_states_different(nuc1, nuc2):
            '''
            determine whether two ancestral states should count as mutation for divergence estimates
            while correctly accounting for ambiguous nucleotides
            '''
            if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']:
                return False
            elif nuc1 in nuc_map and nuc2 in nuc_map:
                return np.sum(nuc_map[nuc1] * nuc_map[nuc2]) == 0
            else:
                return False

        for node in T.find_clades():
            n_muts = len([
                position for ancestral, position, derived in node.mutations
                if are_sequence_states_different(ancestral, derived)
            ])

            if args.timetree:
                node_data['nodes'][node.name]['mutation_length'] = n_muts

            node_data['nodes'][node.name]['branch_length'] = n_muts
    else:
        print("ERROR: divergence unit",
              args.divergence_units,
              "not supported!",
              file=sys.stderr)
        return 1

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T,
                               tree_fname,
                               'newick',
                               format_branch_length='%1.8f')
    print("updated tree written to", tree_fname, file=sys.stdout)

    if args.output_node_data:
        node_data_fname = args.output_node_data
    elif args.alignment:
        node_data_fname = '.'.join(
            args.alignment.split('.')[:-1]) + '.node_data.json'
    else:
        node_data_fname = '.'.join(
            args.tree.split('.')[:-1]) + '.node_data.json'

    write_json(node_data, node_data_fname)
    print("node attributes written to", node_data_fname, file=sys.stdout)

    return 0 if tree_success else 1
Exemple #9
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}

    try:
        T = read_tree(args.tree)
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    import numpy as np
    missing_internal_node_names = [
        n.name is None for n in T.get_nonterminals()
    ]
    if np.all(missing_internal_node_names):
        print("\n*** WARNING: Tree has no internal node names!")
        print(
            "*** Without internal node names, ancestral sequences can't be linked up to the correct node later."
        )
        print(
            "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'."
        )
        print(
            "*** If you haven't run 'augur refine', you can add node names to your tree by running:"
        )
        print("*** augur refine --tree %s --output-tree <filename>.nwk" %
              (args.tree))
        print(
            "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'"
        )

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer
    # Otherwise it returns nonsense.
    from distutils.version import StrictVersion
    import treetime
    if args.keep_ambiguous and not is_vcf and StrictVersion(
            treetime.version) < StrictVersion('0.5.6'):
        print(
            "ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer."
            + "\nYour version is " + treetime.version +
            "\nUpdate TreeTime or run without the --keep-ambiguous flag.")
        return 1

    tt = ancestral_sequence_inference(tree=T,
                                      aln=aln,
                                      ref=ref,
                                      marginal=args.inference,
                                      fill_overhangs=not (args.keep_overhangs))

    if is_vcf or args.keep_ambiguous:
        # TreeTime overwrites ambig sites on tips during ancestral reconst.
        # Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf)
    # add reference sequence to json structure. This is the sequence with
    # respect to which mutations on the tree are defined.
    if is_vcf:
        anc_seqs['reference'] = {"nuc": compress_seq['reference']}
    else:
        anc_seqs['reference'] = {
            "nuc":
            "".join(T.root.sequence) if hasattr(T.root, 'sequence') else ''
        }

    out_name = get_json_name(
        args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
    write_json(anc_seqs, out_name)
    print("ancestral mutations written to", out_name, file=sys.stdout)

    if args.output_sequences:
        if args.output_vcf:
            print(
                "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.",
                file=sys.stderr)
        else:
            records = [
                SeqRecord(Seq(node_data["sequence"]),
                          id=node_name,
                          description="")
                for node_name, node_data in anc_seqs["nodes"].items()
            ]
            SeqIO.write(records, args.output_sequences, "fasta")
            print("ancestral sequences FASTA written to",
                  args.output_sequences,
                  file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",
              vcf_fname,
              file=sys.stdout)

    return 0
Exemple #10
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}

    try:
        T = read_tree(args.tree)
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    import numpy as np
    missing_internal_node_names = [
        n.name is None for n in T.get_nonterminals()
    ]
    if np.all(missing_internal_node_names):
        print("\n*** WARNING: Tree has no internal node names!")
        print(
            "*** Without internal node names, ancestral sequences can't be linked up to the correct node later."
        )
        print(
            "*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'."
        )
        print(
            "*** If you haven't run 'augur refine', you can add node names to your tree by running:"
        )
        print("*** augur refine --tree %s --output-tree <filename>.nwk" %
              (args.tree))
        print(
            "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'"
        )

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # Enfore treetime 0.7 or later
    from distutils.version import StrictVersion
    import treetime
    if StrictVersion(treetime.version) < StrictVersion('0.7.0'):
        print("ERROR: this version of augur requires TreeTime 0.7 or later.")
        return 1

    # Infer ambiguous bases if the user has requested that we infer them (either
    # explicitly or by default) and the user has not explicitly requested that
    # we keep them.
    infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous

    tt = ancestral_sequence_inference(tree=T,
                                      aln=aln,
                                      ref=ref,
                                      marginal=args.inference,
                                      fill_overhangs=not (args.keep_overhangs),
                                      infer_tips=infer_ambiguous)

    character_map = {}
    for x in tt.gtr.profile_map:
        if tt.gtr.profile_map[x].sum() == tt.gtr.n_states:
            # TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous
            # To clean up auspice output, we map all those to 'N'
            character_map[x] = 'N'
        else:
            character_map[x] = x

    anc_seqs['nodes'] = collect_mutations_and_sequences(
        tt,
        full_sequences=not is_vcf,
        infer_tips=infer_ambiguous,
        character_map=character_map)
    # add reference sequence to json structure. This is the sequence with
    # respect to which mutations on the tree are defined.
    if is_vcf:
        anc_seqs['reference'] = {"nuc": compress_seq['reference']}
    else:
        anc_seqs['reference'] = {
            "nuc":
            "".join(T.root.sequence) if hasattr(T.root, 'sequence') else ''
        }

    out_name = get_json_name(
        args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
    write_json(anc_seqs, out_name)
    print("ancestral mutations written to", out_name, file=sys.stdout)

    if args.output_sequences:
        if args.output_vcf:
            print(
                "WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.",
                file=sys.stderr)
        else:
            records = [
                SeqRecord(Seq(node_data["sequence"]),
                          id=node_name,
                          description="")
                for node_name, node_data in anc_seqs["nodes"].items()
            ]
            SeqIO.write(records, args.output_sequences, "fasta")
            print("ancestral sequences FASTA written to",
                  args.output_sequences,
                  file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",
              vcf_fname,
              file=sys.stdout)

    return 0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    tree_meta = {'alignment': args.alignment}
    attributes = ['branch_length']
    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            tree_meta['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed." % args.tree)
        return -1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.ancestral or args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference"
            )
            return -1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
        aln = sequences
    else:
        aln = args.alignment

    if args.output:
        tree_fname = args.output
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree and T:
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction"
            )
            return -1
        metadata, columns = read_metadata(args.metadata)
        if args.year_limit:
            args.year_limit.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_fmt,
                                    min_max_year=args.year_limit)
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(
                args.root
        ) == 1:  #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = timetree(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.root or 'best',
            Tc=args.coalescent if args.coalescent is not None else
            0.01,  #Otherwise can't set to 0
            use_marginal=args.time_marginal or False,
            branch_length_mode=args.branch_length_mode or 'auto',
            clock_rate=args.clock_rate,
            n_iqd=args.n_iqd)

        tree_meta['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        attributes.extend([
            'numdate', 'clock_length', 'mutation_length', 'mutations',
            'raw_date', 'date'
        ])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
        if args.date_confidence:
            attributes.append('num_date_confidence')
    elif args.ancestral in ['joint', 'marginal']:
        tt = ancestral_sequence_inference(
            tree=T,
            aln=aln,
            ref=ref,
            marginal=args.ancestral,
            optimize_branch_length=args.branchlengths,
            branch_length_mode=args.branch_length_mode)
        attributes.extend(['mutation_length', 'mutations'])
        if not is_vcf:
            attributes.extend(['sequence'
                               ])  #don't add sequences if VCF - huge!
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    if is_vcf:
        #TreeTime overwrites ambig sites on tips during ancestral reconst.
        #Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    tree_meta['nodes'] = prep_tree(T, attributes, is_vcf)

    if T:
        import json
        tree_success = Phylo.write(T,
                                   tree_fname,
                                   'newick',
                                   format_branch_length='%1.8f')
        if args.node_data:
            node_data_fname = args.node_data
        else:
            node_data_fname = '.'.join(
                args.alignment.split('.')[:-1]) + '.node_data'

        with open(node_data_fname, 'w') as ofile:
            meta_success = json.dump(tree_meta, ofile)

    #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs
    if is_vcf and (args.ancestral or args.timetree):
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)

        return 0 if (tree_success and meta_success) else -1
    else:
        return -1
Exemple #12
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}
    # check if tree is provided and can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed."%args.tree)
        return 1

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # Only allow recovery of ambig sites for Fasta-input if TreeTime is version 0.5.6 or newer
    # Otherwise it returns nonsense.
    from distutils.version import StrictVersion
    import treetime
    if args.keep_ambiguous and not is_vcf and StrictVersion(treetime.version) < StrictVersion('0.5.6'):
        print("ERROR: Keeping ambiguous sites for Fasta-input requires TreeTime version 0.5.6 or newer."+
                "\nYour version is "+treetime.version+
                "\nUpdate TreeTime or run without the --keep-ambiguous flag.")
        return 1

    tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference,
                                      fill_overhangs = not(args.keep_overhangs))

    if is_vcf or args.keep_ambiguous:
        # TreeTime overwrites ambig sites on tips during ancestral reconst.
        # Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf)

    if args.output:
        anc_seqs_fname = args.output
    else:
        anc_seqs_fname = '.'.join(args.alignment.split('.')[:-1]) + '.anc_seqs.json'

    write_json(anc_seqs, anc_seqs_fname)
    print("ancestral sequences written to",anc_seqs_fname, file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout)

    return 0
Exemple #13
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            node_data['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed."%args.tree)
        return 1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.timetree:
            print("ERROR: alignment is required for ancestral reconstruction or timetree inference")
            return 1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment


    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print("ERROR: meta data with dates is required for time tree reconstruction")
            return 1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata, fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence,
                    reroot=args.root or 'best',
                    Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale
                    use_marginal = args.date_inference == 'marginal',
                    branch_length_inference = args.branch_length_inference or 'auto',
                    clock_rate=args.clock_rate, clock_std=args.clock_std_dev,
                    clock_filter_iqd=args.clock_filter_iqd)

        node_data['clock'] = {'rate': tt.date2dist.clock_rate,
                              'intercept': tt.date2dist.intercept,
                              'rtt_Tmrca': -tt.date2dist.intercept/tt.date2dist.clock_rate}
        attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
    print("updated tree written to",tree_fname, file=sys.stdout)
    if args.output_node_data:
        node_data_fname = args.output_node_data
    else:
        node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json'

    json_success = write_json(node_data, node_data_fname)
    print("node attributes written to",node_data_fname, file=sys.stdout)

    return 0 if (tree_success and json_success) else 1
Exemple #14
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    # check if tree is provided an can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            node_data['input_tree'] = args.tree
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed."%args.tree)
        return -1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.timetree:
            print("ERROR: alignment is required for ancestral reconstruction or timetree inference")
            return -1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format alignments")
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment


    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    else:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print("ERROR: meta data with dates is required for time tree reconstruction")
            return -1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata, fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list
            args.root = args.root[0]

        tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence,
                      reroot=args.root or 'best',
                      Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale
                      use_marginal = args.date_inference == 'marginal',
                      branch_length_inference = args.branch_length_inference or 'auto',
                      clock_rate=args.clock_rate, clock_filter_iqd=args.clock_filter_iqd)

        node_data['clock'] = {'rate': tt.date2dist.clock_rate,
                              'intercept': tt.date2dist.intercept,
                              'rtt_Tmrca': -tt.date2dist.intercept/tt.date2dist.clock_rate}
        attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
    print("updated tree written to",tree_fname, file=sys.stdout)
    if args.output_node_data:
        node_data_fname = args.output_node_data
    else:
        node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json'

    json_success = write_json(node_data, node_data_fname)
    print("node attributes written to",node_data_fname, file=sys.stdout)

    return 0 if (tree_success and json_success) else 1
Exemple #15
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    try:
        T = read_tree(args.tree)
        node_data['input_tree'] = args.tree
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    if not args.alignment:
        # fake alignment to appease treetime when only using it for naming nodes...
        if args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference"
            )
            return 1
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    elif args.alignment:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'
    else:
        tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk'

    if args.root and len(
            args.root
    ) == 1:  #if anything but a list of seqs, don't send as a list
        args.root = args.root[0]
    if args.keep_root:  # This flag overrides anything specified by 'root'
        args.root = None

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction"
            )
            return 1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        tt = refine(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.
            root,  # or 'best', # We now have a default in param spec - this just adds confusion.
            Tc=0.01 if args.coalescent is None else
            args.coalescent,  #use 0.01 as default coalescent time scale
            use_marginal=args.date_inference == 'marginal',
            branch_length_inference=args.branch_length_inference or 'auto',
            clock_rate=args.clock_rate,
            clock_std=args.clock_std_dev,
            clock_filter_iqd=args.clock_filter_iqd,
            covariance=args.covariance,
            resolve_polytomies=(not args.keep_polytomies))

        node_data['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        attributes.extend(
            ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        if args.root:
            if args.root == 'best':
                print(
                    "Warning: To root without inferring a timetree, you must specify an explicit outgroup."
                )
                print(
                    "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n"
                )
            elif args.root in ['least-squares', 'min_dev', 'oldest']:
                raise TypeError(
                    "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup."
                    % args.root)
            else:
                T.root_with_outgroup(args.root)

        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T,
                               tree_fname,
                               'newick',
                               format_branch_length='%1.8f')
    print("updated tree written to", tree_fname, file=sys.stdout)

    if args.output_node_data:
        node_data_fname = args.output_node_data
    elif args.alignment:
        node_data_fname = '.'.join(
            args.alignment.split('.')[:-1]) + '.node_data.json'
    else:
        node_data_fname = '.'.join(
            args.tree.split('.')[:-1]) + '.node_data.json'

    write_json(node_data, node_data_fname)
    print("node attributes written to", node_data_fname, file=sys.stdout)

    return 0 if tree_success else 1
Exemple #16
0
def run(args):
    ## read tree and data, if reading data fails, return with error code
    tree = Phylo.read(args.tree, 'newick')

    # If genes is a file, read in the genes to translate
    if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
        genes = get_genes_from_file(args.genes[0])
    else:
        genes = args.genes

    ## check file format and read in sequences
    is_vcf = False
    if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format input")
            return -1
        compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        node_data = read_node_data(args.ancestral_sequences, args.tree)
        if node_data is None:
            print("ERROR: could not read node data (incl sequences)")
            return -1
        # extract sequences from node meta data
        sequences = {}
        for k,v in node_data['nodes'].items():
            if 'sequence' in v:
                sequences[k] = v['sequence']

    ## load features; only requested features if genes given
    features = load_features(args.reference_sequence, genes)
    print("Read in {} features from reference sequence file".format(len(features)))
    if features is None:
        print("ERROR: could not read features of reference sequence file")
        return -1

    ### translate every feature - but not 'nuc'!
    translations = {}
    deleted = []
    for fname, feat in features.items():
        if is_vcf:
            trans = translate_vcf_feature(sequences, ref, feat)
            if trans:
                translations[fname] = trans
            else:
                deleted.append(fname)
        else:
            if feat.type != 'source':
                translations[fname] = translate_feature(sequences, feat)

    if len(deleted) != 0:
        print("{} genes had no mutations and so have been be excluded.".format(len(deleted)))

    ## glob the annotations for later auspice export
    annotations = {}
    for fname, feat in features.items():
        increment = 0 if feat.type != 'source' else 1 #'nuc' goes to 0, unsure why - make 1
        annotations[fname] = {'start':int(feat.location.start)+increment,
                              'end':int(feat.location.end),
                              'strand': feat.location.strand}
    if is_vcf: #need to add our own nuc
        annotations['nuc'] = {'start': 0,
                              'end': len(ref),
                              'strand': 1}

    ## determine amino acid mutations for each node
    if is_vcf:
        aa_muts = assign_aa_vcf(tree, translations)
    else:
        aa_muts = {}
        for n in tree.get_nonterminals():
            for c in n:
                aa_muts[c.name]={"aa_muts":{}}
            for fname, aln in translations.items():
                for c in n:
                    if c.name in aln and n.name in aln:
                        tmp = [construct_mut(a, int(pos+1), d) for pos, (a,d) in
                                enumerate(zip(aln[n.name], aln[c.name])) if a!=d]
                        aa_muts[c.name]["aa_muts"][fname] = tmp
                    else:
                        print("no sequence pair for nodes %s-%s"%(c.name, n.name))

    write_json({'annotations':annotations, 'nodes':aa_muts}, args.output)
    print("amino acid mutations written to",args.output, file=sys.stdout)

    ## write alignments to file is requested
    if args.alignment_output:
        if is_vcf:
            ## write VCF-style output if requested
            fileEndings = -1
            if args.alignment_output.lower().endswith('.gz'):
                fileEndings = -2
            vcf_out_ref = '.'.join(args.alignment_output.split('.')[:fileEndings]) + '_reference.fasta'
            write_VCF_translation(translations, args.alignment_output, vcf_out_ref)
        else:
            ## write fasta-style output if requested
            if '%GENE' in args.alignment_output:
                for fname, seqs in translations.items():
                    SeqIO.write([SeqRecord.SeqRecord(seq=Seq.Seq(s), id=sname, name=sname, description='')
                                 for sname, s in seqs.items()],
                                 args.alignment_output.replace('%GENE', fname), 'fasta')
            else:
                print("ERROR: alignment output file does not contain '%GENE', so will not be written.")
Exemple #17
0
def run(args):
    ## read tree and data, if reading data fails, return with error code
    tree = Phylo.read(args.tree, 'newick')

    # If genes is a file, read in the genes to translate
    if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
        genes = get_genes_from_file(args.genes[0])
    else:
        genes = args.genes

    ## check file format and read in sequences
    is_vcf = False
    if any([
            args.ancestral_sequences.lower().endswith(x)
            for x in ['.vcf', '.vcf.gz']
    ]):
        if not args.vcf_reference:
            print("ERROR: a reference Fasta is required with VCF-format input")
            return 1
        compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
        sequences = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        node_data = read_node_data(args.ancestral_sequences, args.tree)
        if node_data is None:
            print("ERROR: could not read node data (incl sequences)")
            return 1
        # extract sequences from node meta data
        sequences = {}
        for k, v in node_data['nodes'].items():
            if 'sequence' in v:
                sequences[k] = v['sequence']

    ## load features; only requested features if genes given
    features = load_features(args.reference_sequence, genes)
    print("Read in {} features from reference sequence file".format(
        len(features)))
    if features is None:
        print("ERROR: could not read features of reference sequence file")
        return 1

    ### translate every feature - but not 'nuc'!
    translations = {}
    deleted = []
    for fname, feat in features.items():
        if is_vcf:
            trans = translate_vcf_feature(sequences, ref, feat)
            if trans:
                translations[fname] = trans
            else:
                deleted.append(fname)
        else:
            if feat.type != 'source':
                translations[fname] = translate_feature(sequences, feat)

    if len(deleted) != 0:
        print("{} genes had no mutations and so have been be excluded.".format(
            len(deleted)))

    ## glob the annotations for later auspice export
    #
    # Note that BioPython FeatureLocations use
    # "Pythonic" coordinates: [zero-origin, half-open)
    # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
    annotations = {}
    for fname, feat in features.items():
        annotations[fname] = {
            'seqid': args.reference_sequence,
            'type': feat.type,
            'start': int(feat.location.start) + 1,
            'end': int(feat.location.end),
            'strand': '+' if feat.location.strand else '-'
        }
    if is_vcf:  #need to add our own nuc
        annotations['nuc'] = {
            'seqid': args.reference_sequence,
            'type': feat.type,
            'start': 1,
            'end': len(ref),
            'strand': '+'
        }

    ## determine amino acid mutations for each node
    try:
        if is_vcf:
            aa_muts = assign_aa_vcf(tree, translations)
        else:
            aa_muts = assign_aa_fasta(tree, translations)
    except MissingNodeError as err:
        print("\n*** ERROR: Some/all nodes have no node names!")
        print(
            "*** Please check you are providing the tree output by 'augur refine'."
        )
        print(
            "*** If you haven't run 'augur refine', please add node names to your tree by running:"
        )
        print("*** augur refine --tree %s --output-tree <filename>.nwk" %
              (args.tree))
        print(
            "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'"
        )
        return 1
    except MismatchNodeError as err:
        print("\n*** ERROR: Mismatch between node names in %s and in %s" %
              (args.tree, args.ancestral_sequences))
        print(
            "*** Ensure you are using the same tree you used to run 'ancestral' as input here."
        )
        print(
            "*** Or, re-run 'ancestral' using %s, then use the new %s as input here."
            % (args.tree, args.ancestral_sequences))
        return 1

    output_data = {'annotations': annotations, 'nodes': aa_muts}
    if is_vcf:
        output_data['reference'] = {}
        for fname in translations:
            output_data['reference'][fname] = translations[fname]['reference']
    else:
        output_data['reference'] = aa_muts[tree.root.name]['aa_sequences']

    out_name = get_json_name(
        args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
    write_json(output_data, out_name)
    print("amino acid mutations written to", out_name, file=sys.stdout)

    ## write alignments to file is requested
    if args.alignment_output:
        if is_vcf:
            ## write VCF-style output if requested
            fileEndings = -1
            if args.alignment_output.lower().endswith('.gz'):
                fileEndings = -2
            vcf_out_ref = args.vcf_reference_output or '.'.join(
                args.alignment_output.split('.')
                [:fileEndings]) + '_reference.fasta'
            write_VCF_translation(translations, args.alignment_output,
                                  vcf_out_ref)
        else:
            ## write fasta-style output if requested
            if '%GENE' in args.alignment_output:
                for fname, seqs in translations.items():
                    SeqIO.write([
                        SeqRecord.SeqRecord(seq=Seq.Seq(s),
                                            id=sname,
                                            name=sname,
                                            description='')
                        for sname, s in seqs.items()
                    ], args.alignment_output.replace('%GENE', fname), 'fasta')
            else:
                print(
                    "ERROR: alignment output file does not contain '%GENE', so will not be written."
                )
Exemple #18
0
def run(args):
    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None
    anc_seqs = {}
    # check if tree is provided and can be read
    for fmt in ["newick", "nexus"]:
        try:
            T = Phylo.read(args.tree, fmt)
            break
        except:
            pass
    if T is None:
        print("ERROR: reading tree from %s failed." % args.tree)
        return -1

    if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments"
            )
            return -1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    tt = ancestral_sequence_inference(tree=T,
                                      aln=aln,
                                      ref=ref,
                                      marginal=args.inference)

    if is_vcf:
        # TreeTime overwrites ambig sites on tips during ancestral reconst.
        # Put these back in tip sequences now, to avoid misleading
        tt.recover_var_ambigs()

    anc_seqs['nodes'] = collect_sequences_and_mutations(T, is_vcf)

    if args.output:
        anc_seqs_fname = args.output
    else:
        anc_seqs_fname = '.'.join(
            args.alignment.split('.')[:-1]) + '.anc_seqs.json'

    anc_seqs_success = write_json(anc_seqs, anc_seqs_fname)
    print("ancestral sequences written to", anc_seqs_fname, file=sys.stdout)

    # If VCF, output VCF including new ancestral seqs
    if is_vcf:
        if args.output_vcf:
            vcf_fname = args.output_vcf
        else:
            vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
        print("ancestral sequences as vcf-file written to",
              vcf_fname,
              file=sys.stdout)

    if anc_seqs_success:
        return 0
    else:
        return 1