コード例 #1
0
ファイル: reducealign.py プロジェクト: nvt-1009/biotools
def process_alignment(name, path, preserve, bycodon=False):
    # bycodon = args.retaincodons
    align = AlignIO.read(path, 'fasta')
    align = drop_empty_rows(align)
    preskeep = []
    preservealign = []
    step = 3 if bycodon else 1
    if len(preserve) > 0:
        preservealign = [a for a in align if a.name in preserve]
    if len(preservealign) > 0:
        preservealign = Align.MultipleSeqAlignment(preservealign)
        preserved = preserved_columns(preservealign, bycodon)
        preskeep = range(preserved[0], preserved[1] + 1, step)
        checkalign = [a for a in align if a.name not in preserve]
        checkalign = Align.MultipleSeqAlignment(checkalign)
        processorder = itertools.chain(
            preskeep, range(0, preserved[0], step),
            range(preserved[1] + 1, align.get_alignment_length(), step))
    else:
        checkalign = align
        processorder = range(0, align.get_alignment_length(), step)
    del align
    processorder = list(processorder)
    results = []
    allrows = set()
    for i in processorder:
        j = min([i + step, checkalign.get_alignment_length()])
        infset = {
            s
            for k in range(i, j)
            for s in informative_set(checkalign[:, k:k + 1])
        }
        if len(results) == 0:
            results.append([name, i, infset])
            allrows = infset
            if len(allrows) == len(checkalign):
                break
            continue


#        elif within_any(infset, [r[2] for r in results]):
#            continue
        elif len(allrows.union(infset)) == len(allrows):
            continue
        else:
            results.append([name, i, infset])
            allrows = allrows.union(infset)
            if len(allrows) == len(checkalign):
                break
    #[[r[0], r[1], len(r[2])] for r in results]
    results = result_filter(results, len(checkalign))
    return (results, preskeep)
コード例 #2
0
def pair_align_SeqRecords(seqr_a, seqr_b, ex_aligner = needle_align):
    ''' Pairwise align two SeqRecords using external or internal aligner.

        seqr_a, seqr_b: SeqRecords to align
        ex_aligner: helper function that aligns sequences in two files
                    *If ex_aligner is None, use internal aligner*

        Internal aligner:
            Bio.pairwise2.align.globalds() with Bio.SubsMat.MatrixInfo.Blosum62
            and default gap penalties (gapopen = -10.0, gapextend = -0.5)

        Returns a MultipleSeqAlignment object

    '''

    if ex_aligner is None:
        inaln = align.globalds(ungap_SeqRecord(seqr_a), ungap_SeqRecord(seqr_b),
                               MatrixInfo.blosum62, -10.0, -0.5)
        exaln = Align.MultipleSeqAlignment(inaln[0][:2])
    else:
        tmp_fa = make_tmp_fa(ungap_SeqRecord(seqr_a))
        tmp_ref_fa = make_tmp_fa(ungap_SeqRecord(seqr_b))
        exaln = ex_aligner(tmp_fa.name, tmp_ref_fa.name)
        remove(tmp_fa.name)
        remove(tmp_ref_fa.name)

    return exaln
コード例 #3
0
def writelines(lines, gid):
    ali = al.MultipleSeqAlignment([
        sr.SeqRecord(seq.Seq(''.join(line)),
                     '{1}{0}i'.format(i, gid),
                     description='{1}{0}d'.format(i, gid))
        for i, line in enumerate(lines[::50])
    ])
    return ali
コード例 #4
0
 def _save_alignment(self):
     aln = Align.MultipleSeqAlignment([
         SeqRecord.SeqRecord(Seq.Seq(''.join(n.sequence)),
                             id=n.name,
                             name=n.name,
                             description="")
         for n in self.tree.find_clades()
     ])
     AlignIO.write(aln, os.path.join(self._root_dir, out_aln_fasta),
                   "fasta")
コード例 #5
0
def sorted_uncertainty_from_alignment (align, max_freq_n = 0.1): ## IUPAC ambiguity codes
    if max_freq_n < 0: max_freq_n = 0 # this means only columns without N at all will be used
    max_n = int(max_freq_n * len(align))
    if max_n >= len(align): max_n = len(align) - 1 

    summary_align = AlignInfo.SummaryInfo(Align.MultipleSeqAlignment(align)) # must be an MSA, not a list
    pssm = summary_align.pos_specific_score_matrix()
    # pssm example: {'-':3, 'A':0, 'T':4.0, 'G':0, 'C':2.0, 'N':1} per column, means 3 seqs have "-", 4 have "T"...
    index = [[i,s["N"] + s["-"]] for i, s in enumerate(pssm)]
    return [x[0] for x in index if x[1] < max_n]
コード例 #6
0
def load_aln(infile):

    aln = Align.MultipleSeqAlignment([])
    aln_dict = {}

    with open(infile, 'r') as f:
        for seq_record in SeqIO.parse(f, 'fasta'):
            aln.append(seq_record)
            aln_dict[seq_record.id] = str(seq_record.seq)

    return aln, aln_dict
コード例 #7
0
ファイル: infernal.py プロジェクト: bh0085/projects
def alignment(seqs_in, profile, run_id):
    '''Compute an alignment of multiple sequences to a given 
covariance model profile such as constructed by cmbuild
via infernal.profiles.

input:
  seqs:    a list of biopython SeqRecord objects
  profile: the filename of a covariance model profile
  run_id:  a run id to use for naming temporary files to avoid collisions

output:
  ali:     an rfam multiple sequence alignment
  ref:     the profile reference sequence aligned to ali
  struct:  the profile reference structure aligned to ali

'''
    if type(seqs_in[0]) == str:
        raise Exception(
            'Sorry but string lists are not supported. We need ids!')
        #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s,
        #                                            Bio.Seq.Alphabet.RNAAlphabet),
        #                                'S{0:03}'.format(idx))
        #        for idx, s in enumerate(seqs)]
    else:
        seqs = [
            Bio.SeqRecord.SeqRecord(
                Bio.Seq.Seq(
                    ''.join([let for let in str(ali.seq) if let in 'AUTGC']),
                    Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx))
            for idx, ali in enumerate(seqs_in)
        ]

    name_maps = dict([('S{0:03}'.format(idx), s.id)
                      for idx, s in enumerate(seqs_in)])

    infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format(
        run_id, idx))
    outfile = cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format(
        run_id, idx))
    Bio.SeqIO.write(seqs, infile, 'fasta')

    cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile)
    ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE)
    out = ispc.communicate()[0]
    fopen = open(outfile)
    seqs, ref, struct = rutils.stk_parse(fopen)
    fopen.close()
    ali = ba.MultipleSeqAlignment(seqs)

    for a in ali:
        a.seq = a.seq.upper()
        a.id = name_maps[a.id]

    return ali, ref, struct
コード例 #8
0
ファイル: files_management.py プロジェクト: htalibart/ppsuite
def remove_positions_with_gaps_in_first_sequence(input_fasta, output_fasta):
    # removes all positions with gaps in the first sequence
    aln = AlignIO.read(str(input_fasta), 'fasta')
    first_sequence = str(aln[0].seq)
    good_positions = [
        k for k in range(len(first_sequence)) if first_sequence[k] != '-'
    ]
    first_pos = good_positions[0]
    clean_aln = Align.MultipleSeqAlignment(aln[:, first_pos:first_pos + 1])
    for pos in good_positions[1:]:
        clean_aln += aln[:, pos:pos + 1]
    AlignIO.write(clean_aln, str(output_fasta), 'fasta')
    return output_fasta
コード例 #9
0
def load_aln_to_repair(infile, omit):

    aln = Align.MultipleSeqAlignment([])
    aln_dict = {}

    with open(infile, 'r') as f:
        for seq_record in SeqIO.parse(f, 'fasta'):

            aln_dict[seq_record.id] = str(seq_record.seq)

            if seq_record.name not in omit:
                aln.append(seq_record)

    return aln, aln_dict
コード例 #10
0
ファイル: worker.py プロジェクト: neherlab/treetime-cloud
def save_alignment(tt: TreeTime, config: TreetimeConfig):
    records = [
        SeqRecord.SeqRecord(
            Seq.Seq("".join(n.sequence)),
            id=n.name,
            name=n.name,
            description="",
        ) for n in tt.tree.find_clades()
    ]

    aln = Align.MultipleSeqAlignment(records)

    with open(config.output_filenames.FASTA, "w") as ofile:
        AlignIO.write(aln, ofile, "fasta")
コード例 #11
0
def separate_alignments(msa_data,
                        sus_ids,
                        out_dir,
                        filename,
                        patient_zero='NC_045512.2'):
    good_seqs = []
    poor_seqs = []
    for rec in msa_data:
        if rec.id in sus_ids:
            poor_seqs.append(rec)
        elif rec.id == patient_zero:
            good_seqs.append(rec)
            poor_seqs.append(rec)
        else:
            good_seqs.append(rec)
    good_msa = Align.MultipleSeqAlignment(good_seqs)
    good_msa_fn = filename + '_aligned_white.fa'
    good_msa_fp = out_dir / good_msa_fn
    AlignIO.write(good_msa, good_msa_fp, 'fasta')
    poor_msa = Align.MultipleSeqAlignment(poor_seqs)
    poor_msa_fn = filename + '_aligned_inspect.fa'
    poor_msa_fp = out_dir / poor_msa_fn
    AlignIO.write(poor_msa, poor_msa_fp, 'fasta')
    return 0
コード例 #12
0
def pop_row(aln, seqid):
    ''' Pop a row from an alignment by sequence id

        aln: a Bio.Align.MultipleSeqAlignment object
        seqid: id of Bio.SeqRecord.SeqRecord to pop from aln

        Returns a tuple containing the popped SeqRecord and a
        copy of aln without seqid's SeqRecord.

    '''

    aln_d = SeqIO.to_dict(aln)
    seq = aln_d[seqid]
    del aln_d[seqid]
    aln = Align.MultipleSeqAlignment(aln_d.itervalues())

    return seq, aln
コード例 #13
0
def fetch_seqs(seqs_filepath, out_fp, sample_idxs: list, is_aligned=False, is_gzip=False):
    if is_aligned:
        if is_gzip:
            with gzip.open(seqs_filepath, "rt") as handle:
                cns = AlignIO.read(handle, 'fasta')
        else:
            cns = AlignIO.read(seqs_filepath, 'fasta')
        my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in sample_idxs])
        return AlignIO.write(my_cns, out_fp, 'fasta')
    else:
        if is_gzip:
            with gzip.open(seqs_filepath, "rt") as handle:
                cns = SeqIO.parse(handle, 'fasta')
        else:
            cns = SeqIO.parse(seqs_filepath, 'fasta')
        my_cns = [rec for rec in cns if rec.id in sample_idxs]
        return SeqIO.write(my_cns, out_fp, 'fasta')
コード例 #14
0
def consensus_from_alignment (align): ## IUPAC ambiguity codes
    if ambiguous_dna: ## biopython < 1.78
        xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N"), ambiguous_dna), id=rec.id, description=rec.description) for rec in align]
    else:
        xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N")), id=rec.id, description=rec.description) for rec in align]
    summary_align = AlignInfo.SummaryInfo(Align.MultipleSeqAlignment(xaln)) # must be an MSA, not a list
    pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=["-"])
    consensus = [];
    # pssm example: {'-':3, 'A':0, 'T':4.0, 'G':0, 'C':2.0, 'N':1} per column, means 3 seqs have "-", 4 have "T"...
    for score in pssm: # we don't care about frequency, only presence
        # base can be "R", then iupac.dna_values[R] = [A,G]
        acgt_list = [x for base, count in score.items() for x in IUPACData.ambiguous_dna_values[base] if count > 0]
        consensus.append(iupac_dna[ ''.join(sorted(set(acgt_list))) ])
    if ambiguous_dna:
        return Seq.Seq(''.join(consensus),ambiguous_dna)
    else:
        return Seq.Seq(''.join(consensus))
コード例 #15
0
ファイル: dstat.py プロジェクト: mobilegenome/dstats
def main():
    global f_ab, f_extra, bt_positions
    seqs = {}
    records = []
    fname_list = [basename(fpath) for fpath in options.input_files]

    with open(options.output.replace(".abbababa", ".flist"), "w") as fout:
        fout.write("\n".join(options.input_files))

    for fpath in options.input_files:
        fname = basename(fpath)
        seqs[fname] = SeqIO.index(fpath, "fasta")
        records_per_fasta = seqs.get(fname).keys()
        records.extend([record for record in records_per_fasta])
        print fname

    anc = SeqIO.index(options.anc, "fasta")

    print "\n"

    records = set([str(r) for r in records])
    f_ab = open(options.output, "w")
    f_extra = open(options.extra, "w")
    bt_positions = BedToolPositions()
    for record in sorted(records):
        sequences = []
        # min_alignment_length = min([len(seqs.get(seq_key).get(record)) for seq_key in fname_list] +
        #                            [len(anc.get(record))])
        for seq_key in fname_list:
            # print seq_key
            sequences.append(seqs.get(seq_key).get(record))

        min_alignment_length = min([len(sequence) for sequence in sequences] +
                                   [len(anc.get(record))])

        per_chr_alignment = Align.MultipleSeqAlignment(
            [sequence[:min_alignment_length] for sequence in sequences])

        do_abbababa(per_chr_alignment, anc.get(record)[:min_alignment_length])

    bt_positions.write_to_BED(options.bed_out)
    f_ab.close()
    f_extra.close()

    return 1
コード例 #16
0
def clean_alignment(alignment):
    """
    Remove ambiguities from alignment.

    Iterate over sites in the alignment and build a new alignment containing
    either only pure ATGC sites (-c) or sites with up to a specified proportion
    of N's (-c -n FLOAT).
    """
    site_length = len(alignment[:, 0])
    cleaned_alignment = Align.MultipleSeqAlignment(
        [seq[:0] for seq in alignment])

    if args.n_ratio:
        logging.info(f"Removing sites with > {int(args.n_ratio * 100)}% of " +
                     f"N's from '{alignment[0].name}'")
        for pos in range(0, len(alignment[0])):
            site = alignment[:, pos:pos + 1]
            site_nucleotides = alignment[:, pos]
            n_count = site_nucleotides.upper().count('N')
            n_ratio = n_count / site_length
            if n_ratio <= args.n_ratio:
                cleaned_alignment += site

    else:
        logging.info("Removing sites with ambiguities from " +
                     f"'{alignment[0].name}'")
        iupac = ['N', 'Y', 'R', 'K', 'M', 'W', 'S', 'B', 'D', 'H', 'V', '-']
        iupac_length = len(iupac)
        for pos in range(0, len(alignment[0])):
            site = alignment[:, pos:pos + 1]
            site_nucleotides = alignment[:, pos]
            bad_char = False
            if site_length > iupac_length:
                if any([char in site_nucleotides.upper() for char in iupac]):
                    bad_char = True
                    break
            else:
                for char in site:
                    if str(char.seq).upper() in 'NYRKMWSBDHV-':
                        bad_char = True
                        break
            if not bad_char:
                cleaned_alignment += site

    return cleaned_alignment
コード例 #17
0
def _evolve_sequence(tree, L, gtr):
    """
    Produce random sequence of a given length L, evolve it on a given tree
    using the given gtr model.
    """
    if isinstance(tree, str):
        tree = Phylo.read(tree, 'newick')

    root_seq = np.random.choice(gtr.alphabet, p=gtr.Pi, size=1000)
    tree.root.ref_seq = root_seq
    print("Started sequence evolution...")

    for node in tree.find_clades():

        for c in node.clades:
            c.up = node

        if hasattr(node, 'ref_seq'):
            continue

        t = node.branch_length
        p = gtr.propagate_profile(
            treetime.seq_utils.seq2prof(node.up.ref_seq, gtr.profile_map), t)
        # normalie profile
        p = (p.T / p.sum(axis=1)).T

        # sample mutations randomly
        ref_seq_idxs = np.array([
            int(np.random.choice(np.arange(p.shape[1]), p=p[k]))
            for k in np.arange(p.shape[0])
        ])
        node.ref_seq = np.array([gtr.alphabet[k] for k in ref_seq_idxs])

    records = [
        Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name)
        for k in tree.get_terminals()
    ]

    aln = Align.MultipleSeqAlignment(records)
    #full_aln = Align.MultipleSeqAlignment(full_records)
    print("Sequence evolution done...")
    return root_seq, aln
コード例 #18
0
def main():
    date = '2020_10_07'
    base = '/mnt/g/Covid-19/' + date + '/'

    in_file = base + 'sequences_filtered_aln2_red.fasta'
    out_file = base + 'trees/sequences_filtered_aln2_samp1.fasta'

    num_samples = 1000
    ref_id = 'NC_045512.2'

    aln = read_alignment_file(in_file)
    seq_ids = random.sample(list(aln.keys()), k=num_samples)
    if not ref_id in seq_ids:
        seq_ids[0] = ref_id

    alignment = Align.MultipleSeqAlignment([])
    for id in seq_ids:
        alignment.append(aln[id])

    AlignIO.write(alignment, open(out_file, 'w'), 'fasta')
コード例 #19
0
def build_alignment(filenames, sequences, scaffold):
    """
    Build scaffold alignment.

    Extract sequences of a given scaffold from each input FASTA and build a
    scaffold alignment containing all subjects in the input FASTA list.
    """
    seqs = []
    for filename in filenames:
        seq_to_add = sequences.get(filename).get(scaffold)
        seq_to_add.id = filename.split('.')[0]
        seqs.append(seq_to_add)

    logging.info(f"Building alignment for '{scaffold}'")

    min_len_alignment = min([len(seq) for seq in seqs])
    alignment = Align.MultipleSeqAlignment(
        [seq[:min_len_alignment] for seq in seqs])

    return alignment
コード例 #20
0
ファイル: io.py プロジェクト: hcdenbakker/treetime
def save_timetree_results(tree, outfile_prefix):
    """
    First, it scans the tree and assigns the namesto every node with no name
    then, it saves the information as the csv table
    """
    import pandas
    df = pandas.DataFrame(
        columns=["Given_date", "Initial_root_dist", "Inferred_date"])
    aln = Align.MultipleSeqAlignment([])

    i = 0

    # save everything
    df.to_csv(outfile_prefix + ".meta.csv")
    #  TODO save variance to the metadata
    Phylo.write(tree.tree, outfile_prefix + ".tree.nwk", "newick")
    AlignIO.write(aln, outfile_prefix + ".aln.fasta", "fasta")

    # save root distibution
    mtp = tree.tree.root.msg_to_parent
    threshold = mtp.y.min() + 1000
    idxs = [mtp.y < threshold]
    mtpy = mtp.y[idxs]
    mtpx = utils.numeric_date() - np.array(
        map(tree.date2dist.get_date, mtp.x[idxs]))
    mtpy[0] = threshold
    mtpy[-1] = threshold

    np.savetxt(outfile_prefix + ".root_dist.csv",
               np.hstack((mtpx[:, None], mtpy[:, None])),
               header="Root date,-log(LH)",
               delimiter=',')

    # zip results to one file
    import zipfile
    outzip = outfile_prefix + ".zip"
    zipf = zipfile.ZipFile(outzip, 'w')
    zipf.write(outfile_prefix + ".meta.csv")
    zipf.write(outfile_prefix + ".aln.fasta")
    zipf.write(outfile_prefix + ".tree.nwk")
    zipf.write(outfile_prefix + ".root_dist.csv")
コード例 #21
0
    def remove_columns_from_msa(alignment_in, cols_to_remove):

        # get 0 based index of all wanted columns
        cols_to_remove_0_base = [(i - 1) for i in cols_to_remove]
        aln_cols_index_all = list(range(alignment_in.get_alignment_length()))
        aln_cols_index_wanted = []
        for i in aln_cols_index_all:
            if i not in cols_to_remove_0_base:
                aln_cols_index_wanted.append(i)

        # get wanted alignment segments
        wanted_segments = list_to_segments(aln_cols_index_wanted)

        # create an empty Alignment object
        alignment_new = Align.MultipleSeqAlignment([])
        for sequence in alignment_in:
            new_seq_object = Seq('')
            new_seq_record = SeqRecord(new_seq_object)
            new_seq_record.id = sequence.id
            new_seq_record.description = sequence.description
            alignment_new.append(new_seq_record)

        # add wanted columns to empty Alignment object
        for segment in wanted_segments:

            # for single column segment
            if segment[0] == segment[1]:
                segment_value = alignment_in[:, segment[0]]

                m = 0
                for each_seq in alignment_new:
                    each_seq.seq = Seq(str(each_seq.seq) + segment_value[m])
                    m += 1

            # for multiple columns segment
            else:
                segment_value = alignment_in[:, (segment[0]):(segment[1] + 1)]
                alignment_new += segment_value

        return alignment_new
コード例 #22
0
def seq_dists(ali,run_id, tree = True):
    import Levenshtein
    n = len(ali)
    dists = zeros((n,n))

    if tree:
        ali_named = align.MultipleSeqAlignment(ali)
        maps = {}
        for idx, a in enumerate(ali_named):
            a.id = 'S{0:05}'.format(idx) 
            maps[a.id] = idx
        tree = phyml.tree(ali_named, run_id = run_id, bionj = True)
        for n1 in tree.get_terminals():
            for n2 in tree.get_terminals():
                dists[maps[n1.name],maps[n2.name]] = \
                    tree.distance(n1,n2)
    else:
        for i in range(n):
            for j in range(i):
                dists[i,j] = Levenshtein.distance(str(ali[i].seq), str(ali[j].seq))
                dists[j,i] = dists[i,j]
    return dists
コード例 #23
0
ファイル: asr.py プロジェクト: OmerRonen/Periscope
def _save_phy_aln(fasta_fname, phy_fname, n_seqs=None):
    if os.path.isfile(phy_fname):
        return
    records = SeqIO.parse(fasta_fname, "fasta")
    records_phy = []
    records_phy_names = []
    i = 0
    for record in records:
        if record.id[0:8] not in records_phy_names:
            record.id = record.id[0:8]
            records_phy.append(record)
            records_phy_names.append(record.id[0:8])
            i += 1
            if i == n_seqs:
                break

    aln = Align.MultipleSeqAlignment(records_phy)

    handle = open(phy_fname, 'w')
    pw = SequentialPhylipWriter(handle)
    pw.write_alignment(aln)
    handle.close()
コード例 #24
0
def fake_alignment(T):
    """
    Fake alignment to appease treetime when only using it for naming nodes...
    This is lifted from refine.py and ideally could be imported

    Parameters
    -------
    T : <class 'Bio.Phylo.BaseTree.Tree'>

    Returns
    -------
    <class 'Bio.Align.MultipleSeqAlignment'>
    """
    from Bio import SeqRecord, Seq, Align
    seqs = []
    for n in T.get_terminals():
        seqs.append(
            SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                id=n.name,
                                name=n.name,
                                description=''))
    aln = Align.MultipleSeqAlignment(seqs)
    return aln
コード例 #25
0
ファイル: trimalign.py プロジェクト: nvt-1009/biotools
def main():
    args = getcliargs()
    # Read in alignment
    aln = AlignIO.read(sys.stdin, 'fasta')

    if args.removeseqs:
        row_keep = []
        for a in aln:
            if not re.match(args.removeseqs, a.name):
                row_keep.append(a)
        aln = Align.MultipleSeqAlignment(row_keep)

    if args.removegapcols:
        ncol = aln.get_alignment_length()
        col_keep = []
        for i in range(ncol):
            if set(list(aln[:, i])) != {'-'}:
                col_keep.append(i)
        aln_keep = aln[:, col_keep[0]:col_keep[0] + 1]
        for i in col_keep[1:]:
            aln_keep += aln[:, i:i + 1]
        aln = aln_keep

    AlignIO.write(aln, sys.stdout, 'fasta')
コード例 #26
0
                dates[name] = float(date)
            except:
                failed_dates += 1

        if len(dates) < failed_dates:
            print("\n\nDATE PARSING FAILED, ABORTING...")
            import sys
            sys.exit(1)

    ###########################################################################
    ### FAKING ALIGMENT IF NONE GIVEN
    ###########################################################################
    if params.aln is None:
        from Bio import Seq, SeqRecord, Align
        aln = Align.MultipleSeqAlignment([
            SeqRecord.SeqRecord(Seq.Seq("AAA"), id=node, name=node)
            for node in dates
        ])

    ###########################################################################
    ### ESTIMATE ROOT (if requested) AND DETERMINE TEMPORAL SIGNAL
    ###########################################################################
    base_name = '.'.join(params.tree.split('/')[-1].split('.')[:-1])
    myTree = TreeTime(dates=dates,
                      tree=params.tree,
                      aln=aln,
                      gtr='JC69',
                      verbose=params.verbose)

    if not params.keep_root:
        myTree.reroot('best')
コード例 #27
0
def save_results(tt, state, root):
    print(root)
    if tt is not None:
        #  save files
        treetime.treetime_to_json(tt, os.path.join(root, "out_tree.json"))
        treetime.tips_data_to_json(tt, os.path.join(root, "out_tips.json"))
        treetime.root_lh_to_json(tt, os.path.join(root, "out_root_lh.json"))
        treetime.root_lh_to_csv(tt, os.path.join(root, "out_root_lh.csv"))

        # save full alignment
        aln = Align.MultipleSeqAlignment([
            SeqRecord.SeqRecord(Seq.Seq(''.join(n.sequence)))
            for n in tt.tree.find_clades()
        ])
        AlignIO.write(aln, os.path.join(root, "out_aln.fasta"), "fasta")

        # save newick tree
        Phylo.write(tt.tree, os.path.join(root, "out_newick_tree.nwk"),
                    "newick")

        #save metadata as csv file
        treetime.save_all_nodes_metadata(
            tt, os.path.join(root, "out_metadata.csv"))

        #save molecular clock in normal format
        mclock = np.array([
            (tip.dist2root, tip.numdate_given)
            for tip in tt.tree.get_terminals()
            if hasattr(tip, 'dist2root') and hasattr(tip, 'numdate_given')
        ])
        np.savetxt(os.path.join(root, 'molecular_clock.csv'),
                   mclock,
                   delimiter=',',
                   header='Distance_to_root,Sampling_date')

        # save GTR in csv file
        treetime.save_gtr_to_file(tt.gtr, os.path.join(root, "out_gtr.txt"))

        # zip all results to one file
        with zipfile.ZipFile(os.path.join(root, 'treetime_results.zip'),
                             'w') as out_zip:
            out_zip.write(os.path.join(root, 'out_newick_tree.nwk'),
                          arcname='out_newick_tree.nwk')
            out_zip.write(os.path.join(root, 'out_aln.fasta'),
                          arcname='out_aln.fasta')
            out_zip.write(os.path.join(root, 'out_metadata.csv'),
                          arcname='out_metadata.csv')
            out_zip.write(os.path.join(root, 'out_tree.json'),
                          arcname='out_tree.json')
            out_zip.write(os.path.join(root, 'settings.json'),
                          arcname='settings.json')
            out_zip.write(os.path.join(root, 'molecular_clock.csv'),
                          arcname='molecular_clock.csv')
            out_zip.write(os.path.join(root, 'out_root_lh.csv'),
                          arcname='out_root_lh.csv')
            out_zip.write(os.path.join(root, 'out_gtr.txt'),
                          arcname='out_gtr.txt')

        state['status'] = 'Done'
        return tt, True
    else:
        state['status'] = 'Error'
        return tt, False
コード例 #28
0
ファイル: dstat.py プロジェクト: mobilegenome/dstats
def do_abbababa(alignment, anc_sequence):

    i = 0
    chr = alignment[0].id
    n = len(alignment)
    anc_sequence = Align.MultipleSeqAlignment([anc_sequence])
    print chr
    for i in xrange(0, len(alignment[1]), blocksize):
        block = alignment[:, i:i + blocksize]
        anc_block = anc_sequence[:, i:i + blocksize]
        f_ab.write("%s\t%i\t%i" % (chr, i, i + len(block[0]) - 1))
        f_extra.write("%s\t%i\t%i" % (chr, i, i + len(block[0]) - 1))
        c = 0
        for h3 in xrange(n):
            for h2 in xrange(n):
                if h2 == h3:
                    continue
                for h1 in xrange(n):
                    if (h1 == h3) or (h1 >= h2):
                        continue
                    print "combination %i of %i" % (c + 1, (factorial(n) / 2))
                    bt_positions.taxon_sequence = (h1, h2, h3)
                    c += 1
                    #print h1,h2,h3
                    abba = 0
                    baba = 0
                    bbaa = 0
                    snv = 0

                    for j in range(len(
                            block[1])):  # iterate over sites in alignments

                        s1 = block[h1, j]
                        s2 = block[h2, j]
                        s3 = block[h3, j]
                        s_anc = anc_block[0, j]

                        # print set([h1, h2, h3, h4])

                        #if len(set)
                        if len(set([s1, s2, s3, s_anc])) < 2:
                            continue  # site not biallelic

                        badchar = False
                        for site in [s1, s2, s3,
                                     s_anc]:  # check for N and ambiguities
                            if site in "NYRKMWSBDHV-":
                                badchar = True
                                break

                            # if site in ["N", "Y", "R", "K", "M", "W", "S", "B", "D", "H", "V", "-"]:
                            #     continue
                        if badchar:
                            continue
                        if (s1 == s2
                            ) and s3 == s_anc and s1 != s3 and s2 != s_anc:
                            bbaa += 1
                            snv += 1
                            bt_positions.add_feature(chr, i + j, "BBAA")

                        elif s1 != s2 and s3 != s_anc:
                            if s1 == s3 and s2 == s_anc:
                                baba += 1
                                bt_positions.add_feature(chr, i + j, "BABA")
                            elif s2 == s3 and s1 == s_anc:
                                abba += 1
                                bt_positions.add_feature(chr, i + j, "ABBA")
                        else:
                            snv += 1
                            bt_positions.add_feature(
                                chr, i + j,
                                "SNV")  # add non ABBABABA or BBAA SNV
                    f_ab.write("\t%i\t%i" % (abba, baba))
                    f_extra.write("\t%i\t%i\t%i" % (abba, baba, bbaa))
        f_ab.write("\n")
        f_extra.write("\n")
    return 1
コード例 #29
0
    for entry in tmp_aln:
        # if this alignment has a different size from the rest, something is reaaaaaly wrong!
        if len(entry) != aln_length:
            sys.exit('\t**Error, block "%s" has a different length than the rest of the MSA: %s' %(entry.name, aln))

        if re.match('GC[AF]_', entry.name):
            genome, gene = entry.name.split('|')
        else:
            genome, gene = entry.name.split('_')

        concatenation[genome] += deepcopy(entry.seq)

    partitions.write('LG, %s = %i-%i\n' %(aln.replace('.fasta.aln', ''), current_position, current_position+aln_length-1) )
    current_position += aln_length

    #
    # add gaps for those genomes missing this gene (same size as the expected alignment)
    for genome in genome_union.difference(genomes[aln]):
        concatenation[genome] += Align.Seq( '-' * aln_length, aln_alphabet )
        missing_genes[genome] += aln_length
partitions.close()

#
# remove genomes missing more than 20% of the marker genes
for genome, num_missing_genes in missing_genes.items():
    if num_missing_genes/total_genes > 0.1:
        print '\t\t**%s: excluded from analysis for missing %.2f from concatenated alignment!' %(genome, (num_missing_genes/total_genes)*100)
        concatenation.pop( genome )

AlignIO.write( Align.MultipleSeqAlignment( concatenation.values() ), '%s/concatenated_alignment.aln' %output_folder, 'fasta' )
コード例 #30
0
ファイル: refine.py プロジェクト: tomkinsc/augur
def run(args):
    if args.seed is not None:
        np.random.seed(args.seed)

    # check alignment type, set flags, read in if VCF
    is_vcf = False
    ref = None

    # node data is the dict that will be exported as json
    node_data = {'alignment': args.alignment}
    # list of node attributes that are to be exported, will grow
    attributes = ['branch_length']

    try:
        T = read_tree(args.tree)
        node_data['input_tree'] = args.tree
    except (FileNotFoundError, InvalidTreeError) as error:
        print("ERROR: %s" % error, file=sys.stderr)
        return 1

    if not args.alignment:
        if args.timetree:
            print(
                "ERROR: alignment is required for ancestral reconstruction or timetree inference",
                file=sys.stderr)
            return 1

        if args.divergence_units == 'mutations':
            print(
                "ERROR: alignment is required for divergence in units of mutations",
                file=sys.stderr)
            return 1

        # fake alignment to appease treetime when only using it for naming nodes...
        from Bio import SeqRecord, Seq, Align
        seqs = []
        for n in T.get_terminals():
            seqs.append(
                SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'),
                                    id=n.name,
                                    name=n.name,
                                    description=''))
        aln = Align.MultipleSeqAlignment(seqs)
    elif any([args.alignment.lower().endswith(x)
              for x in ['.vcf', '.vcf.gz']]):
        if not args.vcf_reference:
            print(
                "ERROR: a reference Fasta is required with VCF-format alignments",
                file=sys.stderr)
            return 1

        compress_seq = read_vcf(args.alignment, args.vcf_reference)
        aln = compress_seq['sequences']
        ref = compress_seq['reference']
        is_vcf = True
    else:
        aln = args.alignment

    from treetime import version as treetime_version
    print(f"augur refine is using TreeTime version {treetime_version}")

    # if not specified, construct default output file name with suffix _tt.nwk
    if args.output_tree:
        tree_fname = args.output_tree
    elif args.alignment:
        tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'
    else:
        tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk'

    if args.root and len(
            args.root
    ) == 1:  #if anything but a list of seqs, don't send as a list
        args.root = args.root[0]
    if args.keep_root:  # This flag overrides anything specified by 'root'
        args.root = None

    if args.timetree:
        # load meta data and covert dates to numeric
        if args.metadata is None:
            print(
                "ERROR: meta data with dates is required for time tree reconstruction",
                file=sys.stderr)
            return 1
        metadata, columns = read_metadata(args.metadata)
        if args.year_bounds:
            args.year_bounds.sort()
        dates = get_numerical_dates(metadata,
                                    fmt=args.date_format,
                                    min_max_year=args.year_bounds)

        # save input state string for later export
        for n in T.get_terminals():
            if n.name in metadata and 'date' in metadata[n.name]:
                n.raw_date = metadata[n.name]['date']

        tt = refine(
            tree=T,
            aln=aln,
            ref=ref,
            dates=dates,
            confidence=args.date_confidence,
            reroot=args.
            root,  # or 'best', # We now have a default in param spec - this just adds confusion.
            Tc=0.01 if args.coalescent is None else
            args.coalescent,  #use 0.01 as default coalescent time scale
            use_marginal=args.date_inference == 'marginal',
            branch_length_inference=args.branch_length_inference or 'auto',
            precision='auto' if args.precision is None else args.precision,
            clock_rate=args.clock_rate,
            clock_std=args.clock_std_dev,
            clock_filter_iqd=args.clock_filter_iqd,
            covariance=args.covariance,
            resolve_polytomies=(not args.keep_polytomies))

        node_data['clock'] = {
            'rate': tt.date2dist.clock_rate,
            'intercept': tt.date2dist.intercept,
            'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate
        }
        if args.coalescent == 'skyline':
            try:
                skyline, conf = tt.merger_model.skyline_inferred(
                    gen=args.gen_per_year, confidence=2)
                node_data['skyline'] = [[float(x) for x in skyline.x],
                                        [float(y) for y in conf[0]],
                                        [float(y) for y in skyline.y],
                                        [float(y) for y in conf[1]]]
            except:
                print("ERROR: skyline optimization by TreeTime has failed.",
                      file=sys.stderr)
                return 1

        attributes.extend(
            ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
        if args.date_confidence:
            attributes.append('num_date_confidence')
    else:
        from treetime import TreeAnc
        # instantiate treetime for the sole reason to name internal nodes
        if args.root:
            if args.root == 'best':
                print(
                    "Warning: To root without inferring a timetree, you must specify an explicit outgroup."
                )
                print(
                    "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n"
                )
            elif args.root in ['least-squares', 'min_dev', 'oldest']:
                raise TypeError(
                    "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup."
                    % args.root)
            else:
                T.root_with_outgroup(args.root)

        tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1)

    node_data['nodes'] = collect_node_data(T, attributes)
    if args.divergence_units == 'mutations-per-site':  #default
        pass
    elif args.divergence_units == 'mutations':
        if not args.timetree:
            tt.infer_ancestral_sequences()
        nuc_map = profile_maps['nuc']

        def are_sequence_states_different(nuc1, nuc2):
            '''
            determine whether two ancestral states should count as mutation for divergence estimates
            while correctly accounting for ambiguous nucleotides
            '''
            if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']:
                return False
            elif nuc1 in nuc_map and nuc2 in nuc_map:
                return np.sum(nuc_map[nuc1] * nuc_map[nuc2]) == 0
            else:
                return False

        for node in T.find_clades():
            n_muts = len([
                position for ancestral, position, derived in node.mutations
                if are_sequence_states_different(ancestral, derived)
            ])

            if args.timetree:
                node_data['nodes'][node.name]['mutation_length'] = n_muts

            node_data['nodes'][node.name]['branch_length'] = n_muts
    else:
        print("ERROR: divergence unit",
              args.divergence_units,
              "not supported!",
              file=sys.stderr)
        return 1

    # Export refined tree and node data
    import json
    tree_success = Phylo.write(T,
                               tree_fname,
                               'newick',
                               format_branch_length='%1.8f')
    print("updated tree written to", tree_fname, file=sys.stdout)

    if args.output_node_data:
        node_data_fname = args.output_node_data
    elif args.alignment:
        node_data_fname = '.'.join(
            args.alignment.split('.')[:-1]) + '.node_data.json'
    else:
        node_data_fname = '.'.join(
            args.tree.split('.')[:-1]) + '.node_data.json'

    write_json(node_data, node_data_fname)
    print("node attributes written to", node_data_fname, file=sys.stdout)

    return 0 if tree_success else 1