def process_alignment(name, path, preserve, bycodon=False): # bycodon = args.retaincodons align = AlignIO.read(path, 'fasta') align = drop_empty_rows(align) preskeep = [] preservealign = [] step = 3 if bycodon else 1 if len(preserve) > 0: preservealign = [a for a in align if a.name in preserve] if len(preservealign) > 0: preservealign = Align.MultipleSeqAlignment(preservealign) preserved = preserved_columns(preservealign, bycodon) preskeep = range(preserved[0], preserved[1] + 1, step) checkalign = [a for a in align if a.name not in preserve] checkalign = Align.MultipleSeqAlignment(checkalign) processorder = itertools.chain( preskeep, range(0, preserved[0], step), range(preserved[1] + 1, align.get_alignment_length(), step)) else: checkalign = align processorder = range(0, align.get_alignment_length(), step) del align processorder = list(processorder) results = [] allrows = set() for i in processorder: j = min([i + step, checkalign.get_alignment_length()]) infset = { s for k in range(i, j) for s in informative_set(checkalign[:, k:k + 1]) } if len(results) == 0: results.append([name, i, infset]) allrows = infset if len(allrows) == len(checkalign): break continue # elif within_any(infset, [r[2] for r in results]): # continue elif len(allrows.union(infset)) == len(allrows): continue else: results.append([name, i, infset]) allrows = allrows.union(infset) if len(allrows) == len(checkalign): break #[[r[0], r[1], len(r[2])] for r in results] results = result_filter(results, len(checkalign)) return (results, preskeep)
def pair_align_SeqRecords(seqr_a, seqr_b, ex_aligner = needle_align): ''' Pairwise align two SeqRecords using external or internal aligner. seqr_a, seqr_b: SeqRecords to align ex_aligner: helper function that aligns sequences in two files *If ex_aligner is None, use internal aligner* Internal aligner: Bio.pairwise2.align.globalds() with Bio.SubsMat.MatrixInfo.Blosum62 and default gap penalties (gapopen = -10.0, gapextend = -0.5) Returns a MultipleSeqAlignment object ''' if ex_aligner is None: inaln = align.globalds(ungap_SeqRecord(seqr_a), ungap_SeqRecord(seqr_b), MatrixInfo.blosum62, -10.0, -0.5) exaln = Align.MultipleSeqAlignment(inaln[0][:2]) else: tmp_fa = make_tmp_fa(ungap_SeqRecord(seqr_a)) tmp_ref_fa = make_tmp_fa(ungap_SeqRecord(seqr_b)) exaln = ex_aligner(tmp_fa.name, tmp_ref_fa.name) remove(tmp_fa.name) remove(tmp_ref_fa.name) return exaln
def writelines(lines, gid): ali = al.MultipleSeqAlignment([ sr.SeqRecord(seq.Seq(''.join(line)), '{1}{0}i'.format(i, gid), description='{1}{0}d'.format(i, gid)) for i, line in enumerate(lines[::50]) ]) return ali
def _save_alignment(self): aln = Align.MultipleSeqAlignment([ SeqRecord.SeqRecord(Seq.Seq(''.join(n.sequence)), id=n.name, name=n.name, description="") for n in self.tree.find_clades() ]) AlignIO.write(aln, os.path.join(self._root_dir, out_aln_fasta), "fasta")
def sorted_uncertainty_from_alignment (align, max_freq_n = 0.1): ## IUPAC ambiguity codes if max_freq_n < 0: max_freq_n = 0 # this means only columns without N at all will be used max_n = int(max_freq_n * len(align)) if max_n >= len(align): max_n = len(align) - 1 summary_align = AlignInfo.SummaryInfo(Align.MultipleSeqAlignment(align)) # must be an MSA, not a list pssm = summary_align.pos_specific_score_matrix() # pssm example: {'-':3, 'A':0, 'T':4.0, 'G':0, 'C':2.0, 'N':1} per column, means 3 seqs have "-", 4 have "T"... index = [[i,s["N"] + s["-"]] for i, s in enumerate(pssm)] return [x[0] for x in index if x[1] < max_n]
def load_aln(infile): aln = Align.MultipleSeqAlignment([]) aln_dict = {} with open(infile, 'r') as f: for seq_record in SeqIO.parse(f, 'fasta'): aln.append(seq_record) aln_dict[seq_record.id] = str(seq_record.seq) return aln, aln_dict
def alignment(seqs_in, profile, run_id): '''Compute an alignment of multiple sequences to a given covariance model profile such as constructed by cmbuild via infernal.profiles. input: seqs: a list of biopython SeqRecord objects profile: the filename of a covariance model profile run_id: a run id to use for naming temporary files to avoid collisions output: ali: an rfam multiple sequence alignment ref: the profile reference sequence aligned to ali struct: the profile reference structure aligned to ali ''' if type(seqs_in[0]) == str: raise Exception( 'Sorry but string lists are not supported. We need ids!') #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, # Bio.Seq.Alphabet.RNAAlphabet), # 'S{0:03}'.format(idx)) # for idx, s in enumerate(seqs)] else: seqs = [ Bio.SeqRecord.SeqRecord( Bio.Seq.Seq( ''.join([let for let in str(ali.seq) if let in 'AUTGC']), Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx)) for idx, ali in enumerate(seqs_in) ] name_maps = dict([('S{0:03}'.format(idx), s.id) for idx, s in enumerate(seqs_in)]) infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format( run_id, idx)) outfile = cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format( run_id, idx)) Bio.SeqIO.write(seqs, infile, 'fasta') cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile) ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE) out = ispc.communicate()[0] fopen = open(outfile) seqs, ref, struct = rutils.stk_parse(fopen) fopen.close() ali = ba.MultipleSeqAlignment(seqs) for a in ali: a.seq = a.seq.upper() a.id = name_maps[a.id] return ali, ref, struct
def remove_positions_with_gaps_in_first_sequence(input_fasta, output_fasta): # removes all positions with gaps in the first sequence aln = AlignIO.read(str(input_fasta), 'fasta') first_sequence = str(aln[0].seq) good_positions = [ k for k in range(len(first_sequence)) if first_sequence[k] != '-' ] first_pos = good_positions[0] clean_aln = Align.MultipleSeqAlignment(aln[:, first_pos:first_pos + 1]) for pos in good_positions[1:]: clean_aln += aln[:, pos:pos + 1] AlignIO.write(clean_aln, str(output_fasta), 'fasta') return output_fasta
def load_aln_to_repair(infile, omit): aln = Align.MultipleSeqAlignment([]) aln_dict = {} with open(infile, 'r') as f: for seq_record in SeqIO.parse(f, 'fasta'): aln_dict[seq_record.id] = str(seq_record.seq) if seq_record.name not in omit: aln.append(seq_record) return aln, aln_dict
def save_alignment(tt: TreeTime, config: TreetimeConfig): records = [ SeqRecord.SeqRecord( Seq.Seq("".join(n.sequence)), id=n.name, name=n.name, description="", ) for n in tt.tree.find_clades() ] aln = Align.MultipleSeqAlignment(records) with open(config.output_filenames.FASTA, "w") as ofile: AlignIO.write(aln, ofile, "fasta")
def separate_alignments(msa_data, sus_ids, out_dir, filename, patient_zero='NC_045512.2'): good_seqs = [] poor_seqs = [] for rec in msa_data: if rec.id in sus_ids: poor_seqs.append(rec) elif rec.id == patient_zero: good_seqs.append(rec) poor_seqs.append(rec) else: good_seqs.append(rec) good_msa = Align.MultipleSeqAlignment(good_seqs) good_msa_fn = filename + '_aligned_white.fa' good_msa_fp = out_dir / good_msa_fn AlignIO.write(good_msa, good_msa_fp, 'fasta') poor_msa = Align.MultipleSeqAlignment(poor_seqs) poor_msa_fn = filename + '_aligned_inspect.fa' poor_msa_fp = out_dir / poor_msa_fn AlignIO.write(poor_msa, poor_msa_fp, 'fasta') return 0
def pop_row(aln, seqid): ''' Pop a row from an alignment by sequence id aln: a Bio.Align.MultipleSeqAlignment object seqid: id of Bio.SeqRecord.SeqRecord to pop from aln Returns a tuple containing the popped SeqRecord and a copy of aln without seqid's SeqRecord. ''' aln_d = SeqIO.to_dict(aln) seq = aln_d[seqid] del aln_d[seqid] aln = Align.MultipleSeqAlignment(aln_d.itervalues()) return seq, aln
def fetch_seqs(seqs_filepath, out_fp, sample_idxs: list, is_aligned=False, is_gzip=False): if is_aligned: if is_gzip: with gzip.open(seqs_filepath, "rt") as handle: cns = AlignIO.read(handle, 'fasta') else: cns = AlignIO.read(seqs_filepath, 'fasta') my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in sample_idxs]) return AlignIO.write(my_cns, out_fp, 'fasta') else: if is_gzip: with gzip.open(seqs_filepath, "rt") as handle: cns = SeqIO.parse(handle, 'fasta') else: cns = SeqIO.parse(seqs_filepath, 'fasta') my_cns = [rec for rec in cns if rec.id in sample_idxs] return SeqIO.write(my_cns, out_fp, 'fasta')
def consensus_from_alignment (align): ## IUPAC ambiguity codes if ambiguous_dna: ## biopython < 1.78 xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N"), ambiguous_dna), id=rec.id, description=rec.description) for rec in align] else: xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N")), id=rec.id, description=rec.description) for rec in align] summary_align = AlignInfo.SummaryInfo(Align.MultipleSeqAlignment(xaln)) # must be an MSA, not a list pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=["-"]) consensus = []; # pssm example: {'-':3, 'A':0, 'T':4.0, 'G':0, 'C':2.0, 'N':1} per column, means 3 seqs have "-", 4 have "T"... for score in pssm: # we don't care about frequency, only presence # base can be "R", then iupac.dna_values[R] = [A,G] acgt_list = [x for base, count in score.items() for x in IUPACData.ambiguous_dna_values[base] if count > 0] consensus.append(iupac_dna[ ''.join(sorted(set(acgt_list))) ]) if ambiguous_dna: return Seq.Seq(''.join(consensus),ambiguous_dna) else: return Seq.Seq(''.join(consensus))
def main(): global f_ab, f_extra, bt_positions seqs = {} records = [] fname_list = [basename(fpath) for fpath in options.input_files] with open(options.output.replace(".abbababa", ".flist"), "w") as fout: fout.write("\n".join(options.input_files)) for fpath in options.input_files: fname = basename(fpath) seqs[fname] = SeqIO.index(fpath, "fasta") records_per_fasta = seqs.get(fname).keys() records.extend([record for record in records_per_fasta]) print fname anc = SeqIO.index(options.anc, "fasta") print "\n" records = set([str(r) for r in records]) f_ab = open(options.output, "w") f_extra = open(options.extra, "w") bt_positions = BedToolPositions() for record in sorted(records): sequences = [] # min_alignment_length = min([len(seqs.get(seq_key).get(record)) for seq_key in fname_list] + # [len(anc.get(record))]) for seq_key in fname_list: # print seq_key sequences.append(seqs.get(seq_key).get(record)) min_alignment_length = min([len(sequence) for sequence in sequences] + [len(anc.get(record))]) per_chr_alignment = Align.MultipleSeqAlignment( [sequence[:min_alignment_length] for sequence in sequences]) do_abbababa(per_chr_alignment, anc.get(record)[:min_alignment_length]) bt_positions.write_to_BED(options.bed_out) f_ab.close() f_extra.close() return 1
def clean_alignment(alignment): """ Remove ambiguities from alignment. Iterate over sites in the alignment and build a new alignment containing either only pure ATGC sites (-c) or sites with up to a specified proportion of N's (-c -n FLOAT). """ site_length = len(alignment[:, 0]) cleaned_alignment = Align.MultipleSeqAlignment( [seq[:0] for seq in alignment]) if args.n_ratio: logging.info(f"Removing sites with > {int(args.n_ratio * 100)}% of " + f"N's from '{alignment[0].name}'") for pos in range(0, len(alignment[0])): site = alignment[:, pos:pos + 1] site_nucleotides = alignment[:, pos] n_count = site_nucleotides.upper().count('N') n_ratio = n_count / site_length if n_ratio <= args.n_ratio: cleaned_alignment += site else: logging.info("Removing sites with ambiguities from " + f"'{alignment[0].name}'") iupac = ['N', 'Y', 'R', 'K', 'M', 'W', 'S', 'B', 'D', 'H', 'V', '-'] iupac_length = len(iupac) for pos in range(0, len(alignment[0])): site = alignment[:, pos:pos + 1] site_nucleotides = alignment[:, pos] bad_char = False if site_length > iupac_length: if any([char in site_nucleotides.upper() for char in iupac]): bad_char = True break else: for char in site: if str(char.seq).upper() in 'NYRKMWSBDHV-': bad_char = True break if not bad_char: cleaned_alignment += site return cleaned_alignment
def _evolve_sequence(tree, L, gtr): """ Produce random sequence of a given length L, evolve it on a given tree using the given gtr model. """ if isinstance(tree, str): tree = Phylo.read(tree, 'newick') root_seq = np.random.choice(gtr.alphabet, p=gtr.Pi, size=1000) tree.root.ref_seq = root_seq print("Started sequence evolution...") for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = gtr.propagate_profile( treetime.seq_utils.seq2prof(node.up.ref_seq, gtr.profile_map), t) # normalie profile p = (p.T / p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([ int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0]) ]) node.ref_seq = np.array([gtr.alphabet[k] for k in ref_seq_idxs]) records = [ Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name) for k in tree.get_terminals() ] aln = Align.MultipleSeqAlignment(records) #full_aln = Align.MultipleSeqAlignment(full_records) print("Sequence evolution done...") return root_seq, aln
def main(): date = '2020_10_07' base = '/mnt/g/Covid-19/' + date + '/' in_file = base + 'sequences_filtered_aln2_red.fasta' out_file = base + 'trees/sequences_filtered_aln2_samp1.fasta' num_samples = 1000 ref_id = 'NC_045512.2' aln = read_alignment_file(in_file) seq_ids = random.sample(list(aln.keys()), k=num_samples) if not ref_id in seq_ids: seq_ids[0] = ref_id alignment = Align.MultipleSeqAlignment([]) for id in seq_ids: alignment.append(aln[id]) AlignIO.write(alignment, open(out_file, 'w'), 'fasta')
def build_alignment(filenames, sequences, scaffold): """ Build scaffold alignment. Extract sequences of a given scaffold from each input FASTA and build a scaffold alignment containing all subjects in the input FASTA list. """ seqs = [] for filename in filenames: seq_to_add = sequences.get(filename).get(scaffold) seq_to_add.id = filename.split('.')[0] seqs.append(seq_to_add) logging.info(f"Building alignment for '{scaffold}'") min_len_alignment = min([len(seq) for seq in seqs]) alignment = Align.MultipleSeqAlignment( [seq[:min_len_alignment] for seq in seqs]) return alignment
def save_timetree_results(tree, outfile_prefix): """ First, it scans the tree and assigns the namesto every node with no name then, it saves the information as the csv table """ import pandas df = pandas.DataFrame( columns=["Given_date", "Initial_root_dist", "Inferred_date"]) aln = Align.MultipleSeqAlignment([]) i = 0 # save everything df.to_csv(outfile_prefix + ".meta.csv") # TODO save variance to the metadata Phylo.write(tree.tree, outfile_prefix + ".tree.nwk", "newick") AlignIO.write(aln, outfile_prefix + ".aln.fasta", "fasta") # save root distibution mtp = tree.tree.root.msg_to_parent threshold = mtp.y.min() + 1000 idxs = [mtp.y < threshold] mtpy = mtp.y[idxs] mtpx = utils.numeric_date() - np.array( map(tree.date2dist.get_date, mtp.x[idxs])) mtpy[0] = threshold mtpy[-1] = threshold np.savetxt(outfile_prefix + ".root_dist.csv", np.hstack((mtpx[:, None], mtpy[:, None])), header="Root date,-log(LH)", delimiter=',') # zip results to one file import zipfile outzip = outfile_prefix + ".zip" zipf = zipfile.ZipFile(outzip, 'w') zipf.write(outfile_prefix + ".meta.csv") zipf.write(outfile_prefix + ".aln.fasta") zipf.write(outfile_prefix + ".tree.nwk") zipf.write(outfile_prefix + ".root_dist.csv")
def remove_columns_from_msa(alignment_in, cols_to_remove): # get 0 based index of all wanted columns cols_to_remove_0_base = [(i - 1) for i in cols_to_remove] aln_cols_index_all = list(range(alignment_in.get_alignment_length())) aln_cols_index_wanted = [] for i in aln_cols_index_all: if i not in cols_to_remove_0_base: aln_cols_index_wanted.append(i) # get wanted alignment segments wanted_segments = list_to_segments(aln_cols_index_wanted) # create an empty Alignment object alignment_new = Align.MultipleSeqAlignment([]) for sequence in alignment_in: new_seq_object = Seq('') new_seq_record = SeqRecord(new_seq_object) new_seq_record.id = sequence.id new_seq_record.description = sequence.description alignment_new.append(new_seq_record) # add wanted columns to empty Alignment object for segment in wanted_segments: # for single column segment if segment[0] == segment[1]: segment_value = alignment_in[:, segment[0]] m = 0 for each_seq in alignment_new: each_seq.seq = Seq(str(each_seq.seq) + segment_value[m]) m += 1 # for multiple columns segment else: segment_value = alignment_in[:, (segment[0]):(segment[1] + 1)] alignment_new += segment_value return alignment_new
def seq_dists(ali,run_id, tree = True): import Levenshtein n = len(ali) dists = zeros((n,n)) if tree: ali_named = align.MultipleSeqAlignment(ali) maps = {} for idx, a in enumerate(ali_named): a.id = 'S{0:05}'.format(idx) maps[a.id] = idx tree = phyml.tree(ali_named, run_id = run_id, bionj = True) for n1 in tree.get_terminals(): for n2 in tree.get_terminals(): dists[maps[n1.name],maps[n2.name]] = \ tree.distance(n1,n2) else: for i in range(n): for j in range(i): dists[i,j] = Levenshtein.distance(str(ali[i].seq), str(ali[j].seq)) dists[j,i] = dists[i,j] return dists
def _save_phy_aln(fasta_fname, phy_fname, n_seqs=None): if os.path.isfile(phy_fname): return records = SeqIO.parse(fasta_fname, "fasta") records_phy = [] records_phy_names = [] i = 0 for record in records: if record.id[0:8] not in records_phy_names: record.id = record.id[0:8] records_phy.append(record) records_phy_names.append(record.id[0:8]) i += 1 if i == n_seqs: break aln = Align.MultipleSeqAlignment(records_phy) handle = open(phy_fname, 'w') pw = SequentialPhylipWriter(handle) pw.write_alignment(aln) handle.close()
def fake_alignment(T): """ Fake alignment to appease treetime when only using it for naming nodes... This is lifted from refine.py and ideally could be imported Parameters ------- T : <class 'Bio.Phylo.BaseTree.Tree'> Returns ------- <class 'Bio.Align.MultipleSeqAlignment'> """ from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) return aln
def main(): args = getcliargs() # Read in alignment aln = AlignIO.read(sys.stdin, 'fasta') if args.removeseqs: row_keep = [] for a in aln: if not re.match(args.removeseqs, a.name): row_keep.append(a) aln = Align.MultipleSeqAlignment(row_keep) if args.removegapcols: ncol = aln.get_alignment_length() col_keep = [] for i in range(ncol): if set(list(aln[:, i])) != {'-'}: col_keep.append(i) aln_keep = aln[:, col_keep[0]:col_keep[0] + 1] for i in col_keep[1:]: aln_keep += aln[:, i:i + 1] aln = aln_keep AlignIO.write(aln, sys.stdout, 'fasta')
dates[name] = float(date) except: failed_dates += 1 if len(dates) < failed_dates: print("\n\nDATE PARSING FAILED, ABORTING...") import sys sys.exit(1) ########################################################################### ### FAKING ALIGMENT IF NONE GIVEN ########################################################################### if params.aln is None: from Bio import Seq, SeqRecord, Align aln = Align.MultipleSeqAlignment([ SeqRecord.SeqRecord(Seq.Seq("AAA"), id=node, name=node) for node in dates ]) ########################################################################### ### ESTIMATE ROOT (if requested) AND DETERMINE TEMPORAL SIGNAL ########################################################################### base_name = '.'.join(params.tree.split('/')[-1].split('.')[:-1]) myTree = TreeTime(dates=dates, tree=params.tree, aln=aln, gtr='JC69', verbose=params.verbose) if not params.keep_root: myTree.reroot('best')
def save_results(tt, state, root): print(root) if tt is not None: # save files treetime.treetime_to_json(tt, os.path.join(root, "out_tree.json")) treetime.tips_data_to_json(tt, os.path.join(root, "out_tips.json")) treetime.root_lh_to_json(tt, os.path.join(root, "out_root_lh.json")) treetime.root_lh_to_csv(tt, os.path.join(root, "out_root_lh.csv")) # save full alignment aln = Align.MultipleSeqAlignment([ SeqRecord.SeqRecord(Seq.Seq(''.join(n.sequence))) for n in tt.tree.find_clades() ]) AlignIO.write(aln, os.path.join(root, "out_aln.fasta"), "fasta") # save newick tree Phylo.write(tt.tree, os.path.join(root, "out_newick_tree.nwk"), "newick") #save metadata as csv file treetime.save_all_nodes_metadata( tt, os.path.join(root, "out_metadata.csv")) #save molecular clock in normal format mclock = np.array([ (tip.dist2root, tip.numdate_given) for tip in tt.tree.get_terminals() if hasattr(tip, 'dist2root') and hasattr(tip, 'numdate_given') ]) np.savetxt(os.path.join(root, 'molecular_clock.csv'), mclock, delimiter=',', header='Distance_to_root,Sampling_date') # save GTR in csv file treetime.save_gtr_to_file(tt.gtr, os.path.join(root, "out_gtr.txt")) # zip all results to one file with zipfile.ZipFile(os.path.join(root, 'treetime_results.zip'), 'w') as out_zip: out_zip.write(os.path.join(root, 'out_newick_tree.nwk'), arcname='out_newick_tree.nwk') out_zip.write(os.path.join(root, 'out_aln.fasta'), arcname='out_aln.fasta') out_zip.write(os.path.join(root, 'out_metadata.csv'), arcname='out_metadata.csv') out_zip.write(os.path.join(root, 'out_tree.json'), arcname='out_tree.json') out_zip.write(os.path.join(root, 'settings.json'), arcname='settings.json') out_zip.write(os.path.join(root, 'molecular_clock.csv'), arcname='molecular_clock.csv') out_zip.write(os.path.join(root, 'out_root_lh.csv'), arcname='out_root_lh.csv') out_zip.write(os.path.join(root, 'out_gtr.txt'), arcname='out_gtr.txt') state['status'] = 'Done' return tt, True else: state['status'] = 'Error' return tt, False
def do_abbababa(alignment, anc_sequence): i = 0 chr = alignment[0].id n = len(alignment) anc_sequence = Align.MultipleSeqAlignment([anc_sequence]) print chr for i in xrange(0, len(alignment[1]), blocksize): block = alignment[:, i:i + blocksize] anc_block = anc_sequence[:, i:i + blocksize] f_ab.write("%s\t%i\t%i" % (chr, i, i + len(block[0]) - 1)) f_extra.write("%s\t%i\t%i" % (chr, i, i + len(block[0]) - 1)) c = 0 for h3 in xrange(n): for h2 in xrange(n): if h2 == h3: continue for h1 in xrange(n): if (h1 == h3) or (h1 >= h2): continue print "combination %i of %i" % (c + 1, (factorial(n) / 2)) bt_positions.taxon_sequence = (h1, h2, h3) c += 1 #print h1,h2,h3 abba = 0 baba = 0 bbaa = 0 snv = 0 for j in range(len( block[1])): # iterate over sites in alignments s1 = block[h1, j] s2 = block[h2, j] s3 = block[h3, j] s_anc = anc_block[0, j] # print set([h1, h2, h3, h4]) #if len(set) if len(set([s1, s2, s3, s_anc])) < 2: continue # site not biallelic badchar = False for site in [s1, s2, s3, s_anc]: # check for N and ambiguities if site in "NYRKMWSBDHV-": badchar = True break # if site in ["N", "Y", "R", "K", "M", "W", "S", "B", "D", "H", "V", "-"]: # continue if badchar: continue if (s1 == s2 ) and s3 == s_anc and s1 != s3 and s2 != s_anc: bbaa += 1 snv += 1 bt_positions.add_feature(chr, i + j, "BBAA") elif s1 != s2 and s3 != s_anc: if s1 == s3 and s2 == s_anc: baba += 1 bt_positions.add_feature(chr, i + j, "BABA") elif s2 == s3 and s1 == s_anc: abba += 1 bt_positions.add_feature(chr, i + j, "ABBA") else: snv += 1 bt_positions.add_feature( chr, i + j, "SNV") # add non ABBABABA or BBAA SNV f_ab.write("\t%i\t%i" % (abba, baba)) f_extra.write("\t%i\t%i\t%i" % (abba, baba, bbaa)) f_ab.write("\n") f_extra.write("\n") return 1
for entry in tmp_aln: # if this alignment has a different size from the rest, something is reaaaaaly wrong! if len(entry) != aln_length: sys.exit('\t**Error, block "%s" has a different length than the rest of the MSA: %s' %(entry.name, aln)) if re.match('GC[AF]_', entry.name): genome, gene = entry.name.split('|') else: genome, gene = entry.name.split('_') concatenation[genome] += deepcopy(entry.seq) partitions.write('LG, %s = %i-%i\n' %(aln.replace('.fasta.aln', ''), current_position, current_position+aln_length-1) ) current_position += aln_length # # add gaps for those genomes missing this gene (same size as the expected alignment) for genome in genome_union.difference(genomes[aln]): concatenation[genome] += Align.Seq( '-' * aln_length, aln_alphabet ) missing_genes[genome] += aln_length partitions.close() # # remove genomes missing more than 20% of the marker genes for genome, num_missing_genes in missing_genes.items(): if num_missing_genes/total_genes > 0.1: print '\t\t**%s: excluded from analysis for missing %.2f from concatenated alignment!' %(genome, (num_missing_genes/total_genes)*100) concatenation.pop( genome ) AlignIO.write( Align.MultipleSeqAlignment( concatenation.values() ), '%s/concatenated_alignment.aln' %output_folder, 'fasta' )
def run(args): if args.seed is not None: np.random.seed(args.seed) # check alignment type, set flags, read in if VCF is_vcf = False ref = None # node data is the dict that will be exported as json node_data = {'alignment': args.alignment} # list of node attributes that are to be exported, will grow attributes = ['branch_length'] try: T = read_tree(args.tree) node_data['input_tree'] = args.tree except (FileNotFoundError, InvalidTreeError) as error: print("ERROR: %s" % error, file=sys.stderr) return 1 if not args.alignment: if args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference", file=sys.stderr) return 1 if args.divergence_units == 'mutations': print( "ERROR: alignment is required for divergence in units of mutations", file=sys.stderr) return 1 # fake alignment to appease treetime when only using it for naming nodes... from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments", file=sys.stderr) return 1 compress_seq = read_vcf(args.alignment, args.vcf_reference) aln = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True else: aln = args.alignment from treetime import version as treetime_version print(f"augur refine is using TreeTime version {treetime_version}") # if not specified, construct default output file name with suffix _tt.nwk if args.output_tree: tree_fname = args.output_tree elif args.alignment: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' else: tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk' if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] if args.keep_root: # This flag overrides anything specified by 'root' args.root = None if args.timetree: # load meta data and covert dates to numeric if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr) return 1 metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() dates = get_numerical_dates(metadata, fmt=args.date_format, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] tt = refine( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args. root, # or 'best', # We now have a default in param spec - this just adds confusion. Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale use_marginal=args.date_inference == 'marginal', branch_length_inference=args.branch_length_inference or 'auto', precision='auto' if args.precision is None else args.precision, clock_rate=args.clock_rate, clock_std=args.clock_std_dev, clock_filter_iqd=args.clock_filter_iqd, covariance=args.covariance, resolve_polytomies=(not args.keep_polytomies)) node_data['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } if args.coalescent == 'skyline': try: skyline, conf = tt.merger_model.skyline_inferred( gen=args.gen_per_year, confidence=2) node_data['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]], [float(y) for y in skyline.y], [float(y) for y in conf[1]]] except: print("ERROR: skyline optimization by TreeTime has failed.", file=sys.stderr) return 1 attributes.extend( ['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date']) if args.date_confidence: attributes.append('num_date_confidence') else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes if args.root: if args.root == 'best': print( "Warning: To root without inferring a timetree, you must specify an explicit outgroup." ) print( "\tProceeding without re-rooting. To suppress this message, use '--keep-root'.\n" ) elif args.root in ['least-squares', 'min_dev', 'oldest']: raise TypeError( "The rooting option '%s' is only available when inferring a timetree. Please specify an explicit outgroup." % args.root) else: T.root_with_outgroup(args.root) tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) node_data['nodes'] = collect_node_data(T, attributes) if args.divergence_units == 'mutations-per-site': #default pass elif args.divergence_units == 'mutations': if not args.timetree: tt.infer_ancestral_sequences() nuc_map = profile_maps['nuc'] def are_sequence_states_different(nuc1, nuc2): ''' determine whether two ancestral states should count as mutation for divergence estimates while correctly accounting for ambiguous nucleotides ''' if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']: return False elif nuc1 in nuc_map and nuc2 in nuc_map: return np.sum(nuc_map[nuc1] * nuc_map[nuc2]) == 0 else: return False for node in T.find_clades(): n_muts = len([ position for ancestral, position, derived in node.mutations if are_sequence_states_different(ancestral, derived) ]) if args.timetree: node_data['nodes'][node.name]['mutation_length'] = n_muts node_data['nodes'][node.name]['branch_length'] = n_muts else: print("ERROR: divergence unit", args.divergence_units, "not supported!", file=sys.stderr) return 1 # Export refined tree and node data import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') print("updated tree written to", tree_fname, file=sys.stdout) if args.output_node_data: node_data_fname = args.output_node_data elif args.alignment: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data.json' else: node_data_fname = '.'.join( args.tree.split('.')[:-1]) + '.node_data.json' write_json(node_data, node_data_fname) print("node attributes written to", node_data_fname, file=sys.stdout) return 0 if tree_success else 1