def retrieve_alignment(tre, alnpath, taxonset=range(0, 101), delimiter='_'): """ Parameters ---------------- tre : single-copy treeswift tree generated from James's code alnpath : path to the phylip formatted alignment of the genes. The row labels should be a superset of the leafset of 'tre' seqlen : sequence length parameter, only the first seqlen columns are taken from the MSA taxonset: set, the taxon set of the entire dataset Returns the MSA that corresponds to the input tree. """ aln = AlignIO.read(open(alnpath), "phylip") seqlen = len(aln[0].seq) blank = "-" * seqlen whitelist = set(tre.labels(True, False)) rest = set(taxonset) #print(rest) res = MultipleSeqAlignment([]) for r in aln[:, :seqlen]: if r.id in whitelist: rid = r.id.split(delimiter)[0] rid_i = rid res.append(SeqRecord(r.seq, id=rid)) rest.remove(rid_i) for rst in rest: res.append(SeqRecord(Seq(blank), id=str(rst))) res.sort() return res
def dict_to_bioalignment(d, alphabet='generic_alphabet', sorted=True): """ Create a BioPython MultipleSequenceAlignment from a dict with pairs consisting of id and sequence. """ alignment = MultipleSeqAlignment([]) bio_alphabet = getattr(Bio.Alphabet, alphabet) for id, seq in d.items(): seq_record = SeqRecord(Seq(seq, alphabet=bio_alphabet), id=id) alignment.append(seq_record) if sorted: alignment.sort() return alignment
def impute_ancestors_dnapars(seqs, gl_seq, scratch_dir, gl_name='germline', verbose=True): """ Compute ancestral states via maximum parsimony @param seqs: list of sequences @param gl_seq: germline sequence @param scratch_dir: where to write intermediate dnapars files @param gl_name: name of germline (must be less than 10 characters long) @return genes_line: information needed to output imputed germline data @return seqs_line: information needed to output imputed sequence data """ from gctree.bin.phylip_parse import parse_outfile assert (len(gl_name) < 10) infile, config, outfile = [ os.path.join(scratch_dir, fname) for fname in [ 'infile', 'dnapars.cfg', 'outfile', ] ] aln = MultipleSeqAlignment([SeqRecord(Seq(gl_seq), id=gl_name)]) # sequence ID must be less than ten characters, but also dnapars sets internal node # names to 1, 2, 3, ..., so name them numbers descending from 100 million, hoping # we won't ever have a clone that big... for idx, seq in enumerate(seqs): aln.append(SeqRecord(Seq(seq), id=str(99999999 - idx))) # dnapars uses the name "infile" as default input phylip file with open(infile, 'w') as phylip_file: phylip_file.write(aln.format('phylip')) # and we need to tell it the line where the root sequence occurs with open(infile, 'r') as phylip_file: for lineno, line in enumerate(phylip_file): if line.startswith(gl_name): naive_idx = str(lineno) # arcane user options for dnapars # 'O', naive_idx: the location of the outgroup root # 'S', 'Y': less thorough search; runs much faster but output is less exhaustive # 'J', 13, 10: randomize input ("jumble") using seed 13 and jumbling 10 times # 4: print out steps in each site (to get all nucleotide info) # 5: print sequences in at all nodes (to get ancestors) # '.': use dot-differencing for display # 'Y': accept these options with open(config, 'w') as cfg_file: cfg_file.write('\n'.join( ['O', naive_idx, 'S', 'Y', 'J', '13', '10', '4', '5', '.', 'Y'])) # defer to command line to construct parsimony trees and ancestral states # dnapars has weird behavior if outfile and outtree already exist o_O cmd = [ 'cd', scratch_dir, '&& rm -f outfile outtree && dnapars <', os.path.basename(config), '> dnapars.log' ] if verbose: print "Calling:", " ".join(cmd) res = subprocess.call([" ".join(cmd)], shell=True) # phew, finally got some trees trees = parse_outfile(outfile, countfile=None, naive=gl_name) # take first parsimony tree genes_line = [] seq_line = [] for idx, descendant in enumerate(trees[0].traverse('preorder')): if descendant.is_root(): descendant.name = gl_name else: # use dummy name for internal node sequences descendant.name = '-'.join([descendant.up.name, descendant.name]) if [descendant.up.name, descendant.up.sequence.lower()] not in genes_line: genes_line.append( [descendant.up.name, descendant.up.sequence.lower()]) seq_line.append([ descendant.up.name, descendant.name, descendant.sequence.lower() ]) return genes_line, seq_line