def filter_phyloxml(tree, species_list) : msa = MultipleSeqAlignment([]) flies = set() #print >> sys.stderr, " filtering phyloxml to fly species..." for node in tree.get_terminals() : include = False for prop in node.properties : if (prop.ref == 'Compara:genome_db_name') and (prop.value in species_list) : flies.add(prop.value) include = True break if include : assert len(node.sequences) == 1 sqrcd = node.sequences[0].to_seqrecord() sqrcd.id = node.name sqrcd.description = "" msa.append(sqrcd) if not include : tree.prune(node) return tuple(flies), remove_gap_columns(msa), tree
def stage_one_trimming(self, alignment, window_size, threshold, proportion): """ First stage (of 3) alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ # get the trim positions that we determine begin and end "good" # alignments start, end = self.running_average(alignment, window_size, threshold, proportion) # create a new alignment object to hold our alignment s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-")) for sequence in alignment: if start >= 0 and end: trim = sequence[start:end] # ensure we don't just add a taxon with only gaps/missing # data if set(trim) != set(['-']) and set(trim) != (['?']): s1_trimmed.append(sequence[start:end]) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def clean_seqs(gene): '''clean up sequences to remove N & - characters''' clean_gene = MultipleSeqAlignment([]) for genome in gene: if genome.seq.count("N") + genome.seq.count("-") < 0.1*(len(genome.seq)): clean_gene.append(genome) return clean_gene
def maskalignment(self,arg, percent,filetype): name = arg[0:10] maskFileName = self.PathtoOutput + '/Guidance/GuidanceOutput/' + name + '_masked_' + str(percent) + '.fas' outFile = open(maskFileName,'w') alignment = AlignIO.read(self.PathtoOutput + '/Guidance/GuidanceOutput/' + arg, filetype) trimAlign = MultipleSeqAlignment([]) numRows = len(alignment) x = float(percent) * float(numRows) / 100.0 numGap = numRows - int(x) numCol = alignment.get_alignment_length() #print "Total number of rows: %i" % numRows #print "Number of gapped sequences allowed at a given site: %i" % numGap #print "Total number of columns: %i" % numCol my_array = {} colToKeep=[] for i in range(numCol): #print i lineName = "line_" + str(i) my_array[lineName] = alignment[:,i] if my_array[lineName].count('-') > numGap: print "get rid of column %i" % i else: colToKeep.append(i) for record in alignment: newseq = "" for i in colToKeep: newseq= newseq + (record[i]) newRecord = SeqRecord(Seq(newseq), id=record.id) trimAlign.append(newRecord) outFile.write('>' + record.id + '\n' + newseq + '\n')
def writing(seqs,seq_descs,seq_ids, filename): #Arguments are sequence, description, ids, filename outdir = sys.argv[3] #Output directory if os.path.isdir(outdir): #Checks the presence of directory print "Directory exists. New directory not created" else: command= "mkdir "+ outdir os.system(command) #outpath defines path of the subfolder we want to store results in outpath = outdir + '/' + sys.argv[1] command = "mkdir " + outpath os.system(command) #write the result to output align = MultipleSeqAlignment([]) output_file = outpath + '/' + filename + '.' + 'output' #print output_file #path = outdir + '/'+ output_file for i in range(len(seqs)): align.append(SeqRecord(Seq(seqs[i],generic_protein),id=seq_ids[i],description=seq_descs[i])) AlignIO.write(align, output_file ,"fasta")
def _concatenate(self, alignments): """Return single alignment from list of alignments for multiple genes.""" if len(alignments) == 1: return alignments[0] # sort IDs alignment_ids = [] for gene in alignments: gene_ids = [] for rec in gene: gene_ids.append(rec.id) alignment_ids.append(gene_ids) all_ids = [] [all_ids.extend(e) for e in alignment_ids] all_ids = list(set(all_ids)) # concatenate alignment = MultipleSeqAlignment([]) for txid in all_ids: sequence = "" for i, gene in enumerate(alignments): if txid in alignment_ids[i]: sequence += gene[alignment_ids[i].index(txid)].seq else: sequence += "-" * gene.get_alignment_length() sequence = SeqRecord(sequence, id=txid, description="multigene sequence") alignment.append(sequence) return alignment
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.input) all_taxa = set([]) for count, f in enumerate(files): #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) new_align = MultipleSeqAlignment([], generic_dna) for align in AlignIO.parse(f, 'nexus'): for seq in list(align): #pdb.set_trace() fname = os.path.splitext(os.path.basename(f))[0] new_seq_name = re.sub("^{}_*".format(fname), "", seq.name) all_taxa.add(new_seq_name) seq.id = new_seq_name seq.name = new_seq_name new_align.append(seq) assert len(all_taxa) == args.taxa, "Taxon names are not identical" outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
def _prepResults(self): #print "Query", self.state['queryName'], self.state['queryLen'] #print "HitINFO:", self.hitInfo #print "HitRecord", self.hitRecord #print "HitAlign:", self.alignMap if ( len(self.hitInfo) == 0 and not self.state['eof'] ): alignment = MultipleSeqAlignment( [], self.alphabet) self.outList.append( alignment ) for hit in self.hitInfo: for domain in self.hitRecord[ hit ]: queryStr = "".join( self.alignMap[ hit ][ domain ][ 'query' ] ) targetStr = "".join( self.alignMap[ hit ][ domain ][ 'target' ] ) query = SeqRecord(Seq(queryStr, self.alphabet), id = self.state['queryName'], description = self.state.get( 'desc', "" ), annotations = {}) target = SeqRecord(Seq(targetStr, self.alphabet), id = hit, annotations = {}) alignment = HMMERAlign( [query,target], self.alphabet) alignment._annotations = self.hitRecord[ hit ][ domain ] alignment._annotations[ 'seqName' ] = self.state['queryName'] alignment._annotations[ 'hmmName' ] = hit self.outList.append( alignment )
def pad_nucleotide_sequences(aln_aa, seq_nuc): ''' introduce gaps of 3 (---) into nucleotide sequences corresponding to aligned DNA sequences. Parameters: - aln_aa: amino acid alignment - seq_nuc: unaligned nucleotide sequences. Returns: - aligned nucleotide sequences with all gaps length 3 ''' from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq aln_nuc = MultipleSeqAlignment([]) for aa_seq in aln_aa: try: tmp_nuc_seq = str(seq_nuc[aa_seq.id].seq) except KeyError as e: print aa_seq.id print 'Key not found, continue with next sequence' continue tmpseq = '' nuc_pos = 0 for aa in aa_seq: if aa=='-': tmpseq+='---' else: tmpseq+=tmp_nuc_seq[nuc_pos:(nuc_pos+3)] nuc_pos+=3 aln_nuc.append(SeqRecord(seq=Seq(tmpseq),id=aa_seq.id)) return aln_nuc
def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType): ''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.''' new='?' parsed = AlignIO.read(refMSA_file, 'fasta') newseqs=[] numres=0 totalmasked=0 maskedMSA=MultipleSeqAlignment([]) for row in range(numseq): newseq='' for position in range(alnlen): thispos=str(parsed[row].seq[position]) if thispos=='-': newseq=newseq+parsed[row].seq[position] else: numres+=1 thescore=scores[row][position] if float(thescore)<float(x): #mask if below threshold. newseq=newseq+new totalmasked+=1 else: #or, keep that position newseq=newseq+parsed[row].seq[position] newseqs.append(newseq) for i in range(numseq): if str(seqType)=='protein': aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='') elif str(seqType)=='dna': aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='') maskedMSA.append(aln_record) outhandle=open(final_file, 'w') outhandle.write(maskedMSA.format(str(formatout))) outhandle.close()
def NexusIterator(handle, seq_count=None): """Returns SeqRecord objects from a Nexus file. Thus uses the Bio.Nexus module to do the hard work. You are expected to call this function via Bio.SeqIO or Bio.AlignIO (and not use it directly). NOTE - We only expect ONE alignment matrix per Nexus file, meaning this iterator will only yield one MultipleSeqAlignment. """ n = Nexus.Nexus(handle) if not n.matrix: #No alignment found raise StopIteration alignment = MultipleSeqAlignment(n.alphabet) #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. #The original names and the modified names are kept in these two lists: assert len(n.unaltered_taxlabels) == len(n.taxlabels) if seq_count and seq_count != len(n.unaltered_taxlabels): raise ValueError("Found %i sequences, but seq_count=%i" \ % (len(n.unaltered_taxlabels), seq_count)) for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels): assert new_name.startswith(old_name) seq = n.matrix[new_name] #already a Seq object with the alphabet set #ToDo - Can we extract any annotation too? alignment.append(SeqRecord(seq, id=new_name, name=old_name, description="")) #All done yield alignment
def main(): params = parseArguments(version) for filename in params['filenames']: alignment, info = filterBlocks(filename, params) metadata = calculateMetadata(alignment, info) initialJson = '' if params['H']: # Generate JSON for HTML output, add the "valid blocks" sequence to the initial one. validSeq = Seq(metadata['validString']) validSeqRecord = SeqRecord(seq = validSeq, id = 'Valid blocks', name = 'Valid Blocks') scoreSeq = Seq(metadata['scoreString']) scoreSeqRecord = SeqRecord(seq = scoreSeq, id = 'Score', name = 'Heterozygosity Score') jsonAlignment = MultipleSeqAlignment([validSeqRecord, scoreSeqRecord]) for record in alignment: jsonAlignment.append(record) initialJson = getInitialJson(jsonAlignment) if params['debug']: printAlign(alignment, info) blocks = metadata['blocks'] outfile = filename.split('/')[-1].split('.')[:-1] outfile = '.'.join(outfile) + '-out' writeAlign(alignment, metadata, outfile, initialJson, info, params)
def stage_one_trimming(self, alignment, window_size, proportion, threshold, min_len, replace_ends=False): """ First stage (of 3) alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ # get the trim positions that we determine begin and end "good" # alignments start, end = self.running_average(alignment, window_size, proportion, threshold) # create a new alignment object to hold our alignment s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) for sequence in alignment: # ensure correct sequence alphabet or we'll get a conflict when # we try to generate a consensus sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA() if start >= 0 and end: trim = sequence[start:end] # ensure we don't just add a taxon with only gaps/missing # data and that alignments are >= min_len if set(trim) != set(['-']) and set(trim) != (['?']) and len(trim) >= min_len: if not replace_ends: s1_trimmed.append(sequence[start:end]) else: # replace end gaps with missing data character ? # called on third iteration of trimming repl = self._replace_ends(str(sequence[start:end].seq)) s1_trimmed.append(self._record_formatter(repl, sequence.id)) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def add_gaps_to_align(aln, organisms, check_missing, missing, verbatim=False, min_taxa=3): local_organisms = copy.deepcopy(organisms) if len(aln) < min_taxa: new_align = None elif len(aln) >= min_taxa: new_align = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) overall_length = len(aln[0]) for seq in aln: # strip any reversal characters from mafft seq.name = seq.name.lstrip('_R_') if not verbatim: new_seq_name = '_'.join(seq.name.split('_')[1:]) else: new_seq_name = seq.name.lower() new_align.append(record_formatter(str(seq.seq), new_seq_name)) local_organisms.remove(new_seq_name) for org in local_organisms: if not verbatim: loc = '_'.join(seq.name.split('_')[:1]) else: loc = seq.name if check_missing and missing: try: assert loc in missing[org], "Locus missing" except: assert loc in missing['{}*'.format(org)], "Locus missing" missing_string = '?' * overall_length new_align.append(record_formatter(missing_string, org)) return new_align
def propmat(alignment, num_imp, num_changes, transitions, probs): num_changes = int(num_changes) orlen = len(alignment)-num_imp record = 0 newpd = copy(alignment.pd) newdistarray = copy(alignment.distarray) origlist = [i for i in alignment[:orlen]] implist = [i for i in alignment[orlen:]] # targets = [(random.randint(0,len(implist)-1), random.randint(0,len(alignment[0])-1)) for i in xrange(num_changes)] targets = [(random.randint(0,len(implist)-1), wl_one(probs)) for i in xrange(num_changes)] for t in targets: old = newdistarray[orlen+t[0],t[1]] new = weightselect(transitions[old]) # new = random.choice(AAS) newdistarray[orlen+t[0],t[1]] = new changes = (newdistarray[:,t[1]]==old).astype(int)-(newdistarray[:,t[1]]==new).astype(int) # pdb.set_trace() newpd[orlen+t[0]]+=changes newpd[:,orlen+t[0]]+=changes record += 1 # newpd = np.tril(newpd,-1) # newpd += newpd.transpose() np.fill_diagonal(newpd,0) inds = Counter([t[0] for t in targets]).keys() for ind in inds: seq = implist[ind] implist[ind] = SeqRecord(Seq(''.join(newdistarray[ind+orlen])), id=seq.id, name=seq.name, description=seq.description, annotations=seq.annotations) newalign = MultipleSeqAlignment(origlist+implist) newalign.pd, newalign.distarray = newpd, newdistarray return record, newalign, targets
def replace_gaps(aln): """we need to determine actual starts of alignments""" new_aln = MultipleSeqAlignment([], generic_dna) for taxon in aln: seq = replace_gaps_at_start_and_ends(taxon.seq) new_aln.append(SeqRecord(seq, id=taxon.id, name=taxon.name, description=taxon.description)) return new_aln
def to_generic(self, alphabet): """Retrieve generic alignment object for the given alignment. Instead of the tuples, this returns a MultipleSeqAlignment object from Bio.Align, through which you can manipulate and query the object. alphabet is the specified alphabet for the sequences in the code (for example IUPAC.IUPACProtein). Thanks to James Casbon for the code. """ # TODO - Switch to new Bio.Align.MultipleSeqAlignment class? seq_parts = [] seq_names = [] parse_number = 0 n = 0 for name, start, seq, end in self.alignment: if name == 'QUERY': # QUERY is the first in each alignment block parse_number += 1 n = 0 if parse_number == 1: # create on first_parse, append on all others seq_parts.append(seq) seq_names.append(name) else: seq_parts[n] += seq n += 1 generic = MultipleSeqAlignment([], alphabet) for (name, seq) in zip(seq_names, seq_parts): generic.append(SeqRecord(Seq(seq, alphabet), name)) return generic
def json_to_Bio_alignment(seq_json): from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq aln = MultipleSeqAlignment([]) for seq in seq_json: aln.append(SeqRecord(name=seq['strain'], id=seq['strain'], seq=Seq(seq['seq']))) return aln
def gap_span(reads, bases): """ Returns a MSA with rows=reads and columns=bases, composed of gaps only """ spal = MultipleSeqAlignment(alphabet) span = "".join("-" * bases) for r in reads: spal.append(Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(span, alphabet), id=r)) return spal
def stage_two_trimming(self, s1_trimmed, window_size=5): """ Alignment row-by-row trimming. After stage one trimming, iterate over rows of alignment to find differences between the alignment consensus and the row of data. Trim those ends coming before (or after at 3' end) a block of 5 contiguous highly conserved positions. Goes to third round of filtering to remove edges that end up with only '----' characters to start or end alignment block. """ # create new alignment object to hold trimmed alignment s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) # get consensus of alignment in array form consensus_array = numpy.array(list(self._alignment_consensus(s1_trimmed))) # iterate over each alignment sequence for sequence in s1_trimmed: #if sequence.id == 'phaenicophaeus_curvirostris2': # pdb.set_trace() start, end = self._get_ends(sequence) # convert sequence to array orig_seq_array = numpy.array(list(sequence)) # trim down edge gaps so they do not exert undue influence # on the running average seq_array = orig_seq_array[start:end] compare = (seq_array == consensus_array[start:end]) weight = numpy.repeat(1.0, window_size) / window_size # compute running average across window size running_average = numpy.convolve(compare, weight, 'same') # get first 5' and 3' positions where quality > 1 over # 5 positions ([True, True, True, True, True]). This helps # us find the ends of the alignment where there are likely # problems) gm = (running_average > 0.99) for i in xrange(gm.size): # get 5 value slices if numpy.all(gm[i:i+5] == True): bad_start = i break reversed_gm = gm[::-1] for i in xrange(reversed_gm.size): # get 5 value slices if numpy.all(reversed_gm[i:i+5] == True): bad_end = reversed_gm.size - i break orig_seq_array[:start + bad_start] = '-' orig_seq_array[start + bad_end:] = '-' trim = ''.join(orig_seq_array) # feed those up to replacement engine to set all # missing/trimmed data at edges to "?" which is # missing data designator #trim = self._replace_ends(trim) if set(trim) != set(['-']) and set(trim) != (['?']): s2_trimmed.append(self._record_formatter(trim, sequence.id)) else: s2_trimmed = None break return s2_trimmed
def __init__(self, records="", name=None, alphabet=default_codon_alphabet): MultipleSeqAlignment.__init__(self, records, alphabet=alphabet) # check the type of the alignment to be nucleotide for rec in self: if not isinstance(rec.seq, CodonSeq): raise TypeError("CodonSeq objects are expected in each " "SeqRecord in CodonAlignment") assert self.get_alignment_length() % 3 == 0, "Alignment length is not a triple number"
def test_proteins(self): alpha = HasStopCodon(Gapped(generic_protein, "-"), "*") a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) self.assertEqual(str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) ic = s.information_content(chars_to_ignore=['-', '*']) self.assertAlmostEqual(ic, 133.061475107, places=6)
def bam2Alignment(sam_name, chrom=None, start=None, stop=None, minlen=1): """ Read alignment from samfile and return Alignment object. """ it = sam_name.fetch(chrom, start, stop) aln = MultipleSeqAlignment(alphabet) for read in it: if read.rlen - start + read.pos + 1 > minlen and stop - read.pos + 1 >= minlen: aln.append(getSeqRecord(read, start=start, stop=stop)) return aln
def refactor_title_allmsa(msa): """ refactors titles of sequence in format needed for histoneDB seeds """ msa_r=MultipleSeqAlignment([]) for i in msa: print i.description # genus=re.search(r"\[(\S+)\s+.+\S+\]",i.description).group(1) text=re.search(r"(\S+)\|(\d+)\|(\S+)",i.id) i.id=text.group(3)+"|"+text.group(1)+"|"+text.group(2) # i.description=genus+"_"+variant+"_"+gi msa_r.append(i) return msa_r
def __init__(self, records='', name=None, alphabet=default_codon_alphabet): """Initialize the class.""" MultipleSeqAlignment.__init__(self, records, alphabet=alphabet) # check the type of the alignment to be nucleotide for rec in self: if not isinstance(rec.seq, CodonSeq): raise TypeError("CodonSeq objects are expected in each " "SeqRecord in CodonAlignment") if self.get_alignment_length() % 3 != 0: raise ValueError("Alignment length is not a multiple of " "three (i.e. a whole number of codons)")
def aln_undup(alignment): """Removes duplicate keys""" aln=MultipleSeqAlignment([]) checksums = set() for record in alignment: checksum = seguid(record.seq) if checksum in checksums: print "Ignoring %s" % record.id continue checksums.add(checksum) aln.append(record) return aln
def remove_tribolium(fname) : msa = AlignIO.read(open(fname), "fasta") newmsa = MultipleSeqAlignment([]) for record in msa : header = record.description.split() species = header[1].split('=')[1] geneid = header[2].split('=')[1] if species != 'tribolium_castaneum' : newmsa.append(record) return newmsa
def roundTwo(self): self.roundOne() self.jfileMinus = [] pattern = re.compile(r'_') for j in self.query_name: clust_id = pattern.split(j)[2] for query in SeqIO.parse(j, 'fasta'): seq = pattern.split(query.id)[0] #Create special identifier for each round of files number = 'minus%s' %seq id_jfile = '%s_minus%s' %(clust_id,seq) rax_name = 'reversatest%s' % number fasta_name = 'testing/align%s.fasta' % number if not os.path.isfile('testing/alignment%s.phy' % number): edited = MultipleSeqAlignment([]) openPhy = open('testing/alignment.phy') record = AlignIO.read(openPhy, 'phylip') for i in record: if i.id != seq: edited.append(i) #write the alignment minus a sequence phy_name = 'testing/alignment%s.phy' % number out = open(phy_name , 'w') AlignIO.write(edited, out, 'phylip') out.close() #convert FASTA to PHYLIP format SeqIO.convert(phy_name, 'phylip', fasta_name, 'fasta', ) #Create reference tree raxml_line = RaxmlCommandline(sequences=phy_name, model='GTRGAMMA', name=rax_name, working_dir=self.cwPath) raxml_line() #Add query sequences to the previous alignment multiali_name = 'testing/multiple_ali%s.fasta' %id_jfile if not os.path.isfile('testing/alignment%s.phy' %id_jfile): os.system('mafft --add %s --quiet --reorder %s >%s'% (j, fasta_name, multiali_name)) jason_name = 'multiple_ali%s.jplace' %id_jfile #wrap pplacer if not os.path.isfile('pplacer/%s' %jason_name): self.jfileMinus.append(jason_name) os.system('pplacer --out-dir pplacer -p -t testing/RAxML_result.%s -s testing/RAxML_info.%s %s' % (rax_name, rax_name, multiali_name)) print self.jfile print self.jfileMinus return self.jfile, self.jfileMinus
def split_msa(fname) : msa = AlignIO.read(open(fname), "fasta") msalist = [] specieslist = [] newmsa = MultipleSeqAlignment([]) for record in msa : header = record.description.split() species = header[1].split('=')[1] if not 'tribolium_castaneum' in specieslist : newmsa.append(record) specieslist.append(species) elif species != 'tribolium_castaneum' : newmsa.append(record) specieslist.append(species) else : msalist.append(newmsa) newmsa = MultipleSeqAlignment([]) newmsa.append(record) specieslist.append(species) if newmsa : msalist.append(newmsa) return msalist
def split_alignment(clc, alignment, genelimit): """Split a multiple sequence alignment into a dict of sequences""" # genelimit convert: sequences = {} if isinstance(alignment, dict): alignment = MSA(alignment.values()) exp_len = alignment.get_alignment_length() for dt in genelimit: gene, start, end = dt sequences[gene] = alignment[:, start:end] exp_len -= sequences[gene].get_alignment_length() if exp_len != 0: raise ValueError("Could not split alignment, wrong gene delimiter") return sequences
def get_spliced(self, starts, ends, strand=1): """Return a multiple alignment of the exact sequence range provided. Accepts two lists of start and end positions on target_seqname, representing exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the desired sequences spliced together. *starts* should be a list of 0-based start coordinates of segments in the reference. *ends* should be the list of the corresponding segment ends (in the half-open UCSC convention: http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). To ask for the alignment portion corresponding to the first 100 nucleotides of the reference sequence, you would use ``search([0], [100])`` """ # validate strand if strand not in (1, -1): raise ValueError("Strand must be 1 or -1, got %s" % str(strand)) # pull all alignments that span the desired intervals fetched = list(self.search(starts, ends)) # keep track of the expected letter count # (sum of lengths of [start, end) segments, # where [start, end) half-open) expected_letters = sum(end - start for start, end in zip(starts, ends)) # if there's no alignment, return filler for the assembly of the length given if len(fetched) == 0: return MultipleSeqAlignment( [SeqRecord(Seq("N" * expected_letters), id=self._target_seqname)] ) # find the union of all IDs in these alignments all_seqnames = {sequence.id for multiseq in fetched for sequence in multiseq} # split every record by base position # key: sequence name # value: dictionary # key: position in the reference sequence # value: letter(s) (including letters # aligned to the "-" preceding the letter # at the position in the reference, if any) split_by_position = {seq_name: {} for seq_name in all_seqnames} # keep track of what the total number of (unspliced) letters should be total_rec_length = 0 # track first strand encountered on the target seqname ref_first_strand = None for multiseq in fetched: # find the target_seqname in this MultipleSeqAlignment and use it to # set the parameters for the rest of this iteration for seqrec in multiseq: if seqrec.id == self._target_seqname: try: if ref_first_strand is None: ref_first_strand = seqrec.annotations["strand"] if ref_first_strand not in (1, -1): raise ValueError("Strand must be 1 or -1") elif ref_first_strand != seqrec.annotations["strand"]: raise ValueError( "Encountered strand='%s' on target seqname, " "expected '%s'" % (seqrec.annotations["strand"], ref_first_strand) ) except KeyError: raise ValueError( "No strand information for target seqname (%s)" % self._target_seqname ) from None # length including gaps (i.e. alignment length) rec_length = len(seqrec) rec_start = seqrec.annotations["start"] ungapped_length = seqrec.annotations["size"] # inclusive end in zero-based coordinates of the reference rec_end = rec_start + ungapped_length - 1 # This is length in terms of actual letters in the reference total_rec_length += ungapped_length # blank out these positions for every seqname for seqrec in multiseq: for pos in range(rec_start, rec_end + 1): split_by_position[seqrec.id][pos] = "" break # http://psung.blogspot.fr/2007/12/for-else-in-python.html # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops else: raise ValueError( "Did not find %s in alignment bundle" % (self._target_seqname,) ) # the true, chromosome/contig/etc position in the target seqname real_pos = rec_start # loop over the alignment to fill split_by_position for gapped_pos in range(0, rec_length): for seqrec in multiseq: # keep track of this position's value for the target seqname if seqrec.id == self._target_seqname: track_val = seqrec.seq[gapped_pos] # Here, a real_pos that corresponds to just after a series of "-" # in the reference will "accumulate" the letters found in other sequences # in front of the "-"s split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos] # increment the real_pos counter only when non-gaps are found in # the target_seqname, and we haven't reached the end of the record if track_val != "-" and real_pos < rec_end: real_pos += 1 # make sure the number of bp entries equals the sum of the record lengths if len(split_by_position[self._target_seqname]) != total_rec_length: raise ValueError( "Target seqname (%s) has %s records, expected %s" % ( self._target_seqname, len(split_by_position[self._target_seqname]), total_rec_length, ) ) # translates a position in the target_seqname sequence to its gapped length realpos_to_len = { pos: len(gapped_fragment) for pos, gapped_fragment in split_by_position[self._target_seqname].items() if len(gapped_fragment) > 1 } # splice together the exons subseq = {} for seqid in all_seqnames: seq_split = split_by_position[seqid] seq_splice = [] filler_char = "N" if seqid == self._target_seqname else "-" # iterate from start to end, taking bases from split_by_position when # they exist, using N or - for gaps when there is no alignment. append = seq_splice.append for exonstart, exonend in zip(starts, ends): # exonend is exclusive for real_pos in range(exonstart, exonend): # if this seqname has this position, add it if real_pos in seq_split: append(seq_split[real_pos]) # if not, but it's in the target_seqname, add length-matched filler elif real_pos in realpos_to_len: append(filler_char * realpos_to_len[real_pos]) # it's not in either, so add a single filler character else: append(filler_char) subseq[seqid] = "".join(seq_splice) # make sure we're returning the right number of letters if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: raise ValueError( "Returning %s letters for target seqname (%s), expected %s" % ( len(subseq[self._target_seqname].replace("-", "")), self._target_seqname, expected_letters, ) ) # check to make sure all sequences are the same length as the target seqname ref_subseq_len = len(subseq[self._target_seqname]) for seqid, seq in subseq.items(): if len(seq) != ref_subseq_len: raise ValueError( "Returning length %s for %s, expected %s" % (len(seq), seqid, ref_subseq_len) ) # finally, build a MultipleSeqAlignment object for our final sequences result_multiseq = [] for seqid, seq in subseq.items(): seq = Seq(seq) seq = seq if strand == ref_first_strand else seq.reverse_complement() result_multiseq.append(SeqRecord(seq, id=seqid, name=seqid, description="")) return MultipleSeqAlignment(result_multiseq)
# Get protein sequence ppb = Bio.PDB.PPBuilder() polypeptides = ppb.build_peptides(structure) seq1 = polypeptides[0].get_sequence() seq2 = polypeptides[1].get_sequence() matrix = matlist.blosum62 gap_open = -10 gap_extend = -0.5 alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend) top_aln = alns[0] alignment = MultipleSeqAlignment( [SeqRecord(Seq(top_aln[0])), SeqRecord(Seq(top_aln[1]))]) structure_alignment = Bio.PDB.StructureAlignment(alignment, structure[0]['A'], structure[0]['B']) sup = Bio.PDB.Superimposer() ref_atoms = [] mov_atoms = [] for duo in structure_alignment.duos: res1 = duo[0] res2 = duo[1] if res1 and res2: ref_atoms.append(res1['CA']) mov_atoms.append(res2['CA']) sup.set_atoms(ref_atoms, mov_atoms)
def subset_viruses_nextstrain_build(virus, subtype, gene, window, min_seqs, year_max, year_min): configs = readin_virus_config(virus) standard_gene = standardize_gene_name(virus, gene) #Find reference, alignment and meta files (some sub-genic regions may use files from a gene or a whole genome) if 'specify_location' in configs[standard_gene].keys(): parent_gene = configs[standard_gene]['specify_location']['parent_gene'] reference_file = configs['reference_file'].format(virus=virus, subtype=subtype, gene=parent_gene) alignment_file = configs['alignment_file'].format(virus=virus, subtype=subtype, gene=parent_gene) meta_file = configs['meta_file'].format(virus=virus, subtype=subtype, gene=parent_gene) #some are comma-separated, some are tab-separated metafile_sep = configs['metafile_sep'] else: reference_file = configs['reference_file'].format(virus=virus, subtype=subtype, gene=gene) alignment_file = configs['alignment_file'].format(virus=virus, subtype=subtype, gene=gene) meta_file = configs['meta_file'].format(virus=virus, subtype=subtype, gene=gene) metafile_sep = configs['metafile_sep'] #Find gene location, if domain is sub-genic or reference file contains multiple genes gene_location = False #If domain is sub-genic, fetch its position (within genome or parent gene) from config file if 'specify_location' in configs[standard_gene].keys(): if subtype == None: gene_location_key = "location" else: gene_location_key = "location_" + str(subtype) gene_location_list = ast.literal_eval( configs[standard_gene]['specify_location'][gene_location_key]) #Need to deal with domains the are not contiguous if len(gene_location_list) == 1: gene_location = SeqFeature( FeatureLocation(gene_location_list[0][0], gene_location_list[0][1])) else: compound_locations = [] for location in gene_location_list: compound_locations.append( FeatureLocation(location[0], location[1])) gene_location = CompoundLocation(compound_locations) #Find gene location from reference files else: for seq_record in SeqIO.parse(reference_file, "genbank"): for feature in seq_record.features: if feature.type == 'CDS': if 'gene' in feature.qualifiers.keys(): if feature.qualifiers['gene'][0].lower() == gene.lower( ): gene_location = feature.location elif feature.qualifiers['product'][0].lower( ) == gene.lower(): gene_location = feature.location #Subset data based on time windows meta = pd.read_csv(meta_file, sep=metafile_sep) meta.drop(meta[meta['date'] == '?'].index, inplace=True) meta.dropna(subset=['date'], inplace=True) meta['year'] = meta['date'].str[:4].astype('int') if year_max: meta.drop(meta[meta['year'] > year_max].index, inplace=True) if year_min: meta.drop(meta[meta['year'] < year_min].index, inplace=True) date_range = meta['year'].max() - meta['year'].min() #Remove egg- and cell-passaged strains meta.drop(meta[meta['strain'].str[-4:] == '-egg'].index, inplace=True) meta.drop(meta[meta['strain'].str[-5:] == '-cell'].index, inplace=True) #Limit meta data to only strains in alignment file aligned_isolates = [] with open(alignment_file, "r") as aligned_handle: for isolate in SeqIO.parse(aligned_handle, "fasta"): aligned_isolates.append(isolate.id) aligned_isolates_df = pd.DataFrame(aligned_isolates, columns=['strain']) meta = meta.merge(aligned_isolates_df, on='strain', how='inner') #Group viruses by time windows virus_time_subset = {} if window == 'all': years = str(meta['year'].min()) + '-' + str(meta['year'].max()) virus_time_subset[years] = meta['strain'].tolist() else: date_window_start = meta['year'].min() date_window_end = meta['year'].min() + window while date_window_end <= meta['year'].max(): years = str(date_window_start) + '-' + str(date_window_end) strains = meta[(meta['year'] >= date_window_start) & ( meta['year'] < date_window_end)]['strain'].tolist() virus_time_subset[years] = strains #sliding window date_window_end += 1 date_window_start += 1 #Only use time points with enough data: virus_time_subset = { k: v for k, v in virus_time_subset.items() if len(v) >= min_seqs } year_windows = [] seqs_in_window = [] #Find outgroup sequence from strains at first time point(to make consensus from) first_window = True first_window_strains = [] first_window_sequences = [] alignment_time_subset = {} for years, subset_viruses in virus_time_subset.items(): year_windows.append(years) seqs_in_window.append(len(subset_viruses)) alignment_time_subset[years] = [] #make consensus sequence at first time point if first_window == True: first_window_strains += subset_viruses first_window = False with open(alignment_file, "r") as aligned_handle: for isolate in SeqIO.parse(aligned_handle, "fasta"): if isolate.id in first_window_strains: if gene_location: gene_record = SeqRecord(seq=gene_location.extract( isolate.seq), id=isolate.id, description=gene) else: gene_record = SeqRecord(seq=isolate.seq, id=isolate.id, description=gene) first_window_sequences.append(gene_record) if isolate.id in subset_viruses: if gene_location: alignment_time_subset[years].append( gene_location.extract(isolate.seq)) else: alignment_time_subset[years].append(isolate.seq) first_window_alignment = MultipleSeqAlignment(first_window_sequences) outgroup_seq = AlignInfo.SummaryInfo(first_window_alignment).gap_consensus( ambiguous='N') outgroup_seq_aa = outgroup_seq.translate() return virus_time_subset, alignment_time_subset, outgroup_seq, outgroup_seq_aa, year_windows, seqs_in_window
def __next__(self): handle = self.handle try: # Header we saved from when we were parsing # the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration while line.rstrip() != "#=======================================": line = handle.readline() if not line: raise StopIteration length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": # Read in the rest of this alignment header, # try and discover the number of records expected # and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) # And read in another line... line = handle.readline() if number_of_seqs is None: raise ValueError("Number of sequences missing!") if length_of_seqs is None: raise ValueError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] seq_starts = [] index = 0 # Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: # identifier, seq start position, seq, seq end position # (an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end if start == end: # Special case, either a single letter is present, # or no letters at all. if seq.replace("-", "") == "": start = int(start) end = int(end) else: start = int(start) - 1 end = int(end) else: assert seq.replace("-", "") != "", repr(line) start = int(start) - 1 # python counting end = int(end) # The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id == ids[index] or id == ids[index][:len(id)] if len(seq_starts) == index: # Record the start seq_starts.append(start) # Check the start... if start == end: assert seq.replace("-", "") == "", line else: assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \ "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), start, line) seqs[index] += seq # Check the end ... assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \ "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), seq_starts[index], end, line) index += 1 if index >= number_of_seqs: index = 0 else: # just a start value, this is just alignment annotation (?) # print "Skipping: " + line.rstrip() pass elif line.strip() == "": # Just a spacer? pass else: print(line) assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================": # End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = [] for id, seq in zip(ids, seqs): if len(seq) != length_of_seqs: # EMBOSS 2.9.0 is known to use spaces instead of minus signs # for leading gaps, and thus fails to parse. This old version # is still used as of Dec 2008 behind the EBI SOAP webservice: # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") records.append( SeqRecord(Seq(seq, self.alphabet), id=id, description=id)) return MultipleSeqAlignment(records, self.alphabet)
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration line = line.strip() parts = [x for x in line.split() if x] if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None and \ self.records_per_alignment != number_of_seqs: raise ValueError("Found %i records in this alignment, " "told to expect %i" % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] # By default, expects STRICT truncation / padding to 10 characters. # Does not require any whitespace between name and seq. for i in range(number_of_seqs): line = handle.readline().rstrip() sequence_id, s = self._split_id(line) ids.append(sequence_id) if "." in s: raise ValueError(_NO_DOTS) seqs.append([s]) # Look for further blocks line = "" while True: # Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break # end of file if not line: break # end of file if self._is_header(line): # Looks like the start of a concatenated alignment self._header = line break # print "New block..." for i in range(number_of_seqs): s = line.strip().replace(" ", "") if "." in s: raise ValueError(_NO_DOTS) seqs[i].append(s) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break # end of file records = (SeqRecord(Seq("".join(s), self.alphabet), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)) return MultipleSeqAlignment(records, self.alphabet)
def createAlignment(sequences, alphabet): """Create an Alignment object from a list of sequences""" return MultipleSeqAlignment((SeqRecord(Seq(s,alphabet), id="sequence%i"%(i+1)) \ for (i,s) in enumerate(sequences)), alphabet)
def _thetaEK(records, optimize_dist): """ Divergence time based optimized blosum scores for amino acid alignment @records - Alignment record object @optimize_dist - divergence time """ #posScore = dict() posScore = { "B62": dict(), "KMAT": dict(), "K1": dict(), "K2": dict(), "K3": dict(), "K4": dict(), "K5": dict(), "K6": dict(), "K7": dict(), "K8": dict(), "K9": dict(), "K10": dict() } msaObj = MultipleSeqAlignment(records) KIDERA = kidera() for i in range(len(msaObj[1])): posVector = msaObj[:, i:i + 1] print(i) if len(set([str(x.seq) for x in posVector])) == 1: continue pairs = combinations(range(len(posVector)), 2) posScore["B62"][i] = list() posScore["KMAT"][i] = list() for key, val in posScore.items(): posScore[key][i] = list() for ( m, n ) in pairs: # This for loop has N*(N-1) number of pairs cause N*(N-1) iterations where N is the number of sequences. This needs to be parallelized if str(posVector[m].seq) == "?": posVector[m].seq = "-" if str(posVector[n].seq) == "?": posVector[n].seq = "-" posScore["B62"][i].append( float( Blossum( str(posVector[m].seq).upper(), str(posVector[n].seq).upper())) / optimize_dist[str(posVector[m].id) + "-" + str(posVector[n].id)]) posScore["KMAT"][i].append( float( KMAT( str(posVector[m].seq).upper(), str(posVector[n].seq).upper())) / optimize_dist[str(posVector[m].id) + "-" + str(posVector[n].id)]) for j in range(10): posScore["K" + str(j + 1)][i].append( (KIDERA[str(posVector[m].seq).upper()][j] - KIDERA[str(posVector[n].seq).upper()][j]) / optimize_dist[str(posVector[m].id) + "-" + str(posVector[n].id)]) return posScore
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True, infer_gtr=True, root_state=None, missing='?'): from treetime import GTR from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import Phylo T = Phylo.read(tree, 'newick') nodes = {n.name: n for n in T.get_terminals()} # Determine alphabet only counting tips in the tree places = set() for name, meta in seq_meta.items(): if field in meta and name in nodes: places.add(meta[field]) if root_state is not None: places.add(root_state) # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45) places = sorted(places) nc = len(places) if nc > 180: print("ERROR: geo_inference: can't have more than 180 places!") return None, None elif nc == 1: print( "WARNING: geo_inference: only one place found -- set every internal node to %s!" % places[0]) return None, None elif nc == 0: print("ERROR: geo_inference: list of places is empty!") return None, None else: # set up model alphabet = {chr(65 + i): place for i, place in enumerate(places)} model = GTR.custom(pi=np.ones(nc, dtype=float) / nc, W=np.ones((nc, nc)), alphabet=np.array(sorted(alphabet.keys()))) missing_char = chr(65 + nc) alphabet[missing_char] = missing model.profile_map[missing_char] = np.ones(nc) model.ambiguous = missing_char alphabet_rev = {v: k for k, v in alphabet.items()} # construct pseudo alignment pseudo_seqs = [] for name, meta in seq_meta.items(): if name in nodes: s = alphabet_rev[ meta[field]] if field in meta else missing_char pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name)) aln = MultipleSeqAlignment(pseudo_seqs) # set up treetime and infer from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, gtr=model, convert_upper=False, verbose=0) tt.use_mutation_length = False tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=5.0, marginal=True, normalized_rate=False) # attach inferred states as e.g. node.region = 'africa' for node in tt.tree.find_clades(): node.__setattr__(field, alphabet[node.sequence[0]]) # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03 if confidence: for node in tt.tree.find_clades(): pdis = node.marginal_profile[0] S = -np.sum(pdis * np.log(pdis + TINY)) marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))] marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods marginal = [ (a, b) for a, b in marginal if b > 0.001 ][:4] #only take stuff over .1% and the top 4 elements conf = {a: b for a, b in marginal} node.__setattr__(field + "_entropy", S) node.__setattr__(field + "_confidence", conf) return tt, alphabet
os.system("head data/muscle-patato_pep.clw") from Bio import AlignIO aln_patato = AlignIO.read("data/muscle-patato_pep.clw", "clustal") print aln_patato for record in aln_patato: print("%s - %s" % (record.seq[1:60], record.id)) os.system("head data/dummy_aln.phy") aln_dummy = AlignIO.parse("data/dummy_aln.phy", "phylip") for alignment in aln_dummy: print alignment print "" alignments = list(AlignIO.parse("data/dummy_aln.phy", "phylip")) second_aln = alignments[1] print second_aln from Bio.Alphabet import generic_dna from Bio.Align import MultipleSeqAlignment align1 = MultipleSeqAlignment([ SeqRecord(Seq("ACTGCTAGCTAG", generic_dna), id="toto"), SeqRecord(Seq("ACT-CTAGCTAG", generic_dna), id="titi"), SeqRecord(Seq("ACTGCTAGDTAG", generic_dna), id="tata"), ]) print align1 my_alignments = [align1, aln_patato] AlignIO.write(my_alignments, "mixed.phy", "phylip")
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration # Whitelisted headers we know about. known_headers = [ "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp" ] # Examples in "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 # would often start as follows: # # !!AA_MUTIPLE_ALIGNMENT 1.0 # PileUp of: @/usr/users2/culhane/... # # etc with other seemingly free format text before getting to the # MSF/Type/Check line and the following Name: lines block and // line. # # MUSCLE just has a line "PileUp", while other sources just use the line # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" # (nucleotide). if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known GCG MSF header: %s" % (line.strip().split()[0], ", ".join(known_headers))) while line and " MSF: " not in line: line = handle.readline() if not line: raise ValueError( "Reached end of file without MSF/Type/Check header line") # Quoting from "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. # Page 31: # # "Header information is before a .. (double dot) in a GCG format file. # The file will also have a checksum specific for that file." # # This was followed by a single non-aligned sequence, but this convention # appears to also be used in the GCG MSF files. Quoting other examples in # this reference, page 31: # # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. # # Except from page 148: # # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. # # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: # # MSF: 689 Type: N Check: 0000 .. # # By observation, the MSF value is the column count, type is N (nucleotide) # or P (protein / amino acid). # # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, # # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf # !!NA_MULTIPLE_ALIGNMENT 1.0 # # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. # # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 # Name: G29385 Len: 633 Check: 391 Weight: 1.00 # # // # parts = line.strip("\n").split() offset = parts.index("MSF:") if (parts[offset + 2] != "Type:" or parts[-3] not in ("Check:", "CompCheck:") or parts[-1] != ".."): raise ValueError( "GCG MSF header line should be " "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', " " not: %r" % line) try: aln_length = int(parts[offset + 1]) except ValueError: aln_length = -1 if aln_length < 0: raise ValueError( "GCG MSF header line should have MDF: <int> for column count, not %r" % parts[offset + 1]) seq_type = parts[offset + 3] if seq_type not in ["P", "N"]: raise ValueError( "GCG MSF header line should have 'Type: P' (protein) " "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type) # There should be a blank line after that header line, then the Name: lines # # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here, # # PileUp # # # # MSF: 628 Type: P Check: 147 .. # # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 # # // ids = [] lengths = [] checks = [] weights = [] line = handle.readline() while line and line.strip() != "//": line = handle.readline() if line.strip().startswith("Name: "): if " Len: " in line and " Check: " in line and " Weight: " in line: rest = line[line.index("Name: ") + 6:].strip() name, rest = rest.split(" Len: ") length, rest = rest.split(" Check: ") check, weight = rest.split(" Weight: ") name = name.strip() if name.endswith(" oo"): # T-COFFEE oddity, ignore this name = name[:-3] if name in ids: raise ValueError("Duplicated ID of %r" % name) if " " in name: raise NotImplementedError("Space in ID %r" % name) ids.append(name) # Expect aln_length <= int(length.strip()), see below lengths.append(int(length.strip())) checks.append(int(check.strip())) weights.append(float(weight.strip())) else: raise ValueError("Malformed GCG MSF name line: %r" % line) if not line: raise ValueError( "End of file while looking for end of header // line.") if aln_length != max(lengths): # In broken examples from IMGTHLA was possible to continue # https://github.com/ANHIG/IMGTHLA/issues/201 max_length = max(lengths) max_count = sum(1 for _ in lengths if _ == max_length) raise ValueError( "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s" % (aln_length, max_count, len(ids), max_length)) line = handle.readline() if not line: raise ValueError("End of file after // line, expected sequences.") if line.strip(): raise ValueError( "After // line, expected blank line before sequences.") # Now load the sequences seqs = [[] for _ in ids] # list of empty lists completed_length = 0 while completed_length < aln_length: # Note might have a coordinate header line (seems to be optional) for idx, name in enumerate(ids): line = handle.readline() if idx == 0 and not line.strip(): # T-COFFEE uses two blank lines between blocks, rather than one while line and not line.strip(): line = handle.readline() if not line: raise ValueError( "End of file where expecting sequence data.") # print("Looking for seq for %s in line: %r" % (name, line)) words = line.strip().split() # Should we use column numbers, rather than assuming no spaces in names? if idx == 0 and words and words[0] != name: # print("Actually have a coord line") # Hopefully this is a coordinate header before the first seq try: i = int(words[0]) except ValueError: i = -1 if i != completed_length + 1: raise ValueError( "Expected GCG MSF coordinate line starting %i, got: %r" % (completed_length + 1, line)) if len(words) > 1: # Final block usually not full 50 chars, so expect start only. if len(words) != 2: i = -1 else: try: i = int(words[1]) except ValueError: i = -1 if i != (completed_length + 50 if completed_length + 50 < aln_length else aln_length): raise ValueError( "Expected GCG MSF coordinate line %i to %i, got: %r" % ( completed_length + 1, completed_length + 50 if completed_length + 50 < aln_length else aln_length, line, )) line = handle.readline() words = line.strip().split() # print("Still looking for seq for %s in line: %r" % (name, line)) # Dealt with any coordinate header line, should now be sequence if not words: # Should be sequence here, but perhaps its a short one? if (lengths[idx] < aln_length and len("".join(seqs[idx])) == lengths[idx]): # Is this actually allowed in the format? Personally I would # expect a line with name and a block of trailing ~ here. pass else: raise ValueError("Expected sequence for %s, got: %r" % (name, line)) elif words[0] == name: assert len(words) > 1, line # print(i, name, repr(words)) seqs[idx].extend(words[1:]) else: raise ValueError("Expected sequence for %r, got: %r" % (name, line)) # TODO - check the sequence lengths thus far are consistent # with blocks of 50? completed_length += 50 line = handle.readline() if line.strip(): raise ValueError("Expected blank line, got: %r" % line) # Skip over any whitespace at the end... while True: line = handle.readline() if not line: # End of file, no more alignments break elif not line.strip(): # Blank line, ignore pass elif line.strip().split()[0] in known_headers: # Looks like the start of another alignment: self._header = line break else: raise ValueError( "Unexpected line after GCG MSF alignment: %r" % line) # Combine list of strings into single string, remap gaps seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs] # Apply any trailing padding for short sequences padded = False for idx, (length, s) in enumerate(zip(lengths, seqs)): if len(s) < aln_length and len(s) == length: padded = True seqs[idx] = s + "-" * (aln_length - len(s)) if padded: import warnings from Bio import BiopythonParserWarning warnings.warn( "One of more alignment sequences were truncated and have been gap padded", BiopythonParserWarning, ) records = (SeqRecord( Seq(s), id=i, name=i, description=i, annotations={"weight": w}, ) for (i, s, w) in zip(ids, seqs, weights)) # This will check alignment lengths are self-consistent: align = MultipleSeqAlignment(records) # Check matches the header: if align.get_alignment_length() != aln_length: raise ValueError( "GCG MSF headers said alignment length %i, but have %i" % (aln_length, align.get_alignment_length())) return align
def write(sequences, handle, format): """Write complete set of sequences to a file. Arguments: - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. Note if providing a file handle, your code should close the handle after calling this function (to ensure the data gets flushed to disk). Returns the number of records written (as an integer). """ from Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(handle, SeqRecord): raise TypeError("Check arguments, handle should NOT be a SeqRecord") if isinstance(handle, list): # e.g. list of SeqRecord objects raise TypeError("Check arguments, handle should NOT be a list") if isinstance(sequences, SeqRecord): # This raised an exception in older versions of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: # Map the file format to a writer function/class if format in _FormatToString: format_function = _FormatToString[format] count = 0 for record in sequences: fp.write(format_function(record)) count += 1 elif format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: # Try and turn all the records into a single alignment, # and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %r" % alignment_count count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %r" \ % (format, count) return count
def __next__(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input directory from s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/" output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/") if os.path.isdir(output_directory_2) == False: os.makedirs(output_directory_2) ### iterate each gene for file in os.listdir(output_directory_1): if file != ".DS_Store": output_directory_file = output_directory_2 + file fasta_name = output_directory_1 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " + sequence) alignment = AlignIO.read(sequence, 'fasta') ### calculate the polymorphism in outgroup ### change alignment to an array. total_wrong_poly_sites_outgroup = [] align_array_outgroup = np.array([list(rec) for rec in alignment]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = alignment.get_alignment_length() # alignment = AlignIO.read(sequence, 'fasta') for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position_outgroup = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array_outgroup[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position_outgroup) > float(Max_p_sites_o): print(column_position_outgroup) total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup)) print(unique_wrong_sites_ougroup) print("outgroup") align_2 = MultipleSeqAlignment([]) for record in alignment: new_seq = "" if record.id in outgroups: print(record.seq) for i in range(total_length): if i in unique_wrong_sites_ougroup: new_seq = new_seq + "-" else: new_seq = new_seq + str(record.seq[i]) temp_seq2 = SeqRecord(Seq(str(new_seq)), id=str(record.id)) align_2.extend([temp_seq2]) #align_2.extend(str(record.id), str(new_seq)) else: temp_seq3 = SeqRecord(Seq(str(record.seq)), id=str(record.id)) align_2.extend([temp_seq3]) #align_2.extend(str(record.id), str(record.seq)) print(align_2) AlignIO.write(align_2, output_directory_file, "fasta")
def __next__(self): handle = self.handle line = handle.readline() if not line: raise StopIteration # Strip out header comments while line and line.strip().startswith('#'): line = handle.readline() seqs = {} seq_regions = {} passed_end_alignment = False latest_id = None while True: if not line: break # end of file line = line.strip() if line.startswith('='): # There may be more data, but we've reached the end of this # alignment break elif line.startswith('>'): m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) if not m: m = XMFA_HEADER_REGEX.match(line) if not m: raise ValueError("Malformed header line: %s", line) parsed_id = m.group('id') parsed_data = {} for key in ('start', 'end', 'id', 'strand', 'name', 'realname'): try: value = m.group(key) if key == 'start': value = int(value) # Convert to zero based counting if value > 0: value -= 1 if key == 'end': value = int(value) parsed_data[key] = value except IndexError: # This will occur if we're asking for a group that # doesn't exist. It's fine. pass seq_regions[parsed_id] = parsed_data if parsed_id not in self._ids: self._ids.append(parsed_id) seqs.setdefault(parsed_id, '') latest_id = parsed_id else: assert not passed_end_alignment if latest_id is None: raise ValueError("Saw sequence before definition line") seqs[latest_id] += line line = handle.readline() assert len(seqs) <= len(self._ids) self.ids = self._ids self.sequences = seqs if self._ids and seqs: alignment_length = max(map(len, list(seqs.values()))) records = [] for id in self._ids: if id not in seqs or len(seqs[id]) == 0 \ or len(seqs[id]) == 0: seq = '-' * alignment_length else: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) # Sometimes we don't see a particular sequence in the # alignment, so we skip that record since it isn't present in # that LCB/alignment if id not in seq_regions: continue if (seq_regions[id]['start'] != 0 or seq_regions[id]['end'] != 0): suffix = '/{start}-{end}'.format(**seq_regions[id]) if 'realname' in seq_regions[id]: corrected_id = seq_regions[id]['realname'] else: corrected_id = seq_regions[id]['name'] if corrected_id.count(suffix) == 0: corrected_id += suffix else: if 'realname' in seq_regions[id]: corrected_id = seq_regions[id]['realname'] else: corrected_id = seq_regions[id]['name'] record = SeqRecord(Seq(seq, self.alphabet), id=corrected_id, name=id) record.annotations["start"] = seq_regions[id]['start'] record.annotations["end"] = seq_regions[id]['end'] record.annotations[ "strand"] = 1 if seq_regions[id]['strand'] == '+' else -1 records.append(record) return MultipleSeqAlignment(records, self.alphabet) else: raise StopIteration
try: print(next(SeqIO.parse(h, t_format, given_alpha))) h.close() assert False, "Forcing wrong alphabet, %s, should fail (%s)" \ % (repr(given_alpha), t_filename) except ValueError: #Good - should fail pass h.close() del good, bad, given_alpha, base_alpha if t_alignment: print("Testing reading %s format file %s as an alignment" \ % (t_format, t_filename)) alignment = MultipleSeqAlignment( SeqIO.parse(handle=t_filename, format=t_format)) assert len(alignment) == t_count alignment_len = alignment.get_alignment_length() #Check the record order agrees, and double check the #sequence lengths all agree too. for i in range(t_count): assert compare_record(records[i], alignment[i]) assert len(records[i].seq) == alignment_len print(alignment_summary(alignment)) #Some alignment file formats have magic characters which mean #use the letter in this position in the first sequence. #They should all have been converted by the parser, but if
def write_alignments(alignments, outfile=None, shading_modes=["similar"], logo=True, hideseqs=False, splitN=20, secondary_structure=True, save_dir=""): """ """ if outfile is None: n2 = str(uuid.uuid4()) outfile = "alignment_{}".format(n2) with open(os.path.join(save_dir, "{}.tex".format(outfile)), "w") as tex: print >> tex, "\\documentclass[11pt,landscape]{article}" print >> tex, "\\usepackage{hyperref}" print >> tex, "\\usepackage[paperwidth={}in, paperheight=18in]{{geometry}}".format( 22 / 200. * 200 + 2.5) print >> tex, "\\usepackage{texshade}\n" print >> tex, "\\begin{document}" for aln in alignments: if isinstance(aln, str): name = os.path.basename(aln) msa = MultipleSeqAlignment(list(SeqIO.parse(aln, "fasta"))) elif isinstance(aln, MultipleSeqAlignment): msa = aln name = aln.annotations.get("name", "HistoneDB") else: raise RuntimeError( "Invalid alignments: Must be a path to a FASTA format or a BioPython MultipleSequenceAlignment object." ) write_alignment(tex, msa, name, shading_modes=shading_modes, logo=logo, hideseqs=hideseqs, splitN=splitN, secondary_structure=secondary_structure, save_dir=save_dir) print >> tex, "\\end{document}" #Turn latex into pdf pdflatex = os.path.join(os.path.dirname(sys.executable), "pdflatex") # print pdflatex # print os.path.join(save_dir, "{}.tex".format(outfile)) #assert 0, " ".join([pdflatex, "--file-line-error", "--synctex=1", "-output-directory={}".format(save_dir), "--save-size=10000", os.path.join(save_dir, "{}.tex".format(outfile))]) process = Popen([ pdflatex, "--file-line-error", "--synctex=1", "-output-directory={}".format(save_dir), "--save-size=10000", os.path.join(save_dir, "{}.tex".format(outfile)) ]) process.communicate() #assert 0, #assert os.path.exists(os.path.join(save_dir, "{}.pdf".format(outfile))),"Where are you?" os.remove(os.path.join(save_dir, "{}.tex".format(outfile))) os.remove(os.path.join(save_dir, "{}.aux".format(outfile))) os.remove(os.path.join(save_dir, "{}.log".format(outfile))) os.remove(os.path.join(save_dir, "{}.out".format(outfile))) for fasta_part in glob.glob( os.path.join(save_dir, "{}_*.fasta".format(name))): os.remove(fasta_part) return os.path.join(save_dir, "{}.pdf".format(outfile))
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration line = line.strip() parts = [x for x in line.split() if x] if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None and \ self.records_per_alignment != number_of_seqs: raise ValueError("Found %i records in this alignment, " "told to expect %i" % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] # By default, expects STRICT truncation / padding to 10 characters. # Does not require any whitespace between name and seq. for i in range(number_of_seqs): line = handle.readline().rstrip() sequence_id, s = self._split_id(line) ids.append(sequence_id) while len(s) < length_of_seqs: # The sequence may be split into multiple lines line = handle.readline().strip() if not line: break if line == "": continue s = "".join([s, line.strip().replace(" ", "")]) if len(s) > length_of_seqs: raise ValueError("Found a record of length %i, " "should be %i" % (len(s), length_of_seqs)) if "." in s: raise ValueError(_NO_DOTS) seqs.append(s) while True: # Find other alignments in the file line = handle.readline() if not line: break if self._is_header(line): self._header = line break records = (SeqRecord(Seq(s, self.alphabet), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)) return MultipleSeqAlignment(records, self.alphabet)
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0, iterations=5): """take a set of discrete states associated with tips of a tree and reconstruct their ancestral states along with a GTR model that approximately maximizes the likelihood of the states on the tree. Parameters ---------- tree : str, Bio.Phylo.Tree name of tree file or Biopython tree object traits : dict dictionary linking tips to straits missing_data : str, optional string indicating missing data pc : float, optional number of pseudo-counts to be used during GTR inference, default 1.0 sampling_bias_correction : float, optional factor to inflate overall switching rate by to counteract sampling bias weights : str, optional name of file with equilibirum frequencies verbose : int, optional level of verbosity in output iterations : int, optional number of times non-linear optimization of overall rate and transmission estimation are iterated Returns ------- tuple tuple of treeanc object, forward and reverse alphabets Raises ------ TreeTimeError raise error if ancestral reconstruction errors out """ unique_states = sorted(set(traits.values())) nc = len(unique_states) if nc>180: print("mugration: can't have more than 180 states!", file=sys.stderr) return None, None, None elif nc<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return None, None, None ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### alphabet = [chr(65+i) for i,state in enumerate(unique_states)] missing_char = chr(65+nc) letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)} letter_to_state[missing_char]=missing_data reverse_alphabet = {v:k for k,v in letter_to_state.items()} ########################################################################### ### construct gtr model ########################################################################### if type(weights)==str: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()} mean_weight = np.mean(list(weights.values())) weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float) weights/=weights.sum() else: weights = None # set up dummy matrix W = np.ones((nc,nc), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) try: ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights, reconstruct_tip_states=True) treeanc.optimize_gtr_rate() except TreeTimeError as e: print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n") raise e for i in range(iterations): treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc) treeanc.optimize_gtr_rate() if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False, reconstruct_tip_states=True) print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. " "TreeTime now optimizes the overall rate numerically and thus allows for long branches " "along which multiple changes accumulated. This is expected to affect estimates of the " "overall rate while leaving the relative rates mostly unchanged.")) return treeanc, letter_to_state, reverse_alphabet
def goANI_dnds_calculation(fna1, faa1, fna2, faa2, gedb, debug=False): ''' This is a threadable command to determine the dn/ds of two genomes based on a list of genes Arguments: fna1 : .fna file of genome1 faa1 : .faa file of genome1 fna2 : .fna file of genome2 faa2 : .faa file of genome2 gedb : datatable listing the genes to align and calculate dn/ds for Returns: dndb : data-table containing raw dn/ds information ''' # load .fasta files g1n = SeqIO.to_dict( SeqIO.parse(fna1, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())) g1a = SeqIO.to_dict(SeqIO.parse(faa1, 'fasta', alphabet=IUPAC.protein)) g2n = SeqIO.to_dict( SeqIO.parse(fna2, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())) g2a = SeqIO.to_dict(SeqIO.parse(faa2, 'fasta', alphabet=IUPAC.protein)) # set up aligner aligner = Align.PairwiseAligner() aligner.mode = 'global' #print(MatrixInfo.blosum62) aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") aligner.open_gap_score = -12 aligner.extend_gap_score = -3 # set up table table = defaultdict(list) # for every gene-pair to align j = 0 for i, row in gedb.iterrows(): try: # get the sequences a1 = g1a[row['qry_id']] if a1[-1] == '*': a1 = a1[:-1] a2 = g2a[row['sbj_id']] if a2[-1] == '*': a2 = a2[:-1] s1 = g1n[row['qry_id']] s2 = g2n[row['sbj_id']] # alingn them alignments = aligner.align(a1.seq, a2.seq) # Arbitrary cutoff to make sure this doesn't bug out if len(alignments) > 10000: print("Ahhh! {0} vs {1} has {2} alignments".format( row['qry_id'], row['sbj_id'], len(alignments))) raise Exception('Too many alignments exception') # convert to multi-sequence alignment alignment = min(alignments) ass = str(alignment).split('\n') msa = MultipleSeqAlignment([ SeqRecord(Seq(ass[0], alphabet=IUPAC.protein)), SeqRecord(Seq(ass[-2], alphabet=IUPAC.protein)) ]) # convert to codon alignment codon_aln = Bio.codonalign.build(msa, [s1, s2]) # calculate dn/ds on the codon alignment dS, S, dN, N = custom_dn_ds(codon_aln._records[0], codon_aln._records[1]) # save table['qry_id'].append(row['qry_id']) table['sbj_id'].append(row['sbj_id']) table['S_changed'].append(dS) table['S_sites'].append(S) table['N_changed'].append(dN) table['N_sites'].append(N) j += 1 if debug: if j >= 10: break except Exception as e: print("Alignment exception- {0}".format(e)) table['qry_id'].append(row['qry_id']) table['sbj_id'].append(row['sbj_id']) table['S_changed'].append(0) table['S_sites'].append(0) table['N_changed'].append(0) table['N_sites'].append(0) dnDb = pd.DataFrame(table) return dnDb
MSA0 = JAlign.readlines() '''Next turn the line in to a sequence object''' MSA = [] MSAList =[] count=1 for seq in MSA0: Seqlist = list(seq[:-1]) Seq1 = Seq(''.join(Seqlist)) Seqlist = [Seqlist[x] for x in range(len(Seqlist)) if (Seqlist[x] != '-' and Seqlist[x].upper()==Seqlist[x])] SEQ = Seq(''.join(Seqlist)) MSA.append(SeqRecord(Seq1,id=str(count))) MSAList.append(SEQ) count+=1 MSA = MultipleSeqAlignment(MSA) '''indices for the maps''' def AlignIndices(MSAline): MSAline = list(MSAline) Indices = [x for x in range(len(MSAline)) if (MSAline[x] != '-' and MSAline[x].upper()==MSAline[x])] return(Indices) def TreeDistMat(AlignObject): calculator = DistanceCalculator('identity') dm = calculator.get_distance(AlignObject) return(dm)
def write(sequences, handle, format): """Write complete set of sequences to a file. Arguments: - sequences - A list (or iterator) of SeqRecord objects, or a single SeqRecord. - handle - File handle object to write to, or filename as string. - format - lower case string describing the file format to write. Note if providing a file handle, your code should close the handle after calling this function (to ensure the data gets flushed to disk). Returns the number of records written (as an integer). """ from Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, str): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if not format.islower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(handle, SeqRecord): raise TypeError("Check arguments, handle should NOT be a SeqRecord") if isinstance(handle, list): # e.g. list of SeqRecord objects raise TypeError("Check arguments, handle should NOT be a list") if isinstance(sequences, SeqRecord): # This raised an exception in older versions of Biopython sequences = [sequences] # Map the file format to a writer function/class format_function = _FormatToString.get(format) if format_function is not None: count = 0 with as_handle(handle, "w") as fp: for record in sequences: fp.write(format_function(record)) count += 1 return count writer_class = _FormatToWriter.get(format) if writer_class is not None: count = writer_class(handle).write_file(sequences) if not isinstance(count, int): raise RuntimeError( "Internal error - the underlying %s writer " "should have returned the record count, not %r" % (format, count)) return count if format in AlignIO._FormatToWriter: # Try and turn all the records into a single alignment, # and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], handle, format) if alignment_count != 1: raise RuntimeError("Internal error - the underlying writer " "should have returned 1, not %r" % alignment_count) count = len(alignment) return count if format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError("Reading format '%s' is supported, but not writing" % format) raise ValueError("Unknown format '%s'" % format)
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0): unique_states = sorted(set(traits.values())) nc = len(unique_states) if nc>180: print("mugration: can't have more than 180 states!", file=sys.stderr) return 1 elif nc<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return 1 ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### alphabet = [chr(65+i) for i,state in enumerate(unique_states)] missing_char = chr(65+nc) letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)} letter_to_state[missing_char]=missing_data reverse_alphabet = {v:k for k,v in letter_to_state.items()} ########################################################################### ### construct gtr model ########################################################################### if type(weights)==str: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()} mean_weight = np.mean(list(weights.values())) weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float) weights/=weights.sum() else: weights = None # set up dummy matrix W = np.ones((nc,nc), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights) if ndiff==ttconf.ERROR: # if reconstruction failed, exit return 1 if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False) return treeanc, letter_to_state, reverse_alphabet
return features #prof=cons_prof(alignment) #pylab.plot(prof) if __name__ == '__main__': human_h2a_z_core = Seq( 'SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLI-KATIAGGGVIPHIHKSLIG' ) xenopus_h2a_core = Seq( 'TRSSRAGLQFPVGRVHRLLRKGNYAE-RVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLP' ) # human_h2a_z_core=Seq('SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLIKATIAGGGVIPHIHKSLIG') msa = MultipleSeqAlignment( [SeqRecord(xenopus_h2a_core, id='H2A', name='H2A')]) features = get_hist_ss_in_aln_for_shade(msa, below=True) # features=[{'style':'fill:$\uparrow$','sel':[5,10],'text':'test'}] print(features) shade_aln2png(msa, filename='default', shading_modes=['charge_functional'], legend=False, features=features, title='', logo=False, hideseqs=False, splitN=20, setends=[], ruler=False, show_seq_names=False,
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): message = """Darn... amino acids vs nucleotide coordinates? tool: {0} query_seq: {1} query_tags: {2} {3} length: {4} match_seq: {5} match_tags: {6} {7} length: {8} handle.name: {9} """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name) raise ValueError(message) assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self): """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an MultipleSeqAlignment object containing two rows. """ handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration if line.startswith("#"): #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>"): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line: #End of file raise StopIteration if ">>><<<" in line: #Reached the end of the alignments, no need to read the footer... raise StopIteration #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>": raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None, 1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">": query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError( "Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None, 1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; ": assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else: align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region( query_seq, query_annotation) match_align_seq = self._extract_alignment_region( match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq): raise ValueError( "Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation: if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems(): alignment._annotations[key] = value for key, value in alignment_annotation.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(query_align_seq, alphabet), id=self._query_descr.split(None, 1)[0].strip(","), name="query", description=self._query_descr, annotations={"original_length": int(query_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation: if query_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in query_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(match_align_seq, alphabet), id=match_descr.split(None, 1)[0].strip(","), name="match", description=match_descr, annotations={"original_length": int(match_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_annotation["al_start"]) record._al_stop = int(match_annotation["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation: if match_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in match_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def setUp(self): # Test set 1 seq1 = SeqRecord(Seq( "TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro1") seq2 = SeqRecord(Seq( "TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro2") pro1 = SeqRecord(Seq("SGTARTKLLLLLAALCAAGGALE", alphabet=IUPAC.protein), id="pro1") pro2 = SeqRecord(Seq("SGTSRTKRLLLLAALGAAGGALE", alphabet=IUPAC.protein), id="pro2") aln1 = MultipleSeqAlignment([pro1, pro2]) self.aln1 = aln1 self.seqlist1 = [seq1, seq2] # Test set 2 # M K K H E L(F)L C Q G T S N K L T Q(L)L G T F E D H F L S L Q R M F N N C E V V seq3 = SeqRecord(Seq( "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro1") # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') seq4 = SeqRecord(Seq( "ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro2") # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') seq5 = SeqRecord(Seq( "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro3") pro3 = SeqRecord(Seq( "MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL", alphabet=IUPAC.protein), id="pro1") pro4 = SeqRecord(Seq( "MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL", alphabet=IUPAC.protein), id="pro2") pro5 = SeqRecord(Seq( "MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL", alphabet=IUPAC.protein), id="pro3") aln2 = MultipleSeqAlignment([pro3, pro4, pro5]) self.aln2 = aln2 self.seqlist2 = [seq3, seq4, seq5] # Test set 3 # use Yeast mitochondrial codon table seq6 = SeqRecord(Seq( "ATGGCAAGGGACCACCCAGTTGGGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACCTTTCTTTTCTCAAGACCATCCAG", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro6") seq7 = SeqRecord(Seq( "ATGGCAAGGCACCATCCAGTTGAGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACGTGTCTCTGCTCAAGACCATCCAG", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro7") seq8 = SeqRecord(Seq( "ATGGCAGGGGACCACCCAGTTGGGCACTGATATGATCGTGTGTATCTGCAGAGTAGTAACCACTCTTTTCTCATGACCATCCAG", alphabet=IUPAC.IUPACUnambiguousDNA()), id="pro8") pro6 = SeqRecord(Seq("MARDHPVGHWYDRVYLQSSNTSFTKTIQ", alphabet=IUPAC.protein), id="pro6") pro7 = SeqRecord(Seq("MARHHPVEHWYDRVYLQSSNVSTTKTIQ", alphabet=IUPAC.protein), id="pro7") pro8 = SeqRecord(Seq("MAGDHPVGHWYDRVYTQSSNHSFTMTIQ", alphabet=IUPAC.protein), id="pro8") aln3 = MultipleSeqAlignment([pro6, pro7, pro8]) self.aln3 = aln3 self.seqlist3 = [seq6, seq7, seq8] self.codontable3 = CodonTable.unambiguous_dna_by_id[3]
def MafIterator(handle, seq_count=None): """Iterate over a MAF file handle as MultipleSeqAlignment objects. Iterates over lines in a MAF file-like object (handle), yielding MultipleSeqAlignment objects. SeqRecord IDs generally correspond to species names. """ in_a_bundle = False annotations = [] records = [] while True: # allows parsing of the last bundle without duplicating code try: line = next(handle) except StopIteration: line = "" if in_a_bundle: if line.startswith("s"): # add a SeqRecord to the bundle line_split = line.strip().split() if len(line_split) != 7: raise ValueError( "Error parsing alignment - 's' line must have 7 fields" ) # convert MAF-style +/- strand to biopython-type 1/-1 if line_split[4] == "+": strand = 1 elif line_split[4] == "-": strand = -1 else: # TODO: issue warning, set to 0? strand = 1 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) anno = { "start": int(line_split[2]), "size": int(line_split[3]), "strand": strand, "srcSize": int(line_split[5]), } sequence = line_split[6] # interpret a dot/period to mean the same as the first sequence if "." in sequence: if not records: raise ValueError( "Found dot/period in first sequence of alignment" ) ref = str(records[0].seq) new = [] for (letter, ref_letter) in zip(sequence, ref): new.append(ref_letter if letter == "." else letter) sequence = "".join(new) records.append( SeqRecord( Seq(sequence), id=line_split[1], name=line_split[1], description="", annotations=anno, ) ) elif line.startswith("i"): # TODO: information about what is in the aligned species DNA before # and after the immediately preceding "s" line pass elif line.startswith("e"): # TODO: information about the size of the gap between the alignments # that span the current block pass elif line.startswith("q"): # TODO: quality of each aligned base for the species. # Need to find documentation on this, looks like ASCII 0-9 or gap? # Can then store in each SeqRecord's .letter_annotations dictionary, # perhaps as the raw string or turned into integers / None for gap? pass elif line.startswith("#"): # ignore comments # (not sure whether comments # are in the maf specification, though) pass elif not line.strip(): # end a bundle of records if seq_count is not None: assert len(records) == seq_count alignment = MultipleSeqAlignment(records) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/FastaIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = annotations yield alignment in_a_bundle = False annotations = [] records = [] else: raise ValueError( "Error parsing alignment - unexpected line:\n%s" % (line,) ) elif line.startswith("a"): # start a bundle of records in_a_bundle = True annot_strings = line.strip().split()[1:] if len(annot_strings) != line.count("="): raise ValueError("Error parsing alignment - invalid key in 'a' line") annotations = dict(a_string.split("=") for a_string in annot_strings) elif line.startswith("#"): # ignore comments pass elif not line: break
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input files are from s6 genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/") ### mkdir output directory for s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/" if os.path.isdir(output_directory) == False: os.makedirs(output_directory) ### iterate each gene for file in os.listdir(genes_result_s6): if file != ".DS_Store": output_directory_file = output_directory + file fasta_name = genes_result_s6 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " +sequence) alignment = AlignIO.read(sequence, 'fasta') # print(alignment) ### generate a new alignment sequences without outgroups. align = MultipleSeqAlignment([]) for record in alignment: if record.id not in outgroups: # print(record.id) # print(record.seq) temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id)) # print(temp_seq) align.extend([temp_seq]) print(align) # print(align.get_alignment_length()) total_wrong_poly_sites = [] ### change alignment to an array. align_array = np.array([list(rec) for rec in align]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = align.get_alignment_length() ### using 20bp-long sliding windows. for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position) > float(Max_p_sites): print(column_position) total_wrong_poly_sites = total_wrong_poly_sites + column_position #print(total_wrong_poly_sites) ### generate the unique positions total_wrong_poly_sites = total_wrong_poly_sites + list(range(10)) total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length)) ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species. unique_wrong_sites = list(np.unique(total_wrong_poly_sites)) print(len(unique_wrong_sites)) # sum2 = alignment[:, total_length:total_length + 1] # for i in unique_wrong_sites: # sum2 = sum2 + alignment[:, i:i+1] # print(sum2) # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip") ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites. ### otherwise, copy the gene to the new folder. if len(unique_wrong_sites) > 0: print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")) cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}") cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col print(cmd) os.system(cmd) else: cmd_2 = "cp " + fasta_name + " " + output_directory_file print(cmd_2) os.system(cmd_2)
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: # Empty file - just give up. raise StopIteration if line.strip() != "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = {} # Really only need an OrderedSet, but python lacks this gs = {} gr = {} gf = {} gc = {} passed_end_alignment = False while True: line = handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError( "Could not split line into identifier and sequence:\n" + line) seq_id, seq = parts if seq_id not in ids: ids[seq_id] = True seqs.setdefault(seq_id, "") seqs[seq_id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" feature, text = line[5:].strip().split(None, 2) if feature not in gc: gc[feature] = "" gc[feature] += text.strip() # append to any previous entry # Might be interleaved blocks, so can't check length yet elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" try: seq_id, feature, text = line[5:].strip().split(None, 2) except ValueError: # Free text can sometimes be empty, which a one line split throws an error for. # See https://github.com/biopython/biopython/issues/2982 for more details seq_id, feature = line[5:].strip().split(None, 1) text = "" # if seq_id not in ids: # ids.append(seq_id) if seq_id not in gs: gs[seq_id] = {} if feature not in gs[seq_id]: gs[seq_id][feature] = [text] else: gs[seq_id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" seq_id, feature, text = line[5:].strip().split(None, 2) # if seq_id not in ids: # ids.append(seq_id) if seq_id not in gr: gr[seq_id] = {} if feature not in gr[seq_id]: gr[seq_id][feature] = "" gr[seq_id][feature] += text.strip( ) # append to any previous entry # Might be interleaved blocks, so can't check length yet # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids.keys() self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if (self.records_per_alignment is not None and self.records_per_alignment != len(ids)): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for seq_id in ids: seq = seqs[seq_id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(seq_id) record = SeqRecord( Seq(seq), id=seq_id, name=name, description=seq_id, annotations={"accession": name}, ) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(seq_id, record) records.append(record) for k, v in gc.items(): if len(v) != alignment_length: raise ValueError("%s length %i, expected %i" % (k, len(v), alignment_length)) alignment = MultipleSeqAlignment(records) for k, v in sorted(gc.items()): if k in self.pfam_gc_mapping: alignment.column_annotations[self.pfam_gc_mapping[k]] = v elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping: alignment.column_annotations[self.pfam_gr_mapping[ k[:-5]]] = v else: # Ignore it? alignment.column_annotations["GC:" + k] = v # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration #Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS'] if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known CLUSTAL header: %s" % (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0] == '(' and word[-1] == ')': word = word[1:-1] if word[0] in '0123456789': version = word break #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None # Used to extract the consensus #Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) #Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": #Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: #No consensus break line = handle.readline() if not line: break # end of file assert line.strip() == "" assert seq_cols is not None #Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "": line = handle.readline() if not line: break # end of file if not line: break # end of file if line.split(None, 1)[0] in known_headers: #Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError( "Identifiers out of order? Got '%s' but expected '%s'" % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % ( seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end #Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() #There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: raise StopIteration if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) for (i, s) in zip(ids, seqs)) alignment = MultipleSeqAlignment(records, self.alphabet) #TODO - Handle alignment annotation better, for now #mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: alignment_length = len(seqs[0]) assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment