def translate_alignment(self): ''' translate the alignment and calculate amino acid consensus ''' self.aa_aln = MultipleSeqAlignment([]) if self.cds['end']>=0: last_base = self.cds['end'] else: last_base = self.aln.get_alignment_length()+self.cds['end']+1 # translate, add cds['pad'] Xs at the beginning # TODO: make translation gap-tolerant for seq in self.aln: try: tmp_seq = 'X'*self.cds['pad'] + seq.seq[self.cds['begin']:last_base].translate() except: tmp_seq = Seq.Seq('X'*(self.cds['pad']+(self.cds['end']-self.cds['begin'])/3), generic_protein) print self.cds if self.cds['end']-self.cds['begin']==0: tmp_seq=Seq.Seq('X', generic_protein) self.aa_aln.append(SeqRecord.SeqRecord(seq = tmp_seq, name=seq.name, id=seq.id)) # process amino acid alignment self.aa_summary_info = AlignInfo.SummaryInfo(self.aa_aln) self.aa_consensus = self.aa_summary_info.dumb_consensus() self.calculate_aa_allele_frequencies()
def get_consensus(seqs, counts): # followed advice from here: # http://stackoverflow.com/questions/18860962/run-clustalw2-without-input-fasta-file clustalo = '/home/mchan/software/clustalo-1.2.3-Ubuntu-x86_64' # Multiple Sequence Alignment s1 = [] # reads expanded to represent accurate counts for seq, readCount in zip(seqs, counts): t1 = [seq] * readCount s1 = s1 + t1 s2 = ['>seq' + str(i) + '\n' + s1[i] + '\n' for i in range(len(s1))] str1 = ''.join(s2) proc = Popen([clustalo, '--infile=-', '--threads=4'], stdout=PIPE, stdin=PIPE, stderr=STDOUT) stdout = proc.communicate(input=str1.encode())[0] alignment = Bio.AlignIO.read(StringIO(stdout.decode()), 'fasta') summary_align = AlignInfo.SummaryInfo(alignment) consensusSeq = summary_align.gap_consensus(threshold=0.6, ambiguous='N') consensusSeq = str(consensusSeq).replace("-", "") consensusSeq = re.sub('N+$', '', str(consensusSeq)) # trim trailing Ns return str(consensusSeq)
def loadAlignment( self, path ): """ path is a path to an alignment file in .aln format""" alignment = Clustalw.parse_file( path ) self.allseq = alignment.get_all_seqs() self.summary = AlignInfo.SummaryInfo(alignment) self.l = alignment.get_alignment_length() self.insertLoadedBioAlignment()
def process(key): options = records[key] count = 1 for option in options: option.id = 'record_' + str(count) count += 1 seqs = [str(i.seq) for i in options] if method == 'consensus': if len(seqs) > 1: temp = tempfile.mkdtemp() aln, tree = sequtils.clustalw(temp + '/temp.fasta', options) summary_align = AlignInfo.SummaryInfo(aln) consensus = summary_align.dumb_consensus() peptide = str(consensus) shutil.rmtree(temp) else: peptide = seqs[0] peptide = tagmatch.character_strip(peptide, 'X') elif method == 'concatenated': peptide = '-'.join(list(set(seqs))) new_record = SeqRecord(id=key, seq=Seq(peptide)) return new_record
def get_consensus(msa, threshold=0.5): """ Return the consensus sequence of the MSA. It returns the amino-acid residue that appears in more than `threshold` fraction of the sequences without counting sequences that has gaps in that column. If any residue is present in more sequences than the `threshold`, it introduces an `X` to indicate the ambiguity. If there are more than 50% gaps in a column, it displays the residue in lowercase. >>> from Bio.Alphabet import IUPAC >>> from Bio.Seq import Seq >>> from Bio.SeqRecord import SeqRecord >>> from Bio.Align import MultipleSeqAlignment >>> seq_1 = SeqRecord(Seq("SEEEEACCC", IUPAC.protein), id="I") >>> seq_2 = SeqRecord(Seq("SSEEEGCC-", IUPAC.protein), id="II") >>> seq_3 = SeqRecord(Seq("SSSEEKC--", IUPAC.protein), id="III") >>> seq_4 = SeqRecord(Seq("SSSSEH---", IUPAC.protein), id="IIII") >>> msa = MultipleSeqAlignment([seq_1, seq_2, seq_3, seq_4]) >>> get_consensus(msa) 'SSXEEXCCc' """ alphabet = IUPAC.protein summary_align = AlignInfo.SummaryInfo(msa) ungapped = summary_align.dumb_consensus(consensus_alpha=alphabet, threshold=threshold) gapped = summary_align.gap_consensus(consensus_alpha=alphabet, threshold=0.5) return ''.join(res_a if res_b != '-' else res_a.lower() for (res_a, res_b) in zip(ungapped, gapped))
def summary_aln(aln_params): ''' Take an alignment and coordinates (unpacked from aln_params). Take a sub-alignment bounded by the coordinates. Calculate a pssm for the sub-alignment. Find the variable sites using the pssm. Take a sub-sub-alignment for each variable site. Bind all the sub-sub-alignments to return a sub-alignment of just variable sites (SNVs). ''' (aln, i, j) = aln_params alignment = aln[:, i:j] summary_align = AlignInfo.SummaryInfo(alignment) first_seq = (alignment[0].seq) my_pssm = summary_align.pos_specific_score_matrix(first_seq) dna_bases = ['A', 'C', 'T', 'G'] pos = 0 variant_sites = [] for base_dict in my_pssm: base_count = [base_dict[k] for k in dna_bases if base_dict[k] > 0] if len(base_count) > 1: variant_sites.append(pos) pos += 1 variant_cols = [] for s in variant_sites: variant_cols.append(alignment[:,s:s+1]) if len(variant_cols) > 0: aln = MultipleSeqAlignment(variant_cols[0]) for t in variant_cols[1:]: aln += t return aln
def out(): win.filename = e1.get() alignment = AlignIO.read(win.filename, "clustal") summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() print(consensus) win.filename = e2.get() filename = win.filename if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError: raise with open(filename, "w") as f: f.write(str(consensus)) r = open(e2.get(), 'r').read() root = Tk() S = Scrollbar(root) T = Text(root, height=50, width=500) S.pack(side=RIGHT, fill=Y) T.pack(side=LEFT, fill=Y) S.config(command=T.yview) S.config(command=T.xview) T.config(yscrollcommand=S.set) T.config(xscrollcommand=S.set) quote = r T.insert(END, quote, 'color') mainloop()
def Find_SNP_locations(alignment, startinglocations): SNPlocations = [] summary_align = AlignInfo.SummaryInfo(alignment) print "Identifying SNP locations" sys.stdout.flush() count = 0 total = 0.0 for x in startinglocations: count = count + 1 if count == 10000: total = total + count count = 0 print "%.2f%% complete\r" % ( 100 * (total / alignment.get_alignment_length())), sys.stdout.flush() foundbases = [] for record in alignment: base = record.seq[x].upper() if base != '-' and base not in foundbases: # foundbases.append(base) if len(foundbases) > 1: SNPlocations.append(x) break print "100.00% complete\n" #Found %d SNP locations" % len(SNPlocations), sys.stdout.flush() return SNPlocations
def phylip_writer(handle_in,handle_out,nloci,full_list,loci_list,sample,read_len): N = 0 alignment = {} line = handle_in.readline() while line: if line.startswith('>'): locus = int(line.split('_')[1].lstrip('locus')) if locus < nloci: seq = handle_in.readline().rstrip() if locus in alignment: alignment[locus].append(seq) else: alignment[locus] = [seq] line = handle_in.readline() handle_in.close() for i in full_list: if i in alignment: cov = loci_list[sample].count(i) if cov > 1: phylip = open('temp.phylip','w') phylip.write('%d\t%d\n'%(cov,read_len)) for j in range(cov): phylip.write('Sample%d\t%s\n'%(j,alignment[i][j])) phylip.close() alignments = AlignIO.read(open('temp.phylip'),'phylip-relaxed') summary_align = AlignInfo.SummaryInfo(alignments) seq = str(summary_align.dumb_consensus()) N = N + seq.count('X') elif cov == 1: seq = ''.join(alignment[i]) else: seq = '-'*read_len handle_out.write(seq) return N
def genebank_seq(self): Entrez.email = "*****@*****.**" type_of_seq = self.data['sequence-type'] lines = self.data['genebank-seq'].split('\n') seq_id_list = [line for line in lines if line != ''] seq_content_list = [] for seq in seq_id_list: with Entrez.efetch(db=type_of_seq, rettype="gb", retmode="text", id=seq) as handle: seq_record = SeqIO.read(handle, "gb") seq = str(seq_record.seq) seq_content_list.append(seq) file_content = "" for seq_name, seq_content in zip(seq_id_list, seq_content_list): file_content += ">" + seq_name + "\n" + seq_content + "\n" filename = create_seq_file(file_content) muscle = MuscleCommandline(input=filename) stdout, stderr = muscle() align = AlignIO.read(StringIO(stdout), "fasta") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.gap_consensus(threshold=0.55, ambiguous='N') remove_temp_file(filename) return f'>consensus sequence {len(consensus)} bp\n' + str(consensus)
def design_primers(source_dir, target_dir, settings, logfile): print("\nDesigning primers using PriFi...\n", file=logfile) # get rid of previous files utils.purge_dir(target_dir) aln_files = glob(os.path.join(source_dir, '*.fasta')) print("\tChecking for empty alignments...", file=logfile) for f in aln_files: try: align = AlignIO.read(f, 'fasta') filename = os.path.basename(f) shutil.copyfile(f, os.path.join(target_dir, filename)) except Exception: print("[WARNING] Empty alignment file?! (%s)" % f, file=logfile) continue # call PriFi for actual primer design for f in glob(os.path.join(target_dir, '*.fasta')): aln = AlignIO.read(f, 'fasta') summary = AlignInfo.SummaryInfo(aln) l = aln.get_alignment_length() primerpairs = prifipy.findprimers(0, list(aln), summary, l, settings, logfile) if not primerpairs: print("%s: No valid primer pair found" % f, file=logfile) else: print( '%s: Found %d primer pair suggestions. Writing primer files:' % (f, len(primerpairs)), file=logfile) prifipy.writePrimersToFiles(f, primerpairs, 1, logfile)
def main(): # load germline sequences ref_seqs = {} for f in glob("fasta/full_ref/*.fasta"): for s in SeqIO.parse(f, "fasta"): s.id = s.name = s.description = s.id.split("|")[1].replace("*", "-") ref_seqs[s.id] = s # check and collect references references = [] for f in glob("fasta/IG*.fasta"): name = re.sub("^.*/|[.].*$", "", f) ref = ref_seqs[name] seqs = [x for x in SeqIO.parse(f, "fasta")] # get consensus sequence from file m_align = MultipleSeqAlignment(seqs) align_summary = AlignInfo.SummaryInfo(m_align) consensus = align_summary.dumb_consensus() # cut reference seq to match other seqs length seq_len = len(consensus) ref = ref[:seq_len].upper() alignments = pairwise2.align.globalxs(ref.seq, consensus, -5, -2) # check score... left 1 base for errors with consensus if alignments[0][-3] < seq_len - 1: print("Hey, check {}".format(f)) print(pairwise2.format_alignment(*alignments[0])) else: references.append(ref) SeqIO.write(references, "fasta/references.fasta", "fasta-2line")
def sequence_consensus(sequence_list, vir_typer_pk): from Bio.Align import AlignInfo from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio import AlignIO, SeqIO from Bio.Seq import Seq consensus = list() records = list() if len(sequence_list) >= 2: for sequence_dict in sequence_list: # print(sequence_dict) for seq_code, sequence in sequence_dict.items(): record = SeqRecord(Seq(sequence, IUPAC.unambiguous_dna), id=seq_code, description='') records.append(record) file_path = 'olc_webportalv2/media/vir_typer/{pk}/'.format( pk=vir_typer_pk) try: os.makedirs(file_path) except FileExistsError: pass with open(file_path + 'alignment.fasta', 'w') as alignment_file: SeqIO.write(records, alignment_file, 'fasta') with open(file_path + 'alignment.fasta', 'r') as alignment_file: alignment = AlignIO.read(alignment_file, 'fasta') summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() else: for sequence_dict in sequence_list: # print(sequence_dict) for seq_code, sequence in sequence_dict.items(): consensus = sequence return consensus
def parse_alleles_file(inpath): seqlist = [] with open(inpath, 'r') as allfile: temp = '' for line in allfile: if '//' not in line and line.lstrip('\n'): temp = temp + line elif not line.rstrip('\n '): #sometimes there are blank lines... continue else: nchar = int(len(temp.split('\n')[0].split()[-1])) ntaxa = int(temp.count('\n')) temp = str(ntaxa) + '\t' + str(nchar) + '\n' + temp phylip = StringIO.StringIO(temp) alignment = AlignIO.read(phylip, "phylip-relaxed", alphabet=IUPACAmbiguousDNA()) summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus(threshold=0.5, ambiguous='N') seqlist.append( SeqRecord(consensus, id=str(len(seqlist)), description='')) phylip.close() temp = '' #zero temp again return seqlist
def ConsensusSequences(clusters): reads = [] # go through each cluster of sequencing reads provided for g in clusters: # temporary files tmp_file = tempfile.NamedTemporaryFile(delete=False) tmp_file2 = tempfile.NamedTemporaryFile(delete=False) tmp_file2.close() # if group is larger than MAX_READS_FOR_CONSENSUS reads, select random sample of MAX_READS_FOR_CONSENSUS if len(g) > CdrExtractionOptions.MAX_READS_FOR_CONSENSUS: subsample = random.sample( g, CdrExtractionOptions.MAX_READS_FOR_CONSENSUS) else: subsample = g # use all available reads # store cluster in temporary file for i in range(0, len(subsample)): tmp_file.write('>' + str(i) + '\n') tmp_file.write(subsample[i] + '\n') tmp_file.close() # generate consensus read using muscle muscle_cline = MuscleCommandline(input=tmp_file.name, out=tmp_file2.name, maxiters=2) muscle_cline() align = AlignIO.read(tmp_file2.name, 'fasta') summary_align = AlignInfo.SummaryInfo(align) # store consensus read in list reads.append([summary_align.gap_consensus(ambiguous=''), len(g)]) # remove tmp files os.unlink(tmp_file.name) os.unlink(tmp_file2.name) return reads
def get_ic(path, pdb): """ Process given MSA (in file_path). Return consensus and information content. """ alignment = AlignIO.read(path, "stockholm") summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() pdb2alignid = get_pdb2alignmentid(path) alignid = pdb2alignid[pdb] aligned_pdb_seq = None for seq in alignment: if seq.id == alignid: aligned_pdb_seq = seq break aligned_seqs = [str(a.seq) for a in alignment] freqs = count_freqs(aligned_seqs) info_content = [] for pos in range(len(consensus)): info_content.append( summary_align.information_content(start=pos, end=pos + 1, e_freq_table=FreqTable.FreqTable( freqs, dict_type=FreqTable.FREQ))) return consensus, info_content, aligned_pdb_seq
def fetch_additional_data(MSA): summary_align = AlignInfo.SummaryInfo(MSA) consensus = summary_align.dumb_consensus() print(consensus) PSSM = summary_align.pos_specific_score_matrix() print(PSSM) return consensus, PSSM
def createScoreMatrixFromAlignment(filename, output): """ Generate substitution matrix from a file containing alignment Args: filename (str): path to a fasta file containing aligned sequences output (str): path to a pickle file for the matrix to be stored Returns: my_lom (array): log odds matrix """ c_align = AlignIO.read(filename, "fasta") summary_align = AlignInfo.SummaryInfo(c_align) replace_info = summary_align.replacement_dictionary(["*"]) my_arm = SubsMat.SeqMat(replace_info) # Add pseudocounts for m in my_arm: my_arm[m] += 1 my_lom = SubsMat.make_log_odds_matrix(my_arm) pickle.dump(my_lom, open(output, "wb")) return my_lom
def consensusFromSequencesCnf(sequences, threshold, ambiguous, require_multiple): """ Compute the naive concensus sequence from an alignment of sequences. Args: sequences (array): the array of aligned sequences threshold (float): the value of column agreement under which a single letter will be asigned to the column ambiguous (char): a char letter that will be assigned to a column for an agreement lower than the threshold require_multiple: see biopython.org/DIST/docs/api/Bio.Align.AlignInfo.SummaryInfo-class.html Return: consensus (str): the consensus sequence """ io.exportGroupOfSequencesToFASTA(sequences, "temp/forconsensus.fasta") c_align = AlignIO.read("temp/forconsensus.fasta", "fasta") summary_align = AlignInfo.SummaryInfo(c_align) concensus = summary_align.dumb_consensus(threshold=threshold, ambiguous=ambiguous, consensus_alpha=None, require_multiple=require_multiple) return concensus
def test_read_fasta(self): path = os.path.join(os.curdir, "Quality", "example.fasta") alignment = AlignIO.read(path, "fasta", alphabet=Alphabet.Gapped(IUPAC.ambiguous_dna)) self.assertEqual(len(alignment), 3) seq_record = alignment[0] self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_413_324") self.assertEqual(seq_record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC") seq_record = alignment[1] self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_540_792") self.assertEqual(seq_record.seq, "TTGGCAGGCCAAGGCCGATGGATCA") seq_record = alignment[2] self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348") self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG") self.assertEqual(alignment.get_alignment_length(), 25) align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6) self.assertIsInstance(consensus, Seq) self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN") self.assertEqual( str(alignment), """\ Gapped(IUPACAmbiguousDNA(), '-') alignment with 3 rows and 25 columns CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_413_324 TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_540_792 GTTGCTTCTGGCGTGGGTGGGGGGG EAS54_6_R1_2_1_443_348""")
def printAlignmentInfo(alignment, alphabet): seqlist = [] for record in alignment: seqlist.append(record.seq) m = motifs.create(seqlist, alphabet) pwm = m.counts.normalize() consensus = pwm.consensus summary_align = AlignInfo.SummaryInfo(alignment) consensus2 = summary_align.dumb_consensus() my_pssm = summary_align.pos_specific_score_matrix(consensus, chars_to_ignore=['N']) print(alignment) print('first description: %s' % alignment[0].description) print('first sequence: %s' % alignment[0].seq) print('length %i' % alignment.get_alignment_length()) print('matrix pwm %s' % pwm) print('consensus (motifs) %s' % consensus) print('matrix pssm %s' % my_pssm) print('consensus (AlignInfo.SummaryInfo) %s' % consensus2) return
def get_consensus_seq(fasta_file, q_id): align = AlignIO.read(fasta_file, 'fasta') seq = SeqRecord(AlignInfo.SummaryInfo(align).gap_consensus(), id=q_id, description='') print("'" + q_id + "' consensus sequence generated (original: " + fasta_file + ')') return seq
def obtain_consensus(pair): """Obtain a consensus from the multiple sequence alignment""" consensus_sequences = open("consensus_sequences.fasta", "a") align = AlignIO.read("mafft_temp.out", "clustal") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.dumb_consensus(threshold=0.51, ambiguous='N') new_sequence_name = ">{0},{1}\n{2}\n".format(pair[0], pair[1], consensus) consensus_sequences.write(new_sequence_name)
def build_cons_seq(infile): # https://www.biostars.org/p/14026/ from Bio import AlignIO from Bio.Align import AlignInfo alignment = AlignIO.read(open(infile), "fasta") summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() return consensus
def closestToConsensus(linIt): results = [] print( "Starting vsearch on a new chunk..." ) for lineage in linIt: #save time on singletons (if they weren't excluded by minSeq) if len(lineage['desc']) == 1: with open("%s/%s.fa" % (prj_tree.lineage, lineage['name']), "r") as handle: results.append( SeqIO.read(handle,'fasta') ) else: FNULL = open(os.devnull, 'w') #don't clutter up output with tons of vsearch messages #cluster and rapid align with vsearch subprocess.call([vsearch, "-cluster_size", "%s/%s.fa" % (prj_tree.lineage, lineage['name']), "-id", "0.97", "-sizein", "-sizeout", "-msaout", "%s/%s_msa.fa"%(prj_tree.lineage, lineage['name']), "-clusterout_sort"], stdout=FNULL, stderr=subprocess.STDOUT) #extract biggest cluster with open("%s/%s_msa.fa"%(prj_tree.lineage, lineage['name']), "r") as allClusters: with open("%s/%s_msaBiggest.fa"%(prj_tree.lineage, lineage['name']), "w") as biggestOnly: blank = next( allClusters ) for line in allClusters: if "consensus" in line: break biggestOnly.write(line) #open the msa with open("%s/%s_msaBiggest.fa" % (prj_tree.lineage, lineage['name']), "r") as handle: aln = AlignIO.read(handle, "fasta") #add derep size to alignment as weighting for rec in aln: rec.annotations['weight'] = int( rec.id.split(";")[1].split("=")[1] ) summary_align = AlignInfo.SummaryInfo(aln) pssm = summary_align.pos_specific_score_matrix() #score each sequence and save the best one scores = dict() for record in aln: myScore = 0 for i,l in enumerate(record): myScore += pssm[i][l] scores[record.id] = myScore d=sorted(aln, key=lambda rec: scores[rec.id], reverse=True) #reverse -> get max d[0].seq = d[0].seq.ungap("-") #remove gaps d[0].id = d[0].id.split(";")[0] #remove vsearch size annotation d[0].id = re.sub("^\*","",d[0].id) #get rid of possible annotation from vsearch d[0].description = lineage['desc'][d[0].id] #restore original info results.append( d[0] ) return results
def find_sqce_consensus( list_of_sequences, sqce_type=Constants.SEQUENCE_TYPE_DNA, \ threshold=Constants.DEFAULT_SQCE_CONSENSUS_AMBIG_THRESHOLD, \ fasta_end_name = '' ): if (sqce_type == Constants.SEQUENCE_TYPE_DNA): alphabet = generic_dna ambiguous = Constants.SEQUENCE_AMBIGUOUS_DNA_BASE elif (sqce_type == Constants.SEQUENCE_TYPE_PROT): alphabet = generic_protein ambiguous = Constants.SEQUENCE_AMBIGUOUS_PROT_AA else: raise DenCellORFException( 'MergeStrategy.find_sqce_consensus(): The type of sequence provided' + ' has to be ' + Constants.SEQUENCE_TYPE_DNA + ' or ' + Constants.SEQUENCE_TYPE_PROT + ' (provided type: ' + str(sqce_type) + ').') # Store the input sequences in a fasta file in order to run Muscle input_sequences = (SeqRecord(Seq(s, alphabet)) for s in list_of_sequences) if (not os.path.exists(DefaultTemporaryFolder.TEMPORARY_FOLDER)): os.makedirs(DefaultTemporaryFolder.TEMPORARY_FOLDER) input_sequences_file = os.path.join( DefaultTemporaryFolder.TEMPORARY_FOLDER, 'input_sequences' + fasta_end_name + '.fasta') SeqIO.write(input_sequences, input_sequences_file, 'fasta') # Perform the multiple sequences alignment and # store the output in a fasta file aligned_sequences_file = os.path.join( DefaultTemporaryFolder.TEMPORARY_FOLDER, 'aligned_sequences' + fasta_end_name + '.fasta') muscle_cline = MuscleCommandline(cmd='/bin/muscle', input=input_sequences_file, out=aligned_sequences_file) (stdout, stderr) = muscle_cline() # Read the fasta file containing aligned sequences align = AlignIO.read(aligned_sequences_file, 'fasta') summary_align = AlignInfo.SummaryInfo(align) # Compute the consensus consensus = summary_align.gap_consensus(threshold=threshold, ambiguous=ambiguous) # Remove the temporary fasta files os.remove(input_sequences_file) os.remove(aligned_sequences_file) return str(consensus)
def substitution_matrices(self): subs = [] for msa in [self.kinase_msa, self.peptide_msa]: c_align = AlignIO.read(msa, "tab") summary_align = AlignInfo.SummaryInfo(c_align) replace_info = summary_align.replacement_dictionary() my_arm = SubsMat.SeqMat(replace_info) my_lom = SubsMat.make_log_odds_matrix(my_arm) subs.append(my_lom) return subs[0], subs[1]
def compute_ic_content(alignment): align_info = AlignInfo.SummaryInfo(alignment) align_info.information_content() ic_vector = align_info.ic_vector # biopython wrong version hack if isinstance(ic_vector, dict): ic_vector = np.zeros(len(align_info.ic_vector)) for (ic_i, ic_v) in align_info.ic_vector.items(): ic_vector[ic_i] = ic_v return list(ic_vector)
def get_consensus_sequence(input_fasta, output_fasta): alignment = AlignIO.read(input_fasta, 'fasta') information = AlignInfo.SummaryInfo(alignment) consensus_sequence = information.dumb_consensus() consensus_record = SeqRecord( consensus_sequence, id='consensus', description='' ) SeqIO.write(consensus_record, output_fasta, 'fasta')
def get_consensus(filename): ''' (str) -> str reads in input aligned fasta and generates a consensus consensus uses AlignIO's default threshold of 70% ''' # https://www.biostars.org/p/284637/ alignment = AlignIO.read(filename, 'fasta') summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus(threshold=0.7, ambiguous='N') return str(consensus)