def translate_trsk_genes(gtf_file, fas_file, out_seq_fname): """ translate the trsk genes to protein sequence @args gtf_file: genome annotation file @type gtf_file: str @args fas_file: genome sequence file @type fas_file: str @args out_seq_fname: output file in fasta format @type out_seq_fname: str """ if filecmp.cmp(gtf_file, fas_file): exit("Do the two files are exactly same? Please check that!") ## reading the TSkim file to get the features sys.stdout.write('reading genome features from %s\n' % gtf_file) anno_db = GFFParser.Parse(gtf_file) total_genes = len(anno_db) ## genome sequence file reading sys.stdout.write('reading genome sequence from %s\n' % fas_file) seqlab.chrom_name_consistency(fas_file, anno_db) cds_idx = [] # deleting the empty cds lines for idp, feat in enumerate(anno_db): if not feat['cds_exons'][0].any(): # TSkim annotation expects only single transcript from a region cds_idx.append(idp) anno_db = np.delete(anno_db, cds_idx) genes_with_cds = len(anno_db) fasFH = helper.open_file(fas_file) out_seq_fh = open(out_seq_fname, "w") for rec in SeqIO.parse(fasFH, "fasta"): for idx, feature in enumerate(anno_db): if rec.id == feature['chr']: ## iterate over cds_exons cds_seq = '' for ex in feature['cds_exons'][0]:## single transcript by TSkim cds_seq += rec.seq[ex[0]-1:ex[1]] if feature['strand'] == '-': cds_seq = cds_seq.reverse_complement() ## #sys.stdout.write(str(cds_seq.translate()) + "\n") ## fasta output if cds_seq: prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') out_seq_fh.write(prt_seq.format("fasta")) # FIXME need an efficient way to translate multiple gene # iterate over chromosome fasFH.close() out_seq_fh.close() sys.stdout.write('total genes fetched: %d\n' % total_genes) sys.stdout.write('total genes translated: %d\n' % genes_with_cds) sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
def fastq_rev_transform(input_file, output_name): with open(input_file) as myfile: with open(output_name, 'w') as outfile: for rec in SeqIO.parse(myfile, 'fastq'): new = SeqRecord(seq=rec.seq.reverse_complement(), id=rec.id, name=rec.name, description="reverse complement") outfile.write(new.format('fasta'))
def _translate(self, seq_records, trans_table=1, force_trans_table=False): '''Translate given sequences using trans_table. If some sequence feature contains a translation table number, use it instead. If force_trans_table is True, always use trans_table.''' trans_seqs = list() for record in seq_records: #set the alphabet to DNA try: record.seq.alphabet = IUPAC.ambiguous_dna except Exception,e: #provided sequences SHOULD be DNA ones raise ValueError('AlignmentUtils.translate: unable to set alphabet of the sequence:\n%s\n%s' \ % (record.format('fasta'), e.message)) #determine translation table translation_table = -1 if force_trans_table: #force a translation table translation_table = trans_table else: #see if a translation table is defined in qualifiers for feature in record.features: try: translation_table = int(feature.qualifiers['transl_table'][0]) break except: pass if translation_table < 0: translation_table = trans_table #do a translation trans_seq = record.seq.translate(table=translation_table, stop_symbol="X") trans_seq_rec = SeqRecord(trans_seq, id=record.id) trans_seq_rec.name = record.name trans_seq_rec.description = record.description trans_seqs.append(trans_seq_rec)
def getdropboxsequence(request,name): fileName,fileExtension=os.path.splitext(name) try: client=dropbox.client.DropboxClient(request.session['access_token']) with client.get_file(name) as f: s=f.read() except Exception as e: print(str(e),file=sys.stderr) raise e if fileExtension in ('.gbk','.gb'): try: seq=SeqIO.read(io.StringIO(s.decode("utf-8")), "genbank") except Exception as e: print(str(e),file=sys.stderr) raise e return seq elif fileExtension=='.seq': simple_seq=Seq("".join(s.decode("utf-8").split())) seq=SeqRecord(simple_seq) seq.id=fileName return seq else: print('impossible case') return seq
def t_write_from_recs(self): """Write out GFF3 from SeqRecord inputs. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} sub_qualifiers = {"source": "prediction"} top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers), SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers)] rec.features = [top_feature] out_handle = StringIO.StringIO() GFF.write([rec], out_handle) wrote_info = out_handle.getvalue().split("\n") assert wrote_info[0] == "##gff-version 3" assert wrote_info[1] == "##sequence-region ID1 1 20" assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'other=Some,annotations;ID=gene1'] assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1']
def get_interregions(genbank_path,intergene_length=1): seq_record = next(SeqIO.parse(open(genbank_path), "genbank")) cds_list_plus = [] cds_list_minus = [] intergenic_records = [] intergenic_features = [] # Loop over the genome file, get the CDS features on each of the strands for feature in seq_record.features: if feature.type == 'CDS': mystart = feature.location.start.position myend = feature.location.end.position if feature.strand == -1: cds_list_minus.append((mystart,myend,-1)) elif feature.strand == 1: cds_list_plus.append((mystart,myend,1)) else: sys.stderr.write("No strand indicated %d-%d. Assuming +\n" % (mystart, myend)) cds_list_plus.append((mystart,myend,1)) for i,pospair in enumerate(cds_list_plus[1:]): # Compare current start position to previous end position last_end = cds_list_plus[i][1] this_start = pospair[0] strand = pospair[2] if this_start - last_end >= intergene_length: intergene_seq = seq_record.seq[last_end:this_start] featurelocation = SeqFeature.FeatureLocation(last_end,this_start,strand=+1); strand_string = "+" sequencerecord = SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i), description="%s %d-%d %s" % (seq_record.name, last_end+1, this_start,strand_string)) sequencerecord.features = [featurelocation]; intergenic_features.append(featurelocation); intergenic_records.append(sequencerecord); #intergenic_records.append( # SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i), # description="%s %d-%d %s" % (seq_record.name, last_end+1, # this_start,strand_string))) for i,pospair in enumerate(cds_list_minus[1:]): last_end = cds_list_minus[i][1] this_start = pospair[0] strand = pospair[2] if this_start - last_end >= intergene_length: intergene_seq = seq_record.seq[last_end:this_start] featurelocation = SeqFeature.FeatureLocation(last_end,this_start,strand=-1); strand_string = "-" sequencerecord = SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i), description="%s %d-%d %s" % (seq_record.name, last_end+1, this_start,strand_string)) sequencerecord.features = [featurelocation]; intergenic_features.append(featurelocation); intergenic_records.append(sequencerecord); #intergenic_records.append( # SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i), # description="%s %d-%d %s" % (seq_record.name, last_end+1, # this_start,strand_string))) #outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta" #SeqIO.write(intergenic_records, open(outpath,"w"), "fasta") return intergenic_features;
def gene_to_fasta(gene_str,gene_id,gene_desc): '''Convert a gene string to a fasta formatted string''' gene_seq = Seq(gene_str) gene_seq_r = SeqRecord(gene_seq,id=gene_id,description=gene_desc) fasta_str = gene_seq_r.format('fasta') return fasta_str
def __init__(self, seq, id="<unknown id>", name="<unknown name>", description="<unknown description>", dbxrefs=None, features=None, annotations=None, letter_annotations=None): self.id = id self.tempseq = seq temp = self.id.split('|') self.GeneID = temp[0] self.TranscriptID = temp[1] self.GeneName = temp[2] self.ExonRank = temp[3] if temp[4] == '': self.ConstExon = False else: self.ConstExon = True try: self.FPUTRend = int(temp[5]) except: self.FPUTRend = '' try: self.TPUTRstart = int(temp[6]) except: self.TPUTRstart = '' try: self.exonStart = int(temp[7]) except: self.exonStart = '' try: self.exonEnd = int(temp[8]) except: self.exonEnd = '' if self.FPUTRend != '': sequence = str(self.tempseq)[(self.FPUTRend-self.exonStart):(self.exonEnd-self.exonStart)] elif self.TPUTRstart != '': sequence = str(self.tempseq)[0:(self.TPUTRstart-self.exonStart)] else: sequence = str(self.tempseq) SeqRecord.__init__(self, Seq(sequence, IUPAC.unambiguous_dna), id, name, description, dbxrefs=None, features=None, annotations=None, letter_annotations=None)
def _record_formatter(self, temp): """return a string formatted as a biopython sequence record""" temp_record = SeqRecord(temp) temp_record.id = sequence.id temp_record.name = sequence.name temp_record.description = sequence.description return temp_record
def IgIterator(handle, alphabet = single_letter_alphabet): """Iterate over IntelliGenetics records (as SeqRecord objects). handle - input file alphabet - optional alphabet The optional free format file header lines (which start with two semi-colons) are ignored. The free format commentary lines at the start of each record (which start with a semi-colon) are recorded as a single string with embedded new line characters in the SeqRecord's annotations dictionary under the key 'comment'. """ #Skip any file header text before the first record (;; lines) while True: line = handle.readline() if not line : break #Premature end of file, or just empty? if not line.startswith(";;") : break while line: #Now iterate over the records if line[0]!=";": raise ValueError( \ "Records should start with ';' and not:\n%s" % repr(line)) #Try and agree with SeqRecord convention from the GenBank parser, #(and followed in the SwissProt parser) which stores the comments #as a long string with newlines under annotations key 'comment'. #Note some examples use "; ..." and others ";..." comment_lines = [] while line.startswith(";"): #TODO - Extract identifier from lines like "LOCUS\tB_SF2"? comment_lines.append(line[1:].strip()) line = handle.readline() title = line.rstrip() seq_lines = [] while True: line = handle.readline() if not line : break if line[0] == ";": break #Remove trailing whitespace, and any internal spaces seq_lines.append(line.rstrip().replace(" ","")) seq_str = "".join(seq_lines) if seq_str.endswith("1"): #Remove the optional terminator (digit one) seq_str = seq_str[:-1] if "1" in seq_str: raise ValueError("Potential terminator digit one found within sequence.") #Return the record and then continue... record= SeqRecord(Seq(seq_str, alphabet), id = title, name = title) record.annotations['comment'] = "\n".join(comment_lines) yield record #We should be at the end of the file now assert not line
def PhdIterator(handle): """Returns SeqRecord objects from a PHD file. This uses the Bio.Sequencing.Phd module to do the hard work. """ phd_records = Phd.parse(handle) for phd_record in phd_records: # Convert the PHY record into a SeqRecord... # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1' # from unit test example file phd_solexa. # This will cause problems if used as the record identifier # (e.g. output for FASTQ format). name = phd_record.file_name.split(None, 1)[0] seq_record = SeqRecord(phd_record.seq, id=name, name=name, description=phd_record.file_name) # Just re-use the comments dictionary as the SeqRecord's annotations seq_record.annotations = phd_record.comments # And store the qualities and peak locations as per-letter-annotation seq_record.letter_annotations["phred_quality"] = \ [int(site[1]) for site in phd_record.sites] try: seq_record.letter_annotations["peak_location"] = \ [int(site[2]) for site in phd_record.sites] except IndexError: # peak locations are not always there according to # David Gordon (the Consed author) pass yield seq_record
def sequence2SeqRecord(seq_obj): """-> loads a SeqRecord object array from a Sequence object array """ seqRecord_obj = SeqRecord(seq_obj.get_SEQUENCE(), id = seq_obj.get_ID()) seqRecord_obj.letter_annotations['phred_quality'] = seq_obj.get_QUALITY() return seqRecord_obj
def test(): # Object # checker = QualityChecker(None, None) checker.window_size = 10 checker.threshold = 21 checker.min_length = 5 checker.discard_N = True # Dummy test sequence # scores = "10 11 13 22 23 24 10 10 9 8 7 9 8 9 5 2 5 8 9 8 9 30 33 30 31 32 33 31 33 33 31 33 32 33 32 32 33 32 2 3 2 3 2 1 3 2 1 23 23 23 10 10 9 9" seq = "A T C G T T G A C G G A G T G T A A C T C G A T G A C T T G T C A A C T G G T A G G G T C A A C T G A T C A" scores = map(int, scores.split()) seq = ''.join(seq.split()) assert len(seq) == len(scores) # Make into biopython object # from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq read = SeqRecord(Seq(seq), id="test", name="test", description="test") read.letter_annotations["phred_quality"] = scores # Trim it # trimmed = checker.trim_read(read) # Check result # correct = "30 33 30 31 32 33 31 33 33 31 33 32 33 32 32" correct = "G A T G A C T T G T C A A C T " correct = ''.join(correct.split()) assert str(trimmed.seq) == correct
def test_translate(self): s = SeqRecord(Seq("ATGGTGTAA"), id="TestID", name="TestName", description="TestDescription", dbxrefs=["TestDbxrefs"], features=[SeqFeature(FeatureLocation(0, 3), type="Site")], annotations={'organism': 'bombyx'}, letter_annotations={'test': 'abcdefghi'}) t = s.translate() self.assertEqual(t.seq, "MV*") self.assertEqual(t.id, "<unknown id>") self.assertEqual(t.name, "<unknown name>") self.assertEqual(t.description, "<unknown description>") self.assertFalse(t.dbxrefs) self.assertFalse(t.features) self.assertFalse(t.annotations) self.assertFalse(t.letter_annotations) t = s.translate(cds=True, id=True, name=True, description=True, dbxrefs=True, annotations=True) self.assertEqual(t.seq, "MV") self.assertEqual(t.id, "TestID") self.assertEqual(t.name, "TestName") self.assertEqual(t.description, "TestDescription") self.assertEqual(t.dbxrefs, ["TestDbxrefs"]) self.assertFalse(t.features) self.assertEqual(t.annotations, {'organism': 'bombyx'}) self.assertFalse(t.letter_annotations)
def modify_seq(self, fasta_folder, mod_table_file, output_folder): datas = self._import_data(mod_table_file) for data in datas: seq = "" if (data["ref_id"] + ".fa") in os.listdir(fasta_folder): filename = os.path.join(fasta_folder, data["ref_id"] + ".fa") with open(filename, "r") as fasta: for line in fasta: line = line.strip() if line[0] != ">": seq = seq + line seq_modifier = SeqModifier(seq) for change in data["datas"]: if change["ref_nt"] == "-": seq_modifier.insert( int(change["position"]), change["tar_nt"]) elif change["tar_nt"] == "-": seq_modifier.remove(int(change["position"]), len(change["ref_nt"])) else: seq_modifier.replace( int(change["position"]), change["tar_nt"]) record = SeqRecord(Seq(seq_modifier.seq())) record.id = data["target_id"] record.description = "" SeqIO.write(record, os.path.join( output_folder, record.id + ".fa"), "fasta")
def fetchGene(GeneName): service = Service("http://yeastmine.yeastgenome.org/yeastmine/service") template = service.get_template('Gene_GenomicDNA') rows = template.rows( E = {"op": "LOOKUP", "value": GeneName, "extra_value": "S. cerevisiae"} ) # this service seems to return multiple similar genes but we want the first one only, so count # and it returns information about the gene you want count=0 for row in rows: count=count+1 if count==1: descr= row["description"] GeneSeq=Seq(row["sequence.residues"]) GeneSysName=row["secondaryIdentifier"] #let's create a record for the oldGene GeneRecord = SeqRecord(GeneSeq, id=GeneSysName) #now let's add some more information to make it useful GeneRecord.name=GeneName GeneRecord.features=GeneSysName GeneRecord.description=descr return GeneRecord
def test_genbank_date_list(self): """Check if date lists are handled correctly""" sequence_object = Seq("ATGC", generic_dna) record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') record.annotations["date"] = ["24-DEC-2015"] handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "24-DEC-2015") record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') record.annotations["date"] = ["24-DEC-2015", "25-JAN-2016"] handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "01-JAN-1980")
def to_gff(self, filename): """ Export to GFF format, saving to the specified filename. """ records = [] for fragment in self.__genome.fragments.all(): fragment = fragment.indexed_fragment() seq = Seq(fragment.sequence) rec = SeqRecord(seq, "%s" % (fragment.name,)) features = [] for annotation in fragment.annotations(): # FeatureLocation first bp is AfterPosition, so -1 loc = FeatureLocation(annotation.base_first-1, annotation.base_last) qualifiers = {'name': annotation.feature.name} feature = SeqFeature(loc, type=annotation.feature.type, strand=1, qualifiers=qualifiers) features.append(feature) rec.features = features records.append(rec) with open(filename, "w") as out_handle: GFF.write(records, out_handle, include_fasta=True)
def translate_and_write_DNA_frames(seq, directionsToConsider="forward", tranlationTable=1): """ directionToConsider: forward - normal DNA direction reverse - reverse complement both - both of the abover """ allPossibilities = [] if directionsToConsider in ("forward","both"): # start translation from 1, 2 and 3 nucleotide for frame in range(3): trans = str(seq[frame:].translate(tranlationTable)) allPossibilities.append(trans) if directionsToConsider in ("reverse","both"): # consider reverse complement DNA sequence as well # start translation from 1, 2 and 3 nucleotide for frame in range(3): trans = str(seq.reverse_complement()[frame:].translate(tranlationTable)) allPossibilities.append(trans) i = 0 for currentFrame in allPossibilities: i = i + 1 currentProtein = Seq(currentFrame, alphabet=ProteinAlphabet) if len(currentProtein) >= minimum_peptide_length: currentProteinRecord = SeqRecord(currentProtein, seq_record.name) currentProteinRecord.id = currentProteinRecord.id + "." + str(i) currentProteinRecord.description = seq_record.description + "; frame " + str(i) SeqIO.write(currentProteinRecord, output_handle, "fasta") return
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) trim_rec = seq.annotations[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert str(seqs2[0].seq) == 'CTCA' trim_rec['vector'] = [(0, 0), (8, 13)] seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert str(seqs2[0].seq) == 'GGTCTCA' trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert not seqs2 trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert str(seqs2[0].seq) == 'GGTCTCA' assert TRIMMING_RECOMMENDATIONS not in seqs2[0].annotations
def getPrimerRegions(primers, options, alignment): consensusSequences = [] # Pull out all the (start,end) tuples from primers starts = [] for primer in primers: starts.append(primer[0][0]) coordinates = [] # Go through the starts. Check if the next number is i+1. If not, then output slice sliceStart = 0 primerLength = options.maxprimer for i in range(0,len(starts)): start = starts[i] try: nextStart = starts[i+1] except: break if (nextStart != start+1): coordinates.append((sliceStart,start+primerLength)) sliceStart = nextStart # Create the slices for pair in coordinates: slicedAlignment = alignment[:,pair[0]:pair[1]] consensus = Seq(calculateConsensusSequence(slicedAlignment,options.degeneracy)) seq = SeqRecord(consensus) seq.id = str(pair[0]) + "-" + str(pair[1]) consensusSequences.append(seq) return consensusSequences
def bioseq_to_bwa_seq(self): nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown) for i, (seq, q) in enumerate(g): name = "foo-%d" % i seq_str = "".join(seq) bioseq = SeqRecord(Seq(seq_str, single_letter_alphabet), id=name, name=name, description=name) bioseq.letter_annotations[qkey] = q n, m = len(bioseq), len(bioseq.name) bwseq = bwa.alloc_seq(1, n, m)[0] bwa.bioseq_to_bwa_seq(bioseq, bwseq, n, m, fmt) self.assertEqual(bioseq.name, bwseq.get_name()) self.assertEqual(bioseq.seq.data[::-1], bwseq.get_seq()) self.assertEqual(reverse_complement(bioseq.seq.data), bwseq.get_rseq()) # check that quality has been converted to sanger if fmt == "fastq-solexa": exp_q = [int(round(x+10*math.log10(1+10**(-x/10.)))) for x in q] else: exp_q = q exp_qstr = "".join(chr(x+sg.Q_OFFSET["fastq-sanger"]) for x in exp_q) self.assertEqual(bwseq.get_qual(), exp_qstr)
def main(fastq_name): MAX_Q = 100 logQs = [10 ** (float(-Q) / 10) for Q in range(MAX_Q + 1)] for record in SeqIO.parse(fastq_name, "fastq"): #print _get_phred_quality(record) #for header, seq in fasta_iter(fasta_name): seq = list(record.seq) Q = record.letter_annotations["phred_quality"] P = [10 ** (float(-x) / 10) for x in Q] for i, s in enumerate(seq): val = random() mutation_freq = P[i] if val < mutation_freq: #choose a random nucleotide that's different. seq[i] = choice([x for x in "ACTG" if x != s.upper()]) #record.seq = "".join(seq) #print record.format("fastq") #print "@" + str(record.ide) + "\n" + "".join(seq) + "\n" + "+" + "\n" + Q mutated = SeqRecord( Seq("".join(seq), generic_dna), id = record.id, description = "" ) mutated.letter_annotations["phred_quality"] = Q print mutated.format("fastq"),
def _init_with_SeqRecord(self,record): # Initialize self using existing SeqRecord object SeqRecord.__init__(self, seq=record.seq, id=record.id, name=record.name, description=record.description, dbxrefs=record.dbxrefs, features=record.features, annotations=record.annotations, letter_annotations=record.letter_annotations)
def __init__( self, seq, id=UNKNOWN_ID, name=UNKNOWN_NAME, description=UNKNOWN_DESCRIPTION, dbxrefs=None, features=None, annotations=None, letter_annotations=None, qual=None, ): if id == UNKNOWN_ID and name != UNKNOWN_NAME: id = name # We don't want a Biopython Seq, we need our repr if not isinstance(seq, Seq) and not isinstance(seq, UnknownSeq): raise ValueError("seq should be a franklin Seq") SeqRecord.__init__( self, seq, id=id, name=name, description=description, dbxrefs=dbxrefs, features=features, annotations=annotations, letter_annotations=letter_annotations, ) if qual is not None: self.qual = qual
def main(fastq): for record in SeqIO.parse(fastq, "fastq"): Q = record.letter_annotations["phred_quality"] upperseq = SeqRecord( Seq(str(record.seq).upper()), id = record.id, description = "" ) upperseq.letter_annotations["phred_quality"] = Q print upperseq.format("fastq")
def CpGIslandsToGFF(island_location): # Output methylation regions (CpG Islands, namely) to a GFF3 compliant file out_file = os.getcwd() \ + '/' \ + os.path.splitext(base)[0] \ + '.gff' seq = cur_record.seq rec = SeqRecord(seq, "ID1") qualifiers = {"source": "bssimulation", "score": '.', "ID": cur_record.name} sub_qualifiers = {"source": "bssimulation"} top_feature = SeqFeature(FeatureLocation(0, len(cur_record)), type="region", strand=0, qualifiers=qualifiers) for i in island_location: begin = int(i[0] - i[1]/2) end = int(i[0] + i[1]/2) top_feature.sub_features.append(SeqFeature(FeatureLocation(begin, end), type="CpG_island", strand=0, qualifiers=sub_qualifiers)) rec.features = [top_feature] with open(out_file, "w") as out_handle: GFF.write([rec], out_handle)
def clip_seq_record( seqrecord ): ''' Correctly trims sff Bio.SeqRecord.SeqRecord @seqrecord - Bio.SeqRecord.SeqRecord object to trim phred_quality and sequence based on the annotations @returns a new trimmed seqrecord ''' from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord lefti = seqrecord.annotations['clip_qual_left'] righti = seqrecord.annotations['clip_qual_right'] if lefti > righti and righti == 0: righti = len(seqrecord.seq._data) newseq = Seq( seqrecord.seq._data[lefti:righti], seqrecord.seq.alphabet ) newrec = SeqRecord( seq = newseq, id = seqrecord.id, description = seqrecord.description ) quals = seqrecord._per_letter_annotations['phred_quality'] trimquals = quals[lefti:righti] newrec._per_letter_annotations['phred_quality'] = trimquals return newrec
def write_seqrecord(self, outh, fmt='fasta'): # Just make up quality scores for now if FASTQ for i in range(self.count): record = SeqRecord(Seq(self.get_sequence(), IUPACAmbiguousDNA())) if fmt.startswith('fastq'): record.letter_annotations = {'phred_quality': [10]*len(record)} outh.write(record.format(fmt))
def proteins(cursor, experiment=None, filter_experiments=True, sequence_key=None): """ Return the selected proteins as SeqRecord objects """ query = """SELECT s.id,s.sequence, e.id, e.short_name, e.taxonomy_id from hpf.experiment e join bddb.protein p on e.id=p.experiment_key join ddbCommon.sequence s on p.sequence_key=s.id """ assert experiment!= None or sequence_key != None if experiment != None or filter_experiments==True or sequence_key != None: query += " where " if experiment: if not hasattr(experiment, "__iter__"): experiment = [experiment] query += " e.id in (%s)" % (",".join([str(key) for key in experiment])) if filter_experiments: t = " e.taxonomy_id!=0" query += " and "+t if experiment else t if sequence_key: t = " s.id in (%s)" % (",".join([str(key) for key in sequence_key])) query += " and "+t if experiment or filter_experiments else t runtime().debug(query) cursor.execute(query) runtime().debug("Fetching") for id, sequence, e_id, e_name, taxonomy_id in cursor.fetchall(): record = SeqRecord(Seq(sequence), str(id), description=e_name) record.annotations = {"taxonomy_id":taxonomy_id, "experiment_key":e_id, "organism":e_name} yield record
class SeqRecordMethods(unittest.TestCase): """Test SeqRecord methods.""" def setUp(self): f0 = SeqFeature(FeatureLocation(0, 26), type="source", qualifiers={"mol_type": ["fake protein"]}) f1 = SeqFeature(FeatureLocation(0, ExactPosition(10))) f2 = SeqFeature( FeatureLocation(WithinPosition(12, left=12, right=15), BeforePosition(22))) f3 = SeqFeature( FeatureLocation( AfterPosition(16), OneOfPosition( 26, [ExactPosition(25), AfterPosition(26)]))) self.record = SeqRecord(Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX", generic_protein), id="TestID", name="TestName", description="TestDescr", dbxrefs=["TestXRef"], annotations={"k": "v"}, letter_annotations={"fake": "X" * 26}, features=[f0, f1, f2, f3]) def test_iter(self): for amino in self.record: self.assertEqual("A", amino) break def test_contains(self): self.assertIn(Seq("ABC", generic_protein), self.record) def test_str(self): expected = """ ID: TestID Name: TestName Description: TestDescr Database cross-references: TestXRef Number of features: 4 /k=v Per letter annotation for: fake Seq('ABCDEFGHIJKLMNOPQRSTUVWZYX', ProteinAlphabet())""" self.assertEqual(expected.lstrip(), str(self.record)) def test_repr(self): expected = "SeqRecord(seq=Seq('ABCDEFGHIJKLMNOPQRSTUVWZYX', ProteinAlphabet()), " \ "id='TestID', name='TestName', description='TestDescr', dbxrefs=['TestXRef'])" self.assertEqual(expected, repr(self.record)) def test_format(self): expected = ">TestID TestDescr\nABCDEFGHIJKLMNOPQRSTUVWZYX\n" self.assertEqual(expected, self.record.format("fasta")) def test_format_str(self): expected = ">TestID TestDescr\nABCDEFGHIJKLMNOPQRSTUVWZYX\n" self.assertEqual(expected, "{:fasta}".format(self.record)) if sys.version_info[0] >= 3: def test_format_str_binary(self): with self.assertRaisesRegex( ValueError, "Binary format sff cannot be used with SeqRecord format method" ): "{:sff}".format(self.record) def test_format_spaces(self): rec = SeqRecord(Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX", generic_protein), id="TestID", name="TestName", description="TestDescr") rec.description = "TestDescr with5spaces" expected = ">TestID TestDescr with5spaces\nABCDEFGHIJKLMNOPQRSTUVWZYX\n" self.assertEqual(expected, rec.format("fasta")) def test_upper(self): self.assertEqual("ABCDEFGHIJKLMNOPQRSTUVWZYX", str(self.record.lower().upper().seq)) def test_lower(self): self.assertEqual("abcdefghijklmnopqrstuvwzyx", str(self.record.lower().seq)) def test_slicing(self): self.assertEqual("B", self.record[1]) self.assertEqual("BC", self.record[1:3].seq) with self.assertRaises(ValueError): c = self.record["a"].seq def test_slice_variants(self): """Simple slices using different start/end values.""" for start in list(range(-30, 30)) + [None]: for end in list(range(-30, 30)) + [None]: if start is None and end is None: continue rec = self.record[start:end] seq = self.record.seq[start:end] seq_str = str(self.record.seq)[start:end] self.assertEqual(seq_str, str(seq)) self.assertEqual(seq_str, str(rec.seq)) self.assertEqual("X" * len(seq_str), rec.letter_annotations["fake"]) def test_slice_simple(self): """Simple slice.""" rec = self.record self.assertEqual(len(rec), 26) left = rec[:10] self.assertEqual(str(left.seq), str(rec.seq[:10])) right = rec[-10:] self.assertEqual(str(right.seq), str(rec.seq[-10:])) mid = rec[12:22] self.assertEqual(str(mid.seq), str(rec.seq[12:22])) for sub in [left, right, mid]: self.assertEqual(len(sub), 10) self.assertEqual(sub.id, "TestID") self.assertEqual(sub.name, "TestName") self.assertEqual(sub.description, "TestDescr") self.assertEqual(sub.letter_annotations, {"fake": "X" * 10}) self.assertEqual(sub.dbxrefs, []) # May change this... self.assertEqual(sub.annotations, {}) # May change this... self.assertEqual(len(sub.features), 1) # By construction, each feature matches the full sliced region: self.assertEqual(str(sub.features[0].extract(sub.seq)), str(sub.seq)) self.assertEqual(sub.features[0].extract(str(sub.seq)), str(sub.seq)) def test_slice_zero(self): """Zero slice.""" rec = self.record self.assertEqual(len(rec), 26) self.assertEqual(len(rec[2:-2]), 22) self.assertEqual(len(rec[5:2]), 0) self.assertEqual(len(rec[5:2][2:-2]), 0) def test_add_simple(self): """Simple addition.""" rec = self.record + self.record self.assertEqual(len(rec), 52) self.assertEqual(rec.id, "TestID") self.assertEqual(rec.name, "TestName") self.assertEqual(rec.description, "TestDescr") self.assertEqual(rec.dbxrefs, ["TestXRef"]) self.assertEqual(rec.annotations, {"k": "v"}) self.assertEqual(rec.letter_annotations, {"fake": "X" * 52}) self.assertEqual(len(rec.features), 2 * len(self.record.features)) def test_add_seq(self): """Simple addition of Seq or string.""" for other in [Seq("BIO"), "BIO"]: rec = self.record + other # will use SeqRecord's __add__ method self.assertEqual(len(rec), 26 + 3) self.assertEqual(str(rec.seq), str(self.record.seq) + "BIO") self.assertEqual(rec.id, "TestID") self.assertEqual(rec.name, "TestName") self.assertEqual(rec.description, "TestDescr") self.assertEqual(rec.dbxrefs, ["TestXRef"]) self.assertEqual(rec.annotations, {"k": "v"}) self.assertEqual(rec.letter_annotations, {}) self.assertEqual(len(rec.features), len(self.record.features)) self.assertEqual(rec.features[0].type, "source") self.assertEqual(rec.features[0].location.nofuzzy_start, 0) self.assertEqual(rec.features[0].location.nofuzzy_end, 26) # not +3 def test_add_seqrecord(self): """Simple left addition of SeqRecord from genbank file.""" other = SeqIO.read("GenBank/dbsource_wrap.gb", "gb") other.dbxrefs = ["dummy"] rec = self.record + other self.assertEqual(len(rec), len(self.record) + len(other)) self.assertEqual(str(rec.seq), str(self.record.seq) + str(other.seq)) self.assertEqual(rec.id, "<unknown id>") self.assertEqual(rec.name, "<unknown name>") self.assertEqual(rec.description, "<unknown description>") self.assertEqual(rec.dbxrefs, ["TestXRef", "dummy"]) self.assertEqual(len(rec.annotations), 0) self.assertEqual(len(rec.letter_annotations), 0) self.assertEqual(len(rec.features), len(self.record.features) + len(other.features)) self.assertEqual(rec.features[0].type, "source") self.assertEqual(rec.features[0].location.nofuzzy_start, 0) self.assertEqual(rec.features[0].location.nofuzzy_end, len(self.record)) # not +3 i = len(self.record.features) self.assertEqual(rec.features[i].type, "source") self.assertEqual(rec.features[i].location.nofuzzy_start, len(self.record)) self.assertEqual(rec.features[i].location.nofuzzy_end, len(rec)) def test_add_seq_left(self): """Simple left addition of Seq or string.""" for other in [Seq("BIO"), "BIO"]: rec = other + self.record # will use SeqRecord's __radd__ method self.assertEqual(len(rec), 26 + 3) self.assertEqual(str(rec.seq), "BIO" + str(self.record.seq)) self.assertEqual(rec.id, "TestID") self.assertEqual(rec.name, "TestName") self.assertEqual(rec.description, "TestDescr") self.assertEqual(rec.dbxrefs, ["TestXRef"]) self.assertEqual(rec.annotations, {"k": "v"}) self.assertEqual(rec.letter_annotations, {}) self.assertEqual(len(rec.features), len(self.record.features)) self.assertEqual(rec.features[0].type, "source") self.assertEqual(rec.features[0].location.nofuzzy_start, 3) self.assertEqual(rec.features[0].location.nofuzzy_end, 26 + 3) def test_slice_add_simple(self): """Simple slice and add.""" for cut in range(27): rec = self.record[:cut] + self.record[cut:] self.assertEqual(str(rec.seq), str(self.record.seq)) self.assertEqual(len(rec), 26) self.assertEqual(rec.id, "TestID") self.assertEqual(rec.name, "TestName") self.assertEqual(rec.description, "TestDescr") self.assertEqual(rec.dbxrefs, []) # May change this... self.assertEqual(rec.annotations, {}) # May change this... self.assertEqual(rec.letter_annotations, {"fake": "X" * 26}) self.assertTrue(len(rec.features) <= len(self.record.features)) def test_slice_add_shift(self): """Simple slice and add to shift.""" for cut in range(27): rec = self.record[cut:] + self.record[:cut] self.assertEqual( str(rec.seq), str(self.record.seq[cut:] + self.record.seq[:cut])) self.assertEqual(len(rec), 26) self.assertEqual(rec.id, "TestID") self.assertEqual(rec.name, "TestName") self.assertEqual(rec.description, "TestDescr") self.assertEqual(rec.dbxrefs, []) # May change this... self.assertEqual(rec.annotations, {}) # May change this... self.assertEqual(rec.letter_annotations, {"fake": "X" * 26}) self.assertTrue(len(rec.features) <= len(self.record.features))
def gt(): SeqRecord(Seq("A")) > SeqRecord(Seq("A"))
def notequality(): SeqRecord(Seq("A")) != SeqRecord(Seq("A"))
def equality(): SeqRecord(Seq("A")) == SeqRecord(Seq("A"))
def le(): SeqRecord(Seq("A")) <= SeqRecord(Seq("A"))
def lt(): SeqRecord(Seq("A")) < SeqRecord(Seq("A"))
def test_reverse_complement_mutable_seq(self): s = SeqRecord(MutableSeq("ACTG")) self.assertEqual("CAGT", str(s.reverse_complement().seq))
def test_valid_annotations(self): with self.assertRaises(TypeError): SeqRecord(Seq("ACGT", generic_dna), annotations=[])
def test_valid_description(self): with self.assertRaises(TypeError): SeqRecord(Seq("ACGT", generic_dna), description={})
def test_valid_features(self): with self.assertRaises(TypeError): SeqRecord(Seq("ACGT", generic_dna), features={})
def write_out_informative_fasta(compress_seq, alignment, stripFile=None): from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq sequences = compress_seq['sequences'] ref = compress_seq['reference'] positions = compress_seq['positions'] #If want to exclude sites from initial treebuild, read in here strip_pos = load_mask_sites(stripFile) if stripFile else [] #Get sequence names seqNames = list(sequences.keys()) #Check non-ref sites to see if informative printPositionMap = False #If true, prints file mapping Fasta position to real position sites = [] pos = [] for key in positions: if key not in strip_pos: pattern = [] for k in sequences.keys(): #looping try/except is faster than list comprehension try: pattern.append(sequences[k][key]) except KeyError: pattern.append(ref[key]) origPattern = list(pattern) if '-' in pattern or 'N' in pattern: #remove gaps/Ns to see if otherwise informative pattern = [ value for value in origPattern if value != '-' and value != 'N' ] un = np.unique(pattern, return_counts=True) #If not all - or N, not all same base, and >1 differing base, append if len(un[0]) != 0 and len( un[0]) != 1 and not (len(un[0]) == 2 and min(un[1]) == 1): sites.append(origPattern) pos.append("\t".join([str(len(pos) + 1), str(key)])) #Rotate and convert to SeqRecord sites = np.asarray(sites) align = np.rot90(sites) seqNamesCorr = list(reversed(seqNames)) toFasta = [ SeqRecord(id=seqNamesCorr[i], seq=Seq("".join(align[i])), description='') for i in range(len(sequences.keys())) ] fasta_file = os.path.join(os.path.dirname(alignment), 'informative_sites.fasta') #now output this as fasta to read into raxml or iqtree SeqIO.write(toFasta, fasta_file, 'fasta') #If want a position map, print: if printPositionMap: with open(fasta_file + ".positions.txt", 'w', encoding='utf-8') as the_file: the_file.write("\n".join(pos)) return fasta_file
def test_valid_dbxrefs(self): with self.assertRaises(TypeError): SeqRecord(Seq("ACGT", generic_dna), dbxrefs={})
def run(ifile, cleand_sequence_file, agp_file = 'scaffold.agp', locus_tag = 'XYX', sequence_description = ''): # read input scaffold file """ Reviewer suggestion: Seq1 [organism=Glarea lozoyensis 74030] [strain=74030] """ #sequence_description = '[organism=Glarea lozoyensis 74030] [strain=74030]' if agp_file != False: agp_file = open(agp_file, 'w+') new_sequence_array = [] scaffold_counter = 1 scaffold_name = locus_tag + '_scaffold%(scaffold_counter)s' agp_contig_string = scaffold_name + '\t%(start)s\t%(end)s\t%(linecount)s\tW\t%(sequence_iid)s\t1\t%(end_part_sequence)s\t+\n' agp_gap_string = scaffold_name + '\t%(start)s\t%(end)s\t%(linecount)s\tN\t%(gap_len)s\tfragment\tyes\t\n' sequence_counter = 0 for record in SeqIO.parse(open(ifile), 'fasta'): sequence_start = 0 sequence_end = 0 linecount = 0 # if the whole sequence has a length under 200, we can skip it if len(record.seq) < 200: continue """ Filter out all contigs from a scaffold that are shorter than 200 nucleotides. Two steps, at first replace all short contigs in the sequence and afterwards a special case for the beginning of the line. """ record_seq = str(record.seq) # Check if an arbitrary contig is shorter than 200 nucleotides # replace them with gaps def match_func( match ): return 'N'*len(match.group()) record_seq = re.sub(r'(?=([nN][ACGTactg]{1,199}[nN]))', match_func, record_seq) # Check if the sequence starts with a contig shorter than 200 # replace it with N's and remove it record_seq = re.sub(r'^[ACGTactg]{1,199}[nN]', match_func, record_seq) def match_func( match ): if len(match.group()) < 200: return 'N'*len(match.group()) return match.group() record_seq = re.sub(r'[ACGTactg]+', match_func, record_seq) # remove first occurences of N's record_seq = re.sub('^[nN]+', '', record_seq) # remove last occurences of N's record_seq = re.sub('[nN]+$', '', record_seq) if len(record_seq) < 200: continue # find all N-runs with at least two N's # a single N letter will not be touched for match in re.finditer('[nN]{2,}', record_seq): seq_before_match = match.string[sequence_start : match.start()] linecount += 1 sequence_counter += 1 gap_len = match.end() - match.start() sequence_end += len(seq_before_match) data = {'start': sequence_start + 1, # start relative to the scaffold 'end': sequence_end, # end relative to the scaffold 'linecount': linecount, # linecount in the scaffold 'sequence_iid': 'Seq%s' % sequence_counter, # unique contig iid 'scaffold_counter': scaffold_counter, # counting the scaffold 'end_part_sequence': len(seq_before_match), # end position and len of the contig 'gap_len': gap_len, # Gap length } if agp_file != False: agp_file.write(agp_contig_string % data) linecount += 1 sequence_start += len(seq_before_match) sequence_end += gap_len data.update({'linecount': linecount, 'start': sequence_start + 1, 'end': sequence_end, }) if len(record_seq) < sequence_end: # GAP is at the end of the scaffold, do not track that, in theory that should never happen continue # write to AGP file if agp_file != False: agp_file.write(agp_gap_string % data) sequence_start = sequence_end seq_obj = Seq(seq_before_match, IUPAC.IUPACUnambiguousDNA) temp = SeqRecord(seq_obj, id = 'Seq%s' % sequence_counter, description=sequence_description) assert len( temp.seq ) > 199 new_sequence_array.append(temp) """ sequence after the last gap """ if len(record_seq) > sequence_start: seq_after_last_match = record_seq[sequence_start : len(record_seq)] if len(seq_after_last_match) >= 200: linecount += 1 sequence_end = len(record_seq) sequence_counter += 1 data = {'linecount': linecount, 'start': sequence_start + 1, 'end': sequence_end, 'sequence_iid': 'Seq%s' % sequence_counter, 'scaffold_counter': scaffold_counter, # counting the scaffold 'end_part_sequence': sequence_end - sequence_start, } if agp_file != False: agp_file.write(agp_contig_string % data) seq_obj = Seq(seq_after_last_match, IUPAC.IUPACUnambiguousDNA) temp = SeqRecord(seq_obj, id = 'Seq%s' % sequence_counter, name=record.name, description=sequence_description) assert len( temp.seq ) > 199 new_sequence_array.append(temp) scaffold_counter += 1 output_file = open(cleand_sequence_file,"w+") SeqIO.write(new_sequence_array, output_file, "fasta") output_file.close() # Test if all contigs bigger than 199 for record in SeqIO.parse(open(cleand_sequence_file), 'fasta'): assert len( record.seq ) > 199
def test_valid_name(self): with self.assertRaises(TypeError): SeqRecord(Seq("ACGT", generic_dna), name={})
def PirIterator(handle): """Iterate over Fasta records as SeqRecord objects. handle - input file alphabet - optional alphabet title2ids - A function that, when given the title of the FASTA file (without the beginning >), will return the id, name and description (in that order) for the record as a tuple of strings. If this is not given, then the entire title line will be used as the description, and the first word as the id and name. Note that use of title2ids matches that of Bio.Fasta.SequenceParser but the defaults are slightly different. Examples -------- >>> with open("NBRF/DMB_prot.pir") as handle: ... for record in PirIterator(handle): ... print("%s length %i" % (record.id, len(record))) HLA:HLA00489 length 263 HLA:HLA00490 length 94 HLA:HLA00491 length 94 HLA:HLA00492 length 80 HLA:HLA00493 length 175 HLA:HLA01083 length 188 """ # Skip any text before the first record (e.g. blank lines, comments) while True: line = handle.readline() if line == "": return # Premature end of file, or just empty? if line[0] == ">": break while True: if line[0] != ">": raise ValueError( "Records in PIR files should start with '>' character") pir_type = line[1:3] if pir_type not in _pir_alphabets or line[3] != ";": raise ValueError("Records should start with '>XX;' " "where XX is a valid sequence type") identifier = line[4:].strip() description = handle.readline().strip() lines = [] line = handle.readline() while True: if not line: break if line[0] == ">": break # Remove trailing whitespace, and any internal spaces lines.append(line.rstrip().replace(" ", "")) line = handle.readline() seq = "".join(lines) if seq[-1] != "*": # Note the * terminator is present on nucleotide sequences too, # it is not a stop codon! raise ValueError( "Sequences in PIR files should include a * terminator!") # Return the record and then continue... record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]), id=identifier, name=identifier, description=description) record.annotations["PIR-type"] = pir_type yield record if not line: return # StopIteration assert False, "Should not reach this line"
def ConSeqMaker(SeqDict, SeqList): AlignTemp = [] #find the sequences for that individual for Contig in SeqList: SeqTemp = SeqRecord(seq=(Seq(SeqDict[Contig])), id=Contig), AlignTemp += SeqTemp #put them in an alignment AlignTemp = MultipleSeqAlignment(AlignTemp) NSeqs = len(AlignTemp) #make the consensus of the alignment #dumb_consensus works well as long as there are no ambiguities. If I want to be able to notice/count them, I need something more sophisticated. #AlignTempInfo = AlignInfo.SummaryInfo(AlignTemp) #ConSeq = AlignTempInfo.dumb_consensus(ambiguous='-', consensus_alpha=IUPAC) ConSeq = "" AmbigNucs = 0 Overlap = 0 AmbigNucList = [] for SeqPos in range(0, len(SeqDict[Contig])): PosNucs = [] NumNucs = 0 for record in AlignTemp: if record[SeqPos] != '-': PosNucs += record[SeqPos] NumNucs += 1 if NumNucs > 1: Overlap += 1 PosNucs = list(set(PosNucs)) if len(PosNucs) == 1: ConSeq += PosNucs[0] elif len(PosNucs) > 2: ConSeq += 'n' AmbigNucs += 1 AmbigNucList.append(SeqPos) elif len(PosNucs) == 2: if 'n' in PosNucs: ConSeq += 'n' elif 'm' in PosNucs: if 'a' in PosNucs: ConSeq += 'm' elif 'c' in PosNucs: ConSeq += 'm' else: ConSeq += 'n' elif 'r' in PosNucs: if 'a' in PosNucs: ConSeq += 'r' elif 'g' in PosNucs: ConSeq += 'r' else: ConSeq += 'n' elif 'w' in PosNucs: if 'a' in PosNucs: ConSeq += 'w' elif 't' in PosNucs: ConSeq += 'w' else: ConSeq += 'n' elif 's' in PosNucs: if 'c' in PosNucs: ConSeq += 's' elif 'g' in PosNucs: ConSeq += 's' else: ConSeq += 'n' elif 'y' in PosNucs: if 'c' in PosNucs: ConSeq += 'y' elif 't' in PosNucs: ConSeq += 'y' else: ConSeq += 'n' elif 'k' in PosNucs: if 'g' in PosNucs: ConSeq += 'k' elif 't' in PosNucs: ConSeq += 'k' else: ConSeq += 'n' elif 'a' in PosNucs: if 'c' in PosNucs: ConSeq += 'm' elif 'g' in PosNucs: ConSeq += 'r' elif 't' in PosNucs: ConSeq += 'w' else: print( "ERROR!!! We have strange bases in our sequence: %s\n" % (" ".join(PosNucs))) ConSeq += 'n' elif 'c' in PosNucs: if 'g' in PosNucs: ConSeq += 's' elif 't' in PosNucs: ConSeq += 'y' else: print( "ERROR!!! We have strange bases in our sequence: %s\n" % (" ".join(PosNucs))) ConSeq += 'n' elif 'g' in PosNucs: if 't' in PosNucs: ConSeq += 'k' else: print( "ERROR!!! We have strange bases in our sequence: %s\n" % (" ".join(PosNucs))) ConSeq += 'n' AmbigNucs += 1 AmbigNucList.append(SeqPos) return (ConSeq, AmbigNucs, AmbigNucList, NSeqs, Overlap)
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle line = handle.readline() if not line: raise StopIteration # Strip out header comments while line and line.strip().startswith("#"): line = handle.readline() seqs = {} seq_regions = {} passed_end_alignment = False latest_id = None while True: if not line: break # end of file line = line.strip() if line.startswith("="): # There may be more data, but we've reached the end of this # alignment break elif line.startswith(">"): m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) if not m: m = XMFA_HEADER_REGEX.match(line) if not m: raise ValueError("Malformed header line: %s", line) parsed_id = m.group("id") parsed_data = {} for key in ("start", "end", "id", "strand", "name", "realname"): try: value = m.group(key) if key == "start": value = int(value) # Convert to zero based counting if value > 0: value -= 1 if key == "end": value = int(value) parsed_data[key] = value except IndexError: # This will occur if we're asking for a group that # doesn't exist. It's fine. pass seq_regions[parsed_id] = parsed_data if parsed_id not in self._ids: self._ids.append(parsed_id) seqs.setdefault(parsed_id, "") latest_id = parsed_id else: assert not passed_end_alignment if latest_id is None: raise ValueError("Saw sequence before definition line") seqs[latest_id] += line line = handle.readline() assert len(seqs) <= len(self._ids) self.ids = self._ids self.sequences = seqs if self._ids and seqs: alignment_length = max(map(len, list(seqs.values()))) records = [] for id in self._ids: if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0: seq = "-" * alignment_length else: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) # Sometimes we don't see a particular sequence in the # alignment, so we skip that record since it isn't present in # that LCB/alignment if id not in seq_regions: continue if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0: suffix = "/{start}-{end}".format(**seq_regions[id]) if "realname" in seq_regions[id]: corrected_id = seq_regions[id]["realname"] else: corrected_id = seq_regions[id]["name"] if corrected_id.count(suffix) == 0: corrected_id += suffix else: if "realname" in seq_regions[id]: corrected_id = seq_regions[id]["realname"] else: corrected_id = seq_regions[id]["name"] record = SeqRecord(Seq(seq), id=corrected_id, name=id) record.annotations["start"] = seq_regions[id]["start"] record.annotations["end"] = seq_regions[id]["end"] record.annotations["strand"] = (1 if seq_regions[id]["strand"] == "+" else -1) records.append(record) return MultipleSeqAlignment(records) else: raise StopIteration
def add_str(self, seq, name=None, size=1, desc=""): """Use this method to add a sequence as a string to this fasta.""" self.add_seq( SeqRecord(Seq(seq), id=name + ';size=%i;' % size, description=desc))
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True, infer_gtr=True, root_state=None, missing='?'): from treetime import GTR from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq # Determine alphabet places = set() for meta in seq_meta.values(): if field in meta: places.add(meta[field]) if root_state is not None: places.add(root_state) # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45) places = sorted(places) nc = len(places) if nc>180: print("geo_inference: can't have more than 180 places!") return None elif nc==1: print("geo_inference: only one place found -- set every internal node to %s!"%places[0]) return None elif nc==0: print("geo_inference: list of places is empty!") return None else: alphabet = {chr(65+i):place for i,place in enumerate(places)} myGeoGTR = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)), alphabet = np.array(sorted(alphabet.keys()))) missing_char = chr(65+nc) alphabet[missing_char]=missing myGeoGTR.profile_map[missing_char] = np.ones(nc) alphabet_rev = {v:k for k,v in alphabet.iteritems()} pseudo_seqs = [] for name, meta in seq_meta.items(): s=alphabet_rev[meta[field]] if field in meta else missing_char pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name)) aln = MultipleSeqAlignment(pseudo_seqs) from treetime import TreeAnc tt = TreeAnc(tree=tree, aln=aln, gtr=myGeoGTR, convert_upper=False) tt.use_mutation_length=False tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=5.0, marginal=True, normalized_rate=False) for node in tt.tree.find_clades(): node.__setattr__(field, alphabet[node.sequence[0]]) if confidence: for node in tt.tree.find_clades(): pdis = node.marginal_profile[0] S = -np.sum(pdis*np.log(pdis+TINY)) marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))] marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods marginal = [(a, b) for a, b in marginal if b > 0.01][:4] #only take stuff over 1% and the top 4 elements conf = {a:b for a,b in marginal} node.__setattr__(field + "_entropy", S) node.__setattr__(field + "_confidence", conf) return tt, alphabet
def fullContigs(prot, sequence_dict, assembly_dict, protein_dict, prefix): """Generates a contig from all hits to a protein. If more than one hit, conduct a second exonerate search with the original contigs stitched together.""" logger = logging.getLogger("pipeline") #logger.setLevel(logger.debug) numHits = len(prot["assemblyHits"]) sequence_list = [] contigHits = [] logger.debug("All hits:") logger.debug(prot["assemblyHits"]) write_exonerate_stats(prot["assemblyHits"], prefix) #print numHits if numHits == 1: return str(sequence_dict[prot["assemblyHits"] [0]].seq) #If only one hit to this protein. else: for hit in range(len(prot["assemblyHits"])): assembly_seq_name = prot["assemblyHits"][hit].split(",")[0] logger.debug( "Protein hit {} from {} to {} with {}% id on strand {}".format( assembly_seq_name, prot["hit_start"][hit], prot["hit_end"][hit], prot["percentid"][hit], prot["hit_strand"][hit])) if assembly_seq_name not in contigHits: #Only add each contig once. if prot["hit_strand"][hit] == "+": sequence_list.append(assembly_dict[assembly_seq_name]) else: sequence_list.append( assembly_dict[assembly_seq_name].reverse_complement()) contigHits.append(assembly_seq_name) # logger.debug([i for i in prot["assemblyHits"]]) # logger.debug([(prot["hit_start"][i],prot["hit_end"][i]) for i in range(len(prot["hit_start"]))]) # logger.debug(prot["hit_strand"]) # logger.debug(prot["percentid"]) # logger.debug("\n".join(["{} {}".format(x,assembly_dict[x].seq) for x in contigHits])) supercontig = SeqRecord(Seq("".join(str(b.seq) for b in sequence_list)), id=prot["name"]) #Need to remove contigs if they have the same basename supercontig_cds = supercontig_exonerate(supercontig, protein_dict[prot["name"]], prefix) #Sort the supercontigs by hit location to the protein. joined_supercontig_cds = [b for b in supercontig_cds] joined_supercontig_cds.sort(key=sort_byhitloc, reverse=True) #logger.info([x for x in prot['assemblyHits'] if x in sequence_list]) #write_exonerate_stats([x for x in prot['assemblyHits'] if x in sequence_list]) #Get rid of supercontig sequences that are subsumed by longer sequences on the same stretch. joined_supercontig_cds = subsume_supercontigs(joined_supercontig_cds) SeqIO.write(joined_supercontig_cds, '%s/supercontig_exonerate.fasta' % prefix, 'fasta') if len(joined_supercontig_cds) == 1: logger.debug("One sequence remaining") return str(joined_supercontig_cds[0].seq) #One more Exonerate, just to be sure. superdupercontig = SeqRecord(Seq("".join( str(b.seq) for b in joined_supercontig_cds)), id=prot["name"]) final_supercontig = [ x for x in supercontig_exonerate(superdupercontig, protein_dict[ prot["name"]], prefix) ] final_supercontig.sort(key=sort_byhitloc, reverse=True) final_supercontig = subsume_supercontigs(final_supercontig) return str(Seq("".join(str(b.seq) for b in final_supercontig))) return str(Seq("".join(str(b.seq) for b in joined_supercontig_cds)))
def Task(self): return SeqRecord(Seq(self["SEQ"], alphabet=MAP_ALPHABET[self["TYPE"]]), name=self["NAME"])
list_of_dictionary.append(mydict) df = dataframe(sort_tuple_list) dataframe_list.append(df) count = count + 1 fname = input('Enter filename__') dedup_records = defaultdict(list) for record in SeqIO.parse(fname, "fasta"): # Use the sequence as the key and then have a list of id's as the value dedup_records[str(record.seq)].append(record.id) # this creates a generator; if you need a physical list, replace the outer "(", ")" by "[" and "]", respectively final_seq = (SeqRecord(Seq(seqi, IUPAC.protein), id="|".join(gi), name='', description='') for seqi, gi in dedup_records.items()) # write file SeqIO.write(final_seq, 'fuzzy.fasta', 'fasta') #parsing the file with open('fuzzy.fasta', 'r') as fasta_file: # Will close handle cleanly identifiers = [] lengths = [] sequences = [] A = [] C = [] D = [] E = [] F = []
genome2summary[genome] = genome + '\t' + str(scaffold_nb) + '\t' + str( cluster_nb) + '\t' + str(nb) + '\t' + str( best) + '\t' + contamination + '\t' + result ########################################################## # extracting the fasta sequences and performing the MSAs # ########################################################## rp2seq = defaultdict(list) for genome in genome2scaffold: scaffold, cluster = genome2scaffold[genome] for orf in genome2scaffold2cluster2orf[genome][scaffold][cluster]: pfam = orf2hmm[orf][0] rp = pfam2rp[pfam] rp2seq[rp].append( SeqRecord(seq=orf2seq[orf].seq, id=genome, description="")) print('performing MSA...') for rp, seqList in rp2seq.items(): print(rp + '\t' + str(len(seqList))) output_filename = folder + '/' + rp + '.fa' SeqIO.write(seqList, output_filename, 'fasta') mafft_filename = output_filename.replace('.fa', '.mafft') cmd = 'mafft --auto --thread ' + str( cpu ) + ' ' + output_filename + ' > ' + mafft_filename + ' 2>/dev/null' print(cmd) os.system(cmd) trimal_filename = mafft_filename.replace('.mafft', '.trimal')
def test_unit_pipeline_default(tmpdir, mocker): tmpdir = str(tmpdir) mocker.patch('os.mkdir') mocker.patch('deepbgc.command.pipeline.logging.FileHandler') mock_seqio = mocker.patch( 'deepbgc.command.pipeline.deepbgc.util.SequenceParser') record1 = SeqRecord('ABC') record2 = SeqRecord('DEF') mock_seqio_instance = mock_seqio.return_value mock_seqio_instance.__enter__.return_value = mock_seqio_instance mock_seqio_instance.parse.return_value = [record1, record2] mock_annotator = mocker.patch('deepbgc.command.pipeline.DeepBGCAnnotator') mock_classifier = mocker.patch( 'deepbgc.command.pipeline.DeepBGCClassifier') mock_detector = mocker.patch('deepbgc.command.pipeline.DeepBGCDetector') writer_paths = [ 'deepbgc.command.pipeline.BGCRegionPlotWriter', 'deepbgc.command.pipeline.ClusterTSVWriter', 'deepbgc.command.pipeline.PfamScorePlotWriter', 'deepbgc.command.pipeline.PfamTSVWriter', 'deepbgc.command.pipeline.GenbankWriter', 'deepbgc.command.pipeline.BGCGenbankWriter', 'deepbgc.command.pipeline.ReadmeWriter' # Note: We are mocking classes imported in deepbgc.command.pipeline, not at their original location! ] writers = [mocker.patch(path) for path in writer_paths] report_dir = os.path.join(tmpdir, 'report') report_tmp_dir = os.path.join(report_dir, 'tmp') run([ 'pipeline', '--output', report_dir, '--detector', 'mydetector', '--label', 'mylabel', '--score', '0.1', '--merge-max-protein-gap', '8', '--merge-max-nucl-gap', '9', '--min-nucl', '10', '--min-proteins', '20', '--min-domains', '30', '--min-bio-domains', '40', '--classifier', 'myclassifier1', '--classifier', 'myclassifier2', '--classifier-score', '0.2', 'mySequence.gbk' ]) os.mkdir.assert_any_call(report_dir) os.mkdir.assert_any_call(report_tmp_dir) mock_annotator.assert_called_with(tmp_dir_path=report_tmp_dir) mock_classifier.assert_any_call(classifier='myclassifier1', score_threshold=0.2) mock_classifier.assert_any_call(classifier='myclassifier2', score_threshold=0.2) mock_detector.assert_called_with(detector='mydetector', label='mylabel', score_threshold=0.1, merge_max_protein_gap=8, merge_max_nucl_gap=9, min_nucl=10, min_proteins=20, min_domains=30, min_bio_domains=40) assert mock_annotator.return_value.run.call_count == 2 # Two records assert mock_detector.return_value.run.call_count == 2 # Two records assert mock_classifier.return_value.run.call_count == 4 # Two records for each of the two classifiers mock_annotator.return_value.print_summary.assert_called_once_with() mock_detector.return_value.print_summary.assert_called_once_with() assert mock_classifier.return_value.print_summary.call_count == 2 # For each of the two classifiers for writer in writers: assert writer.return_value.write.call_count == 2 # Two records writer.return_value.close.assert_called_once_with() # Remove logging handlers to avoid affecting other tests logger = logging.getLogger('') for handler in logger.handlers[:]: logger.removeHandler(handler)
adj_str = ";".join(protein2adj[protein]) merged.write(acco + "_" + protein + "\t" + protein2acc[protein] + "\t" + hit + "\t" + protein2dups[item] + "\t" + str(protein2length[protein]) + "\t" + str(protein2score[protein]) + "\t" + str(protein2align_length[item]) + "\t" + str(num_proteins[item]) + "\t" + prot_str + "\t" + loc_str + "\t" + adj_str + "\n") #print sorted_prot_list if len(sorted_prot_list) > 1: tally = tally + len(sorted_prot_list) newrecord = SeqRecord(Seq("", IUPAC.protein), id=acco + "_" + protein + "_JOINED", name=acco + "_" + protein + "_JOINED", description=protein2acc[protein]) for fragment in sorted_prot_list: #print fragment subrecord = seq_dict[fragment] subseq = subrecord.seq subseq = re.sub("\*", "", str(subseq)) #print subseq #print record.seq #print type(subseq) newrecord.seq = newrecord.seq + "" + subseq #if len(newrecord.seq) > 900 :# and protein2score[protein] > 500: final_proteins.append(newrecord) else:
for isolate in taxa: for position in region: recomb_regions[isolate].append(position) # mask indices/positions of recombinant regions identified by gubbins print('Masking recombinant positions in whole genome alignment.') sample_masked_indices = defaultdict(list) new_aln = list() for record in aln: seq_str = list(str(record.seq)) masked_indices = recomb_regions.get(record.id, []) for index in masked_indices: seq_str[index] = 'N' seq_str = ''.join(seq_str) new_record = SeqRecord(Seq(seq_str), id=record.id, description='') sample_masked_indices[record.id] = masked_indices new_aln.append(new_record) # write new FASTA file with recombinant regions masked fasta_outfile = outdir + '/' + re.split('/|\.', aln_path)[-2] + \ '_gubbins_masked.fa' text_outfile = outdir + '/' + re.split('/|\.', aln_path)[-2] + \ '_masked_recomb_positions.txt' var_site_outfile = outdir + '/' + re.split('/|\.', aln_path)[-2] + \ '_gubbins_masked_var_sites.fa' print('Writing', fasta_outfile) with open(fasta_outfile, 'w') as handle: SeqIO.write(new_aln, handle, 'fasta')
def cutSequence(sequence, motif, cut_mark=""): """ Takes a Biopython sequence and a motif and return the sequences cut at this motif. Takes sequence: the sequence as a string, a Seq or a SeqRecord motif: a string containing a '-' at the cut site; e.g. 'g-aattc' for EcoRI cut_mark: mark appended to the cutting site Returns a list of Biopython sequences SeqRecord, with names corresponding to the input sequence, the cut and the location in the input sequence # NOTE: the cutSequence function proceeds by reading the sequence and cutting # the next occurrence of the restriction site. In a case such as a "CCC-CC" # restriction site and a "ACCCCCCG" sequence, which could produce in reality # both "ACCC CCCG" and "ACCCC CCG" cuts, only the first one will be produced # by the function. """ # clean the cutting motif pattern = motif.replace('-', '').upper() cutSite = motif.find('-') if (cutSite == -1): cutSite = len( motif) # if no '-' is found, the cut is at the end of the motif # extract the sequence string try: sequenceString = sequence.seq.upper() # if sequence is a SeqRecord name = sequence.name + '_' + motif + '_' except: try: sequenceString = sequence.tostring().upper( ) # if sequence is a Seq name = motif + '_' except: sequenceString = sequence.upper() # sequence should be a string name = motif + '_' # find the first occurrence fragments = [] nextSite = sequenceString.find(pattern) lastCut = 1 lastPosition = len(sequenceString) # loop while (nextSite > 0): # while a site is found newFragment = SeqRecord(sequenceString[:nextSite + cutSite] + cut_mark) newFragment.name = (name + str(lastCut) + '_' + str(lastCut + nextSite + cutSite - 1)) lastCut = lastCut + nextSite + cutSite sequenceString = cut_mark + sequenceString[nextSite + cutSite:] fragments.append(newFragment) nextSite = sequenceString.find(pattern) # add the remaining sequence if (sequenceString != ''): name = name + str(lastCut) + '_' + str(lastPosition) lastFragment = SeqRecord(sequenceString) lastFragment.name = name fragments.append(lastFragment) # return return fragments
def make_record(string, trio, parent): """Returns a new SeqRecord with the reverse complement sequence.""" return SeqRecord(seq = Seq(string), \ id = trio + "_" + parent, \ description = "")
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: #Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") #import sys #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break #end of file line = line.strip() #remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": #The "//" line indicates the end of the alignment. #There may still be more meta-data passed_end_alignment = True elif line == "": #blank line, ignore pass elif line[0] != "#": #Sequence #Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: #This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " \ + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: #Comment line or meta-data if line[:5] == "#=GF ": #Generic per-File annotation, free text #Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) #Each feature key could be used more than once, #so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': #Generic per-Column annotation, exactly 1 char per column #Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': #Generic per-Sequence annotation, free text #Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": #Generic per-Sequence AND per-Column markup #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry #TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines #Next line... assert len(seqs) <= len(ids) #assert len(gs) <= len(ids) #assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment_length = len(seqs.values()[0]) records = [] #Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) #Accession will be overridden by _populate_meta_data if an explicit #accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def makeSeqRecord(id): return SeqRecord(Seq("AAA", Reduced.Alphabet), id=id, name="", description="", letter_annotations={"phred_quality": [32, 32, 32]})