def countMotifs( infile, motifs ): '''find regular expression *motifs* in sequences within fasta formatted *infile*. ''' it = FastaIterator.FastaIterator( infile ) positions = [] while 1: try: seq = it.next() except StopIteration: break if not seq: break rseq = Genomics.complement( seq.sequence ) lsequence = len(seq.sequence) pos = [] for motif, pattern in motifs: for x in pattern.finditer( seq.sequence ): pos.append( ( motif, "+", x.start(), x.end()) ) for x in pattern.finditer( rseq ): pos.append( ( motif, "-", lsequence - x.end(), lsequence - x.start()) ) positions.append( (seq.title, pos) ) return positions
def FilterJunk(orthologs): """remove assignments to junk contigs. """ for id, oo in orthologs.items(): oo = filter(lambda x: not Genomics.IsJunk(x.contig), oo) if len(oo) > 0: orthologs[id] = oo else: del orthologs[id]
def Align(self, s1, s2, result): result.clear() handle_tmpfile, filename_tmpfile = tempfile.mkstemp() os.write(handle_tmpfile, ">s1\n%s\n" % (s1)) os.write(handle_tmpfile, ">s2\n%s\n" % (s2)) os.close(handle_tmpfile) statement = string.join(("(", self.mEnvironment, self.mExecutable, self.mOptions, filename_tmpfile, ")"), " ") p = subprocess.Popen(statement, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (file_stdout, file_stdin, file_stderr) = (p.stdin, p.stdout, p.stderr) file_stdin.close() lines = file_stdout.readlines() lines_stderr = file_stderr.readlines() exit_code = file_stdout.close() file_stderr.close() if exit_code: raise "Error while executing statement %s" % statement r = None for x in range(len(lines)): if re.search("Alignment \(FASTA format\):", lines[x]): r = Genomics.ParseFasta2Hash(lines[x + 2:]) break if not r: return None a1 = r['s1'] a2 = r['s2'] x1 = 1 x2 = 1 for pos in range(len(a1)): if a1[pos] in string.uppercase and a2[pos] in string.uppercase: result.addPairExplicit(x1, x2, 0) x1 += 1 x2 += 1 continue if a1[pos] != "-": x1 += 1 if a2[pos] != "-": x2 += 1 os.remove(filename_tmpfile) return result
def loadSequence(self, sequence): """load sequence properties from a sequence.""" if len(sequence) % 3: raise "sequence %s is not multiples of 3: length=%i!" % ( cur_record.title, len(sequence)) SequenceProperties.loadSequence(self, sequence) ## uppercase all letters and count codons self.mCodonCounts = Genomics.CountCodons(sequence.upper())
def Expand(self): self.mExpand = True if self.mMapPeptide2Translation.getLength() > 0: f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation) self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def loadSequence(self, sequence): """load sequence properties from a sequence.""" SequenceProperties.loadSequence(self, sequence) ## counts of amino acids self.mCountsAA = {} for x in Bio.Alphabet.IUPAC.extended_protein.letters: self.mCountsAA[x] = 0 for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: aa = Genomics.MapCodon2AA(codon) self.mCountsAA[aa] += 1
def getKL(self, usage): """return Kullback-Leibler Divergence (relative entropy) of sequences with respect to reference codon usage. """ e = 0 freqs = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts, self.mPseudoCounts) for codon, count in self.mCodonCounts.items(): e += usage[codon] * math.log(usage[codon] / freqs[codon]) return e
def getCopy(self): """return a new copy. """ new_entry = Prediction() new_entry.mExpand = self.mExpand new_entry.mPredictionId = self.mPredictionId new_entry.mQueryToken = self.mQueryToken new_entry.mQueryFrom = self.mQueryFrom new_entry.mQueryTo = self.mQueryTo new_entry.mSbjctToken = self.mSbjctToken new_entry.mSbjctStrand = self.mSbjctStrand new_entry.mSbjctFrom = self.mSbjctFrom new_entry.mSbjctTo = self.mSbjctTo new_entry.mRank = self.mRank new_entry.score = self.score new_entry.mQueryLength = self.mQueryLength new_entry.mQueryCoverage = self.mQueryCoverage new_entry.mNGaps = self.mNGaps new_entry.mNFrameShifts = self.mNFrameShifts new_entry.mNIntrons = self.mNIntrons new_entry.mNSplits = self.mNSplits new_entry.mNStopCodons = self.mNStopCodons new_entry.mPercentIdentity = self.mPercentIdentity new_entry.mPercentSimilarity = self.mPercentSimilarity new_entry.mTranslation = self.mTranslation new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo new_entry.mAlignmentString = self.mAlignmentString new_entry.mQueryAli = self.mQueryAli new_entry.mSbjctAli = self.mSbjctAli if self.mExpand: new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) alignlib_lite.py_copyAlignment(new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation) new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) else: new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None return new_entry
def fillFromTable(self, table_row): if len(table_row) == 25: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy(self.mMapPeptide2Translation) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def GetBlocks( self, s1, s2 ): """the strings have to be already aligned!!!""" handle_tmpfile, filename_tmpfile = tempfile.mkstemp() os.write( handle_tmpfile, ">s1\n%s\n" % (s1)) os.write( handle_tmpfile, ">s2\n%s\n" % (s2)) os.close( handle_tmpfile ) statement = " ".join( "(", self.mEnvironment, self.mExecutable % filename_tmpfile, self.mOptions, ")" ) p = subprocess.Popen( statement , shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (file_stdout, file_stdin, file_stderr) = (p.stdin, p.stdout, p.stderr) file_stdin.close() lines = file_stdout.readlines() lines_stderr = file_stderr.readlines() exit_code = file_stdout.close() file_stderr.close() if exit_code: raise ValueError("Error while executing statement %s" % statement) if not os.path.exists( filename_tmpfile + "-gb"): os.remove( filename_tmpfile ) return "", "" lines = open( filename_tmpfile + "-gb").readlines() r = Genomics.ParseFasta2Hash( lines) if not r: return "", "" os.remove( filename_tmpfile ) os.remove( filename_tmpfile + "-gb" ) os.remove( filename_tmpfile + "-gb.htm") return r['s1'], r['s2']
def getEntropy(self, usage=None): """return entropy of a source in terms of a reference usage. Also called conditional entropy or encoding cost. Note that here I compute the sum over 20 entropies, one for each amino acid. If not given, calculate entropy. """ e = 0 freqs = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts, self.mPseudoCounts) if usage == None: usage = freqs for codon, count in self.mCodonCounts.items(): e -= freqs[codon] * math.log(usage[codon]) return e
id_peptides = None, id_genomes = None, output_options = [] ) (options, args) = E.Start( parser ) if options.range_genome: options.range_genome = map(int, options.range_genome.split(",")) if options.range_peptide: options.range_peptide = map(int, options.range_peptide.split(",")) wrapper = Exonerate( options=options.options, output_options=options.output_options ) wrapper.mLogLevel = options.loglevel if options.loglevel >= 2: print "# reading peptide sequence." peptide_sequences = Genomics.ReadPeptideSequences( open(options.input_filename_peptide, "r") ) if options.loglevel >= 2: print "# reading genome sequence." genome_sequences = Genomics.ReadGenomicSequences( open(options.input_filename_genome, "r"), do_reverse = 0 ) if not options.id_peptides: options.id_peptides= peptide_sequences.keys() if not options.id_genomes: options.id_genomes= genome_sequences.keys() for x in options.id_peptides: ps = peptide_sequences[x] if options.range_peptide: ps = ps[options.range_peptide[0]:options.range_peptide[1]] for y in options.id_genomes:
def loadSequence(self, sequence): """load sequence properties from a sequence.""" SequenceProperties.loadSequence(self, sequence) ## uppercase all letters sequence = sequence.upper() self.mNStopCodons = 0 ## setup counting arrays ## nucleotide counts for each position (is not a sum of the counts ## per degenerate site, as the codon might be intelligible, e.g. GNN). self.mCounts = [{ 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }] ## nucleotide counts for each position per degeneracy self.mCountsDegeneracy = [] for x in (0, 1, 2): xx = [] for y in range(5): yy = {} for z in Bio.Alphabet.IUPAC.extended_dna.letters: yy[z] = 0 xx.append(yy) self.mCountsDegeneracy.append(xx) for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: for x in (0, 1, 2): self.mCounts[x][codon[x]] += 1 if Genomics.IsStopCodon(codon): self.mNStopCodons += 1 continue try: aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon) degrees = (deg1, deg2, deg3) for x in range(len(degrees)): self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1 except KeyError: pass
def updateVariants(variants, lcontig, strand, phased=True): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand(strand) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype(genotype)) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos + 1 else: variantseqs = [x[1:] for x in genotype.split("/")] lvariant = max([len(x) for x in variantseqs]) if not phased: variantseqs = [x for x in variantseqs if x] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [ x + "-" * (lvariant - len(x)) for x in variantseqs ] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.complement(reference) variantseqs = [Genomics.complement(x.upper()) for x in variantseqs] start, end = lcontig - end, lcontig - start new_variants.append( ExtendedVariant._make((start, end, reference.upper(), action, has_wildtype, variantseqs))) return new_variants
def setTranslation(self, genomic_sequence): """set translation from genomic sequence.""" self.mMapPeptide2Translation, self.mTranslation = Genomics.Alignment2PeptideAlignment( \ self.mMapPeptide2Genome, self.mQueryFrom, self.mSbjctGenomeFrom, genomic_sequence )
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib_lite.py_makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, alignlib_lite.py_makeSequence( self.mSequence1), alignlib_lite.py_makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
def updateProperties(self): SequencePropertiesCodons.updateProperties(self) self.mCodonFrequencies = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts)
def updateVariants( variants, lcontig, strand, phased = True ): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand( strand ) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype( genotype ) ) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos+1 else: variantseqs = [ x[1:] for x in genotype.split("/") ] lvariant = max( [len(x) for x in variantseqs ] ) if not phased: variantseqs = [ x for x in variantseqs if x ] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [ x + "-" * (lvariant - len( x )) for x in variantseqs ] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.complement( reference ) variantseqs = [ Genomics.complement( x.upper() ) for x in variantseqs ] start, end = lcontig - end, lcontig - start new_variants.append( ExtendedVariant._make( ( start, end, reference.upper(), action, has_wildtype, variantseqs ) )) return new_variants
"--options", dest="options", type="string", help="BlastZ options.") parser.set_defaults( \ input_filename_seq1 = None, input_filename_seq2 = None, options = "B=0 C=2") (options, args) = E.Start(parser) wrapper = BlastZ(options.options) import alignlib_lite seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r")) seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r")) seq1 = seqs1[seqs1.keys()[0]] seq2 = seqs2[seqs2.keys()[0]] result = alignlib_lite.py_makeAlignmentVector() wrapper.Align(seq1, seq2, result) print str( alignlib_lite.py_AlignmentFormatExplicit( result, alignlib_lite.py_makeSequence(seq1), alignlib_lite.py_makeSequence(seq2))) E.Stop()