def amplitude_from_abif(filename): ''' f Return amplitude dictionnary for each base. filename : sanger data abif format seq : compute amplitude only for a sub sequence ''' abif = Trace(filename) ampl = abif.get_data("P1AM1") output = [] index = 0 end = len(abif.seq) while index < end : base = abif.seq[index] output.append((base,abif.get_data("P1AM1")[index])) index+=1 return output
def amplitude_from_abif(filename): ''' f Return amplitude dictionnary for each base. filename : sanger data abif format seq : compute amplitude only for a sub sequence ''' abif = Trace(filename) ampl = abif.get_data("P1AM1") output = [] index = 0 end = len(abif.seq) while index < end: base = abif.seq[index] output.append((base, abif.get_data("P1AM1")[index])) index += 1 return output
def __init__(self,filename,ratio,q_set): traceObject = Trace(filename) self.ratio = float(ratio) self.q_set = int(q_set) self.primarySeq = [] self.secondarySeq = [] self.trim5 = 0 self.trim3 = 0 self.order = list( traceObject.get_data('FWO_1') ) # The Order of the lightwave of Nucleotides self.primary_basecalls = traceObject.get_data('PBAS2') # Bases called by the basecaller of machine self.primary_quality = traceObject.get_data('PCON2') # Base Quality by basecaller self.primary_basecallPos = np.array( traceObject.get_data('PLOC2')) # Primary Peak Location called by the basecaller '''Unprocessed RAW DATA''' ### Analyzed Color Data self.traceMatrix = np.array( [ traceObject.get_data('DATA9'), traceObject.get_data('DATA10'), traceObject.get_data('DATA11'), traceObject.get_data('DATA12') ]) #self.peakPosMatrix , self.peakAmpMatrix = self.getPeakPosAndAmp() correction = True try: self.primary_peakAmp = np.array( traceObject.get_data('P1AM1')) # Primary Peak Amp called by the basecaller self.secondary_peakAmp = np.array( traceObject.get_data('P2AM1')) self.secondary_peakBase = traceObject.get_data('P2BA1') except KeyError: correction = False self.background_value = None; self.primarySeq , self.secondarySeq , self.trim5 , self.trim3 = self.makebasecall(self.ratio,self.q_set,correction)
def main(trace, seq, gene2locus): ''' tracefile, seq, make sure trace and seq correspend to each other correctly return heterosite{} ''' heteroSite = {} gft2base = {} st = Trace(trace) startBase = SKIP_BASE_NUM endBase = len(st.seq) - SKIP_BASE_NUM st.basepos = st.get_data('PLOC1') st.basecalls = st.seq st.tracesamps = {'G': (), 'A': (), 'T': (), 'C': ()} st.tracesamps['G'], st.tracesamps['A'], st.tracesamps['T'], st.tracesamps[ 'C'] = st.get_data('DATA9'), st.get_data('DATA10'), st.get_data( 'DATA11'), st.get_data('DATA12') #detect hetero base i = 0 for basePos in st.basepos[startBase:endBase]: #base posGATC i += 1 tracePos = [ st.tracesamps['G'][basePos], st.tracesamps['A'][basePos], st.tracesamps['T'][basePos], st.tracesamps['C'][basePos] ] if max(tracePos) < TRACE_THRESHOLD: continue stdMn = calSimilty( sorted(tracePos, reverse=True)[:2] ) #calculate the first two largest trace value's similarity by formula: std / mean if stdMn > SIMILTY_THRESHOLD: continue altBase = baseOrderDict[getSecondLarge(tracePos)] heteroPos = startBase + i if heteroPos not in heteroSite: heteroSite[heteroPos] = [st.basecalls[startBase + i - 1], altBase] else: pass if args.debug.lower() == 'y': print( '{orgbasePos}\t{pos}\t{base}\t{G}\t{A}\t{T}\t{C}\t{stdMn}\t{altBase}' .format(base=st.basecalls[startBase + i - 1], G=st.tracesamps['G'][basePos], A=st.tracesamps['A'][basePos], T=st.tracesamps['T'][basePos], C=st.tracesamps['C'][basePos], stdMn=stdMn, pos=startBase + i, orgbasePos=basePos, altBase=altBase)) #run blastn cmd = 'blastn -query "{0}" -db {1} -evalue 0.01 -outfmt "6 qseqid sseqid qstart qend sstart send qseq sseq btop" -num_threads 8 -task megablast -max_target_seqs 1'.format( seq, refSeq) logging.debug(cmd) blastRes = os.popen(cmd).read() #extract base content qseqid, sseqid, qstart, qend, sstart, send, qseq, sseq, btop = blastRes.rstrip( ).split('\t') assert sseqid in gene2locus, '{0} not valid gene id'.format(sseqid) for gftid, cord in gene2locus[sseqid].items(): if (int(sstart) <= int(cord) <= int(send)): #hit within range pos = int(qstart) + int(cord) - int(sstart) if pos in heteroSite: #encounter a hetero site gft2base[gftid] = ''.join(heteroSite[pos]) else: gft2base[gftid] = '{0}{0}'.format(qseq[int(cord) - int(sstart) - 1]) else: #hit W/O range pass return gft2base