def amplitude_from_abif(filename): ''' f Return amplitude dictionnary for each base. filename : sanger data abif format seq : compute amplitude only for a sub sequence ''' abif = Trace(filename) ampl = abif.get_data("P1AM1") output = [] index = 0 end = len(abif.seq) while index < end : base = abif.seq[index] output.append((base,abif.get_data("P1AM1")[index])) index+=1 return output
def amplitude_from_abif(filename): ''' f Return amplitude dictionnary for each base. filename : sanger data abif format seq : compute amplitude only for a sub sequence ''' abif = Trace(filename) ampl = abif.get_data("P1AM1") output = [] index = 0 end = len(abif.seq) while index < end: base = abif.seq[index] output.append((base, abif.get_data("P1AM1")[index])) index += 1 return output
def __init__(self,filename,ratio,q_set): traceObject = Trace(filename) self.ratio = float(ratio) self.q_set = int(q_set) self.primarySeq = [] self.secondarySeq = [] self.trim5 = 0 self.trim3 = 0 self.order = list( traceObject.get_data('FWO_1') ) # The Order of the lightwave of Nucleotides self.primary_basecalls = traceObject.get_data('PBAS2') # Bases called by the basecaller of machine self.primary_quality = traceObject.get_data('PCON2') # Base Quality by basecaller self.primary_basecallPos = np.array( traceObject.get_data('PLOC2')) # Primary Peak Location called by the basecaller '''Unprocessed RAW DATA''' ### Analyzed Color Data self.traceMatrix = np.array( [ traceObject.get_data('DATA9'), traceObject.get_data('DATA10'), traceObject.get_data('DATA11'), traceObject.get_data('DATA12') ]) #self.peakPosMatrix , self.peakAmpMatrix = self.getPeakPosAndAmp() correction = True try: self.primary_peakAmp = np.array( traceObject.get_data('P1AM1')) # Primary Peak Amp called by the basecaller self.secondary_peakAmp = np.array( traceObject.get_data('P2AM1')) self.secondary_peakBase = traceObject.get_data('P2BA1') except KeyError: correction = False self.background_value = None; self.primarySeq , self.secondarySeq , self.trim5 , self.trim3 = self.makebasecall(self.ratio,self.q_set,correction)
def main(args): #https://pypi.python.org/pypi/abifpy/0.9 parser = argparse.ArgumentParser( description= 'extracts sequence and various other data from Applied Biosystems, Inc. format (ABI) file.' ) parser.add_argument('-i', '--indir') # process all ab1 files in the directory parser.add_argument('-o', '--outfile', default="sanger_entire_seq_out.fa") # output filename parser.add_argument('-f', '--infiles', nargs='*') # creates a list of files to be processed parser.add_argument('-d', '--delimiter', default='\t') # delimiter for file args = parser.parse_args() if len(sys.argv) == 1: parser.print_help() sys.exit('\natleast one argument required\n') infiles = list() if args.indir != None: mypath = args.indir infiles = [(join(mypath, f)) for f in listdir(mypath) if (isfile(join(mypath, f)) and 'ab1' in f)] elif args.infiles != None: infiles = args.infiles else: parser.print_help() sys.exit('\natleast one input required\n') outfile = args.outfile delim = args.delimiter out = open(outfile, 'w') for f in infiles: ab1file = Trace(f) out.write('>Sample-%s:File-%s\n%s\n' % (ab1file.name, ab1file.id, ab1file.seq)) #ab1file.export(outfile) out.close()
def getAbi(fileName): # This function opens the file AB1 and returns a dictionary with the fields # that are useful to us. theTrace = Trace(fileName) theRaws = ["raw1", "raw2", "raw3", "raw4"] abi = {} baseOrder = theTrace.data["baseorder"] abi.update({"BaseOrder": baseOrder}) abi.update({"NumValues": len(theTrace.data["raw1"])}) abi.update({"NumBases": len(theTrace.seq)}) abi.update({"Sequence": theTrace.seq}) abi.update({"TracePeaks": theTrace.data["tracepeaks"]}) for i in range(4): oneRaw = theRaws[i] oneBase = baseOrder[i] if oneBase == 'A': abi.update({"RawA": list(theTrace.data[oneRaw])}) elif oneBase == 'C': abi.update({"RawC": list(theTrace.data[oneRaw])}) elif oneBase == 'G': abi.update({"RawG": list(theTrace.data[oneRaw])}) else: abi.update({"RawT": list(theTrace.data[oneRaw])}) return abi
#! /usr/bin/python __author__ = 'sefel' from abifpy import Trace # python module for reading ab1 files (https://github.com/bow/abifpy) import sys # for file input, output operations q_score_cutoff = 20 # cutoff for filtering reads based on PHRED quality score f_seq = '' # f - filtered f_qual = '' f_qual_val = [] if len(sys.argv) <= 1: print "No inputfile provided." print "Usage: python ab1_to_fastq.py <inputfile.ab1>" sys.exit() else: inputfile = sys.argv[1] my_seq = Trace(inputfile) if len(sys.argv) == 3: q_prob_cutoff = sys.argv[2] # begin trimming using modified Richard Mott's algorithm with custom error probability cutoff my_seq.seq = my_seq.trim(my_seq.seq, q_prob_cutoff) my_seq.qual = my_seq.trim(my_seq.qual, q_prob_cutoff) my_seq.qual_val = my_seq.trim(my_seq.qual_val, q_prob_cutoff) else: # trim sequence using modified Richard Mott's algorithm with default error probability cutoff (0.05) my_seq = Trace(inputfile, trimming=True) # replace ambiguous code with Ns my_seq.seq = my_seq.seq_remove_ambig(my_seq.seq) my_seq.export(my_seq.name + ".fastq", 'fastq')
#!/usr/bin/env python # -*- coding: utf-8 -*- from abifpy import Trace yummy = Trace('/Users/mac/Documents/data/ab1/HBV_B.ab1') print yummy.seq print yummy.qual print yummy.qual_val print yummy.id print yummy.name
pruned_data['CH2'] = data['CH2'][first_nuc - 1:last_nuc] pruned_data['CH3'] = data['CH3'][first_nuc - 1:last_nuc] pruned_data['CH4'] = data['CH4'][first_nuc - 1:last_nuc] return pruned_data if __name__ == '__main__': if not len(sys.argv) == 3: sys.exit('Usage\tTrace file\tOutput file name') trace_file = sys.argv[1] output_file_name = sys.argv[2] #Load trace trace = Trace(trace_file) #Extract analysed trace data data = { 'SEQ': trace.seq, 'CH1': trace.tags['DATA9'].tag_data, 'CH2': trace.tags['DATA10'].tag_data, 'CH3': trace.tags['DATA11'].tag_data, 'CH4': trace.tags['DATA12'].tag_data, 'PLOC2': trace.tags['PLOC2'].tag_data, 'OFFSET': 0 } generate_files(data, output_file_name, '_full_trace') pruned_data = auto_prune(data)
def main(trace, seq, gene2locus): ''' tracefile, seq, make sure trace and seq correspend to each other correctly return heterosite{} ''' heteroSite = {} gft2base = {} st = Trace(trace) startBase = SKIP_BASE_NUM endBase = len(st.seq) - SKIP_BASE_NUM st.basepos = st.get_data('PLOC1') st.basecalls = st.seq st.tracesamps = {'G': (), 'A': (), 'T': (), 'C': ()} st.tracesamps['G'], st.tracesamps['A'], st.tracesamps['T'], st.tracesamps[ 'C'] = st.get_data('DATA9'), st.get_data('DATA10'), st.get_data( 'DATA11'), st.get_data('DATA12') #detect hetero base i = 0 for basePos in st.basepos[startBase:endBase]: #base posGATC i += 1 tracePos = [ st.tracesamps['G'][basePos], st.tracesamps['A'][basePos], st.tracesamps['T'][basePos], st.tracesamps['C'][basePos] ] if max(tracePos) < TRACE_THRESHOLD: continue stdMn = calSimilty( sorted(tracePos, reverse=True)[:2] ) #calculate the first two largest trace value's similarity by formula: std / mean if stdMn > SIMILTY_THRESHOLD: continue altBase = baseOrderDict[getSecondLarge(tracePos)] heteroPos = startBase + i if heteroPos not in heteroSite: heteroSite[heteroPos] = [st.basecalls[startBase + i - 1], altBase] else: pass if args.debug.lower() == 'y': print( '{orgbasePos}\t{pos}\t{base}\t{G}\t{A}\t{T}\t{C}\t{stdMn}\t{altBase}' .format(base=st.basecalls[startBase + i - 1], G=st.tracesamps['G'][basePos], A=st.tracesamps['A'][basePos], T=st.tracesamps['T'][basePos], C=st.tracesamps['C'][basePos], stdMn=stdMn, pos=startBase + i, orgbasePos=basePos, altBase=altBase)) #run blastn cmd = 'blastn -query "{0}" -db {1} -evalue 0.01 -outfmt "6 qseqid sseqid qstart qend sstart send qseq sseq btop" -num_threads 8 -task megablast -max_target_seqs 1'.format( seq, refSeq) logging.debug(cmd) blastRes = os.popen(cmd).read() #extract base content qseqid, sseqid, qstart, qend, sstart, send, qseq, sseq, btop = blastRes.rstrip( ).split('\t') assert sseqid in gene2locus, '{0} not valid gene id'.format(sseqid) for gftid, cord in gene2locus[sseqid].items(): if (int(sstart) <= int(cord) <= int(send)): #hit within range pos = int(qstart) + int(cord) - int(sstart) if pos in heteroSite: #encounter a hetero site gft2base[gftid] = ''.join(heteroSite[pos]) else: gft2base[gftid] = '{0}{0}'.format(qseq[int(cord) - int(sstart) - 1]) else: #hit W/O range pass return gft2base
DATABASE = './database' refSeq = os.path.join(DATABASE, 'tumor_ref.fa') #gene2locus = os.path.join(DATABASE, 'genelocus.json') assert os.path.exists(refSeq) #assert os.path.exists(gene2locus) baseOrderDict = {0: 'G', 1: 'A', 2: 'T', 3: 'C'} assert os.path.exists(args.rawseq) name = os.path.basename(os.path.realpath(args.rawseq)) traceFiles = sorted([f for f in os.listdir(args.rawseq) if f.endswith('.ab1')]) traceFiles = [os.path.join(args.rawseq, f) for f in traceFiles] seqFiles = [] for tracef in traceFiles: seqFileName = os.path.basename(tracef).replace('.ab1', '.seq') gstrace = Trace(tracef) gstrace.export(out_file=os.path.join(args.rawseq, seqFileName), fmt='fasta') seqFiles.append(seqFileName) seqFiles = sorted(seqFiles) seqFiles = [os.path.join(args.rawseq, f) for f in seqFiles] logging.debug(seqFiles) logging.debug(traceFiles) def calSimilty(numList): import numpy as np return np.std(numList, dtype=np.float64) / np.mean(numList)