Example #1
0
def amplitude_from_abif(filename):
	''' f
	Return amplitude dictionnary for each base. 
	filename : sanger data abif format 
	seq      : compute amplitude only for a sub sequence 
	''' 
	abif   = Trace(filename)
	ampl   = abif.get_data("P1AM1")
	output = []

	index  = 0 
	end    = len(abif.seq)

	while index < end : 
		base  = abif.seq[index]
		output.append((base,abif.get_data("P1AM1")[index]))
		index+=1

	return output
Example #2
0
def amplitude_from_abif(filename):
    ''' f
	Return amplitude dictionnary for each base. 
	filename : sanger data abif format 
	seq      : compute amplitude only for a sub sequence 
	'''
    abif = Trace(filename)
    ampl = abif.get_data("P1AM1")
    output = []

    index = 0
    end = len(abif.seq)

    while index < end:
        base = abif.seq[index]
        output.append((base, abif.get_data("P1AM1")[index]))
        index += 1

    return output
Example #3
0
    def __init__(self,filename,ratio,q_set):

        traceObject = Trace(filename)
        self.ratio = float(ratio)
        self.q_set = int(q_set)
        self.primarySeq = []
        self.secondarySeq = []
        self.trim5 = 0
        self.trim3 = 0

        self.order = list( traceObject.get_data('FWO_1') ) # The Order of the lightwave of Nucleotides

        self.primary_basecalls    = traceObject.get_data('PBAS2') # Bases called by the basecaller of machine
        self.primary_quality      = traceObject.get_data('PCON2') # Base Quality by basecaller
        self.primary_basecallPos  = np.array( traceObject.get_data('PLOC2')) # Primary Peak Location called by the basecaller


        '''Unprocessed  RAW DATA'''

        ### Analyzed Color Data
        self.traceMatrix = np.array( [ traceObject.get_data('DATA9'),
                                       traceObject.get_data('DATA10'),
                                       traceObject.get_data('DATA11'),
                                       traceObject.get_data('DATA12') ])

        #self.peakPosMatrix , self.peakAmpMatrix = self.getPeakPosAndAmp()

        correction = True
        try:
            self.primary_peakAmp      = np.array( traceObject.get_data('P1AM1')) # Primary Peak Amp called by the basecaller
            self.secondary_peakAmp    = np.array( traceObject.get_data('P2AM1'))
            self.secondary_peakBase   = traceObject.get_data('P2BA1')
        except KeyError:
            correction = False

        self.background_value = None;
        self.primarySeq , self.secondarySeq , self.trim5 , self.trim3 = self.makebasecall(self.ratio,self.q_set,correction)
Example #4
0
def main(args):
    #https://pypi.python.org/pypi/abifpy/0.9
    parser = argparse.ArgumentParser(
        description=
        'extracts sequence and various other data from Applied Biosystems, Inc. format (ABI) file.'
    )
    parser.add_argument('-i',
                        '--indir')  # process all ab1 files in the directory
    parser.add_argument('-o', '--outfile',
                        default="sanger_entire_seq_out.fa")  # output filename
    parser.add_argument('-f', '--infiles',
                        nargs='*')  # creates a list of files to be processed
    parser.add_argument('-d', '--delimiter',
                        default='\t')  # delimiter for file

    args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit('\natleast one argument required\n')

    infiles = list()

    if args.indir != None:
        mypath = args.indir
        infiles = [(join(mypath, f)) for f in listdir(mypath)
                   if (isfile(join(mypath, f)) and 'ab1' in f)]
    elif args.infiles != None:
        infiles = args.infiles
    else:
        parser.print_help()
        sys.exit('\natleast one input required\n')

    outfile = args.outfile
    delim = args.delimiter

    out = open(outfile, 'w')
    for f in infiles:
        ab1file = Trace(f)
        out.write('>Sample-%s:File-%s\n%s\n' %
                  (ab1file.name, ab1file.id, ab1file.seq))
        #ab1file.export(outfile)
    out.close()
Example #5
0
def getAbi(fileName):
    # This function opens the file AB1 and returns a dictionary with the fields
    # that are useful to us.
    theTrace = Trace(fileName)
    theRaws = ["raw1", "raw2", "raw3", "raw4"]
    abi = {}
    baseOrder = theTrace.data["baseorder"]
    abi.update({"BaseOrder": baseOrder})
    abi.update({"NumValues": len(theTrace.data["raw1"])})
    abi.update({"NumBases": len(theTrace.seq)})
    abi.update({"Sequence": theTrace.seq})
    abi.update({"TracePeaks": theTrace.data["tracepeaks"]})
    for i in range(4):
        oneRaw = theRaws[i]
        oneBase = baseOrder[i]
        if oneBase == 'A':
            abi.update({"RawA": list(theTrace.data[oneRaw])})
        elif oneBase == 'C':
            abi.update({"RawC": list(theTrace.data[oneRaw])})
        elif oneBase == 'G':
            abi.update({"RawG": list(theTrace.data[oneRaw])})
        else:
            abi.update({"RawT": list(theTrace.data[oneRaw])})
    return abi
Example #6
0
#! /usr/bin/python
__author__ = 'sefel'

from abifpy import Trace  # python module for reading ab1 files (https://github.com/bow/abifpy)
import sys  # for file input, output operations

q_score_cutoff = 20  # cutoff for filtering reads based on PHRED quality score
f_seq = ''  # f - filtered
f_qual = ''
f_qual_val = []

if len(sys.argv) <= 1:
    print "No inputfile provided."
    print "Usage: python ab1_to_fastq.py <inputfile.ab1>"
    sys.exit()
else:
    inputfile = sys.argv[1]
    my_seq = Trace(inputfile)
    if len(sys.argv) == 3:
        q_prob_cutoff = sys.argv[2]
        # begin trimming using modified Richard Mott's algorithm with custom error probability cutoff
        my_seq.seq = my_seq.trim(my_seq.seq, q_prob_cutoff)
        my_seq.qual = my_seq.trim(my_seq.qual, q_prob_cutoff)
        my_seq.qual_val = my_seq.trim(my_seq.qual_val, q_prob_cutoff)
    else:
        # trim sequence using modified Richard Mott's algorithm with default error probability cutoff (0.05)
        my_seq = Trace(inputfile, trimming=True)
    # replace ambiguous code with Ns
    my_seq.seq = my_seq.seq_remove_ambig(my_seq.seq)
    my_seq.export(my_seq.name + ".fastq", 'fastq')
Example #7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from abifpy import Trace

yummy = Trace('/Users/mac/Documents/data/ab1/HBV_B.ab1')

print yummy.seq

print yummy.qual

print yummy.qual_val

print yummy.id

print yummy.name
Example #8
0
    pruned_data['CH2'] = data['CH2'][first_nuc - 1:last_nuc]
    pruned_data['CH3'] = data['CH3'][first_nuc - 1:last_nuc]
    pruned_data['CH4'] = data['CH4'][first_nuc - 1:last_nuc]

    return pruned_data


if __name__ == '__main__':
    if not len(sys.argv) == 3:
        sys.exit('Usage\tTrace file\tOutput file name')

    trace_file = sys.argv[1]
    output_file_name = sys.argv[2]

    #Load trace
    trace = Trace(trace_file)

    #Extract analysed trace data
    data = {
        'SEQ': trace.seq,
        'CH1': trace.tags['DATA9'].tag_data,
        'CH2': trace.tags['DATA10'].tag_data,
        'CH3': trace.tags['DATA11'].tag_data,
        'CH4': trace.tags['DATA12'].tag_data,
        'PLOC2': trace.tags['PLOC2'].tag_data,
        'OFFSET': 0
    }

    generate_files(data, output_file_name, '_full_trace')

    pruned_data = auto_prune(data)
Example #9
0
def main(trace, seq, gene2locus):
    '''
    tracefile, seq, make sure trace and seq correspend to each other correctly
    return heterosite{}
    '''
    heteroSite = {}
    gft2base = {}
    st = Trace(trace)
    startBase = SKIP_BASE_NUM
    endBase = len(st.seq) - SKIP_BASE_NUM
    st.basepos = st.get_data('PLOC1')
    st.basecalls = st.seq
    st.tracesamps = {'G': (), 'A': (), 'T': (), 'C': ()}
    st.tracesamps['G'], st.tracesamps['A'], st.tracesamps['T'], st.tracesamps[
        'C'] = st.get_data('DATA9'), st.get_data('DATA10'), st.get_data(
            'DATA11'), st.get_data('DATA12')
    #detect hetero base
    i = 0
    for basePos in st.basepos[startBase:endBase]:
        #base posGATC
        i += 1
        tracePos = [
            st.tracesamps['G'][basePos], st.tracesamps['A'][basePos],
            st.tracesamps['T'][basePos], st.tracesamps['C'][basePos]
        ]
        if max(tracePos) < TRACE_THRESHOLD:
            continue
        stdMn = calSimilty(
            sorted(tracePos, reverse=True)[:2]
        )  #calculate the first two largest trace value's similarity by formula: std / mean
        if stdMn > SIMILTY_THRESHOLD:
            continue
        altBase = baseOrderDict[getSecondLarge(tracePos)]
        heteroPos = startBase + i
        if heteroPos not in heteroSite:
            heteroSite[heteroPos] = [st.basecalls[startBase + i - 1], altBase]
        else:
            pass
        if args.debug.lower() == 'y':
            print(
                '{orgbasePos}\t{pos}\t{base}\t{G}\t{A}\t{T}\t{C}\t{stdMn}\t{altBase}'
                .format(base=st.basecalls[startBase + i - 1],
                        G=st.tracesamps['G'][basePos],
                        A=st.tracesamps['A'][basePos],
                        T=st.tracesamps['T'][basePos],
                        C=st.tracesamps['C'][basePos],
                        stdMn=stdMn,
                        pos=startBase + i,
                        orgbasePos=basePos,
                        altBase=altBase))

    #run blastn
    cmd = 'blastn -query "{0}" -db {1}   -evalue 0.01 -outfmt  "6 qseqid sseqid qstart qend sstart send qseq sseq btop"   -num_threads 8 -task megablast -max_target_seqs 1'.format(
        seq, refSeq)
    logging.debug(cmd)
    blastRes = os.popen(cmd).read()
    #extract base content
    qseqid, sseqid, qstart, qend, sstart, send, qseq, sseq, btop = blastRes.rstrip(
    ).split('\t')
    assert sseqid in gene2locus, '{0} not valid gene id'.format(sseqid)
    for gftid, cord in gene2locus[sseqid].items():
        if (int(sstart) <= int(cord) <= int(send)):
            #hit within range
            pos = int(qstart) + int(cord) - int(sstart)
            if pos in heteroSite:  #encounter a hetero site
                gft2base[gftid] = ''.join(heteroSite[pos])
            else:
                gft2base[gftid] = '{0}{0}'.format(qseq[int(cord) -
                                                       int(sstart) - 1])
        else:
            #hit W/O range
            pass
    return gft2base
Example #10
0
DATABASE = './database'
refSeq = os.path.join(DATABASE, 'tumor_ref.fa')
#gene2locus = os.path.join(DATABASE, 'genelocus.json')
assert os.path.exists(refSeq)
#assert os.path.exists(gene2locus)

baseOrderDict = {0: 'G', 1: 'A', 2: 'T', 3: 'C'}
assert os.path.exists(args.rawseq)
name = os.path.basename(os.path.realpath(args.rawseq))
traceFiles = sorted([f for f in os.listdir(args.rawseq) if f.endswith('.ab1')])
traceFiles = [os.path.join(args.rawseq, f) for f in traceFiles]
seqFiles = []
for tracef in traceFiles:
    seqFileName = os.path.basename(tracef).replace('.ab1', '.seq')
    gstrace = Trace(tracef)
    gstrace.export(out_file=os.path.join(args.rawseq, seqFileName),
                   fmt='fasta')
    seqFiles.append(seqFileName)
seqFiles = sorted(seqFiles)
seqFiles = [os.path.join(args.rawseq, f) for f in seqFiles]

logging.debug(seqFiles)
logging.debug(traceFiles)


def calSimilty(numList):
    import numpy as np
    return np.std(numList, dtype=np.float64) / np.mean(numList)