def getReadIDs(in1,in2,title_split=' ',clusterID_position=0): readIDs = [] with open(in1) as inhandle1, open(in2) as inhandle2: for (title1,seq1,qual1),(title2,seq2,qual2) in zip( reptools.FASTQparser(inhandle1), reptools.FASTQparser(inhandle2) ): #loop through the fastq lines if title1.split(title_split)[clusterID_position] != title2.split(title_split)[clusterID_position]: raise IOError('Sequence titles do not match between files:\n{}\n{}\n'.format(title1,title2)) else: readIDs.append(title1.split(title_split)[clusterID_position]) return(readIDs)
def countgenehits_hitsfile(fastqFile, hitsFile, type, evalue=False, mincols=False, title_split=' ', verbose=True): import reptools import io results = {'hit': 0, 'ambiguous': 0, 'fail': 0} reptoolsdict = reptools.retrieve_tophits(hitsFile, type, evalue=evalue, mincols=mincols, title_split_char=title_split) with io.open(fastqFile) as fastq_handle: for title, seq, qual in reptools.FASTQparser(fastq_handle): trimmed_id = title.split(title_split)[0] if trimmed_id in reptoolsdict: if len(reptoolsdict[trimmed_id]) == 1: results['hit'] += 1 elif len(reptoolsdict[trimmed_id]) > 1: results['ambiguous'] += 1 else: results['fail'] += 1 else: results['fail'] += 1 if verbose: print(('hit = %s' % results['hit'])) print(('fail = %s' % results['fail'])) print(('ambiguous = %s' % results['ambiguous'])) return (results)
def fastqcounter(infile): """ Returns the number of unique sequences in a fastq file """ #check if file is derep'd using DerepCheck() derep = reptools.DerepCheck(infile) n = 0 if derep: with open(infile) as fn: for title, seq, qual in reptools.FASTQparser(fn): n += reptools.DerepCount(title) else: with open(infile) as fn: for title, seq, qual in reptools.FASTQparser(fn): n += 1 return (n)
def checkCDR3_fastq(mock_dict, fastqfile, hits_out=False, misses_out=False, failures_out=False, title_split=' ', verbose=True): """ Compares the sliced CDR3 in a fastq file with those simulated by MiXCR (and given in a .descr file) """ import reptools reptoolsdict = {} with open(fastqfile) as infile: for title, seq, qual in reptools.FASTQparser(infile): id = title.split(';')[0].split(title_split)[0] reptoolsdict[id] = seq results = {'hit': 0, 'miss': 0, 'fail': 0} with open( hits_out, 'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle: with open(misses_out, 'wb') if misses_out else reptools.dummy_context_mgr( ) as misses_handle: with open(failures_out, 'wb') if failures_out else reptools.dummy_context_mgr( ) as failures_handle: for id in mock_dict: try: if reptoolsdict[id].lower( ) == mock_dict[id]['CDR3'].lower(): results['hit'] += 1 hits_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) else: if reptoolsdict[id].lower() == 'n': results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) else: results['miss'] += 1 misses_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) except KeyError: results['fail'] += 1 print(id) failures_handle.write('>%s\n%s\n' % (id, '')) if verbose: print(('hit = %s' % results['hit'])) print(('miss = %s' % results['miss'])) print(('failed = %s' % results['fail'])) print(('pct hit = %s' % (results['hit'] / float(sum(results.values())) * 100))) print(('pct miss = %s' % (results['miss'] / float(sum(results.values())) * 100))) return (results)
def DerepCheck(fn): try: with open(fn) as inhandle: for title, seq in reptools.FASTAparser(inhandle): return (is_derepFas(title)) except ValueError: with open(fn) as inhandle: for title, seq, qual in reptools.FASTQparser(inhandle): return (is_derepFas(title))
def fastq2fasta(infile, outfile="", trimstart=0, overwrite=False): """ takes infile and outfile (file names) """ if outfile == "": if infile[-5:] == 'fastq': outfile = infile[:-5] + 'fas' else: outfile = infile + '.fas' if os.path.isfile(outfile) and not overwrite: raise IOError('Output file (%s) already exists' % (outfile)) for (title, seq, qual) in reptools.FASTQparser(open(infile, 'r')): with open(outfile, 'a') as f: f.write('>%s\n%s\n' % (title, seq[trimstart:])) return outfile
def EEfilter_file(infile, FASTAout=False, FASTQout=False, maxee=1): from reptools import dummy_context_mgr as dummy if not FASTAout and not FASTQout: raise ValueError( 'Please supply one or both of FASTAout and FASTQout to EEfilter()') with open(infile) as inhandle: with open(FASTAout, 'w') if FASTAout else dummy() as outfasta_handle: with open(FASTQout, 'w') if FASTQout else dummy() as outfastq_handle: for title, seq, qual in reptools.FASTQparser(inhandle): if reptools.calculate_EE(qual) <= maxee: outfasta_handle.write('>{}\n{}\n'.format(title, seq)) outfastq_handle.write('@{}\n{}\n+\n{}\n'.format( title, seq, qual)) return ( reptools.removeemptyfile(FASTQout) ) #returns None if the file was empty (and has been removed), else the fn
def derep_FASTQ(fn, clust_file): seqs = collections.defaultdict(dict) gene_ids = collections.defaultdict( list ) # stored in a list to avoid repetition - slower, but saves memory changes = collections.defaultdict(list) with open(fn) as in_handle: for title, seq, qual in reptools.FASTQparser(in_handle): if any([ nt not in ['A', 'T', 'G', 'C', 'a', 't', 'g', 'c'] for nt in seq.strip() ]): continue #omit ambiguous sequences title_list = title.strip().strip(';').split(';') seqlen = len(seq) #seqprobs = [-float(Q)/10 for Q in [ord(c)-33 for c in qual]] #for logs seqprobs = [ 10**(-float(Q) / 10) for Q in [ord(c) - 33 for c in qual] ] if title_list[1:] not in gene_ids[seqlen]: #if the geneid is new, the sequence is new (by definition) gene_ids[seqlen].append(title_list[1:]) seqs[seqlen][(seq, len(gene_ids[seqlen]) - 1)] = [ 1, #1, because this is the first time this sequence has been found title_list[0], #the first title found is stored for output seqprobs ] elif (seq, gene_ids[seqlen].index(title_list[1:])) not in seqs[seqlen]: #if the sequence string is new, but not geneid, the geneid code can be reused seqs[seqlen][(seq, gene_ids[seqlen].index( title_list[1:]))] = [1, title_list[0], seqprobs] else: #not a unique sequence #add to changes changes[seqs[seqlen][(seq, gene_ids[seqlen].index( title_list[1:]))][1]].append(title_list[0]) #calculate new probs list newprobs = [ old * new for old, new in zip( seqs[seqlen][(seq, gene_ids[seqlen].index(title_list[1:]) )][2], seqprobs) ] seqs[seqlen][(seq, gene_ids[seqlen].index(title_list[1:]))][0] += 1 seqs[seqlen][(seq, gene_ids[seqlen].index( title_list[1:]))][2] = newprobs #very high counts can result in probs of zero (float underrun), which is invalid; so change them to Phred=120 #TODO = change the probability handling to working with the log probabilities, which will also save #memory, as I can then use float16 - N.B. I can't, because numba doesn't support float16 (yet) for seqlen in seqs: for k in seqs[seqlen]: if min(seqs[seqlen][k][2]) < 0.000000000001: seqs[seqlen][k][2] = [ prob if prob >= 0.000000000001 else 0.000000000001 for prob in seqs[seqlen][k][2] ] if clust_file: with open(clust_file, 'w') as clust_handle: for recipient in changes: clust_handle.write('{}\t{}\n'.format( recipient, '\t'.join(changes[recipient]))) return (dict(seqs), dict(gene_ids))
def checkCDR3_prod(fastqfile, minlen=3 * 5, maxlen=3 * 30, startchars='C', endchars='FWH', hits_out=False, failures_out=False, frameshift_out=False, long_out=False, short_out=False, stop_out=False, bad_out=False, title_split=' ', verbose=True): """ This for use where no reference file is available. Reports % of CDR3 which are productive or start with C and end with F/W/H, and are within a sensible length range Over- and under-length CDR3 are eliminated first Then those with a bad start or end residue (not C and F/W/H) Then those with a stop Then those with a frameshift """ import reptools results = { 'good': 0, 'frameshift': 0, 'stop': 0, 'bad': 0, 'long': 0, 'short': 0, 'fail': 0 } with open(fastqfile) as infile: with open(hits_out, 'wb') if hits_out else reptools.dummy_context_mgr( ) as hits_handle: with open(failures_out, 'wb') if failures_out else reptools.dummy_context_mgr( ) as failures_handle: with open(frameshift_out, 'wb' ) if frameshift_out else reptools.dummy_context_mgr( ) as shift_handle: with open( long_out, 'wb') if long_out else reptools.dummy_context_mgr( ) as long_handle: with open( short_out, 'wb' ) if short_out else reptools.dummy_context_mgr( ) as short_handle: with open( stop_out, 'wb' ) if stop_out else reptools.dummy_context_mgr( ) as stop_handle: with open( bad_out, 'wb' ) if bad_out else reptools.dummy_context_mgr( ) as bad_handle: for title, seq, qual in reptools.FASTQparser( infile): id = title.split(';')[0].split( title_split)[0] seq = seq.strip() if seq.lower() == 'n': results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, seq)) elif len(seq) > maxlen: results['long'] += 1 long_handle.write('>%s\n%s\n' % (id, seq)) elif len(seq) < minlen: results['short'] += 1 short_handle.write('>%s\n%s\n' % (id, seq)) elif (reptools.trans(seq[0:3]).lower() not in startchars.lower() or reptools.trans(seq[-3:]).lower() not in endchars.lower()): results['bad'] += 1 bad_handle.write('>%s\n%s\n' % (id, seq)) elif '*' in reptools.trans(seq): results['stop'] += 1 stop_handle.write('>%s\n%s\n' % (id, seq)) elif len(seq) % 3 != 0: results['frameshift'] += 1 shift_handle.write('>%s\n%s\n' % (id, seq)) else: results['good'] += 1 hits_handle.write('>%s\n%s\n' % (id, seq)) if verbose: totalreads = float(sum(results.values())) print(('over length = %s (%s pct)' % (results['long'], results['long'] / totalreads * 100))) print(('under length = %s (%s pct)' % (results['short'], results['short'] / totalreads * 100))) print(('bad start/end = %s (%s pct)' % (results['bad'], results['bad'] / totalreads * 100))) print(('stop codon = %s (%s pct)' % (results['stop'], results['stop'] / totalreads * 100))) print(( 'frameshift = %s (%s pct)' % (results['frameshift'], results['frameshift'] / totalreads * 100))) print(('no CDR3 = %s (%s pct)' % (results['fail'], results['fail'] / totalreads * 100))) print(('good CDR3 = %s (%s pct)' % (results['good'], results['good'] / totalreads * 100))) return (results)
def checkgeneID_fastq(mock_dict, gene, fastqfile, title_split=' ', hits_out=False, misses_out=False, failures_out=False, ambiguous_out=False, verbose=True): #TODO: add transtable option to replace the [s.split('gamma')[0].split('alpha')[0].split('_')[0] for s in gene_strings] line #or, process the mock_dict first, to match import reptools reptoolsdict = {} with open(fastqfile) as infile: for title, seq, qual in reptools.FASTQparser(infile): id = title.split(';')[0].split(title_split)[0] gene_strings = [ s.split('=')[1] for s in title.split(';') if s.split('=')[0] == gene ] gene_strings = gene_strings[0].split('+') gene_strings = [ s.split('gamma')[0].split('alpha')[0].split('_')[0] for s in gene_strings ] reptoolsdict[id] = gene_strings results = {'hit': 0, 'miss': 0, 'ambiguous': 0, 'fail': 0} with open( hits_out, 'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle: with open(misses_out, 'wb') if misses_out else reptools.dummy_context_mgr( ) as misses_handle: with open(failures_out, 'wb') if failures_out else reptools.dummy_context_mgr( ) as failures_handle: with open( ambiguous_out, 'wb') if ambiguous_out else reptools.dummy_context_mgr( ) as ambiguous_handle: for id in mock_dict: try: if len(reptoolsdict[id]) == 1: if reptoolsdict[id][0] == mock_dict[id][ gene].split('*')[0]: results['hit'] += 1 if hits_out: hits_handle.write( '>%s\n%s\n' % (id, reptoolsdict[id])) elif reptoolsdict[id] == 'none': results['fail'] += 1 failures_handle.write( '>%s\n%s\n' % (id, reptoolsdict[id])) else: results['miss'] += 1 misses_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) elif len(reptoolsdict[id]) == 0: results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) elif len(reptoolsdict[id]) > 1: if mock_dict[id][gene].split( '*')[0] in reptoolsdict[id]: results['ambiguous'] += 1 ambiguous_handle.write( '>%s\n%s\n' % (id, reptoolsdict[id])) else: results['miss'] += 1 misses_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) else: raise ValueError('How did we get here?') except KeyError: results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) if verbose: print(('hit = %s' % results['hit'])) print(('miss = %s' % results['miss'])) print(('ambiguous (including hit) = %s' % results['ambiguous'])) print(('failed = %s' % results['fail'])) print(('pct hit = %s' % (results['hit'] / float(sum(results.values())) * 100))) print(('pct hit (including ambiguous hit) = %s' % ((results['hit'] + results['ambiguous']) / float(sum(results.values())) * 100))) return (results)