def checkCDR3_fastq(mock_dict, fastqfile, hits_out=False, misses_out=False, failures_out=False, title_split=' ', verbose=True): """ Compares the sliced CDR3 in a fastq file with those simulated by MiXCR (and given in a .descr file) """ import reptools reptoolsdict = {} with open(fastqfile) as infile: for title, seq, qual in reptools.FASTQparser(infile): id = title.split(';')[0].split(title_split)[0] reptoolsdict[id] = seq results = {'hit': 0, 'miss': 0, 'fail': 0} with open( hits_out, 'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle: with open(misses_out, 'wb') if misses_out else reptools.dummy_context_mgr( ) as misses_handle: with open(failures_out, 'wb') if failures_out else reptools.dummy_context_mgr( ) as failures_handle: for id in mock_dict: try: if reptoolsdict[id].lower( ) == mock_dict[id]['CDR3'].lower(): results['hit'] += 1 hits_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) else: if reptoolsdict[id].lower() == 'n': results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) else: results['miss'] += 1 misses_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) except KeyError: results['fail'] += 1 print(id) failures_handle.write('>%s\n%s\n' % (id, '')) if verbose: print(('hit = %s' % results['hit'])) print(('miss = %s' % results['miss'])) print(('failed = %s' % results['fail'])) print(('pct hit = %s' % (results['hit'] / float(sum(results.values())) * 100))) print(('pct miss = %s' % (results['miss'] / float(sum(results.values())) * 100))) return (results)
def saveFASTX(seqs_np, gene_ids_sets, FASTAout, FASTQout, gene_labels): #make gene_strings gene_ids = make_gene_strings(gene_ids_sets, gene_labels) #find maximum count maxcount = np.amax([np.amax(seqs_np[seqlen][3]) for seqlen in seqs_np]) #descend through counts with open( FASTAout, 'w') if FASTAout else reptools.dummy_context_mgr() as fasta_handle: with open(FASTQout, 'w') if FASTQout else reptools.dummy_context_mgr( ) as fastq_handle: for count in range(maxcount, 0, -1): #don't want to write where count=0 for seqlen in seqs_np: towrite = np.where(seqs_np[seqlen][3] == count)[0] for x in towrite: #try: title = '{};{};size={}'.format( seqs_np[seqlen][4][x], gene_ids[seqlen][seqs_np[seqlen][1][x]], seqs_np[seqlen][3][x]) #except TypeError: #if there are no seqs with this count, the iterator returns an empty numpy # #array, which breaks the indexing # continue seq = ''.join([chr(c) for c in seqs_np[seqlen][0][x]]) try: qual = ''.join([ chr(int(c + 33)) if c <= 93 else chr(126) for c in #prob_toqual(seqs_np[seqlen][2][x]) [ np.around( np.multiply(np.log10(prob), -10), decimals=0) ##TODO - change for logs for prob in seqs_np[seqlen][2][ x] #TODO - change for logs ] ]) except: print(seqlen) print(x) #print(seqs_np[seqlen][2]) print((seqs_np[seqlen][2][x])) raise fasta_handle.write('>{}\n{}\n'.format(title, seq)) fastq_handle.write('@{}\n{}\n+\n{}\n'.format( title, seq, qual))
def simplify_genes(seqs_np, gene_ids_sets, clust_file, seqlens=False): """ remove unnecessary ambiguities in gene segment IDs: where a sequence is identical to another, the rarer sequence is combined with the more common, IF the two sequences have at least one gene segment ID in common for each gene in the id line. The daughter record will keep the gene segment IDs of the more numerous parent. The rarer parent will have its read count set to zero. If the two records have an qual read count, the simpler (i.e. shorter) set of possible gene segments is taken for each gene (so that the output selection may take some genes from one sequence, and some from the other). TODO: daughter qual score is the posterior probability of the input qual scores (weighted by read count) Input: seqs_np = a dictionary of lists of numpy arrays, one entry per sequence length, with the value being a list of numpy arrays for: sequences, gene id codes, probabilities, rea counts, sequence names gene_ids_sets = a dictionary of dictionaries, one entry per sequence length, with the value being a dictionary where keys match the integers in the gene id codes numpy array, and the values are a list of sets, giving the gene segments identified for each gene (one set per gene). Output: gene_ids_sets. Modified gene set dictionary. Numpy arrays are modified in place """ if not seqlens: seqlens = list(seqs_np.keys()) seqlens = [ ln for ln in seqlens if ln > 1 ] #single nt sequences break the array creation code, and are not useful with open(clust_file, 'w') if clust_file else reptools.dummy_context_mgr( ) as clust_handle: for seqlen in seqlens: #print('Seqlen={}'.format(str(seqlen))) changes = {} seq_array, gene_id_array, prob_array, counts_array, seq_names = seqs_np[ seqlen] reptools.sort_by_freq(seq_array, gene_id_array, prob_array, counts_array, seq_names) #iterate from least to most common (missing the very most common, as there will be nothing to compare it with) for n in range(len(counts_array) - 1, 0, -1): #compare this row with all at least as common (i.e. earlier) rows identical = np.where( identical_rows(seq_array[:n, ], seq_array[n, :]))[0] #work through the identical rows, from the first (most common) on, checking for gene identity if len(identical) > 0: for p in identical: qry_genes = gene_ids_sets[seqlen][gene_id_array[n]] targ_genes = gene_ids_sets[seqlen][gene_id_array[p]] #if there is a match if sum([ len(_q.union(_t)) == len(_q) + len(_t) for _q, _t in zip(qry_genes, targ_genes) ]) == 0: #check if the read count is equal (should never be less) if counts_array[p] == counts_array[n]: #if so, take the simplest gene set for each newgeneset = [] for seg in range(len(qry_genes)): if len(qry_genes[seg]) < len( targ_genes[seg]): newgeneset.append(qry_genes[seg]) else: newgeneset.append(targ_genes[seg]) #is this combination of gene ids already in the dictionary? if newgeneset in list( gene_ids_sets[seqlen].values()): gene_id_array[p] = [ k for k in gene_ids_sets[seqlen] if gene_ids_sets[seqlen][k] == newgeneset ][0] #if so, set gene_id code else: #if not, create an entry newentry = max(gene_ids_sets[seqlen]) + 1 if newentry > 65535: raise ValueError( 'Too many unique gene id combinations (>65535)' ) gene_ids_sets[seqlen][ newentry] = newgeneset gene_id_array[p] = newentry #combine read counts counts_array[p] += counts_array[n] counts_array[n] = 0 #record changes try: changes[seq_names[p]].append(seq_names[n]) except KeyError: changes[seq_names[p]] = [seq_names[n]] changes_made = True break #if a match was found, continue to the next rarest row reptools.save_changes(clust_handle, changes, seq_names, counts_array) return (gene_ids_sets)
def denoise_indelonly(seqs_np, gene_ids_sets, threshold, clust_file, seqlens=False): """ denoise reads differing length, removing indels only (and no more than one indel): where a sequence is threshold less numerous than another and has only a single indel difference, add its reads to the more common sequence IF the two sequences have at least one gene segment ID in common for each gene in the id line. The daughter record will keep the gene segment IDs of the more numerous parent. The rarer parent will have its read count set to zero. TODO: Is there a way to modify qual socres with indels? Nothing obvious (perhaps with insertions?) Input: seqs_np = a dictionary of lists of numpy arrays, one entry per sequence length, with the value being a list of numpy arrays for: sequences, gene id codes, probabilities, rea counts, sequence names gene_ids_sets = a dictionary of dictionaries, one entry per sequence length, with the value being a dictionary where keys match the integers in the gene id codes numpy array, and the values are a list of sets, giving the gene segments identified for each gene (one set per gene). Output: None. Numpy arrays are modified in place """ if not seqlens: seqlens = list(seqs_np.keys()) seqlens = [ ln for ln in seqlens if ln > 1 ] #single nt sequences break the array creation code, and are not useful seqlens = sorted(seqlens) if len(seqlens) < 2: print( 'indel denoising cannot be performed when all sequences are the same length' ) return previous_seq_array = None with open(clust_file, 'w') if clust_file else reptools.dummy_context_mgr( ) as clust_handle: for seqlen in seqlens: seq_array, gene_id_array, prob_array, counts_array, seq_names = seqs_np[ seqlen] changes = {} if previous_seq_array is not None: if seqlen - previous_seqlen == 1: #only look for indels when seqlength delta==1 #the previous seq_array is one base shorter than the present one #so want to mask each position in the current one in turn, and get the hamming distance changes_made = True #loopcounter=0 while changes_made: #iterate until no more improvements changes_made = False #loopcounter+=1 #print('Seqlen={}, iteration={}'.format(str(seqlen),str(loopcounter))) reptools.sort_by_freq(seq_array, gene_id_array, prob_array, counts_array, seq_names) if np.any(counts_array == 0): firstzero = np.where(counts_array == 0)[0][0] else: firstzero = seq_array.shape[0] for n in range( 0, firstzero ): #iterate from most to least common (but non-zero) indels = np.full(previous_seq_array.shape[0], False) b = np.full(seq_array.shape[1], True) for x in range( seq_array.shape[1] ): #loop through the columns, sliding the missing column across c = np.copy(b) c[x] = False #set missing column #look for perfect matches (with the missing column excluded) indels = find_indels(indels, previous_seq_array, seq_array[n, c]) indels = np.logical_and(indels, previous_counts_array > 0) #exclude read zero seqs if np.any(indels): indels_where = np.where(indels)[0] cur_into_prev_ratios = previous_counts_array[ indels_where] / ( counts_array[n] * threshold ) #rewrite to avoid division? prev_into_cur_ratios = counts_array[n] / ( previous_counts_array[indels_where] * threshold) #these two arrays will index indels_where #a value of >1 meets the threshold #check genes for all where >1 gene_matches = [False] * len(indels_where) for p1, p2 in enumerate(indels_where): qry_genes = gene_ids_sets[seqlen][ gene_id_array[n]] targ_genes = gene_ids_sets[ previous_seqlen][ previous_gene_id_array[p2]] if sum([ len(_q.union(_t)) == len(_q) + len(_t) for _q, _t in zip( qry_genes, targ_genes) ]) == 0: gene_matches[ p1] = True #gene_matches will reference indels_where, and also the ratios if np.any(gene_matches): cur_into_prev_ratios = cur_into_prev_ratios * gene_matches #set ratios to 0 where no match prev_into_cur_ratios = prev_into_cur_ratios * gene_matches cur_into_prev_max_idx = np.argmax( cur_into_prev_ratios) #get best ratio prev_into_cur_max_idx = np.argmax( prev_into_cur_ratios) #which direction do we prefer to move the reads? if (cur_into_prev_ratios[ cur_into_prev_max_idx] > prev_into_cur_ratios[ prev_into_cur_max_idx]): if cur_into_prev_ratios[ cur_into_prev_max_idx] >= 1: #print((cur_into_prev_ratios[cur_into_prev_max_idx])) targetrow = indels_where[ cur_into_prev_max_idx] previous_counts_array[ targetrow] += counts_array[n] counts_array[n] = 0 try: changes[previous_seq_names[ targetrow]].append( seq_names[n]) except KeyError: changes[previous_seq_names[ targetrow]] = [ seq_names[n] ] changes_made = True else: if prev_into_cur_ratios[ prev_into_cur_max_idx] >= 1: #print((prev_into_cur_ratios[prev_into_cur_max_idx])) targetrow = indels_where[ prev_into_cur_max_idx] counts_array[ n] += previous_counts_array[ targetrow] previous_counts_array[ targetrow] = 0 try: changes[seq_names[n]].append( previous_seq_names[ targetrow]) except KeyError: changes[seq_names[n]] = [ previous_seq_names[ targetrow] ] changes_made = True reptools.save_changes(clust_handle, previous_changes, previous_seq_names, previous_counts_array) reptools.sort_by_freq( #resort, to allow omission of zeros seq_array, gene_id_array, prob_array, counts_array, seq_names) if np.any(counts_array == 0): firstzero = np.where(counts_array == 0)[0][0] else: firstzero = seq_array.shape[0] previous_seqlen = seqlen #the next four lines previously used np.copy. I don't think this is necessary, and is not desirable for modify #in place previous_seq_array = seq_array[0:firstzero, :] previous_gene_id_array = gene_id_array[0:firstzero] previous_counts_array = counts_array[0:firstzero] previous_seq_names = seq_names[0:firstzero] previous_changes = changes #output: save current changes (because they will never be saved as the previous changes) reptools.save_changes(clust_handle, changes, seq_names, counts_array) return #modification should have occurred in place
def denoise_substitutions(seqs_np, gene_ids_sets, threshold, clust_file, weight_by_qual=True, seqlens=False): """ denoise reads of same length: where a sequence is (hamming distance)*threshold less numerous than another, add its reads to the more common sequence IF the two sequences have at least one gene segment ID in common for each gene in the id line. The daughter record will keep the gene segment IDs of the more numerous parent. The rarer parent will have its read count set to zero. If weight_by_qual=True (the default), adjusts hamming distance according to qual scores. TODO: daughter qual score is the psoterior probability of the input qual scores (weighted by read count) Input: seqs_np = a dictionary of lists of numpy arrays, one entry per sequence length, with the value being a list of numpy arrays for: sequences, gene id codes, probabilities, rea counts, sequence names gene_ids_sets = a dictionary of dictionaries, one entry per sequence length, with the value being a dictionary where keys match the integers in the gene id codes numpy array, and the values are a list of sets, giving the gene segments identified for each gene (one set per gene). Output: None. Numpy arrays are modified in place """ if not seqlens: #if False, process all seqlens = list(seqs_np.keys()) seqlens = [ ln for ln in seqlens if ln > 1 ] #single nt sequences break the array creation code, and are not useful with open(clust_file, 'w') if clust_file else reptools.dummy_context_mgr( ) as clust_handle: for seqlen in seqlens: changes = {} seq_array, gene_id_array, prob_array, counts_array, seq_names = seqs_np[ seqlen] changes_made = True #loopcounter=0 while changes_made: #iterate until no more improvements changes_made = False #loopcounter+=1 #print('Seqlen={}, iteration={}'.format(str(seqlen),str(loopcounter))) reptools.sort_by_freq(seq_array, gene_id_array, prob_array, counts_array, seq_names) if np.any(counts_array == 0): firstzero = np.where(counts_array == 0)[0][0] else: firstzero = seq_array.shape[0] np.seterr( over='ignore') #to not report overflow errors - see below for n in range( 0, firstzero ): #iterate from most to least common (but non-zero) matching = np.equal(seq_array[n + 1:, :], seq_array[ n, :]) #compare this row with all subsequent rows if weight_by_qual: #weight by probabilities weights = generic_chance_of_miss( prob_array[n, :], prob_array[n + 1:, :], matching[:, :] ) #TODO - change generic_chance_of_miss() for logs dists = np.sum( weights, axis=1 ) #gives hamming distances weighted by qual scores else: dists = np.sum(np.invert(matching), axis=1) #find reads which are "close enough" #the next line may cause overflows, but if the the value is too high, it will be insanely large, #and set to Inf, so the comparison should still work hits = np.where(counts_array[n + 1:] >= np.power( threshold, dists, dtype=np.float32) * counts_array[n]) #loop through the hits, checking that the genes match, and continue until a hit is found where they do. for hit in hits[0]: #check gene ids query_genes = gene_ids_sets[seqlen][gene_id_array[n]] target_genes = gene_ids_sets[seqlen][gene_id_array[ n + 1 + hit]] if sum([ len(_q.union(_t)) == len(_q) + len(_t) for _q, _t in zip(query_genes, target_genes) ]) == 0: #all genes have at least one allele in common #v0.14.1: modify qual score of target # calculate prob of base in source ACTUALLY being base in target # (1-prob_array[n])/3 because a 1/3 chance of each uncalled base source_prob_misread = ( (1 - prob_array[n]) / 3)**counts_array[n] #TODO - change for logs #multiply target prob by the calculated source misread prob prob_array[n + 1 + hit] = prob_array[ n + 1 + hit] * source_prob_misread #TODO - change for logs #add counts from query to target counts_array[n + 1 + hit] += counts_array[n] #set query counts to zero counts_array[n] = 0 #record changes try: changes[seq_names[n + 1 + hit]].append( seq_names[n]) except KeyError: changes[seq_names[n + 1 + hit]] = [seq_names[n]] changes_made = True break np.seterr(over='warn') #print('Seqlen {} denoised'.format(seqlen,loopcounter)) reptools.save_changes(clust_handle, changes, seq_names, counts_array) return #numpy arrays will have been modified in-place, and gene_ids,gene_ids_sets have not been modified
def checkCDR3_prod(fastqfile, minlen=3 * 5, maxlen=3 * 30, startchars='C', endchars='FWH', hits_out=False, failures_out=False, frameshift_out=False, long_out=False, short_out=False, stop_out=False, bad_out=False, title_split=' ', verbose=True): """ This for use where no reference file is available. Reports % of CDR3 which are productive or start with C and end with F/W/H, and are within a sensible length range Over- and under-length CDR3 are eliminated first Then those with a bad start or end residue (not C and F/W/H) Then those with a stop Then those with a frameshift """ import reptools results = { 'good': 0, 'frameshift': 0, 'stop': 0, 'bad': 0, 'long': 0, 'short': 0, 'fail': 0 } with open(fastqfile) as infile: with open(hits_out, 'wb') if hits_out else reptools.dummy_context_mgr( ) as hits_handle: with open(failures_out, 'wb') if failures_out else reptools.dummy_context_mgr( ) as failures_handle: with open(frameshift_out, 'wb' ) if frameshift_out else reptools.dummy_context_mgr( ) as shift_handle: with open( long_out, 'wb') if long_out else reptools.dummy_context_mgr( ) as long_handle: with open( short_out, 'wb' ) if short_out else reptools.dummy_context_mgr( ) as short_handle: with open( stop_out, 'wb' ) if stop_out else reptools.dummy_context_mgr( ) as stop_handle: with open( bad_out, 'wb' ) if bad_out else reptools.dummy_context_mgr( ) as bad_handle: for title, seq, qual in reptools.FASTQparser( infile): id = title.split(';')[0].split( title_split)[0] seq = seq.strip() if seq.lower() == 'n': results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, seq)) elif len(seq) > maxlen: results['long'] += 1 long_handle.write('>%s\n%s\n' % (id, seq)) elif len(seq) < minlen: results['short'] += 1 short_handle.write('>%s\n%s\n' % (id, seq)) elif (reptools.trans(seq[0:3]).lower() not in startchars.lower() or reptools.trans(seq[-3:]).lower() not in endchars.lower()): results['bad'] += 1 bad_handle.write('>%s\n%s\n' % (id, seq)) elif '*' in reptools.trans(seq): results['stop'] += 1 stop_handle.write('>%s\n%s\n' % (id, seq)) elif len(seq) % 3 != 0: results['frameshift'] += 1 shift_handle.write('>%s\n%s\n' % (id, seq)) else: results['good'] += 1 hits_handle.write('>%s\n%s\n' % (id, seq)) if verbose: totalreads = float(sum(results.values())) print(('over length = %s (%s pct)' % (results['long'], results['long'] / totalreads * 100))) print(('under length = %s (%s pct)' % (results['short'], results['short'] / totalreads * 100))) print(('bad start/end = %s (%s pct)' % (results['bad'], results['bad'] / totalreads * 100))) print(('stop codon = %s (%s pct)' % (results['stop'], results['stop'] / totalreads * 100))) print(( 'frameshift = %s (%s pct)' % (results['frameshift'], results['frameshift'] / totalreads * 100))) print(('no CDR3 = %s (%s pct)' % (results['fail'], results['fail'] / totalreads * 100))) print(('good CDR3 = %s (%s pct)' % (results['good'], results['good'] / totalreads * 100))) return (results)
def checkgeneID_fastq(mock_dict, gene, fastqfile, title_split=' ', hits_out=False, misses_out=False, failures_out=False, ambiguous_out=False, verbose=True): #TODO: add transtable option to replace the [s.split('gamma')[0].split('alpha')[0].split('_')[0] for s in gene_strings] line #or, process the mock_dict first, to match import reptools reptoolsdict = {} with open(fastqfile) as infile: for title, seq, qual in reptools.FASTQparser(infile): id = title.split(';')[0].split(title_split)[0] gene_strings = [ s.split('=')[1] for s in title.split(';') if s.split('=')[0] == gene ] gene_strings = gene_strings[0].split('+') gene_strings = [ s.split('gamma')[0].split('alpha')[0].split('_')[0] for s in gene_strings ] reptoolsdict[id] = gene_strings results = {'hit': 0, 'miss': 0, 'ambiguous': 0, 'fail': 0} with open( hits_out, 'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle: with open(misses_out, 'wb') if misses_out else reptools.dummy_context_mgr( ) as misses_handle: with open(failures_out, 'wb') if failures_out else reptools.dummy_context_mgr( ) as failures_handle: with open( ambiguous_out, 'wb') if ambiguous_out else reptools.dummy_context_mgr( ) as ambiguous_handle: for id in mock_dict: try: if len(reptoolsdict[id]) == 1: if reptoolsdict[id][0] == mock_dict[id][ gene].split('*')[0]: results['hit'] += 1 if hits_out: hits_handle.write( '>%s\n%s\n' % (id, reptoolsdict[id])) elif reptoolsdict[id] == 'none': results['fail'] += 1 failures_handle.write( '>%s\n%s\n' % (id, reptoolsdict[id])) else: results['miss'] += 1 misses_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) elif len(reptoolsdict[id]) == 0: results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) elif len(reptoolsdict[id]) > 1: if mock_dict[id][gene].split( '*')[0] in reptoolsdict[id]: results['ambiguous'] += 1 ambiguous_handle.write( '>%s\n%s\n' % (id, reptoolsdict[id])) else: results['miss'] += 1 misses_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) else: raise ValueError('How did we get here?') except KeyError: results['fail'] += 1 failures_handle.write('>%s\n%s\n' % (id, reptoolsdict[id])) if verbose: print(('hit = %s' % results['hit'])) print(('miss = %s' % results['miss'])) print(('ambiguous (including hit) = %s' % results['ambiguous'])) print(('failed = %s' % results['fail'])) print(('pct hit = %s' % (results['hit'] / float(sum(results.values())) * 100))) print(('pct hit (including ambiguous hit) = %s' % ((results['hit'] + results['ambiguous']) / float(sum(results.values())) * 100))) return (results)