def __init__( self, seqfname, joinfnames, datadir ): # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output) self.debug = 0 self.n_max_queries = -1 self.queries = [] self.germline_seqs = utils.read_germlines(datadir, remove_N_nukes=False) assert os.path.exists(os.getenv('www')) self.perfplotter = PerformancePlotter( self.germline_seqs, os.getenv('www') + '/partis/joinsolver_performance', 'js') # get info that was passed to joinsolver self.seqinfo = {} with opener('r')(seqfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if len(self.queries ) > 0 and line['unique_id'] not in self.queries: continue self.seqinfo[line['unique_id']] = line iline += 1 if self.n_max_queries > 0 and iline >= self.n_max_queries: break self.n_failed, self.n_total = 0, 0 for joinfname in joinfnames: self.parse_file(joinfname) self.perfplotter.plot() print 'failed: %d / %d = %f' % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
def read_output(self, base_outfname, plot_performance=False): perfplotter = None if plot_performance: assert self.args.plotdir != None assert not self.args.is_data from performanceplotter import PerformancePlotter perfplotter = PerformancePlotter( self.germline_seqs, self.args.plotdir + '/sw/performance', 'sw') n_processed = 0 for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) outfname = workdir + '/' + base_outfname with contextlib.closing(pysam.Samfile(outfname)) as bam: grouped = itertools.groupby(iter(bam), operator.attrgetter('qname')) for _, reads in grouped: # loop over query sequences self.n_total += 1 self.process_query(bam, list(reads), perfplotter) n_processed += 1 if not self.args.no_clean: os.remove(outfname) if self.args.n_procs > 1: # still need the top-level workdir os.rmdir(workdir) print ' processed %d queries' % n_processed if perfplotter != None: perfplotter.plot()
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm') self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener('r')(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue self.siminfo[line['unique_id']] = line self.sim_need.append(line['unique_id']) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + '/*.fostream') if len(fostream_names) == 0: raise Exception('no fostreams found in %s' % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures( infname) # returns list of unique ids in this file with opener('r')(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print '%-20s no info' % unique_id self.perfplotter.add_fail() n_failed += 1 print '' print 'partially failed: %d / %d = %.2f' % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo)) print 'failed: %d / %d = %.2f' % (n_failed, len( self.siminfo), float(n_failed) / len(self.siminfo)) print '' self.perfplotter.plot()
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [ query for query in self.input_info.keys() ] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman'
#---------------------------- #Get user input germlineDirectory = raw_input( 'Enter the path of the germline sequences): ') or 'data/imgt' originalInputFile = raw_input( 'Enter the path of the original input file into mixcr): ' ) or 'simu-10-leaves-1-mutate.csv' mixcrOutput = raw_input( 'Enter the path of the output from mixcr: ') or 'edited_output_file.txt' mixcrPlotDir = 'mixcrPlotDir' #---------------------------- #hardcoded default germline sequences germline_seqs = utils.read_germlines(germlineDirectory) #create an instance of the performance plotter class perfplotter = PerformancePlotter(germline_seqs, 'mixcr') #The true dictionary contains the correct locations taken from the original simulated data file #The inferred dictionary (iDictionary) will contain the inferences of those locations from Mixcr trueDictionary = {} iDictionary = {} with open(originalInputFile) as inFile1: with open(mixcrOutput) as inFile2: reader1 = csv.DictReader(inFile1) reader2 = csv.DictReader(inFile2, delimiter='\t') for row1, row2 in zip(reader1, reader2): unique_id = row1['unique_id'] #print unique_id trueDictionary[unique_id] = {} trueDictionary[unique_id]['v_gene'] = row1['v_gene'] trueDictionary[unique_id]['d_gene'] = row1['d_gene']
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[:self.args.indir.rfind( '/' )] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len( re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall( 'No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position + 1) pgraph = full_text[position:full_text. find('\n\n', position + 1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches( line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast') self.n_total, self.n_partially_failed = 0, 0 # get sequence info that was passed to igblast self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.n_max_queries > 0 and iline >= self.args.n_max_queries: break iline += 1 if self.args.queries != None and int( line['unique_id']) not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[int(line['unique_id'])] = line paragraphs = None print 'reading', self.args.infname info = {} with opener('r')(self.args.infname) as infile: line = infile.readline() # first find the start of the next query's section while line.find('<b>Query=') != 0: line = infile.readline() # then keep going till eof iquery = 0 while line != '': if self.args.n_max_queries > 0 and iquery >= self.args.n_max_queries: break # first find the query name query_name = int(line.split()[1]) # and collect the lines for this query query_lines = [] line = infile.readline() while line.find('<b>Query=') != 0: query_lines.append(line.strip()) line = infile.readline() if line == '': break iquery += 1 # then see if we want this query if self.args.queries != None and query_name not in self.args.queries: continue if query_name not in self.seqinfo: print 'ERROR %d not in reco info' % query_name sys.exit() if self.args.debug: print query_name # and finally add the query to <info[query_name]> info[query_name] = {'unique_id': query_name} self.n_total += 1 self.process_query(info[query_name], query_name, query_lines) self.perfplotter.plot() print 'partially failed: %d / %d = %f' % ( self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total)
def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None): print ' read output' if count_parameters: assert parameter_out_dir is not None assert plotdir is not None pcounter = ParameterCounter( self.germline_seqs) if count_parameters else None true_pcounter = ParameterCounter(self.germline_seqs) if ( count_parameters and not self.args.is_data) else None perfplotter = PerformancePlotter( self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None n_processed = 0 hmminfo = [] with opener('r')(hmm_csv_outfname) as hmm_csv_outfile: reader = csv.DictReader(hmm_csv_outfile) last_key = None boundary_error_queries = [] for line in reader: utils.intify(line, splitargs=('unique_ids', 'seqs')) ids = line['unique_ids'] this_key = utils.get_key(ids) same_event = from_same_event(self.args.is_data, True, self.reco_info, ids) id_str = ''.join(['%20s ' % i for i in ids]) # check for errors if last_key != this_key: # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score) if line['errors'] != None and 'boundary' in line[ 'errors'].split(':'): boundary_error_queries.append(':'.join( [str(uid) for uid in ids])) else: assert len(line['errors']) == 0 if algorithm == 'viterbi': line['seq'] = line['seqs'][ 0] # add info for the best match as 'seq' line['unique_id'] = ids[0] utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0)) if last_key != this_key or self.args.plot_all_best_events: # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event n_processed += 1 if self.args.debug: print '%s %d' % (id_str, same_event) if line['cdr3_length'] != -1 or not self.args.skip_unproductive: # if it's productive, or if we're not skipping unproductive rearrangements hmminfo.append( dict([ ('unique_id', line['unique_ids'][0]), ] + line.items())) if pcounter is not None: # increment counters (but only for the best [first] match) pcounter.increment(line) if true_pcounter is not None: # increment true counters true_pcounter.increment(self.reco_info[ids[0]]) if perfplotter is not None: perfplotter.evaluate(self.reco_info[ids[0]], line) if self.args.debug: self.print_hmm_output( line, print_true=(last_key != this_key), perfplotter=perfplotter) line['seq'] = None line['unique_id'] = None else: # for forward, write the pair scores to file to be read by the clusterer if not make_clusters: # self.args.debug or print '%3d %10.3f %s' % ( same_event, float(line['score']), id_str) if line['score'] == '-nan': print ' WARNING encountered -nan, setting to -999999.0' score = -999999.0 else: score = float(line['score']) if len(ids) == 2: hmminfo.append({ 'id_a': line['unique_ids'][0], 'id_b': line['unique_ids'][1], 'score': score }) n_processed += 1 last_key = utils.get_key(ids) if pcounter is not None: pcounter.write(parameter_out_dir) if not self.args.no_plot: pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if true_pcounter is not None: true_pcounter.write(parameter_out_dir + '/true') if not self.args.no_plot: true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if perfplotter is not None: perfplotter.plot() print ' processed %d queries' % n_processed if len(boundary_error_queries) > 0: print ' %d boundary errors (%s)' % ( len(boundary_error_queries), ', '.join(boundary_error_queries)) return hmminfo