def __init__(self, seqfname, joinfnames, datadir): # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output) self.debug = 0 self.n_max_queries = -1 self.queries = [] self.germline_seqs = utils.read_glfo(datadir, remove_N_nukes=False)['seqs'] assert os.path.exists(os.getenv('www')) self.perfplotter = PerformancePlotter(self.germline_seqs, os.getenv('www') + '/partis/joinsolver_performance', 'js') # get info that was passed to joinsolver self.seqinfo = {} with opener('r')(seqfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if len(self.queries) > 0 and line['unique_id'] not in self.queries: continue self.seqinfo[line['unique_id']] = line iline += 1 if self.n_max_queries > 0 and iline >= self.n_max_queries: break self.n_failed, self.n_total = 0, 0 for joinfname in joinfnames: self.parse_file(joinfname) self.perfplotter.plot() print 'failed: %d / %d = %f' % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
def __init__(self, args): self.args = args self.germline_seqs = utils.read_glfo(self.args.datadir, remove_N_nukes=True)['seqs'] self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm') self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener('r')(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue self.siminfo[line['unique_id']] = line self.sim_need.append(line['unique_id']) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + '/*.fostream') if len(fostream_names) == 0: raise Exception('no fostreams found in %s' % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures(infname) # returns list of unique ids in this file with opener('r')(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print '%-20s no info' % unique_id self.perfplotter.add_fail() n_failed += 1 print '' print 'partially failed: %d / %d = %.2f' % (self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo)) print 'failed: %d / %d = %.2f' % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo)) print '' self.perfplotter.plot()
def __init__(self, args): self.args = args self.germline_seqs = utils.read_glfo(self.args.datadir, remove_N_nukes=True)['seqs'] self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast') self.n_total, self.n_partially_failed, self.n_skipped = 0, 0, 0 # get sequence info that was passed to igblast self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.n_queries > 0 and iline >= self.args.n_queries: break iline += 1 if self.args.queries != None and int(line['unique_id']) not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[int(line['unique_id'])] = line print 'reading', self.args.infname get_genes_to_skip(self.args.infname, self.germline_seqs, method='igblast', debug=False) paragraphs = None info = {} with opener('r')(self.args.infname) as infile: line = infile.readline() # first find the start of the next query's section while line.find('<b>Query=') != 0: line = infile.readline() # then keep going till eof iquery = 0 while line != '': if self.args.n_queries > 0 and iquery >= self.args.n_queries: break # first find the query name query_name = int(line.split()[1]) # and collect the lines for this query query_lines = [] line = infile.readline() while line.find('<b>Query=') != 0: query_lines.append(line.strip()) line = infile.readline() if line == '': break iquery += 1 # then see if we want this query if self.args.queries != None and query_name not in self.args.queries: continue if query_name not in self.seqinfo: print 'ERROR %d not in reco info' % query_name sys.exit() if self.args.debug: print query_name # and finally add the query to <info[query_name]> info[query_name] = {'unique_id':query_name} self.n_total += 1 self.process_query(info[query_name], query_name, query_lines) self.perfplotter.plot() print 'partially failed: %d / %d = %f' % (self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total) print 'skipped: %d / %d = %f' % (self.n_skipped, self.n_total, float(self.n_skipped) / self.n_total) for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g))
def __init__(self, args): self.args = args self.germline_seqs = utils.read_glfo(self.args.datadir)['seqs'] perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[ : self.args.indir.rfind('/')] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall('No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position+1) pgraph = full_text[position : full_text.find('\n\n', position+1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))