def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[ : self.args.indir.rfind('/')] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall('No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position+1) pgraph = full_text[position : full_text.find('\n\n', position+1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail( self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' ERROR no %s match for %d' % (region, query_name) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name][ 'seq'][qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start:end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print 'ERROR apportionment failed on %s' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: print ' %s %3d %3d %s %s' % ( region, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(qr_info[region + '_gene']), qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[:self.args.indir.rfind( '/' )] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len( re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall( 'No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position + 1) pgraph = full_text[position:full_text. find('\n\n', position + 1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches( line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'skip_gene' in qr_info: self.n_skipped += 1 return if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' %d: no %s match' % (query_name, region) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start : end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print ' %s: apportionment failed' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: true_gene = self.seqinfo[query_name][region + '_gene'] infer_gene = qr_info[region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') # print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')