コード例 #1
0
ファイル: imgtparser.py プロジェクト: stevenweaver/partis
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line['unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[ : self.args.indir.rfind('/')]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3:
                        failregions = re.findall('No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position+1)
                        pgraph = full_text[position : full_text.find('\n\n', position+1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:')
                utils.print_reco_event(self.germline_seqs, line, label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
コード例 #2
0
    def process_query(self, qr_info, query_name, query_lines):
        # split query_lines up into blocks
        blocks = []
        for line in query_lines:
            if line.find('Query_') == 0:
                blocks.append([])
            if len(line) == 0:
                continue
            if len(re.findall('<a name=#_[0-9][0-9]*_IGH',
                              line)) == 0 and line.find('Query_') != 0:
                continue
            if len(blocks) == 0:
                print 'wtf? %s' % query_name  # it's probably kicking a reverse match
                self.perfplotter.add_partial_fail(
                    self.seqinfo[query_name],
                    qr_info)  # NOTE that's really a total failure
                self.n_partially_failed += 1
                return
            blocks[-1].append(line)

        # then process each block
        for block in blocks:
            self.process_single_block(block, query_name, qr_info)
            if 'fail' in qr_info:
                self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                                  qr_info)
                self.n_partially_failed += 1
                return

        for region in utils.regions:
            if region + '_gene' not in qr_info:
                print '  ERROR no %s match for %d' % (region, query_name)
                self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                                  qr_info)
                self.n_partially_failed += 1
                return

        # expand v match to left end and j match to right end
        qr_info['v_5p_del'] = 0
        qr_info['fv_insertion'] = ''
        if qr_info['match_start'] > 0:
            if self.args.debug:
                print '    add to v left:', self.seqinfo[query_name][
                    'seq'][:qr_info['match_start']]
            qr_info['seq'] = self.seqinfo[query_name][
                'seq'][:qr_info['match_start']] + qr_info['seq']

        qr_info['j_3p_del'] = 0
        qr_info['jf_insertion'] = ''
        if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']:
            if self.args.debug:
                print '    add to j right:', self.seqinfo[query_name][
                    'seq'][qr_info['match_end'] -
                           len(self.seqinfo[query_name]['seq']):]
            qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][
                qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):]

        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start:end]

        for region in utils.regions:
            start = qr_info[region + '_qr_bounds'][0]
            end = qr_info[region + '_qr_bounds'][1]
            qr_info[region + '_qr_seq'] = qr_info['seq'][start:end]

        try:
            resolve_overlapping_matches(qr_info, self.args.debug,
                                        self.germline_seqs)
        except AssertionError:
            print 'ERROR apportionment failed on %s' % query_name
            self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                              qr_info)
            self.n_partially_failed += 1
            return

        if self.args.debug:
            print '  query seq:', qr_info['seq']
            for region in utils.regions:
                print '    %s %3d %3d %s %s' % (
                    region, qr_info[region + '_qr_bounds'][0],
                    qr_info[region + '_qr_bounds'][1],
                    utils.color_gene(qr_info[region + '_gene']),
                    qr_info[region + '_gl_seq'])
        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start:end]
            if self.args.debug:
                print '   ', boundary, qr_info[boundary + '_insertion']

        self.perfplotter.evaluate(self.seqinfo[query_name], qr_info)
        # for key, val in qr_info.items():
        #     print key, val
        if self.args.debug:
            utils.print_reco_event(self.germline_seqs,
                                   self.seqinfo[query_name],
                                   label='true:',
                                   extra_str='  ')
            utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
コード例 #3
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir,
                                         'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[:self.args.indir.rfind(
            '/'
        )]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(
                            re.findall('[123]. Alignment for [VDJ]-GENE',
                                       full_text)) < 3:
                        failregions = re.findall(
                            'No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position + 1)
                        pgraph = full_text[position:full_text.
                                           find('\n\n', position + 1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(
                    line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs,
                                       self.seqinfo[unique_id],
                                       label='true:')
                utils.print_reco_event(self.germline_seqs,
                                       line,
                                       label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total,
                                        float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total,
                                         float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found +
                                                 n_found, n_not_found /
                                                 float(n_not_found + n_found))
コード例 #4
0
    def process_query(self, qr_info, query_name, query_lines):
        # split query_lines up into blocks
        blocks = []
        for line in query_lines:
            if line.find('Query_') == 0:
                blocks.append([])
            if len(line) == 0:
                continue
            if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0:
                continue
            if len(blocks) == 0:
                print 'wtf? %s' % query_name  # it's probably kicking a reverse match
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)  # NOTE that's really a total failure
                self.n_partially_failed += 1
                return
            blocks[-1].append(line)

        # then process each block
        for block in blocks:
            self.process_single_block(block, query_name, qr_info)
            if 'skip_gene' in qr_info:
                self.n_skipped += 1
                return
            if 'fail' in qr_info:
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
                self.n_partially_failed += 1
                return

        for region in utils.regions:
            if region + '_gene' not in qr_info:
                print '    %d: no %s match' % (query_name, region)
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
                self.n_partially_failed += 1
                return

        # expand v match to left end and j match to right end
        qr_info['v_5p_del'] = 0
        qr_info['fv_insertion'] = ''
        if qr_info['match_start'] > 0:
            if self.args.debug:
                print '    add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']]
            qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq']

        qr_info['j_3p_del'] = 0
        qr_info['jf_insertion'] = ''
        if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']:
            if self.args.debug:
                print '    add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ]
            qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ]

        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start : end]

        for region in utils.regions:
            start = qr_info[region + '_qr_bounds'][0]
            end = qr_info[region + '_qr_bounds'][1]
            qr_info[region + '_qr_seq'] = qr_info['seq'][start : end]

        try:
            resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs)
        except AssertionError:
            print '    %s: apportionment failed' % query_name
            self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
            self.n_partially_failed += 1
            return

        if self.args.debug:
            print '  query seq:', qr_info['seq']
            for region in utils.regions:
                true_gene = self.seqinfo[query_name][region + '_gene']
                infer_gene = qr_info[region + '_gene']
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color('bold', utils.color('blue', region))
                    truestr = ''  #'(originally %s)' % match_name
                else:
                    regionstr = utils.color('bold', utils.color('red', region))
                    truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '')
                # print '  %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr)

                print '    %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq'])
        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start : end]
            if self.args.debug:
                print '   ', boundary, qr_info[boundary + '_insertion']

        self.perfplotter.evaluate(self.seqinfo[query_name], qr_info)
        # for key, val in qr_info.items():
        #     print key, val
        if self.args.debug:
            utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str='  ')
            utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')