def __init__(
        self, seqfname, joinfnames, datadir
    ):  # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output)
        self.debug = 0
        self.n_max_queries = -1
        self.queries = []

        self.germline_seqs = utils.read_germlines(datadir,
                                                  remove_N_nukes=False)
        assert os.path.exists(os.getenv('www'))
        self.perfplotter = PerformancePlotter(
            self.germline_seqs,
            os.getenv('www') + '/partis/joinsolver_performance', 'js')

        # get info that was passed to joinsolver
        self.seqinfo = {}
        with opener('r')(seqfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if len(self.queries
                       ) > 0 and line['unique_id'] not in self.queries:
                    continue
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.n_max_queries > 0 and iline >= self.n_max_queries:
                    break

        self.n_failed, self.n_total = 0, 0
        for joinfname in joinfnames:
            self.parse_file(joinfname)

        self.perfplotter.plot()
        print 'failed: %d / %d = %f' % (self.n_failed, self.n_total,
                                        float(self.n_failed) / self.n_total)
Esempio n. 2
0
    def read_output(self, base_outfname, plot_performance=False):
        perfplotter = None
        if plot_performance:
            assert self.args.plotdir != None
            assert not self.args.is_data
            from performanceplotter import PerformancePlotter
            perfplotter = PerformancePlotter(
                self.germline_seqs, self.args.plotdir + '/sw/performance',
                'sw')

        n_processed = 0
        for iproc in range(self.args.n_procs):
            workdir = self.args.workdir
            if self.args.n_procs > 1:
                workdir += '/sw-' + str(iproc)
            outfname = workdir + '/' + base_outfname
            with contextlib.closing(pysam.Samfile(outfname)) as bam:
                grouped = itertools.groupby(iter(bam),
                                            operator.attrgetter('qname'))
                for _, reads in grouped:  # loop over query sequences
                    self.n_total += 1
                    self.process_query(bam, list(reads), perfplotter)
                    n_processed += 1

            if not self.args.no_clean:
                os.remove(outfname)
                if self.args.n_procs > 1:  # still need the top-level workdir
                    os.rmdir(workdir)

        print '  processed %d queries' % n_processed

        if perfplotter != None:
            perfplotter.plot()
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir,
                                                  remove_N_nukes=True)
        self.perfplotter = PerformancePlotter(self.germline_seqs,
                                              self.args.plotdir, 'ihhhmmm')

        self.details = OrderedDict()
        self.failtails = {}
        self.n_partially_failed = 0

        # get sequence info that was passed to ihhhmmm
        self.siminfo = OrderedDict()
        self.sim_need = []  # list of queries that we still need to find
        with opener('r')(self.args.simfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                self.siminfo[line['unique_id']] = line
                self.sim_need.append(line['unique_id'])
                iline += 1
                if args.n_queries > 0 and iline >= args.n_queries:
                    break

        fostream_names = glob.glob(self.args.indir + '/*.fostream')
        if len(fostream_names) == 0:
            raise Exception('no fostreams found in %s' % args.indir)
        fostream_names.sort()  # maybe already sorted?
        for infname in fostream_names:
            if len(self.sim_need) == 0:
                break

            # try to get whatever you can for the failures
            unique_ids = self.find_partial_failures(
                infname)  # returns list of unique ids in this file

            with opener('r')(infname) as infile:
                self.parse_file(infile, unique_ids)

        # now check that we got results for all the queries we wanted
        n_failed = 0
        for unique_id in self.siminfo:
            if unique_id not in self.details and unique_id not in self.failtails:
                print '%-20s  no info' % unique_id
                self.perfplotter.add_fail()
                n_failed += 1

        print ''
        print 'partially failed: %d / %d = %.2f' % (
            self.n_partially_failed, len(self.siminfo),
            float(self.n_partially_failed) / len(self.siminfo))
        print 'failed:           %d / %d = %.2f' % (n_failed, len(
            self.siminfo), float(n_failed) / len(self.siminfo))
        print ''

        self.perfplotter.plot()
Esempio n. 4
0
    def __init__(self,
                 args,
                 input_info,
                 reco_info,
                 germline_seqs,
                 parameter_dir,
                 write_parameters=False):
        self.parameter_dir = parameter_dir
        self.args = args
        self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug

        self.input_info = input_info
        self.remaining_queries = [
            query for query in self.input_info.keys()
        ]  # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores)
        self.new_indels = 0  # number of new indels that were kicked up this time through

        self.reco_info = reco_info
        self.germline_seqs = germline_seqs
        self.pcounter, self.true_pcounter, self.perfplotter = None, None, None
        if write_parameters:
            self.pcounter = ParameterCounter(self.germline_seqs)
            if not self.args.is_data:
                self.true_pcounter = ParameterCounter(self.germline_seqs)
        if self.args.plot_performance:
            self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw')
        self.info = {}
        self.info['queries'] = []
        self.info['all_best_matches'] = set(
        )  # set of all the matches we found (for *all* queries)
        self.info['skipped_unproductive_queries'] = [
        ]  # list of unproductive queries
        # self.info['skipped_indel_queries'] = []  # list of queries that had indels
        self.info['skipped_unknown_queries'] = []
        self.info['indels'] = {}
        if self.args.apply_choice_probs_in_sw:
            if self.debug:
                print '  reading gene choice probs from', parameter_dir
            self.gene_choice_probs = utils.read_overall_gene_probs(
                parameter_dir)

        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        self.outfile = None
        if self.args.outfname is not None:
            self.outfile = open(self.args.outfname, 'a')

        self.n_unproductive = 0
        self.n_total = 0

        print 'smith-waterman'
#----------------------------
#Get user input
germlineDirectory = raw_input(
    'Enter the path of the germline sequences): ') or 'data/imgt'
originalInputFile = raw_input(
    'Enter the path of the original input file into mixcr): '
) or 'simu-10-leaves-1-mutate.csv'
mixcrOutput = raw_input(
    'Enter the path of the output from mixcr: ') or 'edited_output_file.txt'
mixcrPlotDir = 'mixcrPlotDir'
#----------------------------
#hardcoded default germline sequences
germline_seqs = utils.read_germlines(germlineDirectory)

#create an instance of the performance plotter class
perfplotter = PerformancePlotter(germline_seqs, 'mixcr')

#The true dictionary contains the correct locations taken from the original simulated data file
#The inferred dictionary (iDictionary) will contain the inferences of those locations from Mixcr
trueDictionary = {}
iDictionary = {}
with open(originalInputFile) as inFile1:
    with open(mixcrOutput) as inFile2:
        reader1 = csv.DictReader(inFile1)
        reader2 = csv.DictReader(inFile2, delimiter='\t')
        for row1, row2 in zip(reader1, reader2):
            unique_id = row1['unique_id']
            #print unique_id
            trueDictionary[unique_id] = {}
            trueDictionary[unique_id]['v_gene'] = row1['v_gene']
            trueDictionary[unique_id]['d_gene'] = row1['d_gene']
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir,
                                         'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[:self.args.indir.rfind(
            '/'
        )]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(
                            re.findall('[123]. Alignment for [VDJ]-GENE',
                                       full_text)) < 3:
                        failregions = re.findall(
                            'No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position + 1)
                        pgraph = full_text[position:full_text.
                                           find('\n\n', position + 1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(
                    line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs,
                                       self.seqinfo[unique_id],
                                       label='true:')
                utils.print_reco_event(self.germline_seqs,
                                       line,
                                       label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total,
                                        float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total,
                                         float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found +
                                                 n_found, n_not_found /
                                                 float(n_not_found + n_found))
Esempio n. 7
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir,
                                                  remove_N_nukes=True)

        self.perfplotter = PerformancePlotter(self.germline_seqs,
                                              self.args.plotdir, 'igblast')
        self.n_total, self.n_partially_failed = 0, 0

        # get sequence info that was passed to igblast
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.n_max_queries > 0 and iline >= self.args.n_max_queries:
                    break
                iline += 1
                if self.args.queries != None and int(
                        line['unique_id']) not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[int(line['unique_id'])] = line

        paragraphs = None
        print 'reading', self.args.infname
        info = {}
        with opener('r')(self.args.infname) as infile:
            line = infile.readline()
            # first find the start of the next query's section
            while line.find('<b>Query=') != 0:
                line = infile.readline()
            # then keep going till eof
            iquery = 0
            while line != '':
                if self.args.n_max_queries > 0 and iquery >= self.args.n_max_queries:
                    break
                # first find the query name
                query_name = int(line.split()[1])
                # and collect the lines for this query
                query_lines = []
                line = infile.readline()
                while line.find('<b>Query=') != 0:
                    query_lines.append(line.strip())
                    line = infile.readline()
                    if line == '':
                        break
                iquery += 1
                # then see if we want this query
                if self.args.queries != None and query_name not in self.args.queries:
                    continue
                if query_name not in self.seqinfo:
                    print 'ERROR %d not in reco info' % query_name
                    sys.exit()
                if self.args.debug:
                    print query_name
                # and finally add the query to <info[query_name]>
                info[query_name] = {'unique_id': query_name}
                self.n_total += 1
                self.process_query(info[query_name], query_name, query_lines)

        self.perfplotter.plot()
        print 'partially failed: %d / %d = %f' % (
            self.n_partially_failed, self.n_total,
            float(self.n_partially_failed) / self.n_total)
    def read_hmm_output(self,
                        algorithm,
                        hmm_csv_outfname,
                        make_clusters=True,
                        count_parameters=False,
                        parameter_out_dir=None,
                        plotdir=None):
        print '    read output'
        if count_parameters:
            assert parameter_out_dir is not None
            assert plotdir is not None
        pcounter = ParameterCounter(
            self.germline_seqs) if count_parameters else None
        true_pcounter = ParameterCounter(self.germline_seqs) if (
            count_parameters and not self.args.is_data) else None
        perfplotter = PerformancePlotter(
            self.germline_seqs, plotdir +
            '/hmm/performance', 'hmm') if self.args.plot_performance else None

        n_processed = 0
        hmminfo = []
        with opener('r')(hmm_csv_outfname) as hmm_csv_outfile:
            reader = csv.DictReader(hmm_csv_outfile)
            last_key = None
            boundary_error_queries = []
            for line in reader:
                utils.intify(line, splitargs=('unique_ids', 'seqs'))
                ids = line['unique_ids']
                this_key = utils.get_key(ids)
                same_event = from_same_event(self.args.is_data, True,
                                             self.reco_info, ids)
                id_str = ''.join(['%20s ' % i for i in ids])

                # check for errors
                if last_key != this_key:  # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score)
                    if line['errors'] != None and 'boundary' in line[
                            'errors'].split(':'):
                        boundary_error_queries.append(':'.join(
                            [str(uid) for uid in ids]))
                    else:
                        assert len(line['errors']) == 0

                if algorithm == 'viterbi':
                    line['seq'] = line['seqs'][
                        0]  # add info for the best match as 'seq'
                    line['unique_id'] = ids[0]
                    utils.add_match_info(self.germline_seqs,
                                         line,
                                         self.cyst_positions,
                                         self.tryp_positions,
                                         debug=(self.args.debug > 0))

                    if last_key != this_key or self.args.plot_all_best_events:  # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event
                        n_processed += 1
                        if self.args.debug:
                            print '%s   %d' % (id_str, same_event)
                        if line['cdr3_length'] != -1 or not self.args.skip_unproductive:  # if it's productive, or if we're not skipping unproductive rearrangements
                            hmminfo.append(
                                dict([
                                    ('unique_id', line['unique_ids'][0]),
                                ] + line.items()))
                            if pcounter is not None:  # increment counters (but only for the best [first] match)
                                pcounter.increment(line)
                            if true_pcounter is not None:  # increment true counters
                                true_pcounter.increment(self.reco_info[ids[0]])
                            if perfplotter is not None:
                                perfplotter.evaluate(self.reco_info[ids[0]],
                                                     line)

                    if self.args.debug:
                        self.print_hmm_output(
                            line,
                            print_true=(last_key != this_key),
                            perfplotter=perfplotter)
                    line['seq'] = None
                    line['unique_id'] = None

                else:  # for forward, write the pair scores to file to be read by the clusterer
                    if not make_clusters:  # self.args.debug or
                        print '%3d %10.3f    %s' % (
                            same_event, float(line['score']), id_str)
                    if line['score'] == '-nan':
                        print '    WARNING encountered -nan, setting to -999999.0'
                        score = -999999.0
                    else:
                        score = float(line['score'])
                    if len(ids) == 2:
                        hmminfo.append({
                            'id_a': line['unique_ids'][0],
                            'id_b': line['unique_ids'][1],
                            'score': score
                        })
                    n_processed += 1

                last_key = utils.get_key(ids)

        if pcounter is not None:
            pcounter.write(parameter_out_dir)
            if not self.args.no_plot:
                pcounter.plot(plotdir,
                              subset_by_gene=True,
                              cyst_positions=self.cyst_positions,
                              tryp_positions=self.tryp_positions)
        if true_pcounter is not None:
            true_pcounter.write(parameter_out_dir + '/true')
            if not self.args.no_plot:
                true_pcounter.plot(plotdir + '/true',
                                   subset_by_gene=True,
                                   cyst_positions=self.cyst_positions,
                                   tryp_positions=self.tryp_positions)
        if perfplotter is not None:
            perfplotter.plot()

        print '  processed %d queries' % n_processed
        if len(boundary_error_queries) > 0:
            print '    %d boundary errors (%s)' % (
                len(boundary_error_queries), ', '.join(boundary_error_queries))

        return hmminfo