Ejemplo n.º 1
0
def get_seqfile_info(fname, is_data, germline_seqs=None, cyst_positions=None, tryp_positions=None, n_max_queries=-1, queries=None, reco_ids=None):
    """ return list of sequence info from files of several types """
    if not is_data:
        assert germline_seqs is not None
        assert cyst_positions is not None
        assert tryp_positions is not None

    if '.csv' in fname:
        delimiter = ','
        name_column = 'unique_id'
        seq_column = 'seq'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.tsv' in fname:
        delimiter = '\t'
        name_column = 'name'
        seq_column = 'nucleotide'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname:
        name_column = 'unique_id'
        seq_column = 'seq'
        reader = []
        n_fasta_queries = 0
        ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq'
        for seq_record in SeqIO.parse(fname, ftype):
            reader.append({})
            reader[-1][name_column] = seq_record.name
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break
    else:
        print 'ERROR unrecognized file format %s' % fname
        assert False

    input_info, reco_info = OrderedDict(), OrderedDict()
    n_queries = 0
    for line in reader:
        utils.intify(line)
        # if command line specified query or reco ids, skip other ones
        if queries is not None and line[name_column] not in queries:
            continue
        if reco_ids is not None and line['reco_id'] not in reco_ids:
            continue

        input_info[line[name_column]] = {'unique_id':line[name_column], 'seq':line[seq_column]}
        if not is_data:
            reco_info[line['unique_id']] = line
            utils.add_match_info(germline_seqs, line, cyst_positions, tryp_positions)
        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:
            break

    if len(input_info) == 0:
        print 'ERROR didn\'t end up pulling any input info out of %s' % fname
        assert False
    return (input_info, reco_info)
Ejemplo n.º 2
0
def get_seqfile_info(fname,
                     is_data,
                     germline_seqs=None,
                     cyst_positions=None,
                     tryp_positions=None,
                     n_max_queries=-1,
                     queries=None,
                     reco_ids=None):
    """ return list of sequence info from files of several types """
    if not is_data:
        assert germline_seqs is not None
        assert cyst_positions is not None
        assert tryp_positions is not None

    if '.csv' in fname:
        delimiter = ','
        name_column = 'unique_id'
        seq_column = 'seq'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.tsv' in fname:
        delimiter = '\t'
        name_column = 'name'
        seq_column = 'nucleotide'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname:
        name_column = 'unique_id'
        seq_column = 'seq'
        reader = []
        n_fasta_queries = 0
        ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq'
        for seq_record in SeqIO.parse(fname, ftype):
            reader.append({})
            reader[-1][name_column] = seq_record.name
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break
    else:
        print 'ERROR unrecognized file format %s' % fname
        assert False

    input_info, reco_info = OrderedDict(), OrderedDict()
    n_queries = 0
    for line in reader:
        utils.intify(line)
        # if command line specified query or reco ids, skip other ones
        if queries is not None and line[name_column] not in queries:
            continue
        if reco_ids is not None and line['reco_id'] not in reco_ids:
            continue

        input_info[line[name_column]] = {
            'unique_id': line[name_column],
            'seq': line[seq_column]
        }
        if not is_data:
            reco_info[line['unique_id']] = line
            utils.add_match_info(germline_seqs, line, cyst_positions,
                                 tryp_positions)
        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:
            break

    if len(input_info) == 0:
        print 'ERROR didn\'t end up pulling any input info out of %s' % fname
        assert False
    return (input_info, reco_info)
    def read_hmm_output(self,
                        algorithm,
                        hmm_csv_outfname,
                        make_clusters=True,
                        count_parameters=False,
                        parameter_out_dir=None,
                        plotdir=None):
        print '    read output'
        if count_parameters:
            assert parameter_out_dir is not None
            assert plotdir is not None
        pcounter = ParameterCounter(
            self.germline_seqs) if count_parameters else None
        true_pcounter = ParameterCounter(self.germline_seqs) if (
            count_parameters and not self.args.is_data) else None
        perfplotter = PerformancePlotter(
            self.germline_seqs, plotdir +
            '/hmm/performance', 'hmm') if self.args.plot_performance else None

        n_processed = 0
        hmminfo = []
        with opener('r')(hmm_csv_outfname) as hmm_csv_outfile:
            reader = csv.DictReader(hmm_csv_outfile)
            last_key = None
            boundary_error_queries = []
            for line in reader:
                utils.intify(line, splitargs=('unique_ids', 'seqs'))
                ids = line['unique_ids']
                this_key = utils.get_key(ids)
                same_event = from_same_event(self.args.is_data, True,
                                             self.reco_info, ids)
                id_str = ''.join(['%20s ' % i for i in ids])

                # check for errors
                if last_key != this_key:  # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score)
                    if line['errors'] != None and 'boundary' in line[
                            'errors'].split(':'):
                        boundary_error_queries.append(':'.join(
                            [str(uid) for uid in ids]))
                    else:
                        assert len(line['errors']) == 0

                if algorithm == 'viterbi':
                    line['seq'] = line['seqs'][
                        0]  # add info for the best match as 'seq'
                    line['unique_id'] = ids[0]
                    utils.add_match_info(self.germline_seqs,
                                         line,
                                         self.cyst_positions,
                                         self.tryp_positions,
                                         debug=(self.args.debug > 0))

                    if last_key != this_key or self.args.plot_all_best_events:  # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event
                        n_processed += 1
                        if self.args.debug:
                            print '%s   %d' % (id_str, same_event)
                        if line['cdr3_length'] != -1 or not self.args.skip_unproductive:  # if it's productive, or if we're not skipping unproductive rearrangements
                            hmminfo.append(
                                dict([
                                    ('unique_id', line['unique_ids'][0]),
                                ] + line.items()))
                            if pcounter is not None:  # increment counters (but only for the best [first] match)
                                pcounter.increment(line)
                            if true_pcounter is not None:  # increment true counters
                                true_pcounter.increment(self.reco_info[ids[0]])
                            if perfplotter is not None:
                                perfplotter.evaluate(self.reco_info[ids[0]],
                                                     line)

                    if self.args.debug:
                        self.print_hmm_output(
                            line,
                            print_true=(last_key != this_key),
                            perfplotter=perfplotter)
                    line['seq'] = None
                    line['unique_id'] = None

                else:  # for forward, write the pair scores to file to be read by the clusterer
                    if not make_clusters:  # self.args.debug or
                        print '%3d %10.3f    %s' % (
                            same_event, float(line['score']), id_str)
                    if line['score'] == '-nan':
                        print '    WARNING encountered -nan, setting to -999999.0'
                        score = -999999.0
                    else:
                        score = float(line['score'])
                    if len(ids) == 2:
                        hmminfo.append({
                            'id_a': line['unique_ids'][0],
                            'id_b': line['unique_ids'][1],
                            'score': score
                        })
                    n_processed += 1

                last_key = utils.get_key(ids)

        if pcounter is not None:
            pcounter.write(parameter_out_dir)
            if not self.args.no_plot:
                pcounter.plot(plotdir,
                              subset_by_gene=True,
                              cyst_positions=self.cyst_positions,
                              tryp_positions=self.tryp_positions)
        if true_pcounter is not None:
            true_pcounter.write(parameter_out_dir + '/true')
            if not self.args.no_plot:
                true_pcounter.plot(plotdir + '/true',
                                   subset_by_gene=True,
                                   cyst_positions=self.cyst_positions,
                                   tryp_positions=self.tryp_positions)
        if perfplotter is not None:
            perfplotter.plot()

        print '  processed %d queries' % n_processed
        if len(boundary_error_queries) > 0:
            print '    %d boundary errors (%s)' % (
                len(boundary_error_queries), ', '.join(boundary_error_queries))

        return hmminfo
Ejemplo n.º 4
0
    def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None):
        print '    read output'
        if count_parameters:
            assert parameter_out_dir is not None
            assert plotdir is not None
        pcounter = ParameterCounter(self.germline_seqs) if count_parameters else None
        true_pcounter = ParameterCounter(self.germline_seqs) if (count_parameters and not self.args.is_data) else None
        perfplotter = PerformancePlotter(self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None

        n_processed = 0
        hmminfo = []
        with opener('r')(hmm_csv_outfname) as hmm_csv_outfile:
            reader = csv.DictReader(hmm_csv_outfile)
            last_key = None
            boundary_error_queries = []
            for line in reader:
                utils.intify(line, splitargs=('unique_ids', 'seqs'))
                ids = line['unique_ids']
                this_key = utils.get_key(ids)
                same_event = from_same_event(self.args.is_data, True, self.reco_info, ids)
                id_str = ''.join(['%20s ' % i for i in ids])

                # check for errors
                if last_key != this_key:  # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score)
                    if line['errors'] != None and 'boundary' in line['errors'].split(':'):
                        boundary_error_queries.append(':'.join([str(uid) for uid in ids]))
                    else:
                        assert len(line['errors']) == 0

                if algorithm == 'viterbi':
                    line['seq'] = line['seqs'][0]  # add info for the best match as 'seq'
                    line['unique_id'] = ids[0]
                    utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0))

                    if last_key != this_key or self.args.plot_all_best_events:  # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event
                        n_processed += 1
                        if self.args.debug:
                            print '%s   %d' % (id_str, same_event)
                        if line['cdr3_length'] != -1 or not self.args.skip_unproductive:  # if it's productive, or if we're not skipping unproductive rearrangements
                            hmminfo.append(dict([('unique_id', line['unique_ids'][0]), ] + line.items()))
                            if pcounter is not None:  # increment counters (but only for the best [first] match)
                                pcounter.increment(line)
                            if true_pcounter is not None:  # increment true counters
                                true_pcounter.increment(self.reco_info[ids[0]])
                            if perfplotter is not None:
                                perfplotter.evaluate(self.reco_info[ids[0]], line)

                    if self.args.debug:
                        self.print_hmm_output(line, print_true=(last_key != this_key), perfplotter=perfplotter)
                    line['seq'] = None
                    line['unique_id'] = None

                else:  # for forward, write the pair scores to file to be read by the clusterer
                    if not make_clusters:  # self.args.debug or 
                        print '%3d %10.3f    %s' % (same_event, float(line['score']), id_str)
                    if line['score'] == '-nan':
                        print '    WARNING encountered -nan, setting to -999999.0'
                        score = -999999.0
                    else:
                        score = float(line['score'])
                    if len(ids) == 2:
                        hmminfo.append({'id_a':line['unique_ids'][0], 'id_b':line['unique_ids'][1], 'score':score})
                    n_processed += 1

                last_key = utils.get_key(ids)

        if pcounter is not None:
            pcounter.write(parameter_out_dir)
            if not self.args.no_plot:
                pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions)
        if true_pcounter is not None:
            true_pcounter.write(parameter_out_dir + '/true')
            if not self.args.no_plot:
                true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions)
        if perfplotter is not None:
            perfplotter.plot()

        print '  processed %d queries' % n_processed
        if len(boundary_error_queries) > 0:
            print '    %d boundary errors (%s)' % (len(boundary_error_queries), ', '.join(boundary_error_queries))

        return hmminfo
Ejemplo n.º 5
0
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None):
    """ return list of sequence info from files of several types """

    if '.csv' in fname:
        delimiter = ','
        name_column = 'unique_id'
        seq_column = 'seq'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.tsv' in fname:
        delimiter = '\t'
        name_column = 'name'
        seq_column = 'nucleotide'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname:
        name_column = 'unique_id'
        seq_column = 'seq'
        reader = []
        n_fasta_queries = 0
        ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq'
        for seq_record in SeqIO.parse(fname, ftype):
            reader.append({})
            reader[-1][name_column] = seq_record.name
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break
    else:
        raise Exception('unrecognized file format %s' % fname)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_queries = 0
    for line in reader:
        if '.csv' in fname and name_column not in line:  # hackey hackey hackey
            name_column = 'name'
            seq_column = 'nucleotide'
        utils.process_input_line(line, int_columns=('v_5p_del', 'd_5p_del', 'cdr3_length', 'j_5p_del', 'j_3p_del', 'd_3p_del', 'v_3p_del'), literal_columns=('indels'))
        unique_id = line[name_column]
        # if command line specified query or reco ids, skip other ones
        if queries is not None and unique_id not in queries:
            continue
        if reco_ids is not None and line['reco_id'] not in reco_ids:
            continue

        input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]}
        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname)
            reco_info[unique_id] = dict(line)
            if 'indels' in line and line['indels']['reversed_seq'] != '':  # TODO unhackify this
                reco_info[unique_id]['seq'] = line['indels']['reversed_seq']
            if 'indels' not in line:  # TODO unhackify this
                reco_info[unique_id]['indels'] = None
            if glfo is not None:
                utils.add_match_info(glfo, reco_info[unique_id])
        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:
            break

    if len(input_info) == 0:
        raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids)))
    
    return (input_info, reco_info)