Exemple #1
0
def _get_annotated_counts_by_chrom(chrom_to_do):
    """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it
    meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene."""
    found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r',
                            where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d"
                                  % (chrom_to_do, -startnt[0], min_AAlen),
                            columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \
        .sort_values('AAlen', ascending=False).drop_duplicates('tfam')  # use the longest annotated CDS in each transcript family
    num_cds_incl = 0  # number of CDSs included from this chromosome
    startprof = np.zeros((len(rdlens), startlen))
    cdsprof = np.zeros((len(rdlens), 3))
    stopprof = np.zeros((len(rdlens), stoplen))
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))

    for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False):
        curr_trans = SegmentChain.from_bed(bedlinedict[tid])
        tlen = curr_trans.get_length()
        if tlen >= tstop + stopnt[1]:  # need to guarantee that the 3' UTR is sufficiently long
            curr_hashed_counts = get_hashed_counts(curr_trans, gnd)
            cdslen = tstop+stopnt[1]-tcoord-startnt[0]  # cds length, plus the extra bases...
            curr_counts = np.zeros((len(rdlens), cdslen))
            for (i, rdlen) in enumerate(rdlens):
                for nmis in range(opts.max5mis+1):
                    curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]]
                    # curr_counts is limited to the CDS plus any extra requested nucleotides on either side
            if curr_counts.sum() >= opts.mincdsreads:
                curr_counts /= curr_counts.mean()  # normalize by mean of counts across all readlengths and positions within the CDS
                startprof += curr_counts[:, :startlen]
                cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1)
                stopprof += curr_counts[:, cdslen-stoplen:cdslen]
                num_cds_incl += 1

    for inbam in inbams:
        inbam.close()

    return startprof, cdsprof, stopprof, num_cds_incl
Exemple #2
0
def _get_annotated_counts_by_chrom(chrom_to_do):
    """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it
    meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene."""
    found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r',
                            where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d"
                                  % (chrom_to_do, -startnt[0], min_AAlen),
                            columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \
        .sort_values('AAlen', ascending=False).drop_duplicates('tfam')  # use the longest annotated CDS in each transcript family
    num_cds_incl = 0  # number of CDSs included from this chromosome
    startprof = np.zeros((len(rdlens), startlen))
    cdsprof = np.zeros((len(rdlens), 3))
    stopprof = np.zeros((len(rdlens), stoplen))
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))

    for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False):
        curr_trans = SegmentChain.from_bed(bedlinedict[tid])
        tlen = curr_trans.get_length()
        if tlen >= tstop + stopnt[1]:  # need to guarantee that the 3' UTR is sufficiently long
            curr_hashed_counts = get_hashed_counts(curr_trans, gnd)
            cdslen = tstop+stopnt[1]-tcoord-startnt[0]  # cds length, plus the extra bases...
            curr_counts = np.zeros((len(rdlens), cdslen))
            for (i, rdlen) in enumerate(rdlens):
                for nmis in range(opts.max5mis+1):
                    curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]]
                    # curr_counts is limited to the CDS plus any extra requested nucleotides on either side
            if curr_counts.sum() >= opts.mincdsreads:
                curr_counts /= curr_counts.mean()  # normalize by mean of counts across all readlengths and positions within the CDS
                startprof += curr_counts[:, :startlen]
                cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1)
                stopprof += curr_counts[:, cdslen-stoplen:cdslen]
                num_cds_incl += 1

    for inbam in inbams:
        inbam.close()

    return startprof, cdsprof, stopprof, num_cds_incl
Exemple #3
0
def _regress_tfam(orf_set, gnd):
    """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
    Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
    tfam = orf_set['tfam'].iat[0]
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    hashed_counts = get_hashed_counts(tfam_segs, gnd)
    counts = np.zeros((len(rdlens), nnt), dtype=np.float64)  # even though they are integer-valued, will need to do float arithmetic
    for (i, rdlen) in enumerate(rdlens):
        for nmis in range(1+opts.max5mis):
            counts[i, :] += hashed_counts[(rdlen, nmis)]
    counts = counts.ravel()

    if opts.startcount:
        # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
        offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3)  # offsets for each cond, expecting three positions to check for each
    #    try:
        orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
                           [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
        if orf_set.empty:
            return failure_return

    orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
    abort_set = orf_set.drop_duplicates('gcoord').copy()
    abort_set['gstop'] = abort_set['gcoord']  # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
    abort_set['tstop'] = abort_set['tcoord']+3  # stop after the first codon
    abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
    orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
    if not opts.startonly:  # if marking full ORFs, include histop model
        stop_set = orf_set.drop_duplicates('gstop').copy()
        stop_set['gcoord'] = stop_set['gstop']  # this is an easy flag
        stop_set['tcoord'] = stop_set['tstop']  # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
        stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
        orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
    orf_profs = []
    indices = []
    for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
        if tcoord != tstop:  # not a histop
            tlen = tlens[tid]
            if tcoord+startnt[0] < 0:
                startadj = -startnt[0]-tcoord  # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
            else:
                startadj = 0
            if tstop+stopnt[1] > tlen:
                stopadj = tstop+stopnt[1]-tlen  # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
            else:
                stopadj = 0
            curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
            orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
        else:  # histop
            curr_indices = tid_indices[tid][tstop-6:tstop]
            orf_profs.append(stopprof[:, -6:].ravel())
        indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
        # need to tile the indices for each read length
        if len(indices[-1]) != len(orf_profs[-1]):
            raise AssertionError('ORF length does not match index length')
    orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
                                          np.concatenate(indices),
                                          np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
                                         shape=(nnt*len(rdlens), len(orf_strength_df)))
    # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
    nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
    if len(nonzero_orfs) == 0:  # no possibility of anything coming up
        return failure_return
    orf_matrix = orf_matrix[:, nonzero_orfs]
    orf_strength_df = orf_strength_df.iloc[nonzero_orfs]  # don't bother fitting ORFs with zero reads throughout their entire length
    (orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
    min_str = 1e-6  # allow for machine rounding error
    usable_orfs = orf_strs > min_str
    if not usable_orfs.any():
        return failure_return
    orf_strength_df = orf_strength_df[usable_orfs]
    orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts
    orf_strs = orf_strs[usable_orfs]
    orf_strength_df['orf_strength'] = orf_strs

    covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
    # homoscedastic version (assume equal variance at all positions)

    # resids = counts-orf_matrix.dot(orf_strs)
    # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
    # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
    #                            .dot(orf_matrix).dot(simple_covmat))
    # # heteroscedastic version (Eicker-Huber-White robust estimator)

    orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
    orf_strength_df.set_index('orfname', inplace=True)
    elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
    if opts.startonly:  # count abortive initiation events towards start strength in this case
        include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
        gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
        # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
        covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
        orf_strs_starts = orf_strs[include_starts.values]
    else:
        gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
        covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        orf_strs_starts = orf_strs[elongating_orfs.values]
    start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                 ('chrom', orf_set['chrom'].iloc[0]),
                                                 ('strand', orf_set['strand'].iloc[0]),
                                                 ('codon', gcoord_grps['codon'].first()),
                                                 ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
    start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
                                              .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})

    if not opts.startonly:
        # count histop towards the stop codon - but still exclude abinit
        include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
        gstop_grps = orf_strength_df[include_stops].groupby('gstop')
        covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
        orf_strs_stops = orf_strs[include_stops.values]
        stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                    ('chrom', orf_set['chrom'].iloc[0]),
                                                    ('strand', orf_set['strand'].iloc[0]),
                                                    ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
        stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
                                                .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        # # nohistop
        # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
        # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        # orf_strs_stops = orf_strs[elongating_orfs.values]
        # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
        # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
        #                                                  .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        return orf_strength_df, start_strength_df, stop_strength_df
    else:
        return orf_strength_df, start_strength_df
Exemple #4
0
def _regress_tfam(orf_set, gnd):
    """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
    Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
    tfam = orf_set['tfam'].iat[0]
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    hashed_counts = get_hashed_counts(tfam_segs, gnd)
    counts = np.zeros((len(rdlens), nnt), dtype=np.float64)  # even though they are integer-valued, will need to do float arithmetic
    for (i, rdlen) in enumerate(rdlens):
        for nmis in range(1+opts.max5mis):
            counts[i, :] += hashed_counts[(rdlen, nmis)]
    counts = counts.ravel()

    if opts.startcount:
        # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
        offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3)  # offsets for each cond, expecting three positions to check for each
    #    try:
        orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
                           [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
        if orf_set.empty:
            return failure_return

    orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
    abort_set = orf_set.drop_duplicates('gcoord').copy()
    abort_set['gstop'] = abort_set['gcoord']  # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
    abort_set['tstop'] = abort_set['tcoord']+3  # stop after the first codon
    abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
    orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
    if not opts.startonly:  # if marking full ORFs, include histop model
        stop_set = orf_set.drop_duplicates('gstop').copy()
        stop_set['gcoord'] = stop_set['gstop']  # this is an easy flag
        stop_set['tcoord'] = stop_set['tstop']  # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
        stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
        orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
    orf_profs = []
    indices = []
    for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
        if tcoord != tstop:  # not a histop
            tlen = tlens[tid]
            if tcoord+startnt[0] < 0:
                startadj = -startnt[0]-tcoord  # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
            else:
                startadj = 0
            if tstop+stopnt[1] > tlen:
                stopadj = tstop+stopnt[1]-tlen  # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
            else:
                stopadj = 0
            curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
            orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
        else:  # histop
            curr_indices = tid_indices[tid][tstop-6:tstop]
            orf_profs.append(stopprof[:, -6:].ravel())
        indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
        # need to tile the indices for each read length
        if len(indices[-1]) != len(orf_profs[-1]):
            raise AssertionError('ORF length does not match index length')
    orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
                                          np.concatenate(indices),
                                          np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
                                         shape=(nnt*len(rdlens), len(orf_strength_df)))
    # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
    nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
    if len(nonzero_orfs) == 0:  # no possibility of anything coming up
        return failure_return
    orf_matrix = orf_matrix[:, nonzero_orfs]
    orf_strength_df = orf_strength_df.iloc[nonzero_orfs]  # don't bother fitting ORFs with zero reads throughout their entire length
    (orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
    min_str = 1e-6  # allow for machine rounding error
    usable_orfs = orf_strs > min_str
    if not usable_orfs.any():
        return failure_return
    orf_strength_df = orf_strength_df[usable_orfs]
    orf_matrix = orf_matrix[:, usable_orfs]  # remove entries for zero-strength ORFs or transcripts
    orf_strs = orf_strs[usable_orfs]
    orf_strength_df['orf_strength'] = orf_strs

    covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
    # homoscedastic version (assume equal variance at all positions)

    # resids = counts-orf_matrix.dot(orf_strs)
    # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
    # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
    #                            .dot(orf_matrix).dot(simple_covmat))
    # # heteroscedastic version (Eicker-Huber-White robust estimator)

    orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
    orf_strength_df.set_index('orfname', inplace=True)
    elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
    if opts.startonly:  # count abortive initiation events towards start strength in this case
        include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
        if not include_starts.any():
            return failure_return  # no need to keep going if there weren't any useful starts
        gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
        # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
        covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
        orf_strs_starts = orf_strs[include_starts.values]
    else:
        if not elongating_orfs.any():
            return failure_return
        gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
        covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        orf_strs_starts = orf_strs[elongating_orfs.values]
    start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                 ('chrom', orf_set['chrom'].iloc[0]),
                                                 ('strand', orf_set['strand'].iloc[0]),
                                                 ('codon', gcoord_grps['codon'].first()),
                                                 ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
    start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
                                              .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})

    if not opts.startonly:
        # count histop towards the stop codon - but still exclude abinit
        include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
        gstop_grps = orf_strength_df[include_stops].groupby('gstop')
        covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
        orf_strs_stops = orf_strs[include_stops.values]
        stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                    ('chrom', orf_set['chrom'].iloc[0]),
                                                    ('strand', orf_set['strand'].iloc[0]),
                                                    ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
        stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
                                                .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        # # nohistop
        # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
        # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        # orf_strs_stops = orf_strs[elongating_orfs.values]
        # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
        # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
        #                                                  .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        return orf_strength_df, start_strength_df, stop_strength_df
    else:
        return orf_strength_df, start_strength_df