Beispiel #1
0
def _quantify_tfam(orf_set, gnds):
    """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of
    the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in
    accordance with startmask and stopmask"""
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    orf_matrix = np.zeros((nnt, len(orf_set)))
    ignore_coords = []
    for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)):
        orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1)
        ignore_coords.append(tid_indices[tid][max(tcoord+startmask[0], 0):tcoord+startmask[1]])
        ignore_coords.append(tid_indices[tid][max(tstop+stopmask[0], 0):tstop+stopmask[1]])
    ignore_coords = np.unique(np.concatenate(ignore_coords))
    orf_matrix[ignore_coords, :] = 0  # mask out all positions within the mask region around starts and stops
    valid_orfs = np.array([(orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set))])
    # require at least one valid position, and if >1 ORFs are identical, only include one of them
    orf_matrix[:, ~valid_orfs] = 0  # completely ignore these positions
    valid_nts = (orf_matrix > 0).any(1)  # only bother checking nucleotides where there is a valid ORF
    orf_res = orf_set.copy()
    if valid_nts.any():
        orf_matrix = orf_matrix[valid_nts, :]
        valid_nt_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos[valid_nts])))
        orf_res['nts_quantified'] = (orf_matrix > 0).sum(0)  # the number of nucleotides included in the quantification
        for colname, gnd in zip(colnames, gnds):
            orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0]
            # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array
        return orf_res
    else:
        orf_res['nts_quantified'] = 0
        for colname in colnames:
            orf_res[colname] = 0.
        return orf_res
Beispiel #2
0
# gene_name_lookup = pd.read_csv(opts.genenames,sep='\t',header=None,names=['tid','tfam']).set_index('tid')['tfam'].to_dict()
else:
    gene_name_lookup = {}

new_tfams = {}
multi_names = defaultdict(lambda: int(1))
for tfam_val in tfams.itervalues():
    geneset = {gene_name_lookup[tid] for tid in tfam_val[0] if tid in gene_name_lookup}
    if not geneset:
        geneset = set(tfam_val[0])  # if no gene names available, just use the tids themselves
    genename = _choose_name(geneset)
    if genename in new_tfams:
        multi_names[genename] += 1
        genename = '%s_%d' % (genename, multi_names[genename])
    new_tfams[genename] = tfam_val
for (genename, num_appearances) in multi_names.iteritems():
    sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' % (genename, num_appearances))

if opts.verbose:
    logprint('Saving results')

with open(outbedname, 'w') as outbed:
    with open(outtxtname, 'w') as outtxt:
        for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems():
            outbed.write(SegmentChain(*positionlist_to_segments(chrom, strand, list(genpos)), ID=tfam).as_bed())
            for tid in tids:
                outtxt.write('%s\t%s\n' % (tid, tfam))

if opts.verbose:
    logprint('Tasks complete')
Beispiel #3
0
def _quantify_tfam(orf_set, gnds):
    """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of
    the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in
    accordance with startmask and stopmask"""
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {
        tid: np.flatnonzero(
            np.in1d(all_tfam_genpos, list(curr_tid_genpos),
                    assume_unique=True))
        for (tid, curr_tid_genpos) in tid_genpos.iteritems()
    }
    orf_matrix = np.zeros((nnt, len(orf_set)))
    ignore_coords = []
    for (orf_num,
         (tid, tcoord, tstop,
          AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop',
                                        'AAlen']].itertuples(False)):
        orf_matrix[tid_indices[tid][tcoord:tstop],
                   orf_num] = np.tile(cdsprof, AAlen + 1)
        ignore_coords.append(tid_indices[tid][max(tcoord +
                                                  startmask[0], 0):tcoord +
                                              startmask[1]])
        ignore_coords.append(
            tid_indices[tid][max(tstop + stopmask[0], 0):tstop + stopmask[1]])
    ignore_coords = np.unique(np.concatenate(ignore_coords))
    orf_matrix[
        ignore_coords, :] = 0  # mask out all positions within the mask region around starts and stops
    valid_orfs = np.array([
        (orf_matrix[:, i] > 0).any()
        and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all()
        for i in xrange(len(orf_set))
    ])
    # require at least one valid position, and if >1 ORFs are identical, only include one of them
    orf_matrix[:, ~valid_orfs] = 0  # completely ignore these positions
    valid_nts = (orf_matrix > 0).any(
        1)  # only bother checking nucleotides where there is a valid ORF
    orf_res = orf_set.copy()
    if valid_nts.any():
        orf_matrix = orf_matrix[valid_nts, :]
        valid_nt_segs = SegmentChain(*positionlist_to_segments(
            chrom, strand, list(all_tfam_genpos[valid_nts])))
        orf_res['nts_quantified'] = (orf_matrix > 0).sum(
            0)  # the number of nucleotides included in the quantification
        for colname, gnd in zip(colnames, gnds):
            orf_res[colname] = nnls(orf_matrix,
                                    valid_nt_segs.get_counts(gnd))[0]
            # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array
        return orf_res
    else:
        orf_res['nts_quantified'] = 0
        for colname in colnames:
            orf_res[colname] = 0.
        return orf_res
Beispiel #4
0
        for tid in tfam_val[0] if tid in gene_name_lookup
    }
    if not geneset:
        geneset = set(
            tfam_val[0]
        )  # if no gene names available, just use the tids themselves
    genename = _choose_name(geneset)
    if genename in new_tfams:
        multi_names[genename] += 1
        genename = '%s_%d' % (genename, multi_names[genename])
    new_tfams[genename] = tfam_val
for (genename, num_appearances) in multi_names.iteritems():
    sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' %
                     (genename, num_appearances))

if opts.verbose:
    logprint('Saving results')

with open(outbedname, 'w') as outbed:
    with open(outtxtname, 'w') as outtxt:
        for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems():
            outbed.write(
                SegmentChain(*positionlist_to_segments(chrom, strand,
                                                       list(genpos)),
                             ID=tfam).as_bed())
            for tid in tids:
                outtxt.write('%s\t%s\n' % (tid, tfam))

if opts.verbose:
    logprint('Tasks complete')
Beispiel #5
0
def _regress_tfam(orf_set, gnd):
    """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
    Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
    tfam = orf_set['tfam'].iat[0]
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    hashed_counts = get_hashed_counts(tfam_segs, gnd)
    counts = np.zeros((len(rdlens), nnt), dtype=np.float64)  # even though they are integer-valued, will need to do float arithmetic
    for (i, rdlen) in enumerate(rdlens):
        for nmis in range(1+opts.max5mis):
            counts[i, :] += hashed_counts[(rdlen, nmis)]
    counts = counts.ravel()

    if opts.startcount:
        # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
        offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3)  # offsets for each cond, expecting three positions to check for each
    #    try:
        orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
                           [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
        if orf_set.empty:
            return failure_return

    orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
    abort_set = orf_set.drop_duplicates('gcoord').copy()
    abort_set['gstop'] = abort_set['gcoord']  # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
    abort_set['tstop'] = abort_set['tcoord']+3  # stop after the first codon
    abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
    orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
    if not opts.startonly:  # if marking full ORFs, include histop model
        stop_set = orf_set.drop_duplicates('gstop').copy()
        stop_set['gcoord'] = stop_set['gstop']  # this is an easy flag
        stop_set['tcoord'] = stop_set['tstop']  # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
        stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
        orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
    orf_profs = []
    indices = []
    for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
        if tcoord != tstop:  # not a histop
            tlen = tlens[tid]
            if tcoord+startnt[0] < 0:
                startadj = -startnt[0]-tcoord  # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
            else:
                startadj = 0
            if tstop+stopnt[1] > tlen:
                stopadj = tstop+stopnt[1]-tlen  # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
            else:
                stopadj = 0
            curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
            orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
        else:  # histop
            curr_indices = tid_indices[tid][tstop-6:tstop]
            orf_profs.append(stopprof[:, -6:].ravel())
        indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
        # need to tile the indices for each read length
        if len(indices[-1]) != len(orf_profs[-1]):
            raise AssertionError('ORF length does not match index length')
    orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
                                          np.concatenate(indices),
                                          np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
                                         shape=(nnt*len(rdlens), len(orf_strength_df)))
    # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
    nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
    if len(nonzero_orfs) == 0:  # no possibility of anything coming up
        return failure_return
    orf_matrix = orf_matrix[:, nonzero_orfs]
    orf_strength_df = orf_strength_df.iloc[nonzero_orfs]  # don't bother fitting ORFs with zero reads throughout their entire length
    (orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
    min_str = 1e-6  # allow for machine rounding error
    usable_orfs = orf_strs > min_str
    if not usable_orfs.any():
        return failure_return
    orf_strength_df = orf_strength_df[usable_orfs]
    orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts
    orf_strs = orf_strs[usable_orfs]
    orf_strength_df['orf_strength'] = orf_strs

    covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
    # homoscedastic version (assume equal variance at all positions)

    # resids = counts-orf_matrix.dot(orf_strs)
    # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
    # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
    #                            .dot(orf_matrix).dot(simple_covmat))
    # # heteroscedastic version (Eicker-Huber-White robust estimator)

    orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
    orf_strength_df.set_index('orfname', inplace=True)
    elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
    if opts.startonly:  # count abortive initiation events towards start strength in this case
        include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
        gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
        # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
        covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
        orf_strs_starts = orf_strs[include_starts.values]
    else:
        gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
        covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        orf_strs_starts = orf_strs[elongating_orfs.values]
    start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                 ('chrom', orf_set['chrom'].iloc[0]),
                                                 ('strand', orf_set['strand'].iloc[0]),
                                                 ('codon', gcoord_grps['codon'].first()),
                                                 ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
    start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
                                              .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})

    if not opts.startonly:
        # count histop towards the stop codon - but still exclude abinit
        include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
        gstop_grps = orf_strength_df[include_stops].groupby('gstop')
        covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
        orf_strs_stops = orf_strs[include_stops.values]
        stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                    ('chrom', orf_set['chrom'].iloc[0]),
                                                    ('strand', orf_set['strand'].iloc[0]),
                                                    ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
        stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
                                                .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        # # nohistop
        # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
        # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        # orf_strs_stops = orf_strs[elongating_orfs.values]
        # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
        # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
        #                                                  .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        return orf_strength_df, start_strength_df, stop_strength_df
    else:
        return orf_strength_df, start_strength_df
Beispiel #6
0
def _regress_tfam(orf_set, gnd):
    """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
    Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
    tfam = orf_set['tfam'].iat[0]
    strand = orf_set['strand'].iat[0]
    chrom = orf_set['chrom'].iat[0]
    tids = orf_set['tid'].drop_duplicates().tolist()
    all_tfam_genpos = set()
    tid_genpos = {}
    tlens = {}
    for (i, tid) in enumerate(tids):
        currtrans = SegmentChain.from_bed(bedlinedict[tid])
        curr_pos_set = currtrans.get_position_set()
        tlens[tid] = len(curr_pos_set)
        tid_genpos[tid] = curr_pos_set
        all_tfam_genpos.update(curr_pos_set)
    tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
    all_tfam_genpos = np.array(sorted(all_tfam_genpos))
    if strand == '-':
        all_tfam_genpos = all_tfam_genpos[::-1]
    nnt = len(all_tfam_genpos)
    tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
                   for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
    hashed_counts = get_hashed_counts(tfam_segs, gnd)
    counts = np.zeros((len(rdlens), nnt), dtype=np.float64)  # even though they are integer-valued, will need to do float arithmetic
    for (i, rdlen) in enumerate(rdlens):
        for nmis in range(1+opts.max5mis):
            counts[i, :] += hashed_counts[(rdlen, nmis)]
    counts = counts.ravel()

    if opts.startcount:
        # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
        offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3)  # offsets for each cond, expecting three positions to check for each
    #    try:
        orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
                           [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
        if orf_set.empty:
            return failure_return

    orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
    abort_set = orf_set.drop_duplicates('gcoord').copy()
    abort_set['gstop'] = abort_set['gcoord']  # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
    abort_set['tstop'] = abort_set['tcoord']+3  # stop after the first codon
    abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
    orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
    if not opts.startonly:  # if marking full ORFs, include histop model
        stop_set = orf_set.drop_duplicates('gstop').copy()
        stop_set['gcoord'] = stop_set['gstop']  # this is an easy flag
        stop_set['tcoord'] = stop_set['tstop']  # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
        stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
        orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
    orf_profs = []
    indices = []
    for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
        if tcoord != tstop:  # not a histop
            tlen = tlens[tid]
            if tcoord+startnt[0] < 0:
                startadj = -startnt[0]-tcoord  # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
            else:
                startadj = 0
            if tstop+stopnt[1] > tlen:
                stopadj = tstop+stopnt[1]-tlen  # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
            else:
                stopadj = 0
            curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
            orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
        else:  # histop
            curr_indices = tid_indices[tid][tstop-6:tstop]
            orf_profs.append(stopprof[:, -6:].ravel())
        indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
        # need to tile the indices for each read length
        if len(indices[-1]) != len(orf_profs[-1]):
            raise AssertionError('ORF length does not match index length')
    orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
                                          np.concatenate(indices),
                                          np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
                                         shape=(nnt*len(rdlens), len(orf_strength_df)))
    # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
    nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
    if len(nonzero_orfs) == 0:  # no possibility of anything coming up
        return failure_return
    orf_matrix = orf_matrix[:, nonzero_orfs]
    orf_strength_df = orf_strength_df.iloc[nonzero_orfs]  # don't bother fitting ORFs with zero reads throughout their entire length
    (orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
    min_str = 1e-6  # allow for machine rounding error
    usable_orfs = orf_strs > min_str
    if not usable_orfs.any():
        return failure_return
    orf_strength_df = orf_strength_df[usable_orfs]
    orf_matrix = orf_matrix[:, usable_orfs]  # remove entries for zero-strength ORFs or transcripts
    orf_strs = orf_strs[usable_orfs]
    orf_strength_df['orf_strength'] = orf_strs

    covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
    # homoscedastic version (assume equal variance at all positions)

    # resids = counts-orf_matrix.dot(orf_strs)
    # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
    # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
    #                            .dot(orf_matrix).dot(simple_covmat))
    # # heteroscedastic version (Eicker-Huber-White robust estimator)

    orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
    orf_strength_df.set_index('orfname', inplace=True)
    elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
    if opts.startonly:  # count abortive initiation events towards start strength in this case
        include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
        if not include_starts.any():
            return failure_return  # no need to keep going if there weren't any useful starts
        gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
        # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
        covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
        orf_strs_starts = orf_strs[include_starts.values]
    else:
        if not elongating_orfs.any():
            return failure_return
        gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
        covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        orf_strs_starts = orf_strs[elongating_orfs.values]
    start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                 ('chrom', orf_set['chrom'].iloc[0]),
                                                 ('strand', orf_set['strand'].iloc[0]),
                                                 ('codon', gcoord_grps['codon'].first()),
                                                 ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
    start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
                                              .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})

    if not opts.startonly:
        # count histop towards the stop codon - but still exclude abinit
        include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
        gstop_grps = orf_strength_df[include_stops].groupby('gstop')
        covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
        orf_strs_stops = orf_strs[include_stops.values]
        stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
                                                    ('chrom', orf_set['chrom'].iloc[0]),
                                                    ('strand', orf_set['strand'].iloc[0]),
                                                    ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
        stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
                                                .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        # # nohistop
        # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
        # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
        # orf_strs_stops = orf_strs[elongating_orfs.values]
        # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
        # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
        #                                                  .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})

        return orf_strength_df, start_strength_df, stop_strength_df
    else:
        return orf_strength_df, start_strength_df