Example #1
0
def scorekey_elution(score, elut, recalc_id2inds):
    new_id2inds = None
    new_prots = None
    if score == 'apex':
        score_mat = ApexScores(elut)
    elif score == 'cosine_old':
        score_mat = CosineLazyScores(elut)
    elif score == 'cosine':
        score_mat = CosineLazyNew(elut)
    elif score == 'euclidean':
        score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
                metric=score)
    elif score in ('pq_euc', 'pq_unfilt_euc', 'mq_euc'):
        # Use pepquant specific elution file.
        extension = ( '_pqmsb_filtmsb.tab' if score=='pq_euc' else
                '_pqmsb.tab' if score=='pq_unfilt_euc' else
                '.mq_Intensity.tab' if score=='mq_euc' else 0)
        elut = el.load_elution(os.path.splitext(elut.filename)[0] + extension)
        if recalc_id2inds is not None:
            new_id2inds = recalc_id2inds(elut.prots) #cv framework (arrfeats)
        new_prots = elut.prots 
        score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
                metric='euclidean')
    else:
        fscore = elut.filename + (
                  '.corr_poisson' if score=='poisson' else
                  '.T.wcc_width1' if score=='wcc' else
                  '.corr_euclidean' if score=='euc_poisson' else
                  '.standard' if score=='standard' else # eg elution/testms1
                  0 ) # no score: exception since string and int don't add
        score_mat = precalc_scores(fscore)
    return score_mat, new_id2inds, new_prots
Example #2
0
def scorekey_elution(score, elut, recalc_id2inds):
    new_id2inds = None
    new_prots = None
    if score == 'apex':
        score_mat = ApexScores(elut)
    elif score == 'cosine_old':
        score_mat = CosineLazyScores(elut)
    elif score == 'cosine':
        score_mat = CosineLazyNew(elut)
    elif score == 'euclidean':
        score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
                metric=score)
    elif score in ('pq_euc', 'pq_unfilt_euc', 'mq_euc'):
        # Use pepquant specific elution file.
        extension = ( '_pqmsb_filtmsb.tab' if score=='pq_euc' else
                '_pqmsb.tab' if score=='pq_unfilt_euc' else
                '.mq_Intensity.tab' if score=='mq_euc' else 0)
        elut = el.load_elution(os.path.splitext(elut.filename)[0] + extension)
        if recalc_id2inds is not None:
            new_id2inds = recalc_id2inds(elut.prots) #cv framework (arrfeats)
        new_prots = elut.prots 
        score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
                metric='euclidean')
    else:
        fscore = elut.filename + (
                  '.corr_poisson' if score=='poisson' else
                  '.T.wcc_width1' if score=='wcc' else
                  '.corr_euclidean' if score=='euc_poisson' else
                  '.standard' if score=='standard' else # eg elution/testms1
                  0 ) # no score: exception since string and int don't add
        score_mat = precalc_scores(fscore)
    return score_mat, new_id2inds, new_prots
Example #3
0
def plot_sums(fs, shape=None):
    import plotting as pl
    shape = shape if shape else ut.sqrt_shape(len(fs))
    for i,f in enumerate(fs):
        e = el.load_elution(f)
        pl.subplot(shape[0],shape[1],i+1)
        pl.title(ut.shortname(f))
        sums = np.sum(e.mat,axis=0)
        pl.plot(range(sums.shape[1]), sums[0,:].T)
Example #4
0
def elut_gene_maxes(elutfs, geneids):
    d = {}
    for f in elutfs:
        e = el.load_elution(f)
        prots_inv = ut.list_inv_to_dict(e.prots)
        for gid in geneids:
            if gid in prots_inv:
                d.setdefault(f,{})[gid] = np.max(e.mat[prots_inv[gid]])
    return d
def check(fasta, protq, do_convert):
    p2g = seqs.prots2genes(fasta)
    g2p = ut.dict_inverse(p2g)
    fprots = el.load_elution(protq).prots
    print "checking", ut.shortname(protq)
    print "proteins: %s of %s" % (len([p for p in fprots if p in p2g]),
            len(fprots))
    ngenesfound = len([p for p in fprots if p in g2p])
    print "genes: %s of %s" % (ngenesfound,
            len(fprots))
    if do_convert and ngenesfound < len(fprots):
        print "converting prots to genes:",  protq
        seqs.elut_p2g(protq, p2g)
def msb_filter(proj_dir, msb_out_dir, pq_path):
    """
    Filter the pepquant output by keeping only values with spectral counts in
    the msblender output.
    """
    proj_name = ut.shortname(proj_dir)
    msb_quant_file = os.path.join(msb_out_dir, proj_name+MSB_EXT)
    assert os.path.exists(msb_quant_file), "No filter elution found: %s" % msb_quant_file
    pq_elut, msb_elut = [el.load_elution(f) for f in pq_path,
            msb_quant_file]
    pq_elut.mat = el.filter_matching_elution(pq_elut, msb_elut)
    pq_filt_path = pq_path.replace(PQ_CLEAN, PQ_FILT)
    el.write_elution(pq_elut, pq_filt_path)
    return pq_filt_path
def merge(proj_dir, dirnames, pq_new_path):
    """
    Combine pepquant quantitation from project_1 (etc) PQ_FILE into
    project+PQ_NEW.
    """
    if not os.path.exists(proj_dir):
        os.mkdir(proj_dir)
    proj_name = ut.shortname(proj_dir)
    assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path
    dirnames = ut.i0(sort_numbered(dirnames))
    #print "Sorted dirnames:", dirnames
    pq_files = [os.path.join(d,PQ_FILE) for d in dirnames]
    for f in pq_files:
        if not os.path.exists(f):
            print "No Elution File:", f
    eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f))
    merged = reduce(el.combine_elutions, eluts)
    el.write_elution(merged, pq_new_path)
def prot_counts(fs, min_count=2):
    """
    Sum up all the spectral counts for all the proteins in a set of
    fractionations.  
    Filtered s.t. any returned protein will have at least min_count counts in
    one fraction of one of the fractionations.
    Return a dict: {prot1:count1, prot2:count2, ...}
    """
    allprots = el.all_prots(fs, min_count=min_count)
    pcounts = collections.defaultdict(float)
    for f in fs:
        e = el.load_elution(f)
        psums = np.sum(np.array(e.mat),axis=1)
        frac_sum = sum(psums)
        norm_term = 1 / (frac_sum * len(fs))
        for p,psum in zip(e.prots,psums):
            if p in allprots:
                pcounts[p] += (psum * norm_term)
    return pcounts
Example #9
0
def score_array_multi(arr, sp_base, elut_fs, scores, cutoff, verbose=False,
        remove_multi_base=False, gidscheme=None, allow_singles=True):
    """
    - remove_multi_base: This is not the method currently used to filter scores
      in cases of orthogroup fan-outs--this is a stricter earlier version. That
      filter is feature.py: filter_multi_orths(), applied after scoring.
    """
    assert gidscheme=='', "Gidscheme not implemented in scoring."
    current_sp = ''
    if remove_multi_base: 
        print ("Filtering orths: only single base gene in orthogroups.")
    for e,f in [(el.load_elution(f),f) for f in elut_fs]:
        sp_target = ut.shortname(f)[:2]
        if sp_target != current_sp: # Just for status output
            print "Starting first %s file: %s" % (sp_target, ut.shortname(f))
            current_sp = sp_target
        baseid2inds = orth_indices(sp_base, sp_target, e.prots,
                remove_multi_base)
        # singles based on original spec counts
        singles = set([]) if allow_singles else prots_singles(e) 
        for score in scores:
            if verbose: print score, f
            score_array(arr, e, f, score, cutoff, baseid2inds, singles, lambda prots:
                    orth_indices(sp_base, sp_target, prots, remove_multi_base))
Example #10
0
def score_array_multi(arr, sp_base, elut_fs, scores, cutoff, verbose=False,
        remove_multi_base=False, gidscheme=None, allow_singles=True):
    """
    - remove_multi_base: This is not the method currently used to filter scores
      in cases of orthogroup fan-outs--this is a stricter earlier version. That
      filter is feature.py: filter_multi_orths(), applied after scoring.
    """
    assert gidscheme=='', "Gidscheme not implemented in scoring."
    current_sp = ''
    if remove_multi_base: 
        print ("Filtering orths: only single base gene in orthogroups.")
    for e,f in [(el.load_elution(f),f) for f in elut_fs]:
        sp_target = ut.shortname(f)[:2]
        if sp_target != current_sp: # Just for status output
            print "Starting first %s file: %s" % (sp_target, ut.shortname(f))
            current_sp = sp_target
        baseid2inds = orth_indices(sp_base, sp_target, e.prots,
                remove_multi_base)
        # singles based on original spec counts
        singles = set([]) if allow_singles else prots_singles(e) 
        for score in scores:
            if verbose: print score, f
            score_array(arr, e, f, score, cutoff, baseid2inds, singles, lambda prots:
                    orth_indices(sp_base, sp_target, prots, remove_multi_base))
Example #11
0
        score_mat, _, new_prots = scorekey_elution(skey, elut, None)
        if new_prots is not None:
            arr_prots = np.array(new_prots)
        rows, cols = np.where(score_mat > thresh)
        p1s, p2s = [arr_prots[ids] for ids in rows, cols]
        pairs =  ut.zip_exact(p1s, p2s)
    return pairs

if __name__ == '__main__':
    nargs = len(sys.argv)
    if nargs < 3:
        sys.exit("usage: python score.py filename method(poisson|dotproduct|corrcoef|cov) [argument]") 
    fname = sys.argv[1]
    method = sys.argv[2]
    methodarg = None if nargs < 4 else int(sys.argv[3])
    elut = el.load_elution(fname)
    if method == 'poisson':
        corr = traver_corr(elut.mat, repeat=methodarg) if methodarg else \
            traver_corr(elut.mat)
    elif method in ['cosine_poisson','euclidean_poisson']:
        corr = poisson_repeat(elut.mat, metric=method.split('_')[0],
                repeat=methodarg) if methodarg else poisson_repeat(elut.mat,
                        metric=method)
    elif method in ['euclidean']:
        corr = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
                metric=method)
    elif method in ['apex']:
        corr = apex_scores_toarray_fast(ApexScores(elut))
    #elif method == 'dotproduct':
        #corr = elut.mat * elut.mat.T
    #elif method == 'corrcoef':
def elut_clean_prots(fin,fout):
    elut = el.load_elution(fin)
    elut.prots = [p.strip('>') for p in elut.prots]
    el.write_elution(elut, fout)
Example #13
0
        score_mat, _, new_prots = scorekey_elution(skey, elut, None)
        if new_prots is not None:
            arr_prots = np.array(new_prots)
        rows, cols = np.where(score_mat > thresh)
        p1s, p2s = [arr_prots[ids] for ids in rows, cols]
        pairs =  ut.zip_exact(p1s, p2s)
    return pairs

if __name__ == '__main__':
    nargs = len(sys.argv)
    if nargs < 3:
        sys.exit("usage: python score.py filename method(poisson|dotproduct|corrcoef|cov) [argument]") 
    fname = sys.argv[1]
    method = sys.argv[2]
    methodarg = None if nargs < 4 else int(sys.argv[3])
    elut = el.load_elution(fname)
    if method == 'poisson':
        corr = traver_corr(elut.mat, repeat=methodarg) if methodarg else \
            traver_corr(elut.mat)
    elif method in ['cosine_poisson','euclidean_poisson']:
        corr = poisson_repeat(elut.mat, metric=method.split('_')[0],
                repeat=methodarg) if methodarg else poisson_repeat(elut.mat,
                        metric=method)
    elif method in ['euclidean']:
        corr = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
                metric=method)
    #elif method == 'dotproduct':
        #corr = elut.mat * elut.mat.T
    #elif method == 'corrcoef':
        #corr = np.corrcoef(elut.mat)
    #elif method == 'cov':