コード例 #1
0
def test_pathway_classifier(numIter=10):
    (D,L,_,_) = ifr.load_flu_mat()
    (D2,L2,_,_) = ifr.load_H1N1_mat()
    G = ifr.load_gene_ids(short_name_only=True)
    pc = ifr.Pathway_Classifier((D,L), (D2,L2), feature_names=G, numIter=numIter)
    pc.initAnalysis()
    _rc=pc.computePathwayClassifiers(numIter=numIter)
    _rc2=pc.computePathwayPairsClassifiers()
    print "Top 10 pathway pairs:"
    print pc.getTopPathwayPairs(10)
    
    return pc
コード例 #2
0
def all_pathway_classification():
    '''
    Compute classification rates for all pathways in flu_genelist
    '''
    (D,L,_,_) = ifr.load_flu_mat()
    (D2,L2,_,_) = ifr.load_H1N1_mat()
    G = ifr.load_gene_ids(short_name_only=True)
        
    accListF = []
    accListS = []
    errors = {}
    #for each pathway, get the gene list and compute classification power
    for p in sorted( flu_genelists.PATHWAYS.keys()):
        (_pathway_title, genelist) = flu_genelists.PATHWAYS[p]
        (rc, err_set) = pathway_classification(genelist, feature_names=G, traindat=(D,L), testdat=(D2,L2),
                                               return_err_idxs=True)
        accListF.append( rc[0] )
        accListS.append( "%3.1f"%(100.0*rc[0]))
        errors[p]=err_set
        #print pathway_title, rc[0]
        
    return accListF, accListS, errors
コード例 #3
0
def test_ifr(max_iter=600, plot=True):
    '''
    test ifr iterations using a fixed test data set
    H3N2 data where half of all subjects (not rows) are partitioned
    into the test set.
    @param max_iter: The maximum number of iterations to process.
    '''
    (D,T,_,S) = ifr.load_flu_mat()
    (Dtrain, Ttrain, Dtest, Ttest, test_ids) = ifr.subject_partition_proportional(D, T, S, frac=0.50)
    print "Test ids: %s, with labels: %s"%(str(test_ids),str(Ttest))
    genes = ifr.load_gene_ids(short_name_only=True)
    ifrx = IFR( (Dtrain, Ttrain), (Dtest,Ttest), feature_names=genes )
    for i,x in enumerate(ifrx):  #IFR is an iterator, each iteration returns the features at that iteration
        print "Iteration %d: %d Features Selected. %d Remaining."%((i+1),len(x),ifrx.getRemainingCount())
        if i>=max_iter: break
    
    print "Features from first iteration:"
    print ifrx[0]  #note IFR class supports __getitem__, so results for any iteration processed can be returned
                    # including slices like ifr[0:10] for the first 10 iterations, list of lists returned.
    
    if plot: ifrx.plotResults(titlestr="IFR: H3N2 Fixed 50 Pct Subject Partition")
    return ifrx
コード例 #4
0
def all_pathway_pairs_classification():
    '''
    computes the classification accuracy using all
    pairs of pathways from the flu_genelist SSVM IFR
    '''
    acc = {}  #the classifier accuracy combining pathway i with j
    (D,L,_,_) = ifr.load_flu_mat()
    (D2,L2,_,_) = ifr.load_H1N1_mat()
    G = ifr.load_gene_ids(short_name_only=True)
    pw_dict = flu_genelists.PATHWAYS
    
    #get the mistakes that each individual pathway classifier makes
    (accList,_,errors) = all_pathway_classification()
        
    for i in range(1,13):
        for j in range((i+1),13):
            #find error overlap
            e1 = errors[i]
            e2 = errors[j]
            f1 = len( e1.intersection(e2) )
            #f2 = len( e1.union(e2) )
            overlap_frac = float(f1)/57
            #find accuracy of the two single pathway classifiers
            a1 = accList[i-1]
            a2 = accList[j-1]
            max_acc = max(a1,a2)
            #compute combined accuracy
            (p1,gl1)=pw_dict[i]
            (p2,gl2)=pw_dict[j]
            gl = gl1+gl2
            rc=pathway_classification(gl, feature_names=G, traindat=(D,L), testdat=(D2,L2))
            #(genelist, feature_names=None, traindat=None, testdat=None, return_err_idxs=False, 
            #               common_scale=False, verbose=True ):
            
            #store the results
            acc[(i,j)] = (rc[0], "%s + %s"%(p1,p2), overlap_frac, max_acc )
                                      
    return acc    
コード例 #5
0
def pathway_classification(genelist, feature_names=None, traindat=None, testdat=None, return_err_idxs=False, 
                           common_scale=False, verbose=True ):
    '''
    Given a genelist from the flu data set, such as a collection of genes belonging to a specific
    pathway, this function computes the accuracy of a standard (L2) SVM classifier
    @param genelist: The list of genes forming the pathway
    @param feature_names: The list of feature (gene) names associated with the columns of test/train data.
    The feature_names must contain the genes in genelist, or a lookup will be performed to find the matching alias.
    If None, then feature names will be the genes from the Duke Influenza data.
    @param traindat: If None, then H3N2 data will be used. Else, specify the tuple (D,L) where D is the
    data matrix (samples in rows) and L is the label vector
    @param testdat: If None, then H1N1 data will be used. Else, specify (D2,L2) tuple for test data.
    @param return_err_idxs: If true, then the indexes in the test set where the classifier is wrong will
    be returned.
    @param common_scale: If true, then the test data will be scaled using the training data mean/std, else
    it will be scaled using its own mean/std.
    @param verbose: If true, more output will be displayed.
    @return: Either returns rc or rc, err_set, where rc is the return from the svm engine, which is the tuple
    (test_accuracy, factors, clf, train_accuracy). See L{ifr.svm_engine}.
    '''
    global PC_KNOWN_BAD
    global PC_KNOWN_ALIASES
    
    if traindat is None:
        (D,L,_,_) = ifr.load_flu_mat()
    else:
        (D,L) = traindat
        
    if testdat is None:
        (D2,L2,_,_) = ifr.load_H1N1_mat()
    else:
        (D2,L2) = testdat
        
    G = feature_names if (not feature_names is None) else ( ifr.load_gene_ids(short_name_only=True) )
    idxs = []

    for x in genelist:
        while x in PC_KNOWN_ALIASES:
            if verbose: print "%s is known alias to %s."%(x, PC_KNOWN_ALIASES[x])
            x = PC_KNOWN_ALIASES[x]
            #alias could be key to another alias...so go down until we stop finding substitutions
        
        if x in PC_KNOWN_BAD:
            if verbose: print "%s is known bad, skipping gene."%x
            continue       
        
        try:
            idx = G.index(x)
        except(ValueError): 
            #x is probably an alias to a gene name in G
            if verbose:
                print "Gene %s is not a known feature name. Querying for official name..."%x
            xn = ifr.get_official_name(x, entrezgene=True)
            if (xn is None) or (xn==x):
                if verbose: print "No other official name found, gene will be skipped."
                xn = None
                PC_KNOWN_BAD.append(x)
            else:                    
                print "Substituting %s for %s."%(xn,x)
                PC_KNOWN_ALIASES[x] = xn
             
            try:
                idx = G.index(xn) if (not xn is None) else None
            except(ValueError):
                idx = None  #happens when official name is found, but still not in list
            
        if not idx is None: idxs.append(idx)

    rc = ifr.svm_engine((D,L), (D2,L2), perm=idxs, common_scale=common_scale, verbose=False,
                        no_normalization=False, loss="L2", penalty="L2", C=1.0)
    
    if return_err_idxs:
        IX = sp.array( range(len(L2)))
        errs = IX[ rc[4] != L2]  #indexes where prediction is not correct
        err_set = set(errs)
        return rc, err_set
    else:
        return rc