def all_GO_annotation_pairs_classification(d, verbose=True): ''' Using the results of the all_GO_annotations_classification, this function computes the accuracies of all pairs of the pathway classifiers. @param d: The dictionary { pathway_name:gene_list, ... } @return: A results dictionary keyed by (i,j), which is the combination of pathways indexed by i and j, and the values are the tuple: (accuracy, pathway_name_i + pathway_name_j, number_of_genes) ''' (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() N = len(d) keys = d.keys() acc = {} for i in range(N): for j in range(i+1,N): p1 = keys[i] p2 = keys[j] if verbose: print (i,j), "%s + %s"%(p1,p2) gl1 = d[p1] gl2 = d[p2] gl = gl1 + gl2 rc=pathway_classification(gl, (D,L), (D2,L2)) #store the results acc[(i,j)] = (rc[0], "%s + %s"%(p1,p2), len(gl1)+len(gl2) ) return acc
def all_GO_annotations_classification(ifra=None, depth=5, numIter=30): ''' Compute the classification accuracies for the set of genes corresponding to GO annotations at a given depth in the ontology. @param ifra: An instance of IFR_Analysis_GO object that has already computed the annotations per iteration at a given ontology depth. Set to None to have this function create a new one. @param depth: This parameter is ignored if ifra is not none, in which case ifra.depth is the ontology depth. If ifra is none, then this is the ontology depth used when creating a new analysis object. @param numIter: How many iterations of the IFR gene sets should be used in constructing the 'pathways' used to build the pathway classifiers. @return: (acc_dict, d), where acc_dict is a dictionary keyed by pathway name and value is the classification accuracy; d is a dictionary keyed by pathway name with values being the list of genes that were included in that pathway from the GO analysis. ''' if ifra is None: ifra = ifr.IFR_Analysis_GO(IFR_INFLUENZA_FILE_GENES, ontology_depth=depth) ifra.compute_annotations_per_iteration(saveas=None, bf_thresh=0.0, num_iterations=numIter, homologs=False) d = ifra.annotationMembership(numIter=numIter) d2 = { k:v for (k,v) in d.items() if len(v) >= 5} print "There are %d annotations with at least 5 members."%len(d2) acc_dict = {} (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() for (k,gl) in d2.items(): rc = pathway_classification(gl, traindat=(D,L), testdat=(D2,L2)) acc_dict[k] = rc[0] print "%s %3.2f"%(k, rc[0]) return acc_dict, d2
def test_pathway_classifier(numIter=10): (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() G = ifr.load_gene_ids(short_name_only=True) pc = ifr.Pathway_Classifier((D,L), (D2,L2), feature_names=G, numIter=numIter) pc.initAnalysis() _rc=pc.computePathwayClassifiers(numIter=numIter) _rc2=pc.computePathwayPairsClassifiers() print "Top 10 pathway pairs:" print pc.getTopPathwayPairs(10) return pc
def all_IFR_pairs_classification(ifr, numIter=20): ''' computes the classification accuracy using all pairs of IFR iterations from the flu_genelist SSVM IFR ''' acc = {} #the classifier accuracy combining iteration i with j (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() removed = ifr.get_removed_features() #compute L2 SVM test accuracies for all iterations upto numIter print "Computing L2 SVM test accuracies for each IFR iteration." test_acc_list = [] for x in range(numIter): glx = removed[x] tmp=pathway_classification(glx, (D,L), (D2,L2)) test_acc_list.append(tmp[0]) print "Computing L2 SVM test accuracies for all pairs of IFR iterations." cur = 0 total = (numIter/2)*(numIter-1) for i in range(numIter): for j in range((i+1),numIter): ifr.print_progress(cur, total) cur+=1 gl1 = removed[i] gl2 = removed[j] gl = gl1+gl2 a1 = test_acc_list[i] a2 = test_acc_list[j] max_acc = max(a1,a2) #compute combined accuracy rc=pathway_classification(gl, (D,L), (D2,L2)) #store the results acc[(i,j)] = (rc[0], "IFR %d + IFR %d"%(i,j), a1, a2, max_acc ) res = sorted( acc.values(), reverse=True) return res, test_acc_list
def all_pathway_classification(): ''' Compute classification rates for all pathways in flu_genelist ''' (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() G = ifr.load_gene_ids(short_name_only=True) accListF = [] accListS = [] errors = {} #for each pathway, get the gene list and compute classification power for p in sorted( flu_genelists.PATHWAYS.keys()): (_pathway_title, genelist) = flu_genelists.PATHWAYS[p] (rc, err_set) = pathway_classification(genelist, feature_names=G, traindat=(D,L), testdat=(D2,L2), return_err_idxs=True) accListF.append( rc[0] ) accListS.append( "%3.1f"%(100.0*rc[0])) errors[p]=err_set #print pathway_title, rc[0] return accListF, accListS, errors
def test_ifr(max_iter=600, plot=True): ''' test ifr iterations using a fixed test data set H3N2 data where half of all subjects (not rows) are partitioned into the test set. @param max_iter: The maximum number of iterations to process. ''' (D,T,_,S) = ifr.load_flu_mat() (Dtrain, Ttrain, Dtest, Ttest, test_ids) = ifr.subject_partition_proportional(D, T, S, frac=0.50) print "Test ids: %s, with labels: %s"%(str(test_ids),str(Ttest)) genes = ifr.load_gene_ids(short_name_only=True) ifrx = IFR( (Dtrain, Ttrain), (Dtest,Ttest), feature_names=genes ) for i,x in enumerate(ifrx): #IFR is an iterator, each iteration returns the features at that iteration print "Iteration %d: %d Features Selected. %d Remaining."%((i+1),len(x),ifrx.getRemainingCount()) if i>=max_iter: break print "Features from first iteration:" print ifrx[0] #note IFR class supports __getitem__, so results for any iteration processed can be returned # including slices like ifr[0:10] for the first 10 iterations, list of lists returned. if plot: ifrx.plotResults(titlestr="IFR: H3N2 Fixed 50 Pct Subject Partition") return ifrx
def all_pathway_pairs_classification(): ''' computes the classification accuracy using all pairs of pathways from the flu_genelist SSVM IFR ''' acc = {} #the classifier accuracy combining pathway i with j (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() G = ifr.load_gene_ids(short_name_only=True) pw_dict = flu_genelists.PATHWAYS #get the mistakes that each individual pathway classifier makes (accList,_,errors) = all_pathway_classification() for i in range(1,13): for j in range((i+1),13): #find error overlap e1 = errors[i] e2 = errors[j] f1 = len( e1.intersection(e2) ) #f2 = len( e1.union(e2) ) overlap_frac = float(f1)/57 #find accuracy of the two single pathway classifiers a1 = accList[i-1] a2 = accList[j-1] max_acc = max(a1,a2) #compute combined accuracy (p1,gl1)=pw_dict[i] (p2,gl2)=pw_dict[j] gl = gl1+gl2 rc=pathway_classification(gl, feature_names=G, traindat=(D,L), testdat=(D2,L2)) #(genelist, feature_names=None, traindat=None, testdat=None, return_err_idxs=False, # common_scale=False, verbose=True ): #store the results acc[(i,j)] = (rc[0], "%s + %s"%(p1,p2), overlap_frac, max_acc ) return acc
def pathway_classification(genelist, feature_names=None, traindat=None, testdat=None, return_err_idxs=False, common_scale=False, verbose=True ): ''' Given a genelist from the flu data set, such as a collection of genes belonging to a specific pathway, this function computes the accuracy of a standard (L2) SVM classifier @param genelist: The list of genes forming the pathway @param feature_names: The list of feature (gene) names associated with the columns of test/train data. The feature_names must contain the genes in genelist, or a lookup will be performed to find the matching alias. If None, then feature names will be the genes from the Duke Influenza data. @param traindat: If None, then H3N2 data will be used. Else, specify the tuple (D,L) where D is the data matrix (samples in rows) and L is the label vector @param testdat: If None, then H1N1 data will be used. Else, specify (D2,L2) tuple for test data. @param return_err_idxs: If true, then the indexes in the test set where the classifier is wrong will be returned. @param common_scale: If true, then the test data will be scaled using the training data mean/std, else it will be scaled using its own mean/std. @param verbose: If true, more output will be displayed. @return: Either returns rc or rc, err_set, where rc is the return from the svm engine, which is the tuple (test_accuracy, factors, clf, train_accuracy). See L{ifr.svm_engine}. ''' global PC_KNOWN_BAD global PC_KNOWN_ALIASES if traindat is None: (D,L,_,_) = ifr.load_flu_mat() else: (D,L) = traindat if testdat is None: (D2,L2,_,_) = ifr.load_H1N1_mat() else: (D2,L2) = testdat G = feature_names if (not feature_names is None) else ( ifr.load_gene_ids(short_name_only=True) ) idxs = [] for x in genelist: while x in PC_KNOWN_ALIASES: if verbose: print "%s is known alias to %s."%(x, PC_KNOWN_ALIASES[x]) x = PC_KNOWN_ALIASES[x] #alias could be key to another alias...so go down until we stop finding substitutions if x in PC_KNOWN_BAD: if verbose: print "%s is known bad, skipping gene."%x continue try: idx = G.index(x) except(ValueError): #x is probably an alias to a gene name in G if verbose: print "Gene %s is not a known feature name. Querying for official name..."%x xn = ifr.get_official_name(x, entrezgene=True) if (xn is None) or (xn==x): if verbose: print "No other official name found, gene will be skipped." xn = None PC_KNOWN_BAD.append(x) else: print "Substituting %s for %s."%(xn,x) PC_KNOWN_ALIASES[x] = xn try: idx = G.index(xn) if (not xn is None) else None except(ValueError): idx = None #happens when official name is found, but still not in list if not idx is None: idxs.append(idx) rc = ifr.svm_engine((D,L), (D2,L2), perm=idxs, common_scale=common_scale, verbose=False, no_normalization=False, loss="L2", penalty="L2", C=1.0) if return_err_idxs: IX = sp.array( range(len(L2))) errs = IX[ rc[4] != L2] #indexes where prediction is not correct err_set = set(errs) return rc, err_set else: return rc