def svm_engine(Train, Test, perm=None, common_scale=True, verbose=False, no_normalization=False, loss='l2', penalty='l1', dual=False, C=0.5, tol=0.0001, **kwargs): ''' svm randomized feature extraction engine Internal function that performs the "engine" of the sparse linear svc fitting of the data. @param Train: A tuple (D,L) providing the data and labels for the training data, for example, as loaded by the idr.load_flu_mat function. @param Test: A tuple (D,L) for the test data. @param perm: The random factor/column permutation of the data. None means no permutation. Otherwise this is a permuted list of column indexes. You can also use this parameter to perform non-permuted subset selection. Just provide a list of column indexes to include. @param common_scale: If True, then the test data will be normalized using the centers/scale of the training data (i.e. "common scale"). This is a common thing to do in machine learning -- project the test samples onto the normalized coordinates of the training data before applying the model. This breaks, however, when training on H3N2 microarray data and testing on H1N1 because the collection procedures result in very different scale expression data. It is better in this case to normalize each separately, and use the self-normalized test vectors with the trained model. @param verbose: If True, incremental output to the console will be generated. Otherwise, the results will be returned silently. @param no_normalization: If True, then the built-in normalizing done by this function will be skipped. This is appropriate if the data being provided is already normalized. @return: A tuple (test_accuracy, factors, clf, train_accuracy) where test_accuracy is the accuracy on the test data, factors are those columns with non-zero coefficients used in the classifier, clf is the trained classifier, and train_accuracy is the accuracy on the training data. @note: Other parameters, loss, penalty, dual, C, tol, are input parameters to the scikit-learn SVC classifier. Please see the documentation on the LinearSVC for definitions. ''' (D,L) = Train (D2, L2) = Test if not perm is None: D = D[:,perm] D2 = D2[:,perm] if no_normalization: Xtrn = D Xtst = D2 else: #normalize the training data, mean-center unit-deviation (Xtrn,means,stdvs) = ifr.normalize_data(D) if common_scale: #use the training normalization means/stdvs to apply to testing data (Xtst,_,_) = ifr.normalize_data(D2, means, stdvs) else: #use the test statistics to normalize the test data separately from the training statistics (Xtst,_,_) = ifr.normalize_data(D2) #parameter C=1.0 determined by a parameter search on the training data # using leave-one-subject-out...see svm_find_best_param function # seems to work well even for very different sizes of the number of columns in the data clf = svm.LinearSVC(loss=loss, penalty=penalty, dual=dual, C=C, tol=tol, **kwargs) #, class_weight='auto') clf.fit(Xtrn,L) #print "Non-zero coefficients have values:" #print clf.raw_coef_[ sp.nonzero(clf.raw_coef_)] if verbose: print "Predicting training values to measure training error..." s1 = clf.score(Xtrn,L) if verbose: print "%d correct predictions on training samples"%( s1*len(L) ) print "%.3f fraction correct of %d samples"%( s1, len(L) ) if verbose: print "Predicting test values" s2 = clf.score(Xtst, L2) preds = clf.predict(Xtst) if verbose: print "%d correct predictions on testing samples"%(s2*len(L2)) print "%.3f fraction correct of %d samples"%( s2, len(L2)) x = list(clf.coef_[0,:]) factors = sp.nonzero(x)[0] if not perm is None: if type(perm) is list: perm = sp.array(perm) #make a scipy array factors = perm[factors] if verbose: print "%d gene indexes used in model:"%len(factors) if verbose: print factors return (s2, factors, clf, s1, preds)
def gene_expression_boxplots(D, L, genelist, idxlist, zscore=False, titlestr='Distribution of Expression Levels for Selected Genes', class_labels=['Class 1','Class 2']): ''' Function to show the univariate separability of a set of genes. For each gene in the list, two box plots are drawn, one per class. The box plots show the distribution of expression levels of the gene for each class, which gives an indicator of the univariate separability. @param D: The data matrix, rows=samples, cols=genes/features @param L: Labels, list or vector indicating class membership. Two-class analysis only. @param genelist: The text names of the genes @param idxlist: The column indexes into D associated with each gene in genelist @param zscore: If true, the graph will use normalized values (mean-centered, scaled by std), also known as "z-scores". @param titlestr: The title for the resulting graph ''' #check to make sure labels are valid...integers for 2 classes, -1 / +1 assert len(set(L)) == 2 binary_labels = list(set(L)) #print binary_labels #get ttest scores _, TT = ttest_features(D, L, return_mat=True) if zscore: D,_,_ = ifr.normalize_data(D) N = len(idxlist) #the following list completion is a complex nesting of commands to handle a small number # of genes that are too long to conveniently display, such as AFFX-HUMISGF3A/M97935_3_at, # in which case "M97935_3_at" will be displayed. Simple names like "OAS1" should be unaffected. class1_data = D[:,idxlist][L==binary_labels[1],:] odds = range(1,2*N,2) #class1 boxes will be at the odd ticks class2_data = D[:,idxlist][L==binary_labels[0],:] evens = range(2, (2*N)+1,2) #class2 boxes will be at the even ticks genelist2 = [ gg.split("/")[1][:12] if "/" in gg else gg[:12] for gg in genelist] pl.hold(True) #draw boxplots for class1 at the odd ticks rc= pl.boxplot( class1_data, sym="b+",patch_artist=True, positions=odds) #the values for class -1 for p in rc['boxes']: p.set_color("b") #draw boxplots for class2 at the even ticks rc2=pl.boxplot( class2_data, sym="rx",patch_artist=True, positions=evens) #the values for class +1 for p in rc2['boxes']: p.set_color("r") #draw light vertical lines to group the box plot pairs ax = pl.gca() yrange = ax.yaxis.get_majorticklocs() pl.vlines(sp.array([0]+evens)+0.5, yrange.min(), yrange.max(), color='lightgray', linestyles='solid', alpha=0.5) #draw the ttest-measure score for each feature at the top x_pos_list = sp.array(range(1,2*N+1,2))+0.5 y_pos = 0.9*yrange.max() for i,idx in enumerate(idxlist): tscore = TT[idx,0] #pval = TT[idx,1] pl.text( x_pos_list[i], y_pos, "%2.1f"%tscore, color='blue', ha='center') #labels, titles, and tick marks pl.xlabel("Gene") pl.ylabel("Expression Level") pl.title(titlestr) pl.xticks( x_pos_list, genelist2, rotation='vertical') pl.xlim([0,2*N+1]) #legend r1 = pl.Rectangle((0,0),1,1,fc='b') r2 = pl.Rectangle((0,0),1,1,fc='r') pl.legend( [r1,r2],class_labels, loc=4) pl.subplots_adjust(bottom=0.2, left=0.05, right=0.95) pl.draw()