def svm_engine(Train, Test, perm=None, common_scale=True, verbose=False, no_normalization=False,
               loss='l2', penalty='l1', dual=False, C=0.5, tol=0.0001, **kwargs):
    '''
    svm randomized feature extraction engine
    Internal function that performs the "engine" of the sparse linear svc
    fitting of the data.
    @param Train: A tuple (D,L) providing the data and labels for the training
    data, for example, as loaded by the idr.load_flu_mat function.
    @param Test: A tuple (D,L) for the test data.
    @param perm: The random factor/column permutation of the data. None means
    no permutation. Otherwise this is a permuted list of column indexes. You
    can also use this parameter to perform non-permuted subset selection. Just
    provide a list of column indexes to include.
    @param common_scale: If True, then the test data will be normalized using the centers/scale
    of the training data (i.e. "common scale"). This is a common thing to do in machine learning --
    project the test samples onto the normalized coordinates of the training data before applying
    the model. This breaks, however, when training on H3N2 microarray data and testing on H1N1
    because the collection procedures result in very different scale expression data. It is better
    in this case to normalize each separately, and use the self-normalized test vectors with
    the trained model.
    @param verbose: If True, incremental output to the console will be generated. Otherwise,
    the results will be returned silently.
    @param no_normalization: If True, then the built-in normalizing done by this function will
    be skipped. This is appropriate if the data being provided is already normalized.
    @return: A tuple (test_accuracy, factors, clf, train_accuracy) where test_accuracy is
    the accuracy on the test data, factors are those columns with non-zero coefficients used
    in the classifier, clf is the trained classifier, and train_accuracy is the accuracy
    on the training data.
    
    @note: Other parameters, loss, penalty, dual, C, tol, are input parameters to the scikit-learn
    SVC classifier. Please see the documentation on the LinearSVC for definitions.
    '''
    (D,L) = Train
    (D2, L2) = Test
    
    if not perm is None:
        D = D[:,perm]
        D2 = D2[:,perm]
    
    if no_normalization:
        Xtrn = D
        Xtst = D2
    else:
        #normalize the training data, mean-center unit-deviation
        (Xtrn,means,stdvs) = ifr.normalize_data(D)
        
        if common_scale:
            #use the training normalization means/stdvs to apply to testing data
            (Xtst,_,_) = ifr.normalize_data(D2, means, stdvs)
        else:
            #use the test statistics to normalize the test data separately from the training statistics
            (Xtst,_,_) = ifr.normalize_data(D2)
       
    
    #parameter C=1.0 determined by a parameter search on the training data
    # using leave-one-subject-out...see svm_find_best_param function
    # seems to work well even for very different sizes of the number of columns in the data
    clf = svm.LinearSVC(loss=loss, penalty=penalty, dual=dual, C=C, tol=tol, **kwargs) #, class_weight='auto')   
    clf.fit(Xtrn,L)
    
    #print "Non-zero coefficients have values:"
    #print clf.raw_coef_[ sp.nonzero(clf.raw_coef_)]
    
    if verbose: print "Predicting training values to measure training error..."
    s1 = clf.score(Xtrn,L)
    
    if verbose:
        print "%d correct predictions on training samples"%( s1*len(L) )
        print "%.3f fraction correct of %d samples"%( s1, len(L) )
    
    if verbose: print "Predicting test values"
    s2 = clf.score(Xtst, L2)
    preds = clf.predict(Xtst)
    if verbose:
        print "%d correct predictions on testing samples"%(s2*len(L2))
        print "%.3f fraction correct of %d samples"%( s2, len(L2))
    
    x = list(clf.coef_[0,:])
    factors = sp.nonzero(x)[0]
    if not perm is None:
        if type(perm) is list:
            perm = sp.array(perm) #make a scipy array
        factors = perm[factors]
    if verbose: print "%d gene indexes used in model:"%len(factors)
    if verbose: print factors
    
    return (s2, factors, clf, s1, preds)
Ejemplo n.º 2
0
def gene_expression_boxplots(D, L, genelist, idxlist, zscore=False,
                    titlestr='Distribution of Expression Levels for Selected Genes',
                    class_labels=['Class 1','Class 2']):
    '''
    Function to show the univariate separability of a set of genes. For each gene in the list,
    two box plots are drawn, one per class. The box plots show the distribution of expression
    levels of the gene for each class, which gives an indicator of the univariate separability.
    @param D: The data matrix, rows=samples, cols=genes/features
    @param L: Labels, list or vector indicating class membership. Two-class analysis only.
    @param genelist: The text names of the genes
    @param idxlist: The column indexes into D associated with each gene in genelist
    @param zscore: If true, the graph will use normalized values (mean-centered, scaled by std),
    also known as "z-scores".
    @param titlestr: The title for the resulting graph
    '''
    #check to make sure labels are valid...integers for 2 classes, -1 / +1
    assert len(set(L)) == 2
    binary_labels = list(set(L))
    #print binary_labels
    
    #get ttest scores
    _, TT = ttest_features(D, L, return_mat=True)
        
    if zscore:
        D,_,_ = ifr.normalize_data(D)
    
    N = len(idxlist)
    #the following list completion is a complex nesting of commands to handle a small number
    # of genes that are too long to conveniently display, such as AFFX-HUMISGF3A/M97935_3_at,
    # in which case "M97935_3_at" will be displayed. Simple names like "OAS1" should be unaffected.
    class1_data = D[:,idxlist][L==binary_labels[1],:]
    odds = range(1,2*N,2)  #class1 boxes will be at the odd ticks
    class2_data = D[:,idxlist][L==binary_labels[0],:]
    evens = range(2, (2*N)+1,2) #class2 boxes will be at the even ticks
    
    genelist2 = [ gg.split("/")[1][:12] if "/" in gg else gg[:12] for gg in genelist]
    pl.hold(True)
    
    #draw boxplots for class1 at the odd ticks
    rc= pl.boxplot( class1_data, sym="b+",patch_artist=True,
                    positions=odds)  #the values for class -1
    for p in rc['boxes']: p.set_color("b")
    
    #draw boxplots for class2 at the even ticks
    rc2=pl.boxplot( class2_data, sym="rx",patch_artist=True,
                    positions=evens)  #the values for class +1
    for p in rc2['boxes']: p.set_color("r")
        
    #draw light vertical lines to group the box plot pairs
    ax = pl.gca()
    yrange = ax.yaxis.get_majorticklocs()
    pl.vlines(sp.array([0]+evens)+0.5, yrange.min(), yrange.max(), color='lightgray', linestyles='solid',
              alpha=0.5)

    #draw the ttest-measure score for each feature at the top
    x_pos_list = sp.array(range(1,2*N+1,2))+0.5
    y_pos = 0.9*yrange.max()
    for i,idx in enumerate(idxlist):
        tscore = TT[idx,0]
        #pval = TT[idx,1]
        pl.text( x_pos_list[i], y_pos, "%2.1f"%tscore, color='blue', ha='center')
    
    #labels, titles, and tick marks
    pl.xlabel("Gene")
    pl.ylabel("Expression Level")
    pl.title(titlestr)
    pl.xticks( x_pos_list, genelist2, rotation='vertical')
    pl.xlim([0,2*N+1])

    #legend
    r1 = pl.Rectangle((0,0),1,1,fc='b')
    r2 = pl.Rectangle((0,0),1,1,fc='r')    
    pl.legend( [r1,r2],class_labels, loc=4)

    pl.subplots_adjust(bottom=0.2, left=0.05, right=0.95)
    pl.draw()