Ejemplo n.º 1
0
#number of trigram types

for i in range(0, Ndata):
    XFeatures.append(XFeaturesUnigrams[i])
    for j in range(0, lBTypes):
        XFeatures[i].append(XFeaturesBigrams[i][j])
    for k in range(0, lTTypes):
        XFeatures[i].append(XFeaturesTrigrams[i][k])
        #combining of feature vectors finished

#print "Feature Vector of size ", len(Xfeatures), " extracted";

labelIdx = 2
import handleClassLabels
print "Class Label Vector Y Extraction Started"
YLabels = handleClassLabels.extractClassLabels(filename, labelIdx)
print "Class Label Vector Y of size ", len(YLabels), " extracted"

#Setting up scaler for standardisation
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

# Training SVM
from sklearn import svm
from sklearn import linear_model
print "Declaring SVM"
#clf = svm.LinearSVC(); # linearsvc1
clf = svm.LinearSVC(C=2000.0, class_weight='auto', penalty='l1', dual=0)
# linearsvc2
#clf = svm.SVC(cache_size = 1000, class_weight='auto', kernel = 'poly'); # Predicts all as POSITIVE :((
#clf = linear_model.SGDClassifier();  # not tried yet
#combining the two feature vectors below
Xfeatures=[];
Ndata=len(XfeaturesUnigrams)#number of sentences in training data 
lUTypes=len(XfeaturesUnigrams[0]); #number of unigram types
lTTypes=len(XfeaturesTrigrams[0]); #number of trigram types
for i in range (0,NData):
    XFeatures.append(XFeaturesUnigrams[i]);
    for j in range (0,lTTypes):
        XFeatures[i].append(XfeaturesTrigrams[i][j]);#combining of feature vectors finished

#print "Feature Vector of size ", len(Xfeatures), " extracted";

labelIdx = 2;
import handleClassLabels;
print "Class Label Vector Y Extraction Started";
YLabels = handleClassLabels.extractClassLabels(filename, labelIdx);
print "Class Label Vector Y of size ", len(YLabels), " extracted";

#Setting up scaler for standardisation
from sklearn import preprocessing
scaler = preprocessing.StandardScaler();

import numpy as np
#from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

C_range = 10. ** np.arange(-3, 4);

param_grid = dict(C=C_range)

# Training SVM