Ejemplo n.º 1
0
results = []

for perc in range(1, 100, 5):

    p = np.empty([numFolds])
    ch2 = SelectPercentile(chi2, percentile=perc)

    #perfrom 5folds cross-validation
    for i in range(0, numFolds):

        #data_txt preproccessing - tokenization, selecting 90% of the best features
        vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize)
        X_train_features = vectorizer.fit_transform(x_train_folds[i])
        X_train_features_names = vectorizer.fit(x_train_folds[i]).vocabulary_

        X_train_features = ch2.fit_transform(X_train_features,
                                             y_train_folds[i])
        selected_features_names = np.asarray(
            vectorizer.get_feature_names())[ch2.get_support()]

        classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2),
                               ('clf', OneVsRestClassifier(LinearSVC()))])

        classifier.fit(x_train_folds[i], y_train_folds[i])

        predicted = classifier.predict(x_test_folds[i])

        print metrics.precision_score(y_test_folds[i], predicted)
        p[i] = metrics.precision_score(y_test_folds[i], predicted)
    print p
    results = np.append(results, p.mean())
    print "Results"
Ejemplo n.º 2
0
from sklearn.externals import joblib


#get the data_txt from DB
numDimensions = 22
numFolds = 5

X_train = uux_data.getUUXSentences(numDimensions)
y_train = uux_data.getUUXSentenceDimension(numDimensions)
y_train_binary = MultiLabelBinarizer().fit_transform(y_train)

target_names = uux_data.getUUXDimensions(numDimensions)


#data_txt preproccessing - tokenization, selecting 90% of the best features
vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize)
X_train_features = vectorizer.fit_transform(X_train)
X_train_features_names = vectorizer.fit(X_train).vocabulary_

ch2 = SelectPercentile(chi2, percentile=16)
X_train_features = ch2.fit_transform(X_train_features, y_train_binary)
selected_features_names = np.asarray(vectorizer.get_feature_names())[ch2.get_support()]
print str(len(selected_features_names))

classifier = Pipeline([
    ('tfidf', vectorizer),
    ('chi2', ch2),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, y_train_binary)
joblib.dump(classifier, 'classifier/uux_classifier.pkl')
    #Conctatenate the 4 folds for training
    for j in range(0, numFolds):
        if (i != j):
            fold_x_train = np.concatenate((fold_x_train, folds_X_train[j]))
            fold_y_train = np.concatenate((fold_y_train, folds_y_train[j]))

    fold_x_test = folds_X_train[i]
    fold_y_test = folds_y_train[i]

    #data_txt preproccessing - tokenization, selecting 90% of the best features
    vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize)
    X_train_features = vectorizer.fit_transform(fold_x_train)
    X_train_features_names = vectorizer.fit(fold_x_train).vocabulary_

    ch2 = SelectPercentile(chi2, percentile=16)
    X_train_features = ch2.fit_transform(X_train_features, fold_y_train)
    selected_features_names = np.asarray(
        vectorizer.get_feature_names())[ch2.get_support()]
    print str(len(selected_features_names))

    classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2),
                           ('clf', OneVsRestClassifier(LinearSVC()))])

    classifier.fit(fold_x_train, fold_y_train)

    predicted = classifier.predict(fold_x_test)

    print classification_report(fold_y_test,
                                predicted,
                                target_names=target_names)
    p[i], r[i], f1[i], s[i] = metrics.precision_recall_fscore_support(