Esempio n. 1
0
test_labels = []
for row in test_corpus:
    test_data.append(row[0])
    test_labels.append(row[1])

# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 4, max_df = 0.9)
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
# Apply model on test data
test_vectors = vectorizer.transform(test_data)

# Perform classification with SVM, kernal = linear
model = svm.SVC(kernel = 'linear')
model.fit(train_vectors, train_labels)
prediction = model.predict(test_vectors)

print(classification_report(test_labels, prediction))

# Text Matching / Similarity
# Levenshtein Distance

def levenshtein(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1
    distances = range(len(s1) + 1)
    for index2, char2 in enumerate(s2):
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(s1):
            if char1 == char2:
                newDistances.append(distances[index1])
Esempio n. 2
0
# Random Forest:
# Instead of doing these steps one at a time, we can use a pipeline to complete them all at once
randomforest = Pipeline([('vect', vectorizer),
                         ('chi', SelectKBest(chi2, k=1200)),
                         ('clf', RandomForestClassifier(random_state=42))])

# fitting our model and save it in a pickle for later use
model = randomforest.fit(X_train, y_train)
with open('RandomForest.pickle', 'wb') as f:
    pickle.dump(model, f)
ytest = np.array(y_test)

# Evaluating Results:
# Confusion Matrix:
con_mat = cm(ytest, model.predict(X_test))

confusion_matrix = ConfusionMatrix(ytest, model.predict(X_test))
print("Confusion matrix for Random Forest:\n%s" % confusion_matrix)

print(
    'For Random Forest model: \nPrecision: {0:.3f} \nRecall: {1:.3f} \nf1score: {2:.3f} \nAccuracy: {3:.3f}'
    .format(precision_average(con_mat), recall_average(con_mat),
            f1score(con_mat), accuracy(con_mat)))

# Confusion matrix plot:
acc = accuracy(con_mat)
confusion_matrix.plot(normalized=True)
plt.title('Random Forest \nAccuracy:{0:.3f}'.format(acc))
plt.show()