Example #1
0
print np.var([i[0] for i in negative])
print np.var([i[1] for i in negative])

print np.var([i[0] for i in positive])
print np.var([i[1] for i in positive])

print "----"

clustering = KMeans(n_clusters=4, n_jobs=-2, n_init=12, init='random')
clustering.fit(training_data)

test_data = pca.transform(test_data)
a = clustering.predict(test_data)

scores = gf.precision_recall_etc(a, test_category)
print scores
"""
a_positive = a[:150]
a_negative = a[150:]

print len([i for i in a_positive if i==0])
print len([i for i in a_positive if i==1])
print len([i for i in a_positive if i==2])
print len([i for i in a_positive if i==3])
#print len([i for i in a_positive if i==4])
print "-"
print len([i for i in a_negative if i==0])
print len([i for i in a_negative if i==1])
print len([i for i in a_negative if i==2])
print len([i for i in a_negative if i==3])
# Loop over different step sizes to see if "fine tuning" the step size matters
for val in steps_to_try:
    # Define a feature selector
    feature_sel = RFECV(estimator_to_use, step=val, cv=8, n_jobs=3, verbose=1)

    # Define a pipeline to feature select and classify the data, then train
    pipeline = Pipeline([('select', feature_sel), ('berno', classifier)])
    pipeline.fit(bagofwords_training, sentiment_training)

    # Predict sentiment of test data
    predict_test = pipeline.predict(bagofwords_test)

    # Calculate performance and number of features, then collect and store these values
    num_features.append(pipeline.named_steps['select'].n_features_)
    output = general_f.precision_recall_etc(predict_test, sentiment_test)
    precision.append(output['precision'])
    recall.append(output['recall'])
    specificity.append(output['specificity'])
    NPV.append(output['NPV'])
    f1.append(output['f1'])

# Plot out performance as a function of number of features
print "precision max: " + str(np.argmax(precision))
plt.plot(num_features, precision, label='precision')
print "recall max: " + str(np.argmax(recall))
plt.plot(num_features, recall, label='recall')
print "spec. max: " + str(np.argmax(specificity))
plt.plot(num_features, specificity, label='spec.')
print "NPV max: " + str(np.argmax(NPV))
plt.plot(num_features, NPV, label='NPV')
Example #3
0
sentiment_training = np.loadtxt('output/out' + ignore + '_classes_' + number +
                                '.txt',
                                unpack=True)

# Read in the bag of words representation of the data
bagofwords_training = general_f.read_bagofwords_dat('output/out' + ignore +
                                                    '_bag_of_words_' + number +
                                                    '.csv')

# The same for the test data
sentiment_test = np.loadtxt('output/test' + ignore + '_classes_' + number +
                            '.txt',
                            unpack=True)
bagofwords_test = general_f.read_bagofwords_dat('output/test' + ignore +
                                                '_bag_of_words_' + number +
                                                '.csv')

# Define and train the classifier
classifier = RFC(n_estimators=150, criterion='entropy', n_jobs=3,
                 verbose=0).fit(bagofwords_training, sentiment_training)

# Predict sentiment of the test data
predict_test = classifier.predict(bagofwords_test)

# Calculate accuracy and print
print ""
test_percentage = general_f.accuracy_percentage(predict_test, sentiment_test)
print "RFC Test percentage for " + number + " :     " + str(test_percentage)
print 'other stuff of test data: '
print general_f.precision_recall_etc(predict_test, sentiment_test)