print np.var([i[0] for i in negative]) print np.var([i[1] for i in negative]) print np.var([i[0] for i in positive]) print np.var([i[1] for i in positive]) print "----" clustering = KMeans(n_clusters=4, n_jobs=-2, n_init=12, init='random') clustering.fit(training_data) test_data = pca.transform(test_data) a = clustering.predict(test_data) scores = gf.precision_recall_etc(a, test_category) print scores """ a_positive = a[:150] a_negative = a[150:] print len([i for i in a_positive if i==0]) print len([i for i in a_positive if i==1]) print len([i for i in a_positive if i==2]) print len([i for i in a_positive if i==3]) #print len([i for i in a_positive if i==4]) print "-" print len([i for i in a_negative if i==0]) print len([i for i in a_negative if i==1]) print len([i for i in a_negative if i==2]) print len([i for i in a_negative if i==3])
# Loop over different step sizes to see if "fine tuning" the step size matters for val in steps_to_try: # Define a feature selector feature_sel = RFECV(estimator_to_use, step=val, cv=8, n_jobs=3, verbose=1) # Define a pipeline to feature select and classify the data, then train pipeline = Pipeline([('select', feature_sel), ('berno', classifier)]) pipeline.fit(bagofwords_training, sentiment_training) # Predict sentiment of test data predict_test = pipeline.predict(bagofwords_test) # Calculate performance and number of features, then collect and store these values num_features.append(pipeline.named_steps['select'].n_features_) output = general_f.precision_recall_etc(predict_test, sentiment_test) precision.append(output['precision']) recall.append(output['recall']) specificity.append(output['specificity']) NPV.append(output['NPV']) f1.append(output['f1']) # Plot out performance as a function of number of features print "precision max: " + str(np.argmax(precision)) plt.plot(num_features, precision, label='precision') print "recall max: " + str(np.argmax(recall)) plt.plot(num_features, recall, label='recall') print "spec. max: " + str(np.argmax(specificity)) plt.plot(num_features, specificity, label='spec.') print "NPV max: " + str(np.argmax(NPV)) plt.plot(num_features, NPV, label='NPV')
sentiment_training = np.loadtxt('output/out' + ignore + '_classes_' + number + '.txt', unpack=True) # Read in the bag of words representation of the data bagofwords_training = general_f.read_bagofwords_dat('output/out' + ignore + '_bag_of_words_' + number + '.csv') # The same for the test data sentiment_test = np.loadtxt('output/test' + ignore + '_classes_' + number + '.txt', unpack=True) bagofwords_test = general_f.read_bagofwords_dat('output/test' + ignore + '_bag_of_words_' + number + '.csv') # Define and train the classifier classifier = RFC(n_estimators=150, criterion='entropy', n_jobs=3, verbose=0).fit(bagofwords_training, sentiment_training) # Predict sentiment of the test data predict_test = classifier.predict(bagofwords_test) # Calculate accuracy and print print "" test_percentage = general_f.accuracy_percentage(predict_test, sentiment_test) print "RFC Test percentage for " + number + " : " + str(test_percentage) print 'other stuff of test data: ' print general_f.precision_recall_etc(predict_test, sentiment_test)