Example #1
0
def main():
    test_set = False
    for name, count, p in pipelines:
        print "{}. {}".format(count, name)
    print "\n\n Select a combination or a comma seperated list of combinations. 'a' will select every option"
    l = raw_input().strip()
    if l == "a":
        selections = list(range(len(pipelines)))
    else:
        selections = [int(x.strip()) for x in l.split(",")]
    print "\nEnter the location of the test set file,\nIf empty, no predictions will be output"
    print "[d=data/ml_dataset_test_in.csv]"
    file_location = raw_input().strip()
    test_set = bool(file_location)
    if file_location == "d":
        file_location = "data/ml_dataset_test_in.csv"
    if test_set:
        test_lines, _ = read_input_file(quick_n_dirty=False, file_name=file_location, test_set=test_set)
    lines, targets = read_input_file(quick_n_dirty=False)
    scores = score_classifiers(lines, targets, selections)
    scores.sort(key=lambda x: -1*x[0])
    best_combo = scores[0]
    print "Best combination is "+ str(best_combo[4])
    print "With parameters: "
    for param in best_combo[3].keys():
        print "\t" + param + ": " + str(best_combo[1].best_estimator_.get_params()[param])
    if test_set:
        output_predictions(estimator=best_combo[1].best_estimator_, train_lines=lines, train_targets=targets, test_lines=test_lines)
Example #2
0
from classifiers.naive_bayes import NaiveBayes
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import generate_dataset as gd
import pdb, unittest
import numpy as np
import random
import math
random.seed(1234)

lines, targets = gd.read_input_file(quick_n_dirty=True)
N = len(lines)
lines_and_targets = zip(lines,targets)
random.shuffle(lines_and_targets)
lines, targets = zip(*lines_and_targets)
targets = np.vstack(targets)

train_lines = lines[:8*N/10]
train_targets = targets[:8*N/10].ravel()

test_lines = lines[8*N/10:]
test_targets = targets[8*N/10:].ravel()

count_vect = CountVectorizer()
X_count = count_vect.fit_transform(train_lines)
X_count_test = count_vect.transform(test_lines)

tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(train_lines)
X_tfidf_test = tfidf_vect.transform(test_lines)
from generate_dataset import read_input_file
from sklearn.feature_extraction.text import CountVectorizer
import pdb
from collections import Counter
import json
import numpy as np
lines, targets = read_input_file(False)
data= {}

u = CountVectorizer()
X = u.fit_transform(lines)
data['num_unigrams'] = X.shape[1]
u = CountVectorizer(min_df=2)
X = u.fit_transform(lines)
data['num_unigrams_df_1'] = data['num_unigrams'] - X.shape[1]

word_counts = zip(u.get_feature_names(),X.sum(0).view(np.ndarray).ravel())
word_counts.sort(key= lambda x: -x[1])
data['top_100_unigrams'] = word_counts[:100]
word_c_dict = dict(word_counts)
s = set()
for i in range(4):
    f = X[(targets==i).ravel(), :].sum(0).view(np.ndarray)
    wc = zip(u.get_feature_names(),f.ravel())
    wc.sort(key=lambda x: -x[1])
    wc = [k[0] for k in wc]
    data["top_100_unigrams_for_class_"+str(i)] = wc[:100]
    s = s | set(wc[:100])
for i in range(4):
    s = s & set(data['top_100_unigrams_for_class_'+str(i)])