def main(): test_set = False for name, count, p in pipelines: print "{}. {}".format(count, name) print "\n\n Select a combination or a comma seperated list of combinations. 'a' will select every option" l = raw_input().strip() if l == "a": selections = list(range(len(pipelines))) else: selections = [int(x.strip()) for x in l.split(",")] print "\nEnter the location of the test set file,\nIf empty, no predictions will be output" print "[d=data/ml_dataset_test_in.csv]" file_location = raw_input().strip() test_set = bool(file_location) if file_location == "d": file_location = "data/ml_dataset_test_in.csv" if test_set: test_lines, _ = read_input_file(quick_n_dirty=False, file_name=file_location, test_set=test_set) lines, targets = read_input_file(quick_n_dirty=False) scores = score_classifiers(lines, targets, selections) scores.sort(key=lambda x: -1*x[0]) best_combo = scores[0] print "Best combination is "+ str(best_combo[4]) print "With parameters: " for param in best_combo[3].keys(): print "\t" + param + ": " + str(best_combo[1].best_estimator_.get_params()[param]) if test_set: output_predictions(estimator=best_combo[1].best_estimator_, train_lines=lines, train_targets=targets, test_lines=test_lines)
from classifiers.naive_bayes import NaiveBayes from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import generate_dataset as gd import pdb, unittest import numpy as np import random import math random.seed(1234) lines, targets = gd.read_input_file(quick_n_dirty=True) N = len(lines) lines_and_targets = zip(lines,targets) random.shuffle(lines_and_targets) lines, targets = zip(*lines_and_targets) targets = np.vstack(targets) train_lines = lines[:8*N/10] train_targets = targets[:8*N/10].ravel() test_lines = lines[8*N/10:] test_targets = targets[8*N/10:].ravel() count_vect = CountVectorizer() X_count = count_vect.fit_transform(train_lines) X_count_test = count_vect.transform(test_lines) tfidf_vect = TfidfVectorizer() X_tfidf = tfidf_vect.fit_transform(train_lines) X_tfidf_test = tfidf_vect.transform(test_lines)
from generate_dataset import read_input_file from sklearn.feature_extraction.text import CountVectorizer import pdb from collections import Counter import json import numpy as np lines, targets = read_input_file(False) data= {} u = CountVectorizer() X = u.fit_transform(lines) data['num_unigrams'] = X.shape[1] u = CountVectorizer(min_df=2) X = u.fit_transform(lines) data['num_unigrams_df_1'] = data['num_unigrams'] - X.shape[1] word_counts = zip(u.get_feature_names(),X.sum(0).view(np.ndarray).ravel()) word_counts.sort(key= lambda x: -x[1]) data['top_100_unigrams'] = word_counts[:100] word_c_dict = dict(word_counts) s = set() for i in range(4): f = X[(targets==i).ravel(), :].sum(0).view(np.ndarray) wc = zip(u.get_feature_names(),f.ravel()) wc.sort(key=lambda x: -x[1]) wc = [k[0] for k in wc] data["top_100_unigrams_for_class_"+str(i)] = wc[:100] s = s | set(wc[:100]) for i in range(4): s = s & set(data['top_100_unigrams_for_class_'+str(i)])