def findUnigrams(): unigrams = [] conditions = [] i = 0; # iterate through each user's tweets for fileName in os.listdir(dataPath): i += 1 with open(dataPath + fileName, 'r+') as f: print("Started Parsing: " + fileName) joinedTweets = "" for line in f: dataPayload = json.loads(line) condition = dataPayload['metadata']['condition'] if condition == "ptsd": print "PTSD" continue userCorpus = dataPayload['allTokensLemmatized'] # removing stop words from the entire corupus userCorpus = removeStopWords(userCorpus) unigrams.append(" ".join(userCorpus)) if condition == "control": conditions.append(1) else: conditions.append(0) forest = RandomForestClassifier(n_estimators = 100) # forest = forest.fit(train_data_features, train["sentiment"] ) vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) train_data_features = vectorizer.fit_transform(unigrams) train_data_features = train_data_features.toarray() score = cvs(forest, train_data_features, conditions, cv=10) print score # with open('../outputs/unigrams.json', 'w') as outfile: # json.dump(featureDict, outfile) print 'done writing unigrams ' + str(i)
def build(args): """ Builds the models from the arguments. In a real applciation, would probably arguments: - fixtures (where the training data is) - model_dir (where to write the models out to) - kfolds (number of cross validation folds) For now, just write out the pickles to HEAT_MODEL and COLD_MODEL """ start = time.time() # Load data and estimator dataset = load_energy() alphas = np.logspace(-10, -2, 200) scores = {} for y in ('Y1', 'Y2'): # Perform cross validation, don't worry about Imputation here clf = linear_model.RidgeCV(alphas=alphas) scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12) # Get the alpha from the ridge by fitting the entire data set. # There are a couple of reasons for this, but mostly to ensure that # we get the desired result pickled (e.g. a ridge with alpha) clf.fit(dataset.data, dataset.target(y)) # Build the model on the entire datset include Imputer pipeline model = linear_model.Ridge(alpha=clf.alpha_) imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) estimator = Pipeline([("imputer", imputer), ("ridge", model)]) estimator.fit(dataset.data, dataset.target(y)) # Dump the model jump = { 'Y1': HEAT_MODEL, 'Y2': COLD_MODEL, } with open(jump[y], 'wb') as f: pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL) msg = ("%s trained on %i instances using a %s model\n" " average R2 score of %0.3f using an alpha of %0.5f\n" " model has been dumped to %s\n") print(msg % ( y, len(dataset.data), model.__class__.__name__, scores[y].mean(), clf.alpha_, jump[y], )) build_time = time.time() - start return "Build took %0.3f seconds" % build_time
def build(args): """ Builds the models from the arguments. In a real applciation, would probably arguments: - fixtures (where the training data is) - model_dir (where to write the models out to) - kfolds (number of cross validation folds) For now, just write out the pickles to HEAT_MODEL and COLD_MODEL """ start = time.time() # Load data and estimator dataset = load_energy() alphas = np.logspace(-10, -2, 200) scores = {} for y in ('Y1', 'Y2'): # Perform cross validation, don't worry about Imputation here clf = linear_model.RidgeCV(alphas=alphas) scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12) # Get the alpha from the ridge by fitting the entire data set. # There are a couple of reasons for this, but mostly to ensure that # we get the desired result pickled (e.g. a ridge with alpha) clf.fit(dataset.data, dataset.target(y)) # Build the model on the entire datset include Imputer pipeline model = linear_model.Ridge(alpha=clf.alpha_) imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) estimator = Pipeline([("imputer", imputer), ("ridge", model)]) estimator.fit(dataset.data, dataset.target(y)) # Dump the model jump = { 'Y1': HEAT_MODEL, 'Y2': COLD_MODEL, } with open(jump[y], 'wb') as f: pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL) msg = ( "%s trained on %i instances using a %s model\n" " average R2 score of %0.3f using an alpha of %0.5f\n" " model has been dumped to %s\n" ) print(msg % ( y, len(dataset.data), model.__class__.__name__, scores[y].mean(), clf.alpha_, jump[y], )) build_time = time.time() - start return "Build took %0.3f seconds" % build_time
#KNN knn = knc(p=2) #specify Euclidean distance param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_accuracy = gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_f1 = gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_auc = gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res) #Naive Bayes nb = mnb() nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy') nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1') nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc') #Decision Tree dtree = tr.DecisionTreeClassifier(criterion='gini', splitter='best', max_features=None, max_depth=None, min_samples_split=2, min_samples_leaf=2, max_leaf_nodes=None) param_grid = dict(max_depth=range(1, 23)) dt_accuracy = gscv(dtree, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res)
#KNN knn=knc(p = 2) #specify Euclidean distance param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_accuracy=gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_f1=gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_auc=gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res) #Naive Bayes nb = mnb() nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy') nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1') nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc') #Decision Tree dtree = tr.DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_features = None, max_depth = None,min_samples_split = 2, min_samples_leaf = 2, max_leaf_nodes = None) param_grid = dict(max_depth=range(1, 23)) dt_accuracy = gscv(dtree, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(max_depth=range(1, 23)) dt_f1 = gscv(dtree, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(max_depth=range(1, 23)) dt_auc = gscv(dtree, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)
import SVM as CLF import numpy as np import matplotlib.pyplot as plt from sklearn.cross_validation import cross_val_score as cvs from sklearn.ensemble import AdaBoostClassifier as ABC df, salary, keys = CLF.clean(CLF.get_data()) estimators = [10, 20, 30, 40, 50, 100, 200, 400] estimator_scores = [] for estimator in estimators: clf = ABC(n_estimators=estimator) estimator_scores.append(cvs(clf, df, salary).mean()) learning_rates = [1, 10, 20, 30, 40, 50, 100, 200] learning_scores = [] best_estimator = estimators[estimator_scores.index(max(estimator_scores))] for rate in learning_rates: clf = ABC(n_estimators=best_estimator, learning_rate=rate) learning_scores.append(cvs(clf, df, salary).mean()) n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R learning_rate = 1. fig = plt.figure() ax = fig.add_subplot(111)
x+= pegasos_svm_test(train_set[test], w); y_axis.append(x/5); print(i); plt.title("Cross Validation Error"); plt.xlabel("Lambda"); plt.ylabel("Error"); plt.plot(x_axis,y_axis); w = pegasos_svm_train(train_set,2**-3); print(pegasos_svm_test(test_set,w)); ### Question 2d### clf = OneVsRestClassifier(SVC()).fit(train_set_x, train_set_y); y = clf.predict(test_set_x); print((test_num-np.sum(np.equal(y,test_set_y)))/test_num); ### Question 2e ### print(1-np.mean(cvs(OneVsRestClassifier(SVC()),train_set_x,y=train_set_y,cv=10))); ### Question 2f ### print(1-np.mean(cvs(OneVsRestClassifier(SVC()),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=1/len(train_set)**2,C=100)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=1/len(train_set)**2,C=1)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=0,C=.0001)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(C=1.5)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(C=10000)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=.001, C=1)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=0.01, C=1000)),train_set_x,y=train_set_y,cv=10))); print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=1/len(train_set),C=100)),train_set_x,y=train_set_y,cv=10)));
bnb.fit(df_train, out_train) bnb.score(df_test, out_test) # ### Better, but still not close to 90% # #### So the Naive Bayes Classifier also doesnot do a good job in this task. # #### Now I will try and experiment with Tree based classifiers. # ## Decision Trees # In[ ]: from sklearn.cross_validation import cross_val_score as cvs from sklearn.tree import DecisionTreeClassifier as dtree tr = dtree() cvs(tr, df_train, out_train, cv=10) # ### Playing around with the tree parameters # # #### Effect of changing the max depth # In[ ]: for i in range(2, 20): tr = dtree(max_depth=i) print("Max Depth = " + str(i) + "\t Score: ") print(np.mean(cvs(tr, df_train, out_train, cv=10))) print("\n") # ### Visualization of effects of max depth