Exemple #1
0
def findUnigrams():

	unigrams = []
	conditions = []

	i = 0;

	# iterate through each user's tweets
	for fileName in os.listdir(dataPath):

		i += 1

		with open(dataPath + fileName, 'r+') as f:
			print("Started Parsing: " + fileName)

			joinedTweets = ""

			for line in f:

				dataPayload = json.loads(line)

				condition = dataPayload['metadata']['condition']

				if condition == "ptsd":
					print "PTSD"
					continue

				userCorpus = dataPayload['allTokensLemmatized']

				# removing stop words from the entire corupus
				userCorpus = removeStopWords(userCorpus)

				unigrams.append(" ".join(userCorpus))

				if condition == "control":
					conditions.append(1)
				else:
					conditions.append(0)

	forest = RandomForestClassifier(n_estimators = 100)

	# forest = forest.fit(train_data_features, train["sentiment"] )

	vectorizer = CountVectorizer(analyzer = "word",   \
				 tokenizer = None,    \
				 preprocessor = None, \
				 stop_words = None,   \
				 max_features = 5000)

	train_data_features = vectorizer.fit_transform(unigrams)

	train_data_features = train_data_features.toarray()

	score = cvs(forest, train_data_features, conditions, cv=10)

	print score

	# with open('../outputs/unigrams.json', 'w') as outfile:
 #   		json.dump(featureDict, outfile)
	print 'done writing unigrams ' + str(i)
Exemple #2
0
def build(args):
    """
    Builds the models from the arguments.
    In a real applciation, would probably arguments:

        - fixtures (where the training data is)
        - model_dir (where to write the models out to)
        - kfolds (number of cross validation folds)

    For now, just write out the pickles to HEAT_MODEL and COLD_MODEL
    """
    start = time.time()

    # Load data and estimator
    dataset = load_energy()
    alphas = np.logspace(-10, -2, 200)

    scores = {}
    for y in ('Y1', 'Y2'):
        # Perform cross validation, don't worry about Imputation here
        clf = linear_model.RidgeCV(alphas=alphas)
        scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12)

        # Get the alpha from the ridge by fitting the entire data set.
        # There are a couple of reasons for this, but mostly to ensure that
        # we get the desired result pickled (e.g. a ridge with alpha)
        clf.fit(dataset.data, dataset.target(y))

        # Build the model on the entire datset include Imputer pipeline
        model = linear_model.Ridge(alpha=clf.alpha_)
        imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
        estimator = Pipeline([("imputer", imputer), ("ridge", model)])
        estimator.fit(dataset.data, dataset.target(y))

        # Dump the model
        jump = {
            'Y1': HEAT_MODEL,
            'Y2': COLD_MODEL,
        }

        with open(jump[y], 'wb') as f:
            pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL)

        msg = ("%s trained on %i instances using a %s model\n"
               "     average R2 score of %0.3f using an alpha of %0.5f\n"
               "     model has been dumped to %s\n")

        print(msg % (
            y,
            len(dataset.data),
            model.__class__.__name__,
            scores[y].mean(),
            clf.alpha_,
            jump[y],
        ))

    build_time = time.time() - start
    return "Build took %0.3f seconds" % build_time
def build(args):
    """
    Builds the models from the arguments.
    In a real applciation, would probably arguments:

        - fixtures (where the training data is)
        - model_dir (where to write the models out to)
        - kfolds (number of cross validation folds)

    For now, just write out the pickles to HEAT_MODEL and COLD_MODEL
    """
    start = time.time()

    # Load data and estimator
    dataset = load_energy()
    alphas  = np.logspace(-10, -2, 200)

    scores = {}
    for y in ('Y1', 'Y2'):
        # Perform cross validation, don't worry about Imputation here
        clf = linear_model.RidgeCV(alphas=alphas)
        scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12)

        # Get the alpha from the ridge by fitting the entire data set.
        # There are a couple of reasons for this, but mostly to ensure that
        # we get the desired result pickled (e.g. a ridge with alpha)
        clf.fit(dataset.data, dataset.target(y))

        # Build the model on the entire datset include Imputer pipeline
        model = linear_model.Ridge(alpha=clf.alpha_)
        imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
        estimator = Pipeline([("imputer", imputer), ("ridge", model)])
        estimator.fit(dataset.data, dataset.target(y))

        # Dump the model
        jump = {
            'Y1': HEAT_MODEL,
            'Y2': COLD_MODEL,
        }

        with open(jump[y], 'wb') as f:
            pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL)

        msg = (
            "%s trained on %i instances using a %s model\n"
            "     average R2 score of %0.3f using an alpha of %0.5f\n"
            "     model has been dumped to %s\n"
        )

        print(msg % (
            y, len(dataset.data), model.__class__.__name__,
            scores[y].mean(), clf.alpha_,
            jump[y],
        ))

    build_time = time.time() - start
    return "Build took %0.3f seconds" % build_time
Exemple #4
0
#KNN
knn = knc(p=2)  #specify Euclidean distance

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_accuracy = gscv(knn, param_grid, cv=10,
                   scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_f1 = gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_auc = gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)

#Naive Bayes
nb = mnb()
nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy')
nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc')

#Decision Tree
dtree = tr.DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  max_features=None,
                                  max_depth=None,
                                  min_samples_split=2,
                                  min_samples_leaf=2,
                                  max_leaf_nodes=None)

param_grid = dict(max_depth=range(1, 23))
dt_accuracy = gscv(dtree, param_grid, cv=10,
                   scoring='accuracy').fit(df_exp, df_res)
#KNN
knn=knc(p = 2) #specify Euclidean distance

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_accuracy=gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_f1=gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_auc=gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)

#Naive Bayes
nb = mnb()
nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy')
nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc')

#Decision Tree
dtree = tr.DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_features = None, max_depth = None,min_samples_split = 2, min_samples_leaf = 2, max_leaf_nodes = None)

param_grid = dict(max_depth=range(1, 23))
dt_accuracy = gscv(dtree, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(max_depth=range(1, 23))
dt_f1 = gscv(dtree, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(max_depth=range(1, 23))
dt_auc = gscv(dtree, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)
Exemple #6
0
import SVM as CLF
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import cross_val_score as cvs
from sklearn.ensemble import AdaBoostClassifier as ABC

df, salary, keys = CLF.clean(CLF.get_data())
estimators = [10, 20, 30, 40, 50, 100, 200, 400]
estimator_scores = []
for estimator in estimators:
    clf = ABC(n_estimators=estimator)
    estimator_scores.append(cvs(clf, df, salary).mean())

learning_rates = [1, 10, 20, 30, 40, 50, 100, 200]
learning_scores = []
best_estimator = estimators[estimator_scores.index(max(estimator_scores))]
for rate in learning_rates:
    clf = ABC(n_estimators=best_estimator, learning_rate=rate)
    learning_scores.append(cvs(clf, df, salary).mean())

n_estimators = 400
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
learning_rate = 1.

fig = plt.figure()
ax = fig.add_subplot(111)


Exemple #7
0
        x+= pegasos_svm_test(train_set[test], w);
    
    y_axis.append(x/5);
    print(i);

plt.title("Cross Validation Error");
plt.xlabel("Lambda");
plt.ylabel("Error");
plt.plot(x_axis,y_axis);

w = pegasos_svm_train(train_set,2**-3);
print(pegasos_svm_test(test_set,w));

### Question 2d###
clf = OneVsRestClassifier(SVC()).fit(train_set_x, train_set_y);
y = clf.predict(test_set_x);
print((test_num-np.sum(np.equal(y,test_set_y)))/test_num);

### Question 2e ###
print(1-np.mean(cvs(OneVsRestClassifier(SVC()),train_set_x,y=train_set_y,cv=10)));

### Question 2f ###
print(1-np.mean(cvs(OneVsRestClassifier(SVC()),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=1/len(train_set)**2,C=100)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=1/len(train_set)**2,C=1)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=0,C=.0001)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(C=1.5)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(C=10000)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=.001, C=1)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=0.01, C=1000)),train_set_x,y=train_set_y,cv=10)));
print(1-np.mean(cvs(OneVsRestClassifier(SVC(gamma=1/len(train_set),C=100)),train_set_x,y=train_set_y,cv=10)));
bnb.fit(df_train, out_train)
bnb.score(df_test, out_test)

# ### Better, but still not close to 90%

# #### So the Naive Bayes Classifier also doesnot do a good job in this task.
# #### Now I will try and experiment with Tree based classifiers.

# ## Decision Trees

# In[ ]:

from sklearn.cross_validation import cross_val_score as cvs
from sklearn.tree import DecisionTreeClassifier as dtree
tr = dtree()
cvs(tr, df_train, out_train, cv=10)

# ### Playing around with the tree parameters
#
# #### Effect of changing the max depth

# In[ ]:

for i in range(2, 20):
    tr = dtree(max_depth=i)
    print("Max Depth = " + str(i) + "\t Score: ")
    print(np.mean(cvs(tr, df_train, out_train, cv=10)))
    print("\n")

# ### Visualization of effects of max depth