t_val = df_val['target'] x_val = df_val.drop('target',axis=1) from imblearn.over_sampling import SMOTE, ADASYN from imblearn.combine import SMOTEENN,SMOTETomek from imblearn.under_sampling import ClusterCentroids #测试抽样 from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier from sklearn.model_selection import cross_validate from sklearn import linear_model from sklearn.neural_network import MLPClassifier LogisticRegression = linear_model.LogisticRegression(solver='lbfgs',max_iter=500,tol=1e-3) SGD = linear_model.SGDClassifier(loss="hinge", penalty="l2", max_iter=100,tol=1e-3) RF = RandomForestClassifier(10) GBD = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10) nn = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100,50,50,20, 2),tol=1e-3) nn2 = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(10, 2),tol=1e-3) mds = [LogisticRegression,SGD,RF,GBD,nn,nn2] mds_name = ['LogisticRegression','SGD','RF','GBD','nn','nn2'] sample_name = ['ClusterCentroids','SMOTEENN','SMOTETomek','SMOTE','SMOTE borderline1','SMOTE borderline2','ADASYN'] sample_methods = [ClusterCentroids(),SMOTEENN(),SMOTETomek(),SMOTE(),SMOTE(kind='borderline1'),SMOTE(kind='borderline2'),ADASYN()] import time sample_roc = [] i=0 for s in sample_methods:
import features.zip_codes import evaluation if __name__ == '__main__': data_train = pd.read_csv("../data/zip.train", header = None, sep =" ") cleaned_train_data = data_train.dropna(axis=1, thresh=2) input_data = cleaned_train_data.iloc[:, 1:].values targets = cleaned_train_data[0].values input_data2 = features.zip_codes.multires(input_data) # log reg with simple feature set print("Evaluating simple feature set") log_reg = lm.SGDClassifier(n_jobs=1, loss="log", max_iter = 50) classifier.fit(log_reg, input_data, targets) pred, pred_proba = classifier.predict(log_reg, input_data) evaluation.print_errors(targets, pred) print("") # log reg with advanced feature set print("Evaluating modified feature set") log_reg2 = lm.SGDClassifier(n_jobs=1, loss="log", max_iter=50) classifier.fit(log_reg2, input_data2, targets) pred, pred_proba = classifier.predict(log_reg2, input_data2) evaluation.print_errors(targets, pred)
import matplotlib.pyplot as plt from sklearn import svm, linear_model, linear_model x_pos = np.random.uniform(3.8, 4.2, (10000, 2)) x_neg = np.random.uniform(-4.2, -3.8, (100, 2)) y_pos = np.full(10000, 0) y_neg = np.full(100, 1) x = np.concatenate((x_pos, x_neg), axis=0) y = np.concatenate([y_pos, y_neg]) #svc = svm.SVC(kernel='linear', C=10000).fit(x, y) hinglesgd = linear_model.SGDClassifier(loss="hinge", penalty="l2", shuffle=True, average=10, alpha=0.00001).fit(x, y) logsgd = linear_model.SGDClassifier(loss="log", penalty="l2", shuffle=True, average=10, alpha=0.00001).fit(x, y) # create a mesh to plot in h = .02 # step size in the mesh #x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1 #y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1 x_min, x_max = -6, 6 y_min, y_max = -6, 6 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
r'\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b', ))])), ('features_tweets', TweetsToFeatures()), ]) clasificador_usado = Pipeline([ ('features', feature_union), ('clf', naive_bayes.MultinomialNB(alpha=0.01)), ]) elif args.clasificador == "LB2": clasificador_usado = Mayoria() elif args.clasificador == "MNB": clasificador_usado = naive_bayes.MultinomialNB() parameters_grid_search = parameters_mnb elif args.clasificador == "SGD": clasificador_usado = linear_model.SGDClassifier(shuffle=True) else: # "SVM" clasificador_usado = svm.SVC() parameters_grid_search = parameters_svm if args.grid_search: grid_search = GridSearchCV(clasificador_usado, parameters_grid_search, cv=5, verbose=2, n_jobs=8) grid_search.fit(features, clases) print("Mejores parámetros encontrados para " + args.clasificador + ":") for nombre_parametro, valor_parametro in clasificador_usado.get_params(
train.dropna(inplace=True) ans = train.pop('target') # process test test.fillna(value=0, inplace=True) total = train.append(test) # scalization scaler = MinMaxScaler() scaler.fit(total) # sep total to train, test train = total[:pretrain.shape[0]] test = total[pretrain.shape[0]:] # can modify variable clf = linear_model.SGDClassifier(n_jobs=-1, verbose=1) clf.fit(train, ans.astype('int')) result = clf.predict(test) # output result result = pd.DataFrame( { 'id': [str(i) for i in range(0, len(result))], 'target': result }, columns=['id', 'target']) result.to_csv('result.csv', index=False, quoting=2)
print(i) for j in range(ch): spike_interval = np.int32(steps / (train_data[i, j] * 50 + 0.0001)) jitter = np.random.randint(20) spikes = np.zeros(steps - 20) spikes[jitter::spike_interval] = 60 train_in_spikes[j, :-20] = spikes reservoir_network.add_input(train_in_spikes) rate_coding = reservoir_network.simulate() #X[i,:] = rate_coding X[i, :] = rate_coding / (np.max(rate_coding) + 0.0001) #maxX = (np.max(X) + 0.0001) #X = X/maxX print("training linear model") clf = linear_model.SGDClassifier(max_iter=100000, tol=1e-3) clf.fit(X, train_labels) X_test = np.zeros((test_labels.shape[0], reservoir_network.n_nodes)) test_in_spikes = np.zeros((ch, steps)) initial_activities = np.zeros(test_labels.shape[0]) extra_activities = np.zeros(test_labels.shape[0]) for i in range(test_labels.shape[0]): print(i) for j in range(ch): spike_interval = np.int32(steps / (test_data[i, j] * 50 + 0.0001)) jitter = np.random.randint(20) spikes = np.zeros(steps - 20) spikes[jitter::spike_interval] = 60 test_in_spikes[j, :-20] = spikes reservoir_network.add_input(test_in_spikes)
# Again with important features clf2 = RandomForestClassifier(n_estimators=500, max_depth=30, random_state=0).fit(data[data.columns[clf.feature_importances_>0.01]], Y_train) # Predictions results=clf2.predict(test[data.columns[clf.feature_importances_>0.01]]) results = pd.DataFrame({'outcome':results[:]}) # Creating submission file sub=pd.read_csv('sample_submission.csv',header=0) outcome={0:"no", 1:"yes"} sub['outcome'] = results["outcome"].map(outcome) sub.to_csv('random_for_imp.csv') #Score: 75.6083 # SGDClassifier from sklearn import linear_model clf = linear_model.SGDClassifier(max_iter=1000).fit(data,Y_train) results=clf.predict(test) results = pd.DataFrame({'outcome':results[:]}) # Creating submission file sub=pd.read_csv('sample_submission.csv',header=0) outcome={0:"no", 1:"yes"} sub['outcome'] = results["outcome"].map(outcome) sub.to_csv('sgdc.csv') #Score: 76.2646 # LinearSVC with feature selection from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(data, Y_train) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(data)
def train_classifiers(train_x, train_y, test_x, test_y, articulatory=False, dataset_name='', classifiers=['lda'], nframes_mfcc=1): """ train classifiers on the features to look at baseline classifications """ print("size of input layer (== dimension of the features space) %d" % train_x.shape[1]) ### Training a SVM to compare results TODO if 'sgd' in classifiers: ### Training a linear model (elasticnet) to compare results print("*** training a linear model with SGD ***") from sklearn import linear_model clf = linear_model.SGDClassifier( loss='modified_huber', penalty='elasticnet') # TODO change and CV params clf.fit(train_x, train_y) print "score linear classifier (elasticnet, SGD trained)", clf.score( test_x, test_y) with open('linear_elasticnet_classif.pickle', 'w') as w_f: cPickle.dump(clf, w_f) if 'rf' in classifiers: ### Training a random forest to compare results print("*** training a random forest ***") from sklearn.ensemble import RandomForestClassifier clf2 = RandomForestClassifier(n_jobs=-1, max_features='log2', min_samples_split=3) clf2.fit(train_x, train_y) print "score random forest", clf2.score(test_x, test_y) if 'lda' in classifiers: print "*** training a linear discriminant classifier ***" from sklearn.lda import LDA from sklearn.metrics import confusion_matrix from sklearn import cross_validation def lda_on(train_x, train_y, test_x, test_y, feats_name='all_features'): """ Linear Discriminant Analysis """ lda = LDA() lda.fit(train_x, train_y, store_covariance=True) print feats_name, "(train):", lda.score(train_x, train_y) print feats_name, "(test):", lda.score(test_x, test_y) with open(dataset_name + '_lda_classif_' + feats_name + '.pickle', 'w') as w_f: cPickle.dump(lda, w_f) y_pred = lda.predict(test_x) X_train, X_validate, y_train, y_validate = cross_validation\ .train_test_split(train_x, train_y, test_size=0.2, random_state=0) lda.fit(X_train, y_train) print feats_name, "(validation):", lda.score( X_validate, y_validate) y_pred_valid = lda.predict(X_validate) cm_test = confusion_matrix(test_y, y_pred) cm_valid = confusion_matrix(y_validate, y_pred_valid) np.set_printoptions(threshold='nan') with open("cm_test" + feats_name + ".txt", 'w') as w_f: print >> w_f, cm_test with open("cm_valid" + feats_name + ".txt", 'w') as w_f: print >> w_f, cm_valid if articulatory: lda_on(train_x[:, :39 * nframes_mfcc], train_y, test_x[:, :39 * nframes_mfcc], test_y, feats_name='mfcc') lda_on(train_x[:, 39 * nframes_mfcc:], train_y, test_x[:, 39 * nframes_mfcc:], test_y, feats_name='arti') else: lda_on(train_x, train_y, test_x, test_y, feats_name='both') if 'featselec' in classifiers: ### Feature selection print("*** feature selection now: ***") print(" - Feature importances for the random forest classifier") print clf2.feature_importances from sklearn.feature_selection import SelectPercentile, f_classif # SelectKBest TODO? selector = SelectPercentile(f_classif, percentile=10) # ANOVA selector.fit(train_x, train_y) print selector.pvalues_ scores = -np.log10(selector.pvalues_) scores /= scores.max() print(" - ANOVA scoring (order of the MFCC)") print scores from sklearn.feature_selection import RFECV print(" - Recursive feature elimination with cross-validation w/ LDA") lda = LDA() rfecv = RFECV(estimator=lda, step=1, scoring='accuracy') rfecv.fit(train_x, train_y) print("Optimal number of features : %d" % rfecv.n_features_) print("Ranking (order of the MFCC):") print rfecv.ranking_
def sgd_classifiers(): sgd = OneVsRestClassifier(linear_model.SGDClassifier()) return sgd
def main(): path = "../../../Herts/" extension = ".csv" numberLocations = len(files) locationList6 = [] locationList3 = [] locationList1 = [] for i in range(0, numberLocations): toHoldOut = files[i] #day to (location to count) data = {} for dataFile in files: if dataFile != toHoldOut: filename = path + dataFile + extension with open(filename, 'rb') as f: reader = csv.reader(f) count = 0 for row in reader: if (count > 0): date = datetime.datetime.fromtimestamp( float(row[0])) dateString = str(date.month) + "/" + str( date.day) + "/" + str(date.year) if dateString in data: locationCountMap = data[dateString] if dataFile in locationCountMap: locationCountMap[ dataFile] = locationCountMap[ dataFile] + 1 else: locationCountMap[dataFile] = 1 data[dateString] = locationCountMap else: locationCountMap = {} locationCountMap[dataFile] = 1 data[dateString] = locationCountMap else: count = 1 # trace: "3/4/2016", "3/10/2016", "2/29/2016", "2/26/2016", "2/22/2016", "2/19/2016", "2/13/2016", "2/11/2016", "2/9/2016", "2/1/2016", "1/27/2016", "1/26/2016", "1/15/2016", "1/14/2016", "1/13/2016", "12/26/2015", "12/10/2015", "12/8/2015", "12/3/2015", "11/23/2015", "11/13/2015", "11/6/2015", "11/5/2015", "11/1/2015" setOfAllPrecipitationDays = [ "3/2/2016", "2/25/2016", "2/24/2016", "2/23/2016", "2/20/2016", "2/16/2016", "2/15/2016", "2/10/2016", "2/8/2016", "2/5/2016", "2/4/2016", "2/3/2016", "1/23/2016", "1/18/2016", "1/17/2016", "1/16/2016", "1/12/2016", "1/10/2016", "1/4/2016", "12/31/2015", "12/30/2015", "12/29/2015", "12/27/2015", "12/24/2015", "12/23/2015", "12/22/2015", "12/18/2015", "12/17/2015", "12/15/2015", "12/14/2015", "12/2/2015", "12/1/2015", "11/28/2015", "11/22/2015", "11/20/2015", "11/19/2015", "11/12/2015", "11/11/2015", "11/10/2015" ] #x is a list of (list of counts), where each index in the inner lists represents a location x = [] y = [] for day in data: locationCountMap = data[day] toAdd = [] for location in files: if location in locationCountMap: toAdd.append(locationCountMap[location]) else: toAdd.append(0) x.append(toAdd) if day in setOfAllPrecipitationDays: #1 indicates a precipitation day y.append(1) else: #0 indicates a non-precipitation day y.append(0) #"Training" the data (which in this case is just maintaining these training pairs for later use) y = np.array(y) # K nearest neighbors neighbors = KNeighborsClassifier() neighbors.fit(x, y) # SGD sgd = linear_model.SGDClassifier() sgd.fit(x, y) # SVC svc = SVC() svc.fit(x, y) # Bernoulli Naive Bayes nb = BernoulliNB() nb.fit(x, y) # Decision tree decisionTree = tree.DecisionTreeClassifier() decisionTree.fit(x, y) toAdd6 = [] toAdd3 = [] toAdd1 = [] scores1 = cross_validation.cross_val_score(neighbors, x, y, cv=5) scores2 = cross_validation.cross_val_score(sgd, x, y, cv=5) scores3 = cross_validation.cross_val_score(svc, x, y, cv=5) scores4 = cross_validation.cross_val_score(nb, x, y, cv=5) scores5 = cross_validation.cross_val_score(decisionTree, x, y, cv=5) toAdd6.append(np.mean(scores1)) toAdd6.append(np.mean(scores2)) toAdd6.append(np.mean(scores3)) toAdd6.append(np.mean(scores4)) toAdd6.append(np.mean(scores5)) #using best 3 classifiers, svc, sgd, decision tree toAdd3.append(np.mean(scores3)) toAdd3.append(np.mean(scores2)) toAdd3.append(np.mean(scores5)) #using best classifier, svc toAdd1.append(np.mean(scores3)) averageAccuracy6 = np.mean(toAdd6) averageAccuracy3 = np.mean(toAdd3) averageAccuracy1 = np.mean(toAdd1) locationList6.append((averageAccuracy6, i)) locationList3.append((averageAccuracy3, i)) locationList1.append((averageAccuracy1, i)) print "*****************************" print "Average of all 6 classifier methods:" clusterLocations(locationList6) print "*****************************" print "Average of best 3 classifier methods:" clusterLocations(locationList3) print "*****************************" print "Using only best classifier method:" clusterLocations(locationList1)
import numpy import pandas as pd import joblib from sklearn import linear_model from sklearn.model_selection import train_test_split df = pd.read_csv( "https://raw.githubusercontent.com/tyler-martin-12/alexa_check_flag_skill/master/df_final.csv",index_col=0) print(df.head()) x = df.copy().drop('label', axis=1) y = df['label'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3 , random_state=1) lm = linear_model.SGDClassifier(alpha=.1,loss='log') lm.fit(x_train, y_train) joblib.dump(lm, 'model.pkl')
scaler = preprocessing.StandardScaler() scaler.fit(X) scaler.mean_ scaler.scale_ X_scaled = scaler.transform(X) # According to the scikit-learn documentation, the following is a good guess for # the number of iterations required to achieve convergence n_iter = np.ceil(10**6 / X.shape[0]) # As usual, the regularisation parameter 'alpha' can be tuned using # `grid_search.GridSearchCV` gs = grid_search.GridSearchCV( estimator=lm.SGDClassifier(loss='log', penalty='l2', n_iter=n_iter), param_grid={'alpha': 10.0**-np.arange(1, 7)}, scoring='roc_auc', cv=kf ) gs.fit(X_scaled, y) gs.best_estimator_ # Before using this model to predict, we'd need to call `scaler.transform` on # the new data # We can also put everything together in a pipeline… sgd_pipeline = Pipeline([ ('scale', preprocessing.StandardScaler()),
perceptron.fit(X_train, Y_train) Y_pred = perceptron.predict(X_test) acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2) linear_svc = LinearSVC() linear_svc.fit(X_train, Y_train) Y_pred = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) sgd = linear_model.SGDClassifier(max_iter=5, tol=None) sgd.fit(X_train, Y_train) Y_pred = sgd.predict(X_test) sgd.score(X_train, Y_train) acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) print(acc_sgd) random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, Y_train) Y_prediction = random_forest.predict(X_test) print(Y_prediction) import csv with open("output_result.csv", 'w', newline='') as myfile:
plt.legend(loc='best') plt.show # 4:データの整形------------------------------------------------------- X_std = X_xor z = y_xor #解説 5:カーネル近似を適用する------------------------------------------ rbf_feature = RBFSampler(gamma=1, n_components=100, random_state=1) X_std = rbf_feature.fit_transform(X_std) print("X_stdの大きさ ", pd.DataFrame(X_std).shape) #pd.DataFrame(X_std).to_clipboard() #これでクリップボードに保持できるのでエクセルに貼れる # 6:機械学習で分類する--------------------------------------------------- clf_result = linear_model.SGDClassifier( loss="hinge") #loss="hinge", loss="log" # 7:K分割交差検証(cross validation)で性能を評価する--------------------- scores = cross_validation.cross_val_score(clf_result, X_std, z, cv=10) print("平均正解率 = ", scores.mean()) print("正解率の標準偏差 = ", scores.std()) # 8:トレーニングデータとテストデータに分けて実行してみる------------------ X_train, X_test, train_label, test_label = cross_validation.train_test_split( X_std, z, test_size=0.1, random_state=1) clf_result.fit(X_train, train_label) #正答率を求める pre = clf_result.predict(X_test) ac_score = metrics.accuracy_score(test_label, pre) print("正答率 = ", ac_score)
start = time.time() estimator.fit(X_train, y_train) fit_time = time.time() - start n_iter = estimator.n_iter_ train_score = estimator.score(X_train, y_train) test_score = estimator.score(X_test, y_test) return fit_time, n_iter, train_score, test_score # Define the estimators to compare estimator_dict = { 'No stopping criterion': linear_model.SGDClassifier(tol=1e-3, n_iter_no_change=3), 'Training loss': linear_model.SGDClassifier(early_stopping=False, n_iter_no_change=3, tol=0.1), 'Validation score': linear_model.SGDClassifier(early_stopping=True, n_iter_no_change=3, tol=0.0001, validation_fraction=0.2) } # Load the dataset X, y = load_mnist(n_samples=10000) X_train, X_test, y_train, y_test = train_test_split(X, y,
def predict(dir, name, validate): base = "%s/%s" % (dir, name) negpath = "%strain_neg%s.txt" % (base, suffix) pospath = "%strain_pos%s.txt" % (base, suffix) testpath = "%stest_data.txt" % (base) if validate: printer("Validating " + base) else: printer("Predicting " + base) printer("Reading train tweets...") negtweets = [[0, t, -1] for t in p.read_tweets(negpath)] postweets = [[0, t, 1] for t in p.read_tweets(pospath)] testweets = [[0, t, 0] for t in p.read_tweets(testpath)] printer("Reading test tweets...") #testtweets = pd.DataFrame.from_records(p.process_testdata(testpath), columns=["ind","tweet"]) #testtweets["label"] = 0 printer("Processing data...") data = pd.DataFrame(testweets + negtweets + postweets, columns=["ind", "tweet", "label"]) printer("Vectorising data...") featureCount = 0 if useVectoriser: count_vectorizer = CountVectorizer( preprocessor=preprocessor, ngram_range=(1, 3), min_df=3, lowercase=True, binary=False, token_pattern=r'(?u)(?<=\s)\S+(?=\s)') vec_data = count_vectorizer.fit_transform(data["tweet"]) features = count_vectorizer.get_feature_names() featureCount = len(features) printer(features[:60]) printer("Found " + str(featureCount) + " features") else: data["norm"] = [line.split(' ') for line in data["tweet"]] vec_data = bf.vectorise(data["norm"]) printer("Vectorized, learning...") if validate: vec_train, vec_test, labels_train, labels_test = train_test_split( vec_data[10000:], data["label"][10000:], test_size=0.25, random_state=1) else: vec_train = vec_data[10000:] labels_train = data["label"][10000:] vec_test = vec_data[:10000] # printer("Start MLP\n") # clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 64), random_state=1) printer("Start SGD\n") clf = linear_model.SGDClassifier(shuffle=True, max_iter=10000, tol=0.0001, loss='hinge', penalty='l2', alpha=0.0001) printer("Predicting...") clf_output = clf.fit(vec_train, labels_train) # predict data ans = clf.predict(vec_test) pred = clf.decision_function(vec_test) if validate: printer(metrics.classification_report(labels_test, ans)) score = metrics.f1_score(labels_test, ans) return (setting, score, featureCount) else: # save data res = pd.DataFrame(ans) res.index += 1 res.to_csv(path_or_buf="%ssubmission.csv" % name, index=True, index_label="Id", header=["Prediction"]) return ((setting, 0, featureCount))
# Choose model # from sklearn import gaussian_process # Gaussian = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1) # GaussianProcessRegressor # from sklearn import metrics from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.svm import SVC svm = SVC(kernel='linear') from sklearn.svm import LinearSVC svmLinear = LinearSVC() from sklearn import tree cartTree = tree.DecisionTreeClassifier() linear_square = lm.SGDClassifier(loss='squared_loss', penalty='none', max_iter=maxIter, tol=tolerance) ridge = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance, alpha=0.5) # ridgel1 = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance) lasso = lm.SGDClassifier(loss='squared_loss', penalty='l1', max_iter=maxIter, tol=tolerance) logisitc = lm.LogisticRegression() # bayes = lm.BayesianRidge() # Bagging
def setSGD(self): self.classifier = linear_model.SGDClassifier(loss="log") print "Using SGD classifier"
def main(): #November 1st 2015 to Marth 10th 2016 #1447306774 #Thu, 12 Nov 2015 05:39:34 GMT path = "../../../Herts/" extension = ".csv" # numberLocations = len(files) # locationList6 = [] # locationList3 = [] # locationList1 = [] # for i in range(0, numberLocations): # toHoldOut = files[i] #day to (location to count) data = {} #earliestDay = float("inf") #latestDay = float("-inf") for dataFile in files: # if dataFile != toHoldOut: filename = path + dataFile + extension with open(filename, 'rb') as f: reader = csv.reader(f) count = 0 for row in reader: if (count > 0): date = datetime.datetime.fromtimestamp(float(row[0])) dateString = str(date.month) + "/" + str( date.day) + "/" + str(date.year) #thisDay = date.year * 10000 + date.month * 100 + date.day #if(thisDay < earliestDay): # earliestDay = thisDay #if(thisDay > latestDay): # latestDay = thisDay if dateString in data: locationCountMap = data[dateString] if dataFile in locationCountMap: locationCountMap[ dataFile] = locationCountMap[dataFile] + 1 else: locationCountMap[dataFile] = 1 data[dateString] = locationCountMap else: locationCountMap = {} locationCountMap[dataFile] = 1 data[dateString] = locationCountMap else: count = 1 #print earliestDay #print latestDay #setOfAllSchoolDays = ["11/2/2015", "11/3/2015", "11/4/2015", "11/5/2015", "11/6/2015", "11/9/2015", "11/10/2015", "11/11/2015", "11/12/2015", "11/13/2015", "11/16/2015", "11/17/2015", "11/18/2015", "11/19/2015", "11/20/2015", "11/23/2015", "11/24/2015", "11/30/2015", "12/1/2015", "12/2/2015", "12/3/2015", "12/4/2015", "12/7/2015", "12/8/2015", "12/9/2015", "12/10/2015", "12/11/2015", "1/27/2016", "1/28/2016", "1/29/2016", "2/1/2016", "2/2/2016", "2/3/2016", "2/4/2016", "2/5/2016", "2/9/2016", "2/10/2016", "2/11/2016", "2/12/2016", "2/15/2016", "2/16/2016", "2/17/2016", "2/18/2016", "2/19/2016", "2/24/2016", "2/25/2016", "2/26/2016", "2/29/2016", "3/1/2016", "3/2/2016", "3/3/2016", "3/4/2016", "3/7/2016", "3/8/2016", "3/9/2016", "3/10/2016"] setOfAllPrecipitationDays = [ "3/4/2016", "3/10/2016", "2/29/2016", "2/26/2016", "2/22/2016", "2/19/2016", "2/13/2016", "2/11/2016", "2/9/2016", "2/1/2016", "1/27/2016", "1/26/2016", "1/15/2016", "1/14/2016", "1/13/2016", "12/26/2015", "12/10/2015", "12/8/2015", "12/3/2015", "11/23/2015", "11/13/2015", "11/6/2015", "11/5/2015", "11/1/2015", "3/2/2016", "2/25/2016", "2/24/2016", "2/23/2016", "2/20/2016", "2/16/2016", "2/15/2016", "2/10/2016", "2/8/2016", "2/5/2016", "2/4/2016", "2/3/2016", "1/23/2016", "1/18/2016", "1/17/2016", "1/16/2016", "1/12/2016", "1/10/2016", "1/4/2016", "12/31/2015", "12/30/2015", "12/29/2015", "12/27/2015", "12/24/2015", "12/23/2015", "12/22/2015", "12/18/2015", "12/17/2015", "12/15/2015", "12/14/2015", "12/2/2015", "12/1/2015", "11/28/2015", "11/22/2015", "11/20/2015", "11/19/2015", "11/12/2015", "11/11/2015", "11/10/2015" ] #x is a list of (list of counts), where each index in the inner lists represents a location x = [] y = [] for day in data: locationCountMap = data[day] toAdd = [] for location in files: if location in locationCountMap: toAdd.append(locationCountMap[location]) else: toAdd.append(0) x.append(toAdd) if day in setOfAllPrecipitationDays: #1 indicates a school day y.append(1) else: #0 indicates a non-school day y.append(0) #"Training" the data (which in this case is just maintaining these training pairs for later use) y = np.array(y) # K nearest neighbors neighbors = KNeighborsClassifier() neighbors.fit(x, y) # SGD sgd = linear_model.SGDClassifier() sgd.fit(x, y) # SVC svc = SVC() svc.fit(x, y) # Bernoulli Naive Bayes nb = BernoulliNB() nb.fit(x, y) # Decision tree decisionTree = tree.DecisionTreeClassifier() decisionTree.fit(x, y) scores1 = cross_validation.cross_val_score(neighbors, x, y, cv=5) scores2 = cross_validation.cross_val_score(sgd, x, y, cv=5) scores3 = cross_validation.cross_val_score(svc, x, y, cv=5) scores4 = cross_validation.cross_val_score(nb, x, y, cv=5) scores5 = cross_validation.cross_val_score(decisionTree, x, y, cv=5) print "Neighbors:" print np.mean(scores1) print "SGD:" print np.mean(scores2) print "SVM:" print np.mean(scores3) print "NB:" print np.mean(scores4) print "Tree:" print np.mean(scores5)
import numpy as np from sklearn import linear_model from sklearn import tree from sklearn import ensemble from sklearn import svm MODELS = { "SVC": svm.SVC(kernel="linear"), "SGDClassifier": linear_model.SGDClassifier(max_iter=5, tol=-np.infty, random_state=42) }
#!/usr/bin/env python # uses GridSearchCV to optimize SVM import numpy as np import sklearn.linear_model as lm import sklearn.grid_search as gs import lib.loader as ld import sklearn.feature_extraction.text as tfidf # training data trainx, trainy = ld.loadtrain('data/trainingdata.txt') trainx2 = tfidf.TfidfTransformer().fit_transform(trainx) parameters = {'alpha': [10**i for i in np.arange(-5, -2, 0.2)], 'loss': ['hinge', 'log']} mdl = lm.SGDClassifier() clf = gs.GridSearchCV(mdl, parameters, n_jobs=-1, cv=5) clf.fit(trainx2.toarray(), trainy) # print results print(clf.best_score_) # best score print(clf.best_params_) # best params parameters = {'alpha': [10**i for i in np.arange(-3, 0, 0.2)], 'loss': ['hinge', 'log']} mdl = lm.SGDClassifier() clf = gs.GridSearchCV(mdl, parameters, n_jobs=-1, cv=5) clf.fit(trainx2.toarray(), trainy) # print results print(clf.best_score_) # best score print(clf.best_params_) # best params
# y_XXX_all_labels stores the label rank results of each review X_train, X_test, y_train_all_labels, y_test_all_labels = train_test_split( X, y_array, test_size=0.2, random_state=42) #now we only take the first label(real label) to do classification y_train = np.array(y_train_all_labels)[:, 0].tolist() y_test = np.array(y_test_all_labels)[:, 0].tolist() #---------------------------Start Training------------------------------------ if (classifier_type == 'svm'): # classifier = OneVsRestClassifier(svm.SVC(kernel='linear', C=1, probability=True, random_state=0)) classifier = OneVsRestClassifier( linear_model.SGDClassifier(max_iter=500, tol=1e-3, random_state=21, warm_start=warm_start_set)) if (classifier_type == 'mlp'): classifier = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=500, alpha=0.0001, solver='adam', verbose=0, random_state=21) # warm_start=warm_start_set) sys.exit(0) print("Strat training...") classifier.fit(X_train, y_train) #sys.exit(0)
def main(): t0 = time.time() ## 1.b) Load and convert datasets start # # #Get cvs paths # relation_path = r"C:\\Users\\V\\Desktop\\UW Vidal\\Winter 18\\TCSS455 Introduction to Machine Learning\\Project\\training\\relation\\relation.csv" # profile_path = r"C:\\Users\\V\\Desktop\\UW Vidal\\Winter 18\\TCSS455 Introduction to Machine Learning\\Project\\training\\profile\\profile.csv" # # #Convert csv into pandas DataFrame # relation_df = pd.read_csv(relation_path) # profile_df = pd.read_csv(profile_path) # 2. Summarize Data ########################################################### # 2.a) Descriptive statistics # print(profile_df.describe()) # pd.set_option('display.width', 100) # pd.set_option('precision', 3) # correlations = profile_df.corr(method='pearson') # print(correlations) # profile_df.hist() # pyplot.show() # # # ## 3. Prepare Data ############################################################# ## a) Data Cleaning ## b) Feature Selection ## c) Data Transforms # # # userid_col = relation_df[['userid']] # row_counter = 1 # num_users = 1 # userid_dict = {} # # #put all userids' in a dictionary # for index, row in userid_col.iterrows(): # l = row.tolist() # userid = l[0].strip() # if (userid not in userid_dict): # userid_dict[userid] = "" # num_users += 1 # # row_counter += 1 # if (row_counter < -2000): #15change # break # # # #relation_head = relation_df.head(2000) #25change # #profile_head = profile_df.head(2000) #35change # #head_profile.to_csv("head.csv", sep=',') # # # print("Here now") # # #combine all likeids' associated with a userid # #make this the value of the userid in the dictionary # for index, row in relation_df.iterrows():#45change # # row_list = row.tolist() # # userid = str(row_list[1]) # user_vals = userid_dict[userid] # userid_dict[userid] = user_vals + " " + str(row_list[2]) # # t_df = pd.DataFrame.from_dict(userid_dict, orient='index') # t_df = t_df.reset_index() ## remember to reassign when calling a function # t_df.columns = ["userid", "likes"] # # merge_df = pd.merge(t_df, profile_df, on="userid") #55change end merge_df = pd.read_csv('merged.csv', sep=',') # pd.set_option('display.width', 100) # pd.set_option('precision', 3) # correlations = merge_df.corr(method='pearson') # 4. Evaluate Algorithms ###################################################### # a) Split-out validation dataset X = merge_df['likes'] y_gender = merge_df['gender'] y_age = merge_df['age'] #age convert y_age = y_age.apply(convert_age_to_class) y.to_csv("age_classified.csv", sep=',') #general algorithm valida_size = 0.20 seed = 7 X_train, X_validation, y_train, y_validation = train_test_split( X, y, test_size=valida_size, random_state=seed) # ## b) Test options and evaluation metric # count_vect1 = CountVectorizer() X_train = count_vect1.fit_transform(X_train) #X_train is sparse.csr_matrix ## print(type(X_train)) ## print(X_train.shape) ## print(count_vect1.get_feature_names()) count_vect2 = CountVectorizer(vocabulary=count_vect1.vocabulary_) X_validation = count_vect2.fit_transform(X_validation) ## print(type(X_validation)) ## print(X_validation.shape) # # kNN = KNeighborsClassifier() # bNB_clf = BernoulliNB() # dt_clf = DecisionTreeClassifier(random_state=0) # lr_clf = LogisticRegression(random_state=seed) # sgd_clf = linear_model.SGDClassifier(max_iter=10, learning_rate='optimal', random_state=seed) # mNB_clf = MultinomialNB() # # enclf = VotingClassifier(estimators=[('dt', dt_clf),('lr', lr_clf), ('sgd', sgd_clf), ('mNB', mNB_clf)], voting='hard') # # dt_clf.fit(X_train, y_train) # bNB_clf.fit(X_train, y_train) # lr_clf.fit(X_train, y_train) # sgd_clf.fit(X_train, y_train) # mNB_clf.fit(X_train, y_train) # enclf = enclf.fit(X_train, y_train) # # print("Here") # # results_dt = dt_clf.predict(X_validation) # results_bNB = bNB_clf.predict(X_validation) # results_lr = lr_clf.predict(X_validation) # results_sgd = sgd_clf.predict(X_validation) # results_nMB = mNB_clf.predict(X_validation) # results_enclf = enclf.predict(X_validation) # # print("results_bNB") # print(accuracy_score(y_validation, results_dt)) # print(confusion_matrix(y_validation, results_dt)) # print(classification_report(y_validation, results_dt)) # print("results_bNB") # print(accuracy_score(y_validation, results_bNB)) # print(confusion_matrix(y_validation, results_bNB)) # print(classification_report(y_validation, results_bNB)) # print("results_lr") # print(accuracy_score(y_validation, results_lr)) # print(confusion_matrix(y_validation, results_lr)) # print(classification_report(y_validation, results_lr)) # print("results_sgd") # print(accuracy_score(y_validation, results_sgd)) # print(confusion_matrix(y_validation, results_sgd)) # print(classification_report(y_validation, results_sgd)) # print(results_lr) # print(results_sgd) # print() # print("results_mNB") # print(accuracy_score(y_validation, results_nMB)) # print(confusion_matrix(y_validation, results_nMB)) # print(classification_report(y_validation, results_nMB)) # # print("results_enclf") # print(accuracy_score(y_validation, results_enclf)) # print(confusion_matrix(y_validation, results_enclf)) # print(classification_report(y_validation, results_enclf)) ## c) Spot Check Algorithms # models = [] # models.append(('multiNB', MultinomialNB())) # models.append(('bernoulliNB', BernoulliNB())) # models.append(('kNN', KNeighborsClassifier())) # models.append(('LogReg', LogisticRegression())) # models.append(('SGD', linear_model.SGDClassifier(max_iter=10, learning_rate='optimal', random_state=seed))) ## ## ## print() # print('Comparing Algorithms') # results = [] # names = [] # for name, model in models: # kfold = KFold(n_splits=10, random_state = seed) # cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy') # results.append(cv_results) # names.append(name) # print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())) # ## d) Compare Algorithms # fig = pyplot.figure() # fig.suptitle('Algorithm Comparison') # ax = fig.add_subplot(111) # pyplot.boxplot(results) # ax.set_xticklabels(names) # pyplot.show() 'learning_rate':('optimal', 'constant', 'invscaling') #'learning_rate':('optimal', 'constant', 'invscaling') 'learning_rate':('optimal', 'constant', 'invscaling') parameters = {'max_iter': (1, 5, 10, 20), 'shuffle': (True, False)} sgd = linear_model.SGDClassifier() clf = GridSearchCV(sgd, parameters) clf.fit(X_train, y_train) print(clf.cv_results_) # lR = LogisticRegression() # lR.fit(X_train,y_train) # count_vect_val = CountVectorizer() # print("Here") ## X_validation = count_vect_val.fit_transform(X_validation) # predictions = lR.predict(X_validation) # print(accuracy_score(y_validation, predictions)) # print(confusion_matrix(y_validation, predictions)) # print(classification_report(y_validation, predictions)) # # t1 = time.time() # print("\n\n--- %s seconds ---" % (t1-t0)) # winsound.Beep(500,1000) print("Done")
df1 = pd.DataFrame() for i in range(0, a): txt = deepcopy(df["text"][i]) txt1 = re.sub("[^a-zA-Z]", " ", txt) txt2 = txt1.lower().split() txt3 = [j for j in txt2 if not j in content] txt4 = " ".join(txt3) df1 = df1.append([[i, txt4, df["polarity"][i]]], ignore_index=True) df1.columns = ['row_number', 'text', 'polarity'] voc = [] for i in xrange(0, a): voc.append(df1["text"][i]) vectorizer1 = CountVectorizer(max_features=5000) SGD1 = linear_model.SGDClassifier(loss='hinge', penalty='l1') unigram = vectorizer1.fit_transform(voc) unigram_model = SGD1.fit(unigram.toarray(), df1['polarity']) vectorizer2 = CountVectorizer(ngram_range=(1, 2), max_features=5000) SGD2 = linear_model.SGDClassifier(loss='hinge', penalty='l1') bigram = vectorizer2.fit_transform(voc) bigram_model = SGD2.fit(bigram.toarray(), df1['polarity']) tfidf1 = TfidfTransformer(smooth_idf=False) SGD3 = linear_model.SGDClassifier(loss='hinge', penalty='l1') unigram_tfidf = tfidf1.fit_transform(unigram.toarray()) unigramtfidf_model = SGD3.fit(unigram_tfidf.toarray(), df1['polarity']) tfidf2 = TfidfTransformer(smooth_idf=False) SGD4 = linear_model.SGDClassifier(loss='hinge', penalty='l1')
MLA = [ #ensemble method ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #gaussian processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #naive_bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), #nearest neighbours svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), #trees tree.DecisionTreeClassifier(), ]
L = mat['L'][0][0] y = D[:, :L] X = D[:, L:] print(X.shape) print(y.shape) n_instances = X.shape[0] n_features = X.shape[1] n_labels = y.shape[1] classifiers = [] for j in range(n_labels): classifier = linear_model.SGDClassifier(loss='hinge', tol=1e-3, max_iter=1) classifier.partial_fit([X[0, :]], [y[0, j]], classes=[0, 1]) classifiers.append(classifier) predictions = np.zeros((n_instances - 1, n_labels)) probabilities = np.zeros((n_instances - 1, n_labels)) truth = y[1:, :] # truth = np.array(y[1:, :].todense()) # Initialize adwin detector adwin = AdWin() # Start online learning for i in range(1, n_instances): cur_probs = np.zeros(n_labels)
regression(light_reg.SAGARegressor(random_state=RANDOM_SEED)), regression(light_reg.SAGRegressor(random_state=RANDOM_SEED)), regression(light_reg.SDCARegressor(random_state=RANDOM_SEED)), # Sklearn Linear Classifiers classification( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification( linear_model.PassiveAggressiveClassifier( random_state=RANDOM_SEED)), classification(linear_model.Perceptron(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.PassiveAggressiveClassifier( random_state=RANDOM_SEED)), classification_binary( linear_model.Perceptron(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary( linear_model.SGDClassifier(random_state=RANDOM_SEED)),
def generate_model(custid, last_post_time): # custid=17967798 #x = datetime.now() #today_date=str(x).split(" ")[0] #yesterday=datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d') docs = mdb[user_rating_collection].find( { "custid": custid, "seen_unix_time": { '$gt': last_post_time } }, { "category": 1, "keywords": 1, "rating": 1, "seen_unix_time": 1 }) #docs=mdb[user_rating_collection].find({"custid" : custid,"date" :{'$nin':["2019-03-07","2019-03-06"]}},{"category":1,"keywords":1,"rating":1}) X = [] y = [] new_last_post_time = 0 for doc in docs: #print(doc) s = '' if "category" in doc and doc["category"]: s = s + cat[ doc["category"]] if doc["category"] in cat else doc["category"] if "keywords" in doc and doc["keywords"]: s = s + " " + " ".join(doc["keywords"]) X.append(s) y.append(doc["rating"]) #print(custid,doc) new_last_post_time = doc['seen_unix_time'] if len(X) > 0: vectorizer = HashingVectorizer() vect_X = vectorizer.transform(X) #vectorizer = TfidfVectorizer() #vectorizer = CountVectorizer() #tfidf_model=vectorizer.fit_transform(X) #test_X=vectorizer.transform(tX) #from sklearn.linear_model import LogisticRegression #classifier = LogisticRegression(max_iter=500,random_state = 0,multi_class="multinomial",solver="lbfgs",penalty="l2") #classifier = LogisticRegression(max_iter=500,C=1000,solver='lbfgs') #classifier.fit(tfidf_model, y) #model=Pipeline([("tfidf",vectorizer),("lr",classifier)]) #model=joblib.load("../models/"+str(custid)) #model.partial_fit(vect_X,y) try: model = joblib.load("../models/" + str(custid)) model.partial_fit(vect_X, y) joblib.dump(model, "../models/" + str(custid)) print(str(custid) + " partial_fit") except: if len(set(y)) > 1: print(str(custid) + " normal fit") model = linear_model.SGDClassifier(loss='log', penalty="l2", n_iter=500, n_jobs=-1) model.fit(vect_X, y) joblib.dump(model, "../models/" + str(custid)) else: print( str(custid) + " The number of classes has to be greater than one; got 1 class" ) #joblib.dump(vectorizer, "../models/vect_"+str(custid)) print(custid + " last post time" + new_last_post_time) mdb[active_users_collection].update_one({"_id": custid}, { '$set': { 'updated': 1, 'up_time': time.time(), 'last_post_time': new_last_post_time } }) else: print(str(custid) + " don't have atleast 1 flips") print(custid + " last post time" + new_last_post_time) mdb[active_users_collection].update_one( {"_id": custid}, {'$set': { 'updated': 1, 'up_time': time.time() }})
inplace=False, kind='quicksort', na_position='last') final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator] preprocessed_reviews = [] for sentence in final['Text'].values: preprocessed_reviews.append(clean_text(sentence)) count_vect = CountVectorizer() count_vect.fit(preprocessed_reviews) joblib.dump(count_vect, 'count_vect.pkl') X = count_vect.transform(preprocessed_reviews) print(X.shape) Y = final['Score'].values clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, eta0=0.1, alpha=0.001) clf.fit(X, Y) joblib.dump(clf, 'model.pkl') print( predict( 'Have been having this since years. Much better option than Bru.Nescafe still managing to do well in market with ' 'all the competitors breathing down it\'s neck. Good one!')) print( predict( 'The Magic behind that massal I found is nothing but Artificial Food colour. Dear ITC , please provide Natural and safe food to your beloved Indians. Even though the food colouring is approved but try to avoid such artificial food even in small traces and make the young Indian more stronger.' ))
stopword=True, more_stopwords=None, spellcheck=False, stemming=True, remove_numbers=True, deasciify=True, remove_punkt=True, lowercase=True, wordngramrange=(1, 2), charngramrange=(2, 2), nmaxfeature=None, norm="l2", use_idf=True, classifier=sklinear.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42), ) def tr_sentiment_analysis(lang, weights, stopword, more_stopwords, spellcheck, stemming, remove_numbers, deasciify, remove_punkt, lowercase, wordngramrange, charngramrange, nmaxfeature, norm, use_idf, classifier, train_data_folder, train_data_fname, text_col, cat_col, csvsep, shuffle_dataset, cross_val_performance, modelfolder, modelname): conf_sentiment = prepconfig.FeatureChoice( lang, weights, stopword, more_stopwords, spellcheck, stemming,