def main(): # Logistic regression or Random forest output_type = sys.argv[1] raw_file_path = sys.argv[2] path_to_output = sys.argv[3] raw_input_rdd = sc.textFile( raw_file_path, minPartitions=32).map(lambda line: line.encode("utf-8")) process_data = raw_input_rdd.map(lambda line: replacetab(line)) df_for_pp = create_df(process_data) pp = PreProcess(df_for_pp) preprocessed_data = pp.preprocess_data() data = None if output_type == "rf": data = prep_rf(preprocessed_data) # write to file the data variable data.persist(StorageLevel(True, True, False, False, 1)) data.write.parquet(path_to_output + "final_" + output_type + "_data.parquet")
def update_recommendations(self): """ Note: """ if self.pairs_served < 1: self.recommendations = {} for metric in self.metrics: self.df1 = PreProcess(self.df1).filter_df(metric) self.df2 = PreProcess(self.df2).filter_df(metric) self.update_similarity_matrix('euclidean') # reset the df to their orignal version for the next iteration self.df1 = self.orginal_df1 self.df2 = self.orginal_df2 self.recommendations[metric] = np.argsort( self.sim_mat[0])[-(self.num_matches):].tolist() return self.recommendations
from pre_processing import PreProcess import numpy as np import matplotlib.pyplot as plt import time from sklearn import tree from sklearn.metrics import accuracy_score import math import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn import metrics preprocess = PreProcess("data/train", "data/test") preprocess.read_train_test_data() max_features = 1000 feat_list = [] acc_list = [] while (max_features < 41000): count_vect = CountVectorizer(stop_words='english', max_features=max_features) X_train_fit = count_vect.fit(preprocess.training_data) X_train_counts = X_train_fit.transform(preprocess.training_data) tfIdfFit = TfidfTransformer(use_idf=True, norm='l2', sublinear_tf=True).fit(X_train_counts) preprocess.traintfIdf = tfIdfFit.transform(X_train_counts) X_test_counts = X_train_fit.transform(preprocess.test_data) preprocess.testtfIdf = tfIdfFit.transform(X_test_counts) nb_clf = MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None) nb_clf.fit(preprocess.traintfIdf, preprocess.train_target)
print("-------------------------------------\n") KNN_5 = NearestNeighbors(train, 5) KNN_5.train(train) res_k5 = KNN_5.test(test) return res_k5 if __name__ == '__main__': start_time = time.process_time() print("PRE-PROCESSING DATA...") print("-------------------------------------\n") pre_process = PreProcess() folder = 'data/' data = pre_process.pre_process(folder) splits = [.9, .8, .7, .6] ''' The following lists will contain lists, each with a result, in the format: [total, correct, accuracy, precision, recall, f1_score] ''' nb_results = [] knn1_results = [] knn5_results = [] for split in splits: print("----------------------") print("|| SPLIT = {}/{} ||".format(split, round(1 - split, 1)))
from sklearn.grid_search import GridSearchCV from pre_processing import PreProcess from sklearn import metrics from sklearn.cross_validation import cross_val_score from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier preprocess = PreProcess("data/train", "data/test") preprocess.read_train_test_data() preprocess.getTfIdf() ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=500, learning_rate=1) scores = cross_val_score(ab_clf, preprocess.traintfIdf, preprocess.train_target, cv=3) print("the cross validated accuracy on training is " + str(scores)) print( "the cross validated accuracy(standard deviation) on training is: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2)) ab_clf.fit(preprocess.traintfIdf, preprocess.train_target) # finding the training and test predictions train_pred_ab = ab_clf.predict(preprocess.traintfIdf) test_pred_ab = ab_clf.predict(preprocess.testtfIdf) ab_train_accuracy = metrics.accuracy_score(preprocess.train_target, train_pred_ab)
# ftest = open("data/svm_wrong.dat", 'r') # svm = np.loadtxt(ftest, delimiter=',') # # ftest = open("data/softmax_wrong.dat", 'r') # sm = np.loadtxt(ftest, delimiter=',') # # # ftest = open("data/nb_wrong.dat", 'r') # nb = np.loadtxt(ftest, delimiter=',') # # svm = set(svm) # sm = set(sm) # nb = set(nb) # venn3([svm, sm, nb], ('SVM', 'Softmax', 'Naive Bayes')) # plt.show() preprocess = PreProcess("data/train", "data/test") preprocess.read_train_test_data() preprocess.getTfIdf() preprocess.polarity_POS_features() pca = PCA(n_components=2) X_r = pca.fit(preprocess.traintfIdf.toarray()).transform(preprocess.traintfIdf.toarray()) print "The number of features " + str(pca.n_components_) target_names = ['Bad', 'Neutral', 'Good'] plt.figure() colors = ['navy', 'turquoise', 'darkorange'] lw = 2 for color, i, target_name in zip(colors, [0, 1, 2], target_names): plt.scatter(X_r[preprocess.train_target == i, 0], X_r[preprocess.train_target == i, 1], color=color, alpha=.8, lw=lw, label=target_name)
import numpy as np from sklearn.metrics.pairwise import pairwise_distances from pre_processing import PreProcess from learn_preferences import LearnPreferences df_sea = pd.read_csv('data/metrics/seattle_test.csv', index_col=0) df_sf = pd.read_csv('data/metricssanfran_test.csv', index_col=0) sea_ref = df_sea[['city', 'state', 'street', 'finishedsqft', 'bedrooms', 'bathrooms', 'trans_score', 'walkscore_score']] sf_ref = df_sf[['city', 'state', 'street', 'finishedsqft', 'bedrooms', 'bathrooms', 'trans_score', 'walkscore_score']] # create the PreProcesss objects prep_sf = PreProcess(df_sf) prep_sea = PreProcess(df_sea) # drop the unneccessary columns, clean_up NA's and normalize use in the recommender prep_sf.drop_columns() sf = prep_sf.preprocess_df() sf = prep_sf.normalize_columns() prep_sea.drop_columns() sea = prep_sea.preprocess_df() sea = prep_sea.normalize_columns() # specify the metrics to use for the similarity matrix metrics = ['walk_distance', 'space_distance'] # init a LearnPreferences object with seed house of SanFran index 3, and use 50 listings lp = LearnPreferences(sf, sea, sf_ref, sea_ref, metrics, 3, 50)
from sklearn.model_selection import train_test_split from sklearn.svm import SVC import pandas as pd import numpy as np from sklearn.metrics import confusion_matrix from pre_processing import PreProcess #define img-processing parameters low_t = 10 high_t = 100 max_length = 5 SRC_DIR = '' data_process = PreProcess(SRC_DIR) input_data_1 = data_process.process_image(low_t, high_t, max_length) SRC_DIR = '' data_process = PreProcess(SRC_DIR) input_data_2 = data_process.process_image(low_t, high_t, max_length) input_data_1.append(input_data_2) imgdata = input_data_1 x = imgdata.drop('Class', axis=1) y = imgdata['Class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) svm_classifier = SVC(kernel='linear') svm_classifier.fit(X_train, y_train) y_pred = svclassifier.predict(X_test)