def merge_data(): # print train_data_dir + "/train_pair*" train_pairs = glob.glob(train_data_dir + "/*train_pairs*") print list(zip(train_pairs, list(xrange(0, 4)))) for i, train_pair in enumerate(train_pairs): dir_name = ntpath.dirname(train_pair) pref = ntpath.basename(train_pair).split("train_pairs")[0] suffix = ntpath.basename(train_pair).split("train_pairs")[-1] # print pref, suffix info = dir_name + "/" + pref + "train_publicinfo" + suffix target = dir_name + "/" + pref + "train_target" + suffix print info, pref, suffix X = data_io.read_train_pairs(train_pair) y = data_io.read_train_target(target) inf_data = data_io.read_train_info(info) X, y, inf_data = process_indices(X, y, inf_data, i) if "X_merged" not in locals(): X_merged = X y_merged = y info_merged = inf_data else: print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape X_merged = X_merged.append(X) y_merged = y_merged.append(y) info_merged = info_merged.append(inf_data) print "Shape thus far", X_merged.shape, y_merged.shape return X_merged, y_merged, info_merged
def merge_data(): #print train_data_dir + "/train_pair*" train_pairs = glob.glob(train_data_dir + "/*train_pairs*") print list(zip(train_pairs, list(xrange(0, 4)))) for i, train_pair in enumerate(train_pairs): dir_name = ntpath.dirname(train_pair) pref = ntpath.basename(train_pair).split("train_pairs")[0] suffix = ntpath.basename(train_pair).split("train_pairs")[-1] #print pref, suffix info = dir_name + "/" + pref + "train_publicinfo" + suffix target = dir_name + "/" + pref + "train_target" + suffix print info, pref, suffix X = data_io.read_train_pairs(train_pair) y = data_io.read_train_target(target) inf_data = data_io.read_train_info(info) X, y, inf_data = process_indices(X, y, inf_data, i) if 'X_merged' not in locals(): X_merged = X y_merged = y info_merged = inf_data else: print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape X_merged = X_merged.append(X) y_merged = y_merged.append(y) info_merged = info_merged.append(inf_data) print "Shape thus far", X_merged.shape, y_merged.shape return X_merged, y_merged, info_merged
def extract_train_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading in the training data") X = data_io.read_train_pairs() y = data_io.read_train_target() #X = X.iloc[1:7] #y = y.iloc[1:7] print("Extracting features: " + str(X.shape)) extracted = features.fit_transform(X, y,type_map = data_io.read_train_info()); elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed/60.0) + " Minutes") print ("Saving features") X = pd.DataFrame(extracted, index = X.index) X.columns = header data_io.save_train_features(X, y.Target) return X
def extract_train_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading in the training data") X = data_io.read_train_pairs() y = data_io.read_train_target() #X = X.iloc[1:7] #y = y.iloc[1:7] print("Extracting features: " + str(X.shape)) extracted = features.fit_transform(X, y, type_map=data_io.read_train_info()) elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed / 60.0) + " Minutes") print("Saving features") X = pd.DataFrame(extracted, index=X.index) X.columns = header data_io.save_train_features(X, y.Target) return X
def selectAllMixed(X, y): info = data_io.read_train_info() info = info[(info['A type'] != "Numerical") | (info['B type'] != "Numerical")] X = X.ix[info.index] y = y.ix[X.index] return X, y
def getTrainingDataset(self): print "Reading in the training data" train = data_io.read_train_pairs() print "Reading the information about the training data" train2 = data_io.read_train_info() train["A type"] = train2["A type"] train["B type"] = train2["B type"] return train
def getDataset(self): if self.getTrain: readData = data_io.read_train_pairs() readData2 = data_io.read_train_info() else: readData = data_io.read_valid_pairs() readData2 = data_io.read_valid_info() readData["A type"] = readData2["A type"] readData["B type"] = readData2["B type"] return readData
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print train print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) features = [x[0] for x in classifier.steps[0][1].features ] csv_fea = csv.writer(open('features.csv','wb')) imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print fea[0], fea[1] csv_fea.writerow([fea[0],fea[1]]) oob_score = classifier.steps[1][1].oob_score_ print "oob score:", oob_score logger = open("run_log.txt","a") if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n") else:logger.write("\n" + str(oob_score[0]) + "\n") print("Saving the classifier") data_io.save_model(classifier) print("Predicting the train set") train_predict = classifier.predict(train) trian_predict = train_predict.flatten() data_io.write_submission(train_predict, 'train_set', run = 'train') t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): X = data_io.read_train_pairs() y = data_io.read_train_target() info = data_io.read_train_info() X, y, info = exploit_symmetries(X, y, info) print X.shape, y.shape print "-1", len(y[y['Target'] == -1]) print "0", len(y[y['Target'] == 0]) print "1", len(y[y['Target'] == 1]) # X = X.iloc[:10] # y = y.iloc[:10] # info = info.iloc[:10] data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv") data_io.save(y, "./Competition/CEfinal_train_target-sym.csv") data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv") print "finished"
def main(): X = data_io.read_train_pairs() y = data_io.read_train_target() info = data_io.read_train_info() X,y, info = exploit_symmetries(X,y, info) print X.shape, y.shape print "-1", len(y[y['Target']==-1]) print "0", len(y[y['Target']==0]) print "1", len(y[y['Target']==1]) # X = X.iloc[:10] # y = y.iloc[:10] # info = info.iloc[:10] data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv") data_io.save(y, "./Competition/CEfinal_train_target-sym.csv") data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv") print "finished"
probb = np.bincount(b.astype('int32')).astype('float64') / len(b) proba_nz = proba[np.nonzero(proba)] probb_nz = probb[np.nonzero(probb)] jointp = np.outer(proba_nz, probb_nz) hpos = np.sum(np.log(jointp) * jointp) return -hpos if __name__ == '__main__': print 'Reading in {} data...'.format(DATA) if DATA == 'train': info = data_io.read_train_info() train = data_io.read_train_pairs() elif DATA == 'valid': info = data_io.read_valid_info() train = data_io.read_valid_pairs() else: raise ValueError print 'Saving coded info matrix...' codes = np.zeros(info.values.shape) lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3} for i, t in enumerate(info.values): a, b = t codes[i, :] = [lookup[a], lookup[b]] savemat('matlab/{}info.mat'.format(DATA), {'codes': codes},
def get_pipeline(): features = feature_extractor() steps = [("extract_features", features), ("classify", RandomForestRegressor(n_estimators=50, verbose=2, n_jobs=2, min_samples_split=10, random_state=1, compute_importances=True))] return Pipeline(steps) if __name__=="__main__": print("Reading in the training data") train_raw = data_io.read_train_pairs() target = data_io.read_train_target() info = data_io.read_train_info() info['iindex'] = range(4050) train = train_raw.join(info) classifier = get_pipeline() ### FOLDS CODE # folds = cval.KFold(len(train), n_folds=2, indices=False) # # results = [] # for i, fold in enumerate(folds): # print("Extracting features and training model for fold " + str(i)) # traincv, testcv = fold # classifier.fit(train[traincv], target[traincv]) # results.append(classifier.score(train[testcv], target[testcv]))
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train = train.append(sup) target = target.append(sup_target) print "Train size = ", str(train.shape) print("Extracting features and training model") (feature_trans, classifier) = get_pipeline() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) print("Train-test split") trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1) print "TrainX size = ", str(trainX.shape) print "TestX size = ", str(testX.shape) print("Saving features") data_io.save_features(orig_train) classifier.fit(trainX, trainY) print("Saving the classifier") data_io.save_model(classifier) testX = numpy.nan_to_num(testX) print "Score on held-out test data ->", classifier.score(testX, testY) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) feature_importrance = classifier.feature_importances_ logger = open("feature_importance.csv","a") for fi in feature_importrance: logger.write(str(fi)) logger.write("\n") t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): global cf start = time.clock() numRows = None cv = False nfold = 10 clf_keys = ["rfg"] try: opts, args = getopt.getopt(sys.argv[1:], "n:c:m:h") except getopt.GetoptError as err: print str(err) sys.exit(2) clamp = lambda n, minn, maxn: max(min(maxn, n), minn) for o, a in opts: if o == "-n": numRows = clamp(int(a), 1, 4050) elif o == "-c": cv = True nfold = int(a) elif o == "-m": if a == "all": clf_keys = [] for clf_key in cf.get_all_keys(): clf_keys.append(clf_key) elif cf.is_valid_key(a): clf_keys = [a] else: print "ERROR: wrong classifier name: " + a elif o == "-h": print 'options:' print "\t -n [number of rows]" print "\t -c [number of folds]" print "\t -m [classifier key | all]" sys.exit(0) else: print "try help: python train.py -h" sys.exit(2) print("Reading in the training data") train = data_io.read_train_pairs(numRows) trainInfo = data_io.read_train_info(numRows) train['A type'] = trainInfo['A type'] train['B type'] = trainInfo['B type'] target = data_io.read_train_target(numRows) if cv: data = {} data['train'] = train data['target'] = target for clf_key in clf_keys: print "Initiating " + str(nfold) + " fold cross validation with classifier " + cf.get_classifier_name(clf_key) crossvalidate(data, nfold, clf_key) else: for clf_key in clf_keys: start_train = time.clock() print("Extracting features and training model") classifier = get_pipeline(clf_key) classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier, clf_key) end_train = time.clock() print 'time taken:', end_train - start_train, 'seconds' end = time.clock() print 'Execution time:', round(end - start, 2)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1, 4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff / 60, 1)