def main(): # The file contains information about the features # Format -> Feature name:Values it can take (seperated by commas) with open("feature.info") as f: data_info = f.readlines() # Create feature nodes features = feature_info(data_info) # Cross Validation print("Running Cross Validation For depths...") for depth in depths: cross_validation(depth, features) # Transform the data with open(f_train) as f: data = [line.rstrip() for line in f] data_train = featurization.featurize(data) with open(f_test) as f: tdata = [line.rstrip() for line in f] data_test = featurization.featurize(tdata) tree = build_tree(data_train, features, -1) print("Accuracy on Train ", test(tree, data_train, False)) print("Accuracy on Test ", test(tree, data_test, False)) with open(f_eval) as f: tdata = [line.rstrip() for line in f] eval_data = featurization.featurize(tdata) test(tree, eval_data, True)
def cross_validation(depth, features): with open(f_train) as f: examples = f.readlines() total = len(examples) fifth = int(total / 5) cv1 = examples[:fifth] cv2 = examples[fifth:2 * fifth] cv3 = examples[2 * fifth:3 * fifth] cv4 = examples[3 * fifth:4 * fifth] cv5 = examples[4 * fifth:] CROSS_VALIDATION_FILES = [cv1, cv2, cv3, cv4, cv5] train_examples = None test_examples = None for i in range(5): test_examples = CROSS_VALIDATION_FILES[i] train_examples_arr = [ exs for exs in CROSS_VALIDATION_FILES if exs != test_examples ] train_examples = [] for arr in train_examples_arr: train_examples.extend(arr) train_data = featurization.featurize(train_examples) test_data = featurization.featurize(test_examples) tree = build_tree(train_data, features, depth) accs = [test(tree, test_data, False)] print("Depth ", depth, "Avg. Accuracy ", np.mean(accs))
def cross_validation(depth, features): data = [] accs = [] for i in range(4): indexs = [j for j in range(4)] indexs.remove(i) f_test = f_cross.replace(".", '0' + str(i) + '.') data_t = featurization.featurize(f_test) data = [] for index in indexs: data += featurization.featurize( f_cross.replace(".", '0' + str(index) + '.')) accs.append(test(data, data_t, features, depth)) print("Depth ", depth, "Avg. Accuracy ", np.mean(accs), "Std. Deviation ", np.std(accs))
def main(): #The file contains information about the features #Format -> Feature name:Values it can take (seperated by commas) with open("info.txt") as f: data_info = f.readlines() #Transform the data data_train = featurization.featurize(f_train) data_test = featurization.featurize(f_test) #Create feature nodes features = feature_info(data_info) print("Accuracy on Train ", test(data_train, data_train, features, -1)) print("Accuracy on Test ", test(data_train, data_test, features, -1)) for depth in depths: cross_validation(depth, features)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--dataset", default=None, type=str, required=True, help="Path to dataframe", ) parser.add_argument( "--glove_path", default=None, type=str, required=True, help="Path to pre-trained glove embeddings", ) parser.add_argument( "--word_dir", default=None, type=str, required=True, help="Path to directory containing all special hand-crafted features", ) parser.add_argument( "--featurization_type", default=0, type=int, required=False, help="Type of featurization 0-->Average Glove, 1-->TF-IDF Glove, 2-->USE", ) parser.add_argument( "--run_implementation", default=0, type=int, required=True, help="run implementation number? (1/2/3)", ) args = parser.parse_args() # load dataset final_df = pd.read_pickle(args.dataset) le=LabelEncoder() final_df.labels=le.fit_transform(final_df.labels) traindf,testdf,xTrain,xTest,yTrain,yTest=split_dataframe(final_df,'Requirements','Requirements_clean','Requirements_clean') glove = Magnitude(args.glove_path) tfidf = TfidfVectorizer() tfidf.fit(xTrain) idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_)) if args.run_implementation==1: if args.featurization_type==1: train_features, test_features, feature_names = featurization.featurize(args,traindf,testdf,featurization.tfidf_glove,glove,idf_dict) elif args.featurization_type==2: train_features, test_features, feature_names = featurization.featurize_USE(args,traindf,testdf,featurization.USE_Vec) else: train_features, test_features, feature_names = featurization.featurize(args,traindf,testdf,featurization.Glove_Vec,glove,None) y_train = np.array(traindf.labels.values) y_test = np.array(testdf.labels.values) file_names=[('rf.sav','RF'),('lr.sav','LR'),('bdt.sav','BDT'),('svc.sav','SVC'),('xgb.sav','XGB'),('ann.sav','ANN'),('knn.sav','KNN')] for _,name in enumerate(file_names): est=tuning(name[1],train_features,y_train) print(name[1]+": ") print_model_metrics(y_testy_test,y_test_pred=np.array(est.predict(test_features)),verbose=False,return_metrics=True) print("\n") pickle.dump(est,open(name[0],'wb')) if args.run_implementation==2: xTrain_pad,xTest_pad,yTrain_enc,yTest_enc,embed_matrix,tok=preprocessing_for_lstm(xTrain,xTest,yTrain,yTest,glove,20000,100,128) y_train = np.array(traindf.labels.values) y_test = np.array(testdf.labels.values) input_lstm={ 'vocabulary_size':len(tok.word_index)+1, 'embed_size':100, 'embedding_matrix':embed_matrix, 'hidden_dim':16, 'output_size':5, 'seq_length':xTrain_pad.shape[1] } model_lstm=LSTM_tf(**input_dic) num_epochs = 5 history = model_lstm.fit(xTrain_pad, yTrain_enc, epochs=num_epochs, validation_data=(xTest_pad, yTest_enc)) print_model_metrics(y_test=np.array(yTest),y_test_pred=np.array(model_lstm.predict_classes(xTest_pad)),verbose=False,return_metrics=True) input_CNN={ 'vocabulary_size':len(tok.word_index)+1, 'embed_size':100, 'embedding_matrix':None, 'filters':8, 'output_size':5, 'pool_size':3, 'kernel_size':3, 'seq_length':xTrain_pad.shape[1] } model_cnn=CNN_tf(**input_CNN) history = model_cnn.fit(xTrain_pad, yTrain_enc, epochs=num_epochs) print(print_model_metrics(y_test=np.array(yTest),y_test_pred=np.array(model_cnn.predict_classes(xTest_pad)),verbose=False,return_metrics=True))
def extract_features(all_inputs): """ Yields (label, feature) tuples. Images labeled by directory name, featurized by featurization.py. """ shuffle(all_inputs) for i, png in enumerate(all_inputs): yield png.split('/')[-2], featurize(png)
return v+v2 ''' fp = open('../input/train.v02.csv') h1 = fp.readline().replace("\"", "").rstrip().split(",") header_1 = {x[0]: x[1] for x in zip(h1, range(len(h1)))} train_raw = [] train = [] train_label = [] for line in csv.reader(fp): train_raw.append(line) my_dict = {} my_dict['month'] = line[header_1['Date']].split("-")[1] train.append(featurize(line, header_1, my_dict)) train_label.append(int(line[header_1['WnvPresent']])) fp.close() fp = open('../input/test.v02.csv') h2 = fp.readline().replace("\"", "").rstrip().split(",") header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))} test_raw = [] test = [] test_label = [] for line in csv.reader(fp): test_raw.append(line) my_dict = {} my_dict['month'] = line[header_1['Date']].split("-")[1] test.append(featurize(line, header_2, my_dict)) test_label.append(int(line[header_2['WnvPresent']]))
for line in f_matrix: if l[h[line[0]]]==line[1]: v[line[2]]=1.0 return v ''' fp = open('../input/train.v02.csv') h1 = fp.readline().replace("\"", "").rstrip().split(",") header_1 = {x[0]: x[1] for x in zip(h1, range(len(h1)))} train_raw = [] train = [] train_label = [] for line in csv.reader(fp): train_raw.append(line) train.append(featurize(line, header_1)) train_label.append(int(line[header_1['WnvPresent']])) fp.close() fp = open('../input/test.v02.csv') h2 = fp.readline().replace("\"", "").rstrip().split(",") header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))} test_raw = [] test = [] test_label = [] for line in csv.reader(fp): test_raw.append(line) test.append(featurize(line, header_2)) test_label.append(int(line[header_2['WnvPresent']])) fp.close()
tree = spatial.KDTree(coords) for line in train_raw: v = tree.query_ball_point([ float(line[header_1['Latitude']]) / 0.9335, float(line[header_1['Longitude']]) ], 0.0058) if len(v) > 0: density = float(sum([train_label[x] for x in v])) / len(v) else: density = 0.0 my_dict = {} my_dict['month'] = line[header_1['Date']].split("-")[1] train.append(featurize(line, header_1, my_dict) + [density]) fp = open('../input/test.v02.csv') h2 = fp.readline().replace("\"", "").rstrip().split(",") header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))} test_raw = [] test = [] test_label = [] for line in csv.reader(fp): test_raw.append(line) v = tree.query_ball_point([ float(line[header_1['Latitude']]) / 0.9335, float(line[header_1['Longitude']]) ], 0.0058) if len(v) > 0:
fp.close() #Construct the KD Tree to help us find neighbors within a radius tree = spatial.KDTree(coords) for line in train_raw: v = tree.query_ball_point([ float(line[header_1['Latitude']]) / 0.9335, float(line[header_1['Longitude']]) ], 0.0058) if len(v) > 0: density = float(sum([train_label[x] for x in v])) / len(v) else: density = 0.0 train.append(featurize(line, header_1, {}) + [density]) fp = open('../input/test.v02.csv') h2 = fp.readline().replace("\"", "").rstrip().split(",") header_2 = {x[0]: x[1] for x in zip(h2, range(len(h2)))} test_raw = [] test = [] test_label = [] for line in csv.reader(fp): test_raw.append(line) v = tree.query_ball_point([ float(line[header_1['Latitude']]) / 0.9335, float(line[header_1['Longitude']]) ], 0.0058) if len(v) > 0: