def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] # print("Reading data...") Y, X = ctk_io.read_multitask_liblinear( working_dir) # ('data_testing/multitask_assertion/train_and_test') stopper = nn_models.get_early_stopper() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples #print("Data has %d examples and dimension %d" % (num_examples, dimension) ) #print("Output has %d dimensions" % (num_labels) ) X = np.reshape(X, (num_examples, 11, dimension / 11)) Y_adj, indices = ctk_io.flatten_outputs(Y) #print("After reshaping the data has shape %s" % (str(X.shape))) for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind + 1] - indices[label_ind] model = nn_models.get_cnn_model(X.shape, num_outputs) #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1]))) train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind + 1])] #if(train_y.shape[-1] == 1): # print("Number of values=1 is %d" % (train_y.sum())) #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) ) model.fit(X, train_y, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2, callbacks=[stopper]) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True) #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) ) #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50]) sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] # print("Reading data...") Y, X = ctk_io.read_multitask_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test') stopper = nn_models.get_early_stopper() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples #print("Data has %d examples and dimension %d" % (num_examples, dimension) ) #print("Output has %d dimensions" % (num_labels) ) X = np.reshape(X, (num_examples, 11, dimension / 11)) Y_adj, indices = ctk_io.flatten_outputs(Y) #print("After reshaping the data has shape %s" % (str(X.shape))) for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind+1] - indices[label_ind] model = nn_models.get_cnn_model(X.shape, num_outputs) #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1]))) train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind+1])] #if(train_y.shape[-1] == 1): # print("Number of values=1 is %d" % (train_y.sum())) #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) ) model.fit(X, train_y, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2, callbacks=[stopper]) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True) #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) ) #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50]) sys.exit(0)
def main(args): working_dir = args[0] print("Reading data...") Y, X = ctk_io.read_multitask_liblinear(working_dir) # get_data() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples print("Data has %d examples and dimension %d" % (num_examples, dimension)) print("Output has %d dimensions" % (num_labels)) Y_adj, indices = ctk_io.flatten_outputs(Y) print("%d labels mapped to %d outputs based on category numbers" % (Y.shape[1], Y_adj.shape[1])) label_scores = [] for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind + 1] - indices[label_ind] # model = models.get_mlp_model(dimension, num_outputs) print("Starting to train for label %d with %d outputs" % (label_ind, num_outputs)) folds = sk.cross_validation.KFold(num_examples, n_folds=num_folds) scores = [] total_tp = 0 total_fp = 0 total_fn = 0 fold_ind = 0 total_score = 0 for train_indices, test_indices in folds: print("Starting fold %d" % fold_ind) train_x = X[train_indices] train_y = Y_adj[train_indices, int(indices[label_ind]) : int(indices[label_ind + 1])] test_x = X[test_indices] test_y = Y_adj[test_indices, int(indices[label_ind]) : int(indices[label_ind + 1])] model = nn_models.get_mlp_model(dimension, num_outputs) model.fit(train_x, train_y, nb_epoch=nb_epoch, batch_size=batch_size) ### This was to test model reading/writing and it works fine. # temp_dir = tempfile.mkdtemp() # json_string = model.to_json() # open(os.path.join(temp_dir, 'model_%d.json' % label_ind), 'w').write(json_string) # model.save_weights(os.path.join(temp_dir, 'model_%d.h5' % label_ind), overwrite=True) # # model = None # # model = model_from_json(open(os.path.join(temp_dir, "model_%d.json" % label_ind)).read()) # model.load_weights(os.path.join(temp_dir, "model_%d.h5" % label_ind)) if num_outputs == 1: labels = test_y predictions = model.predict_classes(test_x, batch_size=batch_size) # labels = np.reshape(test_y, (len(test_y),1)) ## count up true positive occurrences where prediction = label = 1 aka prediction + label == 2 tp = len(np.where((predictions + labels) == 2)[0]) total_tp += tp ## false positives: prediction - label = 1 fp = len(np.where((predictions - labels) == 1)[0]) total_fp += fp ## false negatives: label - prediction = 1 fn = len(np.where((labels - predictions) == 1)[0]) total_fn += fn print("tp=%d, fp=%d, fn=%d" % (tp, fp, fn)) recall = tp / float(tp + fn) if tp > 0 else 0 precision = tp / float(tp + fp) if tp > 0 else 1 f1 = get_f(recall, precision) print("P=%f, R=%f, F1=%f" % (precision, recall, f1)) else: score = model.evaluate(test_x, test_y, batch_size=batch_size) print("score=%s" % (score)) total_score += score[1] # score = model.evaluate(test_x, test_y, show_accuracy=True, batch_size=batch_size) # print("Scores for fold %d:" % fold_ind) # print("test score: ", score[0]) # print("test accuracy: " , score[1]) fold_ind += 1 if num_outputs == 1: recall = total_tp / float(total_tp + total_fn) precision = total_tp / float(total_tp + total_fp) f1 = get_f(recall, precision) print("Overall total: P=%f, R=%f, F=%f" % (recall, precision, f1)) label_scores.append(f1) else: total_score /= num_folds print("Overall accuracy = %f" % (total_score)) label_scores.append(total_score) for ind, val in enumerate(label_scores): print("%s of label %d is %f" % ("Fscore" if num_outputs == 2 else "Accuracy", ind, val))
def get_data(): # data = load_svmlight_file("polarity.liblinear") # return data[0][:, 1:].toarray(), data[1]-1 return ctk_io.read_multitask_liblinear("data_testing/multitask_assertion/train_and_test")
def main(args): working_dir = args[0] print("Reading data...") Y, X = ctk_io.read_multitask_liblinear(working_dir) # get_data() num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples print("Data has %d examples and dimension %d" % (num_examples, dimension)) print("Output has %d dimensions" % (num_labels)) Y_adj, indices = ctk_io.flatten_outputs(Y) print("%d labels mapped to %d outputs based on category numbers" % (Y.shape[1], Y_adj.shape[1])) label_scores = [] for label_ind in range(0, Y.shape[1]): num_outputs = indices[label_ind + 1] - indices[label_ind] # model = models.get_mlp_model(dimension, num_outputs) print("Starting to train for label %d with %d outputs" % (label_ind, num_outputs)) folds = sk.cross_validation.KFold(num_examples, n_folds=num_folds) scores = [] total_tp = 0 total_fp = 0 total_fn = 0 fold_ind = 0 total_score = 0 for train_indices, test_indices in folds: print("Starting fold %d" % fold_ind) train_x = X[train_indices] train_y = Y_adj[train_indices, int(indices[label_ind]):int(indices[label_ind + 1])] test_x = X[test_indices] test_y = Y_adj[test_indices, int(indices[label_ind]):int(indices[label_ind + 1])] model = nn_models.get_mlp_model(dimension, num_outputs) model.fit(train_x, train_y, nb_epoch=nb_epoch, batch_size=batch_size) ### This was to test model reading/writing and it works fine. # temp_dir = tempfile.mkdtemp() # json_string = model.to_json() # open(os.path.join(temp_dir, 'model_%d.json' % label_ind), 'w').write(json_string) # model.save_weights(os.path.join(temp_dir, 'model_%d.h5' % label_ind), overwrite=True) # # model = None # # model = model_from_json(open(os.path.join(temp_dir, "model_%d.json" % label_ind)).read()) # model.load_weights(os.path.join(temp_dir, "model_%d.h5" % label_ind)) if num_outputs == 1: labels = test_y predictions = model.predict_classes(test_x, batch_size=batch_size) # labels = np.reshape(test_y, (len(test_y),1)) ## count up true positive occurrences where prediction = label = 1 aka prediction + label == 2 tp = len(np.where((predictions + labels) == 2)[0]) total_tp += tp ## false positives: prediction - label = 1 fp = len(np.where((predictions - labels) == 1)[0]) total_fp += fp ## false negatives: label - prediction = 1 fn = len(np.where((labels - predictions) == 1)[0]) total_fn += fn print("tp=%d, fp=%d, fn=%d" % (tp, fp, fn)) recall = tp / float(tp + fn) if tp > 0 else 0 precision = tp / float(tp + fp) if tp > 0 else 1 f1 = get_f(recall, precision) print("P=%f, R=%f, F1=%f" % (precision, recall, f1)) else: score = model.evaluate(test_x, test_y, batch_size=batch_size) print("score=%s" % (score)) total_score += score[1] # score = model.evaluate(test_x, test_y, show_accuracy=True, batch_size=batch_size) # print("Scores for fold %d:" % fold_ind) # print("test score: ", score[0]) # print("test accuracy: " , score[1]) fold_ind += 1 if num_outputs == 1: recall = total_tp / float(total_tp + total_fn) precision = total_tp / float(total_tp + total_fp) f1 = get_f(recall, precision) print("Overall total: P=%f, R=%f, F=%f" % (recall, precision, f1)) label_scores.append(f1) else: total_score /= num_folds print("Overall accuracy = %f" % (total_score)) label_scores.append(total_score) for ind, val in enumerate(label_scores): print("%s of label %d is %f" % ("Fscore" if num_outputs == 2 else "Accuracy", ind, val))
def get_data(): #data = load_svmlight_file("polarity.liblinear") #return data[0][:, 1:].toarray(), data[1]-1 return ctk_io.read_multitask_liblinear( 'data_testing/multitask_assertion/train_and_test')