def export(): """ Process features and exports dataset """ dset = dataset.read('conflicts', 10000) with open(config.CONFLICTS_RESULTS_PATH.format('export', '0'), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ 'path_similarity', 'qos', 'time', 'negation', 'synonyms', 'hierarchy_targets', 'hierarchy_traffics', 'endpoints', 'num_endpoints', 'services', 'num_services', 'groups', 'num_groups', 'traffics', 'num_traffics', 'protocols', 'num_protocols', 'middleboxes', 'num_middleboxes', 'conflict' ]) for case in dset['content']: (path_similarity, qos, time, negation, synonyms, hierarchy_targets, hierarchy_traffics, endpoints, num_endpoints, services, num_services, groups, num_groups, traffics, num_traffics, protocols, num_protocols, middleboxes, num_middleboxes) = get_features(case['sentence'], case['hypothesis']) csv_writer.writerow([ path_similarity, qos, time, negation, synonyms, hierarchy_targets, hierarchy_traffics, endpoints, num_endpoints, services, num_services, groups, num_groups, traffics, num_traffics, protocols, num_protocols, middleboxes, num_middleboxes, case['conflict'] ])
def learning_curve(dataset_size, model_type): """ runs cross validation to plot learning curve """ print("LEARNING CURVE", dataset_size, model_type) dset = dataset.read('conflicts', dataset_size) data, targets = [], [] for case in dset['content']: data.append(get_features(case['sentence'], case['hypothesis'])) targets.append(case['conflict']) model = ClassificationModel(model_type) train_sizes, train_scores, test_scores = model.learning_curve( data, targets) with open(config.LEARNING_CURVE_PATH.format(dataset_size, model_type), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow( ['model', 'dataset_size', 'train_size', 'train_mse', 'test_mse']) for (train_size, train_score, test_score) in zip(train_sizes, train_scores, test_scores): csv_writer.writerow([ model_type, dataset_size, train_size, ','.join(np.char.mod('%f', train_score)), ','.join(np.char.mod('%f', test_score)) ]) plot = plotter.learning_curve(train_sizes, train_scores, test_scores) plot.savefig("../res/plot/learning_{}_{}.pdf".format( dataset_size, model_type))
def train(dataset_size, model_type): """ opens fit dataset and trains SVM/LogReg/Forest model with it, then tests it""" print("MODEL TRAIN", dataset_size, model_type) dset = dataset.read('conflicts', dataset_size) data, targets = [], [] for case in dset['content']: data.append(case) targets.append(case['conflict']) feature_vector = [] with open('../res/training.csv', 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow(['sentence', 'hypothesis', 'features', 'conflict']) for idx, case in enumerate(data): features = get_features(case['sentence'], case['hypothesis']) feature_vector.append(features) # print(feature_vector[idx], targets[idx]) csv_writer.writerow( [case['sentence'], case['hypothesis'], features, targets[idx]]) model = ClassificationModel(model_type) model.train(feature_vector, targets, dataset_size) model.save(dataset_size)
def run(dtype): """ opens specific dataset, splits 75-25 percent for train-test and runs extraction """ print "DATASET ", dtype global END_TRAINING data = dataset.read('extraction', dtype)['intents'] intents = [] for case in data: intent = [] for part in case['parts']: if 'entity_type' in part and not 'alias' in part: part['alias'] = part['entity_type'][1:] intent.append(part) intents.append(intent) print "DATASET CASES #", len(intents) n_samples = int(ceil(len(intents) * 0.75)) training = sample(intents, n_samples) validation = sample(intents, len(intents) - n_samples) diag = Dialogflow(SESSION_ID) diag.update_intent(INTENT_ID, training, False) training_begin = diag.train_agent(training_callback) time_elapsed = None while True: if END_TRAINING: time_elapsed = (END_TRAINING - training_begin) print "Training time: ", time_elapsed break # time.sleep(50) print "Testing..." results = diag.detect_intent_texts(validation) with open(config.EXTRACTION_RESULTS_PATH.format(dtype), 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ "text", "recognized_entities", "training_time", "recall", "precision", "f1_score" ]) for result in results: rec = recall(result) prec = precision(result) f1_sc = f1_score(result) print result['text'] print 'recall: ', rec print 'precision: ', prec print 'f1_score: ', f1_sc csv_writer.writerow([ result['text'], result['recognized_entities'], time_elapsed, rec, prec, f1_sc ])
def test(dataset_size, model_type): """ opens fit dataset and trains SVM/LogReg/Forest model with it, then tests it""" print("MODEL TEST", dataset_size, model_type) dset = dataset.read('conflicts', dataset_size) data, targets = [], [] for case in dset['content']: data.append(case) targets.append(case['conflict']) fit_data, test_data = [], [] fit_cases, test_cases, fit_target, test_target = train_test_split( data, targets, test_size=0.25, shuffle=True, random_state=0) for idx, fit_case in enumerate(fit_cases): fit_data.append( get_features(fit_case['sentence'], fit_case['hypothesis'])) print(fit_data[idx], fit_target[idx]) for test_case in test_cases: test_data.append( get_features(test_case['sentence'], test_case['hypothesis'])) model = ClassificationModel(model_type) start_time = time.time() model.train(fit_data, fit_target, dataset_size) elapsed_time = time.time() - start_time test_results = model.test(test_data) with open(config.CONFLICTS_RESULTS_PATH.format(dataset_size, model_type), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ 'hypothesis', 'sentence', 'type', 'conflict', 'features', 'prediction' ]) for (test_case, result, features) in zip(test_cases, test_results, test_data): csv_writer.writerow([ test_case['hypothesis'], test_case['sentence'], test_case['type'], test_case['conflict'], features, result ]) precision = metrics.precision_score(test_target, test_results) recall = metrics.recall_score(test_target, test_results) f1_score = metrics.f1_score(test_target, test_results) print("FIT TIME", elapsed_time) print("PRECISION", precision) print("RECALL", recall) print("F1 SCORE", f1_score) model.save(dataset_size)
def roc_curve(dataset_size): """ runs cross validation to plot precision recall curve """ print("ROC CURVE", dataset_size) dset = dataset.read('conflicts', dataset_size) data, targets = [], [] for case in dset['content']: data.append(get_features(case['sentence'], case['hypothesis'])) targets.append(case['conflict']) for mtype in ['svm', 'log', 'forest']: model = ClassificationModel(mtype) plot = plotter.plot_roc_curve(dataset_size, mtype, model, data, targets) plot.savefig("../res/plot/roc_{}_{}.pdf".format(dataset_size, mtype), bbox_inches='tight')
def train(dtype): """ opens specific dataset and trains agent with it """ print("DATASET ", dtype) data = dataset.read('extraction', dtype)['intents'] intents = [] for case in data: intent = [] for part in case['parts']: if 'entity_type' in part and not 'alias' in part: part['alias'] = part['entity_type'][1:] intent.append(part) intents.append(intent) print("DATASET CASES #", len(intents)) diag = Dialogflow(evaluation=True) diag.update_intent(INTENT_ID, intents, False) training_begin = diag.train_agent(training_callback)
def analyze_campus_policies(): """ runs tests with the trained Random Forest model, with each pair of intents in the campi dataset """ print "MODEL TEST USING CAMPI" dset = dataset.read('contradictions', 'campi') intents = [] for case in dset['intents']: # if case['university'] not in intents: # intents[case['university']] = [] intents.append((case['university'], case['text'], case['nile'])) model = ClassificationModel('forest') results = [] if model.load_model(10000): # for (uni, intents) in intents.items(): for i in range(len(intents)): (uni_stn, text_stn, sentence) = intents[i] for j in range(i + 1, len(intents)): (uni_hyp, text_hyp, hypothesis) = intents[j] if sentence != hypothesis: results.append( (uni_stn, uni_hyp, text_stn, text_hyp, sentence, hypothesis, model.predict([get_features(sentence, hypothesis)]))) with open( config.CONTRADICTIONS_RESULTS_PATH.format('summary', 'campi'), 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ 'university stn', 'university hyp', 'text stn', 'text hyp', 'sentence', 'hypothesis', 'prediction' ]) for (uni_stn, uni_hyp, text_stn, text_hyp, sentence, hypothesis, prediction) in results: csv_writer.writerow([ uni_stn, uni_hyp, text_stn, text_hyp, sentence, hypothesis, prediction[0] ]) else: print "Problem loading model"
def validate(dataset_size, model_type): """ runs cross validation in classification model """ print "MODEL VALIDATION", dataset_size, model_type dset = dataset.read('contradictions', dataset_size) data, targets = [], [] for case in dset['content']: data.append(get_features(case['sentence'], case['hypothesis'])) targets.append(case['contradiction']) model = ClassificationModel(model_type) scores = model.cross_validate(data, targets) print "scores", scores print "FIT TIME", scores['fit_time'] print "VALIDATION TIME", scores['score_time'] print "PRECISION", scores['test_precision_macro'] print "RECALL", scores['test_recall_macro'] print "F1 SCORE", scores['test_f1_macro'] return scores['fit_time'], scores['score_time'], scores[ 'test_precision_macro'], scores['test_recall_macro'], scores[ 'test_f1_macro']
def validate(dataset_size, model_type): """ runs cross validation in classification model """ print("MODEL VALIDATION", dataset_size, model_type) dset = dataset.read('conflicts', dataset_size) data, targets = [], [] for case in dset['content']: data.append(get_features(case['sentence'], case['hypothesis'])) targets.append(case['conflict']) print("DATASET LOADED") model = ClassificationModel(model_type) scores = model.cross_validate(data, targets) print("scores", scores) print("MEAN FIT TIME", np.mean(scores['fit_time'])) print("MEAN VALIDATION TIME", np.mean(scores['score_time'])) print("MEAN PRECISION", np.mean(scores['test_precision_macro'])) print("MEAN RECALL", np.mean(scores['test_recall_macro'])) print("MEAN F1 SCORE", np.mean(scores['test_f1_macro'])) return scores['fit_time'], scores['score_time'], scores[ 'test_precision_macro'], scores['test_recall_macro'], scores[ 'test_f1_macro']
def analyze_campus_policies(model_size): """ runs tests with the trained Random Forest model, with each pair of intents in the campi dataset """ print("MODEL TEST USING CAMPI ALL") campi_by_uni_dset = dataset.read('conflicts', 'campi', 'all') results = [] summary = { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 } summary_by_type = { 'qos': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'negation': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'path': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'time': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'synonym': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'hierarchy': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 } } model = ClassificationModel('forest') if model.load(model_size): for case in campi_by_uni_dset: features_vector = get_features(case['sentence']['nile'], case['hypothesis']['nile']) prediction = model.predict([features_vector])[0] if prediction == case['conflict']: summary['tp' if prediction == 1 else 'tn'] += 1 summary_by_type[case['type']]['tp' if prediction == 1 else 'tn'] += 1 else: print(case['sentence']['nile'], case['hypothesis']['nile']) summary['fp' if prediction == 1 else 'fn'] += 1 summary_by_type[case['type']]['fp' if prediction == 1 else 'fn'] += 1 print(features_vector, prediction, case['conflict']) results.append( (case['sentence']['university'], case['hypothesis']['university'], case['sentence']['text'], case['hypothesis']['text'], case['sentence']['nile'], case['hypothesis']['nile'], case['type'], case['conflict'], features_vector, prediction)) with open(config.CONFLICTS_RESULTS_PATH.format('campi', 'all'), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ 'sentence university', 'hypothesis university', 'sentence text', 'hypothesis text', 'sentence nile', 'hypothesis nile', 'type', 'conflict', 'features', 'prediction' ]) for (stn_uni, hyp_uni, stn_text, hyp_text, stn_nile, hyp_nile, type, conflict, features, prediction) in results: csv_writer.writerow([ stn_uni, hyp_uni, stn_text, hyp_text, stn_nile, hyp_nile, type, conflict, features, prediction ]) summary['precision'] = metrics.precision(summary['tp'], summary['fp']) summary['recall'] = metrics.recall(summary['tp'], summary['fn']) summary['f1'] = metrics.f1_score(summary['precision'], summary['recall']) with open(config.CONFLICTS_RESULTS_PATH.format('campi', 'all_summary'), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow( ['type', 'tp', 'tn', 'fp', 'fn', 'precision', 'recall', 'f1']) for type, result in summary_by_type.items(): result['precision'] = metrics.precision( result['tp'], result['fp']) result['recall'] = metrics.recall(result['tp'], result['fn']) result['f1'] = metrics.f1_score(result['precision'], result['recall']) csv_writer.writerow([ type, result['tp'], result['tn'], result['fp'], result['fn'], result['precision'], result['recall'], result['f1'] ]) csv_writer.writerow([ 'total', summary['tp'], summary['tn'], summary['fp'], summary['fn'], summary['precision'], summary['recall'], summary['f1'] ]) print(summary) else: print("Problem loading model")
def feedback(): """ opens alpha dataset, splits 75-25 percent for train-feedback """ print("FEEDBACK") global END_TRAINING diag = Dialogflow(evaluation=True) all_data = dataset.read('extraction', 'both')['intents'] all_intents = [] for case in all_data: intent = [] for part in case['parts']: intent.append(part) all_intents.append(intent) num_repeats = 1 with open(config.EXTRACTION_RESULTS_PATH.format('feedback', 'single'), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ "repeat", "feedback_round", "text", "recognized_entities", "expected_entities", "tp", "fp", "fn", "recall", "precision", "f1_score" ]) for repeat in range(num_repeats): n_samples = int(floor(len(all_intents) * 0.25)) training = sample(all_intents, n_samples) feedback = sample(all_intents, len(all_intents) - n_samples) print("DATASET CASES TRAIN #", len(training)) print("DATASET CASES FEEDBACK #", len(feedback)) diag.update_intent(INTENT_ID, training, False) training_begin = diag.train_agent(training_callback) time_elapsed = None while True: if END_TRAINING: time_elapsed = (END_TRAINING - training_begin) print("Training time: ", time_elapsed) break time.sleep(60) print("Testing...") results = [] shuffle(feedback) for idx, feedback_case in enumerate(feedback): print("intent", idx) result = diag.detect_intent_texts([feedback_case])[0] rec = metrics.recall(result['tp'], result['fn']) prec = metrics.precision(result['tp'], result['fp']) f1_sc = metrics.f1_score(prec, rec) print(result['text']) print('recall: ', rec) print('precision: ', prec) print('f1_score: ', f1_sc) csv_writer.writerow([ repeat, idx, result['text'], result['recognized_entities'], result['expected_entities'], result['tp'], result['fp'], result['fn'], rec, prec, f1_sc ]) if result['fp'] != 0 or result['fn'] != 0: training.append(feedback_case) print("DATASET CASES TRAIN #", len(training)) diag.update_intent(INTENT_ID, training, False) END_TRAINING = None training_begin = diag.train_agent(training_callback) time_elapsed = None while True: if END_TRAINING: time_elapsed = (END_TRAINING - training_begin) print("Training time: ", time_elapsed) break time.sleep(60) csv_writer.writerow(["DATASET CASES TRAIN #", len(training)])
def run(dtype): """ opens specific dataset, splits 75-25 percent for train-test and runs extraction """ print("DATASET ", dtype) global END_TRAINING data = dataset.read('extraction', dtype)['intents'] intents = [] for case in data: intent = [] for part in case['parts']: intent.append(part) intents.append(intent) print("DATASET CASES #", len(intents)) highest_precision = 0 highest_recall = 0 highest_f1 = 0 highest_try = 0 num_tries = 0 while num_tries < 30: num_tries += 1 END_TRAINING = None n_samples = int(ceil(len(intents) * 0.75)) training = sample(intents, n_samples) validation = sample(intents, len(intents) - n_samples) diag = Dialogflow(evaluation=True) diag.update_intent(INTENT_ID, training, False) training_begin = diag.train_agent(training_callback) time_elapsed = None while True: if END_TRAINING: time_elapsed = (END_TRAINING - training_begin) print("Training time: ", time_elapsed) break # time.sleep(50) print("Testing...") results = diag.detect_intent_texts(validation) with open(config.EXTRACTION_RESULTS_PATH.format(dtype, num_tries), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ "text", "recognized_entities", "expected_entities", "training_time", "recall", "precision", "f1_score" ]) mean_precision = 0 mean_recall = 0 num_entries = len(results) for result in results: rec = metrics.recall(result['tp'], result['fn']) prec = metrics.precision(result['tp'], result['fp']) f1_sc = metrics.f1_score(prec, rec) mean_precision += prec mean_recall += rec print(result['text']) print('recall: ', rec) print('precision: ', prec) print('f1_score: ', f1_sc) csv_writer.writerow([ result['text'], result['recognized_entities'], result['expected_entities'], time_elapsed, rec, prec, f1_sc ]) mean_precision /= num_entries mean_recall /= num_entries mean_f1 = metrics.f1_score(mean_precision, mean_recall) csv_writer.writerow(["Mean Precision", mean_precision]) csv_writer.writerow(["Mean Recall", mean_recall]) csv_writer.writerow(["Mean F1", mean_f1]) print("Mean Precision", mean_precision) print("Mean Recall", mean_recall) print("Mean F1", mean_f1) if mean_f1 > highest_f1: highest_f1 = mean_f1 highest_precision = mean_precision highest_recall = mean_recall highest_try = num_tries print("Highest Precision", highest_precision) print("Highest Recall", highest_recall) print("Highest F1", highest_f1) print("Highest Try", highest_try)
for j in range(len(g[i])): g_history[i][j] += g[i][j] ** 2 g[i][j] /= eps + math.sqrt(g_history[i][j]) else: # 1 dimensional for i in range(len(g)): g_history[i] += g[i] ** 2 g[i] /= eps + math.sqrt(g_history[i]) return g, g_history if __name__ == '__main__': # Train an Autoencoder model, with AdaGrad N, D, xs = dataset.read('data/dataset.dat') mean = 0 stddev = math.sqrt(1 / D) n_hidden_units = 5 n_epochs = 20 # The initial learning rate should be much higher e.g. 1.0 than in normal # SGD without AdaGrad, since the adaption will take care of the scaling. initial_learning_rate = 1 model = AutoencoderAdaGrad(n_in=D, n_units=n_hidden_units, lr=initial_learning_rate, mean=mean, stddev=stddev)
y (list): Activations of the output layer. gW1 (list): Computed gradients of `W1`. gb1 (list): Computed gradients of `b1`. gW2 (list): Computed gradients of `W2`. gb2 (list): Computed gradients of `bw`. """ self.W2 = optimize_sgd(self.W2, gW2, self.lr) self.b2 = optimize_sgd(self.b2, gb2, self.lr) self.W1 = optimize_sgd(self.W1, gW1, self.lr) self.b1 = optimize_sgd(self.b1, gb1, self.lr) if __name__ == '__main__': # Train an Autoencoder model N, D, xs = dataset.read('data/dataset.dat') # Parameters for initializing the ranom parameters. The standard deviation # is set with respect to the dimensions of the inputs. # http://docs.chainer.org/en/stable/reference/links.html#linear mean = 0 stddev = math.sqrt(1 / D) n_hidden_units = 5 n_epochs = 20 initial_learning_rate = 0.001 model = Autoencoder(n_in=D, n_units=n_hidden_units, lr=initial_learning_rate, mean=mean,
def feedback(): """ opens alpha dataset, splits 75-25 percent for train-test, then opens campi dataset to use as feedback """ print "FEEDBACK ", global END_TRAINING diag = Dialogflow(SESSION_ID) alpha_data = dataset.read('extraction', 'alpha')['intents'] campi_data = dataset.read('extraction', 'campi')['intents'] alpha_intents = [] for case in alpha_data: intent = [] for part in case['parts']: if 'entity_type' in part and not 'alias' in part: part['alias'] = part['entity_type'][1:] intent.append(part) alpha_intents.append(intent) campi_intents = [] for case in campi_data: intent = [] for part in case['parts']: if 'entity_type' in part and not 'alias' in part: part['alias'] = part['entity_type'][1:] intent.append(part) campi_intents.append(intent) print "DATASET CASES ALPHA #", len(alpha_intents) print "DATASET CASES CAMPI #", len(campi_intents) diag.update_intent(INTENT_ID, alpha_intents, False) training_begin = diag.train_agent(training_callback) time_elapsed = None while True: if END_TRAINING: time_elapsed = (END_TRAINING - training_begin) print "Training time: ", time_elapsed break time.sleep(60) print "Testing..." results = [] shuffle(campi_intents) for idx, feedback_case in enumerate(campi_intents): print "intent", idx result = diag.detect_intent_texts([feedback_case])[0] rec = recall(result) prec = precision(result) f1_sc = f1_score(result) print result['text'] print 'recall: ', rec print 'precision: ', prec print 'f1_score: ', f1_sc results.append( (idx, result['text'], result['recognized_entities'], result['tp'], result['tn'], result['fp'], result['fn'], rec, prec, f1_sc)) if result['fp'] != 0 or result['fn'] != 0: alpha_intents.append(feedback_case) print "DATASET CASES ALPHA #", len(alpha_intents) diag.update_intent(INTENT_ID, alpha_intents, False) END_TRAINING = None training_begin = diag.train_agent(training_callback) time_elapsed = None while True: if END_TRAINING: time_elapsed = (END_TRAINING - training_begin) print "Training time: ", time_elapsed break with open(config.EXTRACTION_RESULTS_PATH.format('feedback'), 'wb') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ "feedback_round", "text", "recognized_entities", "training_time", "tp", "tn", "fp", "fn", "recall", "precision", "f1_score" ]) for (idx, intent, rec_entities, tp, tn, fp, fn, rec, prec, f1_sc) in results: csv_writer.writerow([ idx, intent, rec_entities, time_elapsed, tp, tn, fp, fn, rec, prec, f1_sc ])