def save_necessities(self, file_name): dump((self.course_id_lookup_dict, self.class_number_lookup_dict, self.course_cluster_probs_dict, self.k, self.vectorizer, self.tfidf_mat, self.word_dict, self.course_doc_dict, self.__get_course_id_list(), self.course_info_dict, self.course_association_dictionary, self.pagerank_dict), open(file_name, 'wb'))
def auto_phase_match(cfg, samples=30, restore=False, meter_base_names=None): data = [] if 'PCKL_FILE' in os.environ: data = pickle.load(open(os.environ['PCKL_FILE'], "rb")) else: backupfile = "%s.conf.%d.backup.json" % (cfg.devurl.hostname, int(time.time())) print "Making a backup config for later restore" cfg.getregisters(ofile=backupfile) for i in range(3): data.append(measure_and_rotate(cfg, samples, meter_base_names=meter_base_names)) try: from cloud.serialization.cloudpickle import dump dump(data, open("{}/tests/{}T{}.pckl".format(egauge_config.THISDIR, cfg.devurl.netloc, int(time.time())), "wb")) except ImportError as ex: print "unable to save pckl file", ex team, updates = phase_match(data) for tt in team: print tt channels = data[0][0][0] totals = data[0][0][2] #from IPython.core.debugger import Pdb; Pdb().set_trace() if 'PCKL_FILE' not in os.environ: if restore is True: print "Restoring to original configuration on request" cfg.setregisters(ifile=backupfile, skip_backup=True) cfg.wait() cfg.reboot() else: if updates > 0: body = cfg.get_installation_POST(channels, team, totals) uri = "/cgi-bin/protected/egauge-cfg" resp, cont = cfg.request(uri, method="POST", body=body) cfg.wait() cfg.reboot() else: print "No recommended change to the original config" else: if updates > 0: obj = cfg._to_json(channels, team, totals) obj_str = cfg._format_json(obj) ofile = "%s.conf.%d.phase_checked.json" % (cfg.devurl.hostname, int(time.time())) with open(ofile, "wt") as of: print >> of, obj_str print "saved config to ", ofile return ((channels, team, totals, data))
def _save_results(self, output_path): """ Arguments: - `output_path`: """ if os.path.exists(output_path): os.remove(output_path) with open(output_path, "w") as fpout: cloudpickle.dump(self.results, fpout) pass pass
functions_dict['phi0'] = lambdify(syms, term) # first order functions_dict['ddx'] = lambdify(syms, term.diff(x)) functions_dict['ddy'] = lambdify(syms, term.diff(y)) functions_dict['ddz'] = lambdify(syms, term.diff(z)) # second order functions_dict['d2dx2'] = lambdify(syms, term.diff(x,2)) functions_dict['d2dy2'] = lambdify(syms, term.diff(y,2)) functions_dict['d2dz2'] = lambdify(syms, term.diff(z,2)) functions_dict['d2dxdy'] = lambdify(syms, term.diff(x,y)) functions_dict['d2dxdz'] = lambdify(syms, term.diff(x,z)) functions_dict['d2dydz'] = lambdify(syms, term.diff(y,z)) # third order functions_dict['d3dz3'] = lambdify(syms, term.diff(z,3)) functions_dict['d3dxdz2'] = lambdify(syms, term.diff(x,z,2)) functions_dict['d3dydz2'] = lambdify(syms, term.diff(y,z,2)) # fourth order functions_dict['d4dz4'] = lambdify(syms, term.diff(z,4)) functions_dict['d4dx2dz2'] = lambdify(syms, term.diff(x,2,z,2)) functions_dict['d4dy2dz2'] = lambdify(syms, term.diff(y,2,z,2)) dump(functions_dict, open('ad_0.pickle', 'wb'))
def main(): start = time.time() ### READ ########################################################################################################### print '\n------------' print 'Reading data' print '------------\n' all_train_sentences = Paragraphs("Dataset/Train/").all_sentences() ### read_end = time.time() print 'Reading time:', read_end - start, 's' #################################################################################################################### ### PREPROCESS ##################################################################################################### print '\n------------------' print 'Preprocessing data' print '------------------\n' used_fraction = 0.005 train_fraction = 0.8 none_fraction = 0.05 print 'Fraction of data used:', used_fraction print 'Fraction of data for training:', train_fraction print 'Fraction of None-labelled samples used:', none_fraction (used_sentences, _) = all_train_sentences.split_randomly(used_fraction) (train_sentences, test_sentences) = used_sentences.split_randomly(train_fraction) all_train_tokens = train_sentences.tokens() subsampled_tokens = subsample_none(all_train_tokens, none_fraction) print 'Number of training tokens:', len(subsampled_tokens) class_dict = get_class_dict(subsampled_tokens) stem_dict = get_stem_dict(subsampled_tokens) word_dict = get_word_dict(subsampled_tokens) ngram_order = 2 char_ngram_dict = get_char_ngram_dict(subsampled_tokens, ngram_order) ngram_dict = get_ngram_dict(all_train_tokens, ngram_order) trigger_dict = get_trigger_dict(subsampled_tokens) arg_word_dict = get_arg_word_dict(subsampled_tokens) feature_strings = ['word_template_feature', 'word_class_template_feature', 'capital_letter_feature', 'token_in_trigger_dict_feature', 'number_in_token_feature', 'token_in_protein_feature', 'token_is_after_dash_feature', 'pos_class_feature', 'character_ngram_feature'] phi = partial(set_of_features, stem_dict, word_dict, class_dict, trigger_dict, ngram_order, char_ngram_dict, ngram_dict, feature_strings) print 'Used features:', feature_strings ### preprocess_end = time.time() print 'Preprocessing time:', preprocess_end - read_end, 's' #################################################################################################################### ### TRAIN ########################################################################################################## print '\n-------------' print 'Training data' print '-------------\n' alpha = 0.2 max_iterations = 10 print 'Alpha =', alpha print 'Max iterations =', max_iterations classifier = LoglinearModel(lambda t: t.event_candidate, phi, class_dict.keys(), alpha, max_iterations)\ .train(subsampled_tokens) ### train_end = time.time() print 'Training time:', train_end - read_end, 's' #################################################################################################################### #### TEST ########################################################################################################### print '\n-------' print 'Testing' print '-------\n' all_test_tokens = test_sentences.tokens() subsampled_test_tokens = all_test_tokens print 'Number of test tokens:', len(subsampled_test_tokens) predictions = classifier.predict_all(subsampled_test_tokens) ### predict_end = time.time() print 'Predict time:', predict_end - train_end, 's' #################################################################################################################### ### ERROR ANALYSIS ################################################################################################# print '\n-----------------' print 'Analysing results' print '-----------------\n' true_labels = [] for token in all_test_tokens: true_labels.append(token.event_candidate) test_keys = class_dict.keys() for label in test_keys: print 'Analyzing label: ', label precision_recall_f1(true_labels, predictions, label) y_test = map(lambda t: t.event_candidate, all_test_tokens) y_pred = predictions # Computer our confusion matrix cm2 = confusion_matrix(class_dict, y_test, y_pred) pprint(cm2) print cm2 none_index = class_dict['None'] classes = class_dict.keys() for i in range(len(class_dict)): print '\nCLASS: ', classes[i] print 'Recall: ', label_recall(cm2, i) print 'Precision: ', label_precision(cm2, i) print 'F1: ', label_f1(cm2, i) print '\n' print 'Precision micro:', precision_micro(cm2, none_index) print 'Recall micro:', recall_micro(cm2, none_index) print 'F1 micro:', f1_micro(cm2, none_index) print '\n' print 'Precision macro:', precision_macro(cm2, none_index) print 'Recall macro:', recall_macro(cm2, none_index) print 'F1 macro:', f1_macro(cm2, none_index) ### analysis_end = time.time() print '\nAnalysis time:', analysis_end - predict_end, 's' # #################################################################################################################### # cp.dump(classifier, open('classifier_' + time.strftime("%Y%m%d-%H%M%S") + '.p', 'wb'))
def save_model(self, fname): dump(self.model, open(fname, "wb"))
def save(self, filename): #binary mode is important for pickling with open(filename, 'wb') as f: dump(self, f)
def auto_phase_match(cfg, samples=30, restore=False, meter_base_names=None): data = [] if 'PCKL_FILE' in os.environ: data = pickle.load(open(os.environ['PCKL_FILE'], "rb")) else: backupfile = "%s.conf.%d.backup.json" % (cfg.devurl.hostname, int(time.time())) print "Making a backup config for later restore" cfg.getregisters(ofile=backupfile) for i in range(3): data.append( measure_and_rotate(cfg, samples, meter_base_names=meter_base_names)) try: from cloud.serialization.cloudpickle import dump dump( data, open( "{}/tests/{}T{}.pckl".format(egauge_config.THISDIR, cfg.devurl.netloc, int(time.time())), "wb")) except ImportError as ex: print "unable to save pckl file", ex team, updates = phase_match(data) for tt in team: print tt channels = data[0][0][0] totals = data[0][0][2] #from IPython.core.debugger import Pdb; Pdb().set_trace() if 'PCKL_FILE' not in os.environ: if restore is True: print "Restoring to original configuration on request" cfg.setregisters(ifile=backupfile, skip_backup=True) cfg.wait() cfg.reboot() else: if updates > 0: body = cfg.get_installation_POST(channels, team, totals) uri = "/cgi-bin/protected/egauge-cfg" resp, cont = cfg.request(uri, method="POST", body=body) cfg.wait() cfg.reboot() else: print "No recommended change to the original config" else: if updates > 0: obj = cfg._to_json(channels, team, totals) obj_str = cfg._format_json(obj) ofile = "%s.conf.%d.phase_checked.json" % (cfg.devurl.hostname, int(time.time())) with open(ofile, "wt") as of: print >> of, obj_str print "saved config to ", ofile return ((channels, team, totals, data))
def main(): start = time.time() ### READ ########################################################################################################### print '\n------------' print 'Reading data' print '------------\n' all_train_sentences = Paragraphs("Dataset/Train/").all_sentences() ### read_end = time.time() print 'Reading time:', read_end - start, 's' #################################################################################################################### ### PREPROCESS ##################################################################################################### print '\n------------------' print 'Preprocessing data' print '------------------\n' used_fraction = 1 train_fraction = 0.8 none_fraction = 0.10 print 'Fraction of data used:', used_fraction print 'Fraction of data for training:', train_fraction print 'Fraction of None-labelled samples used:', none_fraction (used_sentences, _) = all_train_sentences.split_randomly(used_fraction) (train_sentences, test_sentences) = used_sentences.split_randomly(train_fraction) all_train_tokens = train_sentences.tokens() subsampled_tokens = subsample_none(all_train_tokens, none_fraction) print 'Number of training tokens:', len(subsampled_tokens) class_dict = get_class_dict(subsampled_tokens) arg_dict = {'None': 0, 'Theme': 1, 'Cause': 2} stem_dict = get_stem_dict(subsampled_tokens) word_dict = get_word_dict(subsampled_tokens) ngram_order = 2 char_ngram_dict = get_char_ngram_dict(subsampled_tokens, ngram_order) ngram_dict = get_ngram_dict(all_train_tokens, ngram_order) trigger_dict = get_trigger_dict(subsampled_tokens) arg_word_dict = get_arg_word_dict(subsampled_tokens) classes = dict(map(lambda c: (c, 0), class_dict.keys())) for token in subsampled_tokens: classes[token.event_candidate] += 1 print classes feature_strings = [#'word_template_feature', 'word_class_template_feature', 'capital_letter_feature', # 'token_in_trigger_dict_feature', 'number_in_token_feature', 'token_in_protein_feature', # 'token_is_after_dash_feature', 'pos_class_feature'] # 'character_ngram_feature'] phi = partial(set_of_features_structured, stem_dict, word_dict, arg_dict, class_dict, arg_word_dict, ngram_order, char_ngram_dict, ngram_dict, feature_strings) print 'Used features:', feature_strings ### preprocess_end = time.time() print 'Preprocessing time:', preprocess_end - read_end, 's' #################################################################################################################### ### TRAIN ########################################################################################################## print '\n-------------' print 'Training data' print '-------------\n' alpha = 0.2 max_iterations = 15 arg_none_subsampling = 0.05 def gold(trigger): args = [u'None'] * len(trigger.tokens_in_sentence) for (i, arg) in trigger.event_candidate_args: args[i] = arg return args print 'Alpha =', alpha print 'Max iterations =', max_iterations # classifier = SearchStructuredLoglinearModel(gold, phi, arg_dict.keys(), alpha, max_iterations)\ # .train(subsampled_tokens, average=True) classifier = StructuredLoglinearModel(gold, phi, arg_dict.keys(), alpha, arg_none_subsampling, max_iterations)\ .train(subsampled_tokens, average=True) ### train_end = time.time() print 'Training time:', train_end - read_end, 's' #################################################################################################################### #### TEST ########################################################################################################### print '\n-------' print 'Testing' print '-------\n' all_test_tokens = test_sentences.tokens() subsampled_test_tokens = subsample_none(all_test_tokens, 0) print 'Number of test tokens:', len(subsampled_test_tokens) predictions = classifier.predict_all(subsampled_test_tokens) predict_end = time.time() print 'Predict time:', predict_end - train_end, 's' #################################################################################################################### ### ERROR ANALYSIS ################################################################################################# print '\n-----------------' print 'Analysing results' print '-----------------\n' n_args = len(arg_dict) confusion = mat(zeros((n_args, n_args))) hits = 0 misses = 0 for i in range(0, len(predictions)): truth = gold(subsampled_test_tokens[i]) if truth == predictions[i]: hits += 1 else: misses += 1 for j in range(0, len(predictions[i])): confusion[arg_dict[predictions[i][j]], arg_dict[truth[j]]] += 1 np.set_printoptions(suppress=True) print confusion print 'precision micro:', precision_micro(confusion, 0) print 'recall micro:', recall_micro(confusion, 0) print 'f1 micro:', f1_micro(confusion, 0) print 'precision macro:', precision_macro(confusion, 0) print 'recall macro:', recall_macro(confusion, 0) print 'f1 macro:', f1_macro(confusion, 0) ### analysis_end = time.time() print '\nAnalysis time:', analysis_end - predict_end, 's' # #################################################################################################################### # cp.dump(classifier, open('classifier_' + time.strftime("%Y%m%d-%H%M%S") + '.p', 'wb'))
def to_pickle(self, filepath): from cloud.serialization.cloudpickle import dump fh = open(filepath, 'w') dump(self, fh) fh.close()