def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test', save_output=True, save_scores=False): if model_props is None: model_props = model_properties.MentionRankingProps( name=model_name, load_weights_from=model_name, weights_file=weights_file) print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True) docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl') stats = {} print "Building model" model, _ = pairwise_models.get_model(dataset, vectors, model_props) print "Evaluating model on", dataset_name evaluate_model(dataset, docs, model, model_props, stats, save_output=save_output, save_scores=save_scores) timer.clear() utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
def write_action_spaces(dataset_name, action_space_path, model_path, ltr=False): output_file = action_space_path + dataset_name + "_action_space.pkl" print "Writing candidate actions to " + output_file scores = utils.load_pickle(model_path + dataset_name + "_scores.pkl") write_probable_pairs(dataset_name, action_space_path, scores) probable_pairs = utils.load_pickle(action_space_path + dataset_name + '_probable_pairs.pkl') possible_pairs_total = 0 action_spaces = [] for did in scores: if did in probable_pairs: actions = defaultdict(list) for (m1, m2) in probable_pairs[did]: actions[m2].append(m1) if ltr: actions = sorted(actions.items(), cmp=lambda (ana1, ants1), (ana2, ants2): -1 if (ana1, ana2) in scores[did] else 1) for i in range(len(actions) - 1): assert (actions[i][0], actions[i + 1][0]) in scores[did] else: actions = sorted(actions.items(), key=lambda (ana, ants): max(scores[did][ (ant, ana)] - scores[did][(-1, ana)] for ant in ants)) possible_pairs = get_possible_pairs(probable_pairs[did]) possible_pairs_total += len(possible_pairs) action_spaces.append(ActionSpace(did, actions, possible_pairs)) utils.write_pickle(action_spaces, output_file)
def get_asp_valid_path(model_str, player_img, answer_set_filename, save=True): # Initialize start and goal fact variables start_nodes = [] goal_nodes = [] is_start_idx = State.prolog_state_contents_is_start_index() goal_reached_idx = State.prolog_state_contents_goal_reached_index() # Create new graph for model graph = nx.Graph() # Add nodes from reachable facts reachable_facts = Solver.get_facts_as_list(model_str, fact_name='reachable') for reachable_fact in reachable_facts: reachable_contents = Solver.get_fact_contents_as_list( reachable_fact) reachable_node = str(reachable_contents) graph.add_node(reachable_node) if reachable_contents[is_start_idx] == '1': start_nodes.append(reachable_node) if reachable_contents[goal_reached_idx] == '1': goal_nodes.append(reachable_node) # Check that reachable start and goal states exist if len(start_nodes) == 0: error_exit('No reachable start states found in model str') if len(goal_nodes) == 0: error_exit('No reachable goal states found in model str') # Add edges from link facts link_facts = Solver.get_facts_as_list(model_str, fact_name='link') for link_fact in link_facts: link_contents = Solver.get_fact_contents_as_list(link_fact) src_node = str(link_contents[:len(link_contents) // 2]) dest_node = str(link_contents[len(link_contents) // 2:]) graph.add_edge(src_node, dest_node) # Check if valid path exists from start to goal for start_node in start_nodes: for goal_node in goal_nodes: valid_path_exists = nx.has_path(graph, source=start_node, target=goal_node) if valid_path_exists: valid_path = nx.dijkstra_path(graph, source=start_node, target=goal_node) if save: valid_path_str = " => \n".join(valid_path) valid_path_file = get_filepath( "level_saved_files_%s/generated_level_paths" % player_img, "%s.pickle" % answer_set_filename) write_pickle(valid_path_file, valid_path_str) return valid_path return None
def main(save_filename, unique_metatiles_file, player_img): print("Creating id maps from unique metatiles file %s..." % unique_metatiles_file) start_time = datetime.now() metatile_count = 0 id_metatile_map = {} metatile_id_map = {} unique_metatiles = read_pickle(unique_metatiles_file) for metatile in unique_metatiles: metatile_count += 1 metatile_id = "t%d" % metatile_count metatile_str = metatile.to_str() id_metatile_map[metatile_id] = metatile_str metatile_id_map[metatile_str] = metatile_id level_saved_files_dir = "level_saved_files_%s/" % player_img outfile = "%s.pickle" % save_filename id_metatile_map_file = get_filepath( level_saved_files_dir + "id_metatile_maps", outfile) metatile_id_map_file = get_filepath( level_saved_files_dir + "metatile_id_maps", outfile) write_pickle(id_metatile_map_file, id_metatile_map) write_pickle(metatile_id_map_file, metatile_id_map) end_time = datetime.now() runtime = str(end_time - start_time) print("Runtime: %s" % runtime) return id_metatile_map_file, metatile_id_map_file, runtime
def main(save_filename, unique_metatiles_file, player_img, print_stats): print("Calculating states per metatile stats for the given unique_metatiles_file: %s" % unique_metatiles_file) start_time = datetime.now() save_directory = "level_saved_files_%s/metatile_num_states_dicts/" % player_img save_file = "%s.pickle" % save_filename metatile_num_states_dict_file = get_filepath(save_directory, save_file) unique_metatiles = read_pickle(unique_metatiles_file) metatile_num_states_dict = {} for metatile in unique_metatiles: metatile_str = metatile.to_str() metatile_graph = nx.DiGraph(metatile.graph_as_dict) num_states = len(metatile_graph.nodes()) metatile_num_states_dict[metatile_str] = num_states write_pickle(metatile_num_states_dict_file, metatile_num_states_dict) end_time = datetime.now() runtime = str(end_time-start_time) if print_stats: print(get_metatile_num_states_stats(metatile_num_states_dict)) print("Runtime: %s\n" % runtime) return runtime
def preprocess_names(config): word_counts = utils.load_pickle(config.word_counts_raw) capitalized_counts = utils.load_pickle(config.capitalized_counts) remove = ["english", "french", "german", "august", "president", "colonel", "lord", "june", "major", "states", "august", "sunday", "christmas", "america", "paris", "france", "florence", "roman", "israel", "ireland", "bible", "france", "england"] name_stats = [] names, wordlike_names = set(), set() for name_file in [config.first_names_txt, config.last_names_txt]: with open(name_file) as f: for line in f: split = line.split() name = split[0].lower() freq = float(line.split()[1]) count_ratio = word_counts[name] / max(1, float(capitalized_counts[name])) if name in remove and freq < 0.002: print name if name not in remove and freq > 0.002: name_stats.append((name, word_counts[name], capitalized_counts[name], count_ratio)) if count_ratio < 0.75: names.add(name) if count_ratio > 0.2: wordlike_names.add(name) for w, wc, cc, cr in sorted(name_stats, key=itemgetter(-1), reverse=True): if cc > 500: print w, wc, cc, cr utils.write_pickle(names, config.names) utils.write_pickle(wordlike_names, config.wordlike_names)
def run_evaluation(self): train_scores, train_loss, dev_pairs = evaluate(self, self.dev_docs, self.dev_data, "Evaluating on train") test_scores, test_loss, test_pairs = evaluate(self, self.test_docs, self.test_data, "Evaluating on test") epoch_stats = { "epoch": self.epoch, "n": self.n, "train_loss": train_loss, "test_loss": test_loss } epoch_stats.update({"train " + k: v for k, v in train_scores.iteritems()}) epoch_stats.update({"test " + k: v for k, v in test_scores.iteritems()}) self.history.append(epoch_stats) utils.write_pickle(self.history, self.model_props.path + 'history.pkl') timer.print_totals() test_conll = epoch_stats["test conll"] if self.epoch % self.write_every == 0: self.best_conll_window = 0 if test_conll > self.best_conll: self.best_conll = test_conll print "New best CoNLL, saving model" self.save_progress(dev_pairs, test_pairs, "best") if test_conll > self.best_conll_window: self.best_conll_window = test_conll print "New best CoNLL in window, saving model" self.save_progress(dev_pairs, test_pairs, str(self.write_every * int(self.epoch / self.write_every))) self.model.save_weights(self.model_props.path + "weights.hdf5", overwrite=True)
def eval(self): # self.logger.info('--------------------Evaluation: mAP@50-------------------') self.ImgNet.eval().cuda() self.TxtNet.eval().cuda() re_BI, re_BT, re_LT, qu_BI, qu_BT, qu_LT, indexes = generate_hashes_from_dataloader( self.database_loader, self.test_loader, self.ImgNet, self.TxtNet, self.cfg.LABEL_DIM) qu_BI = self.get_each_5th_element(qu_BI) re_BI = self.get_each_5th_element(re_BI) qu_LI = self.get_each_5th_element(qu_LT) re_LI = self.get_each_5th_element(re_LT) indexes = list(indexes) indexes[0] = self.get_each_5th_element(indexes[0]) indexes[2] = self.get_each_5th_element(indexes[2]) self.visualize_retrieval(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT, re_LI, re_LT, indexes, 'DJSRH') MAP_I2T, MAP_T2I, MAP_I2I, MAP_T2T, MAP_AVG = self.calc_maps_k( qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT, re_LI, re_LT, self.cfg.MAP_K) MAPS = (MAP_I2T, MAP_T2I, MAP_I2I, MAP_T2T) maps5 = (MAP_I2T, MAP_T2I, MAP_I2I, MAP_T2T, MAP_AVG) maps10 = self.calc_maps_k(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT, re_LI, re_LT, 10) maps20 = self.calc_maps_k(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT, re_LI, re_LT, 20) mapshr = self.calc_maps_rad(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT, re_LI, re_LT, [0, 1, 2, 3, 4, 5]) top_k_hists(qu_BI, qu_BT, re_BI, re_BT, model='DJSRH') hr_hists(qu_BI, qu_BT, re_BI, re_BT, model='DJSRH') build_binary_hists(qu_BI, qu_BT, re_BI, re_BT, 'DJSRH', [i[0] for i in mapshr]) maps_eval = (maps5, maps10, maps20, mapshr) if (self.best_it + self.best_ti + self.best_ii + self.best_tt) < (MAP_I2T + MAP_T2I + MAP_I2I + MAP_T2T): self.best_it = MAP_I2T self.best_ti = MAP_T2I self.best_ii = MAP_I2I self.best_tt = MAP_T2T if not self.cfg.TEST: self.save_checkpoints('best.pth') if not self.cfg.TEST: self.save_checkpoints('last.pth') write_pickle( osp.join(self.cfg.MODEL_DIR, self.path, 'maps_eval.pkl'), maps_eval)
def export(self): path = '../yelp_dataset/fm_res/' V_filename = 'FM_V.pickle' W_filename = 'FM_W.pickle' V = self.V.detach() W = [self.W.detach(), self.w0.detach()] write_pickle(path + V_filename, V) write_pickle(path + W_filename, W)
def write_feature_names(): utils.write_pickle( { f: i for i, f in enumerate( next(utils.load_json_lines(directories.RAW + 'train'))["pair_feature_names"]) }, directories.MISC + 'pair_feature_names.pkl')
def write_docs(dataset_name): gold, mention_to_gold = load_gold(dataset_name) mentions = load_mentions(dataset_name) docs = [] for did in gold: docs.append(Document(did, mentions[did], gold[did], mention_to_gold[did])) utils.write_pickle(docs, directories.DOCUMENTS + dataset_name + '_docs.pkl')
def write_docs(dataset_name): gold, mention_to_gold = load_gold(dataset_name) mentions = load_mentions(dataset_name) docs = [] for did in gold: docs.append( Document(did, mentions[did], gold[did], mention_to_gold[did])) utils.write_pickle(docs, directories.DOCUMENTS + dataset_name + '_docs.pkl')
def write_genres(): sources = set() for dataset_name in ["train"]: print "Adding sources from", dataset_name for d in docs(dataset_name): sources.add(d["document_features"]["source"]) print sources utils.write_pickle({source: i for i, source in enumerate(sorted(sources))}, directories.MISC + 'genres.pkl')
def export(self, filepath, metapath): r""" export the embeddings to files """ user_factors = self.user_factors.detach().cpu() item_factors = self.item_factors.detach().cpu() user_file = filepath + metapath + '_user' + '.pickle' item_file = filepath + metapath + '_item' + '.pickle' write_pickle(user_file, user_factors) write_pickle(item_file, item_factors)
def prep_data(config): vocab = vocabulary.Vocabulary(config) for era in ["historic", "modern"]: sequences = [] for sentence in sentences(config, True, include_historic=(era == "historic"), include_modern=(era == "modern")): print " ".join([vocab[vocab[w]] for w in sentence]) sequences.append([vocab[w] for w in sentence]) utils.write_pickle(sequences, config.all_sequences[era])
def process_answer_set(self, model_str): player_img, prolog_filename = Solver.parse_prolog_filepath( self.prolog_file) answer_set_filename = self.get_cur_answer_set_filename(prolog_filename) # Create assignments dictionary {(tile_x, tile_y): tile_id} assignments_dict = Solver.create_assignments_dict(model_str) # Create and save structural txt file for the generated level level_structural_txt = "" for row in range(self.level_h): for col in range(self.level_w): tile_xy = (col, row) tile_id = assignments_dict.get(tile_xy) tile_char = self.get_tile_char(tile_id) level_structural_txt += tile_char level_structural_txt += "\n" if self.save: generated_level_txt_dir = "level_structural_layers/generated/" level_structural_txt_file = get_filepath( generated_level_txt_dir, "%s.txt" % answer_set_filename) write_file(level_structural_txt_file, level_structural_txt) generated_level_assignments_dir = "level_saved_files_%s/generated_level_assignments_dicts/" % player_img level_assignments_file = get_filepath( generated_level_assignments_dir, "%s.pickle" % answer_set_filename) write_pickle(level_assignments_file, assignments_dict) generated_level_model_str_dir = "level_saved_files_%s/generated_level_model_strs/" % player_img level_model_str_file = get_filepath(generated_level_model_str_dir, "%s.txt" % answer_set_filename) write_pickle(level_model_str_file, model_str) if self.print_level: print(level_structural_txt) if self.validate: asp_valid = Solver.asp_is_valid( check_path=True, check_onground=self.require_all_platforms_reachable, check_bonus=self.require_all_bonus_tiles_reachable, model_str=model_str, player_img=player_img, answer_set_filename=answer_set_filename, tile_ids=self.tile_ids.copy(), save=self.save) self.asp_valid_levels_count += 1 if asp_valid else 0 # state_graph_valid_path = Solver.get_state_graph_valid_path(assignments_dict, player_img, prolog_filename, # answer_set_filename, save=self.save) # self.state_graph_valid_levels_count += 1 if state_graph_valid_path is not None else 0 self.increment_answer_set_count()
def main(data_path): # if os.path.isfile(data_path): # df_loans = read_pickle("{}preprocessed_loans.pickle".format(data_path)) # else: contract_addresses = get_contract_addresses() loans = get_all_loans(contract_addresses) write_pickle(loans, "{}loans.pickle".format(data_path)) # # loans = read_pickle("{}loans.pickle".format(data_path)) df_loans = preprocess(loans) write_pickle(df_loans, "{}preprocessed_loans.pickle".format(data_path))
def write_words(): words = Counter() for dataset_name in ["train", "dev", "test"]: inc = 1 if dataset_name == "train" else 0 print "Adding words from", dataset_name for d in docs(dataset_name): for mention in d["mentions"].values(): for w in mention["sentence"]: words[word_vectors.normalize(w)] += inc words[word_vectors.normalize(mention["dep_relation"])] += 1 utils.write_pickle(words, directories.MISC + 'word_counts.pkl')
def build_data(): training_data = {} testing_data = {} train_size = 120 for i in range(10): imgs = get_imgs_by_number(i) random.shuffle(imgs) training_data[i] = [img[1] for img in imgs[:train_size]] testing_data[i] = [img[1] for img in imgs[train_size:]] utils.write_pickle(training_data, 'train_data.pkl') utils.write_pickle(testing_data, 'test_data.pkl') print('training and testing data is shuffled and written')
def count_words(config, normalize): word_counts = Counter() capitalized_counts = Counter() for sentence in sentences(config, normalize): for i in range(len(sentence)): w = sentence[i] word_counts[w] += 1 if w[0].isupper() and not first_word(sentence, i): capitalized_counts[w.lower()] += 1 utils.write_pickle(word_counts, config.word_counts if normalize else config.word_counts_raw) if not normalize: utils.write_pickle(capitalized_counts, config.capitalized_counts)
def main(save_filename, metatile_id_map_file, id_metatile_map_file, metatile_coords_dict_files, player_img): print("Constructing tile_id constraints dictionary...") start_time = datetime.now() # Create save file path metatile_constraints_dir = "level_saved_files_%s/metatile_constraints" % player_img metatile_constraints_file = get_filepath(metatile_constraints_dir, "%s.pickle" % save_filename) # Load in files metatile_id_map = read_pickle(metatile_id_map_file) id_metatile_map = read_pickle(id_metatile_map_file) metatile_coords_dicts = [ read_pickle(file) for file in metatile_coords_dict_files ] coord_metatiles_dict = get_coord_metatiles_dict(metatile_coords_dicts) coord_tile_ids_map = get_coord_tile_ids_map(metatile_id_map, coord_metatiles_dict) tile_id_constraints_dict = {} for tile_id, metatile_str in id_metatile_map.items(): metatile = Metatile.from_str(metatile_str) tile_id_constraints_dict[tile_id] = { "type": metatile.type, "graph": metatile.graph_as_dict, "games": metatile.games, "levels": metatile.levels, "adjacent": { TOP: [], BOTTOM: [], LEFT: [], RIGHT: [], TOP_LEFT: [], BOTTOM_LEFT: [], TOP_RIGHT: [], BOTTOM_RIGHT: [] } } tile_id_constraints_dict = populate_tile_id_constraints_adjacencies( tile_id_constraints_dict, coord_tile_ids_map) end_time = datetime.now() runtime = str(end_time - start_time) write_pickle(metatile_constraints_file, tile_id_constraints_dict) print("Runtime: %s\n" % runtime) return metatile_constraints_file, runtime
def main(config, train_size=0.8, min_length=6, max_length=30): for era in ["historic", "modern"]: seqs = utils.load_pickle(config.all_sequences[era]) n = len(seqs) seqs = [s for s in seqs if min_length <= len(s) <= max_length] print len(seqs), n lengths = [len(s) for s in seqs] #sns.distplot(lengths) #plt.show() random.shuffle(seqs) train_size = int(train_size * len(seqs)) utils.write_pickle(seqs[:train_size], config.train_sequences[era]) utils.write_pickle(seqs[train_size:], config.dev_sequences[era])
def stemWords(): from operator import add import utils from nltk.stem.porter import * stemmer = PorterStemmer() D2 = utils.read_pickle('../all-word-series') D3 = {} for k, v in D2.iteritems(): sk = stemmer.stem(k) if sk not in D3: D3[sk] = v else: D3[sk] = map(add, D3[sk], v) utils.write_pickle(D3, '../stemmed-all-unigrams')
def save_process_runtimes(process_key, process_runtimes): all_levels_process_info_file = utils.get_filepath( "", "all_levels_process_info.pickle") if os.path.exists(all_levels_process_info_file): all_levels_process_info = utils.read_pickle( all_levels_process_info_file) else: all_levels_process_info = {} if all_levels_process_info.get(process_key) is None: all_levels_process_info[process_key] = {} for process_step, runtime_str in process_runtimes: all_levels_process_info[process_key][process_step] = runtime_str utils.write_pickle(all_levels_process_info_file, all_levels_process_info)
def write_probable_pairs(dataset_name, action_space_path, scores): probable_pairs = {} margin_removals = 0 total_pairs = 0 total_size = 0 for did in utils.logged_loop(scores): doc_scores = scores[did] pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1], key=lambda pr: doc_scores[pr] - (-1 - 0.3 * doc_scores[ (-1, pr[1])]), reverse=True) total_pairs += len(pairs) probable_pairs[did] = [] for pair in pairs: score = doc_scores[pair] - (-1 - 0.3 * doc_scores[(-1, pair[1])]) if score < SCORE_THRESHOLD: break probable_pairs[did].append(pair) max_scores = {} for pair in probable_pairs[did]: if pair[1] not in max_scores: max_scores[pair[1]] = max(doc_scores[pair], -1 - 0.3 * doc_scores[(-1, pair[1])]) else: max_scores[pair[1]] = max(max_scores[pair[1]], doc_scores[pair]) margin_removals += len(probable_pairs[did]) probable_pairs[did] = [ p for p in probable_pairs[did] if doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD ] margin_removals -= len(probable_pairs[did]) total_size += len(probable_pairs[did]) print "num docs:", len(scores) print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores))) print "avg size: {:.1f}".format(total_size / float(len(scores))) print "margin removals size: {:.1f}".format(margin_removals / float(len(scores))) utils.write_pickle( probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl') shutil.copyfile('clustering_preprocessing.py', action_space_path + 'clustering_preprocessing.py')
def write_feature_names(): raw_train = directories.RAW + 'train' try: utils.write_pickle( { f: i for i, f in enumerate( next(utils.load_json_lines(raw_train)) ["pair_feature_names"]) }, directories.MISC + 'pair_feature_names.pkl') except FileNotFoundError as e: if e.filename == raw_train: raise FileNotFoundError( 'Raw training data not found. Perhaps you need to copy the original dataset first: %s' % e.filename) from e else: raise
def eval_acc(data): query_path = './queries.pkl' rewards_path = './rewards.txt' write_pickle(query_path, data) while(not os.path.isfile(rewards_path)): sleep(0.1) rewards_file = open(rewards_path, 'r') acc = rewards_file.read() os.remove(rewards_path) try: acc = float(acc) except Exception: acc = 0.0 return acc
def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test', save_output=True, save_scores=False): if model_props is None: model_props = model_properties.MentionRankingProps(name=model_name, load_weights_from=model_name, weights_file=weights_file) print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True) docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl') stats = {} print "Building model" model, _ = pairwise_models.get_model(dataset, vectors, model_props) print "Evaluating model on", dataset_name evaluate_model(dataset, docs, model, model_props, stats, save_output=save_output, save_scores=save_scores) timer.clear() utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
def write_document_vectors(): vectors = word_vectors.WordVectors(load=True) for dataset_name in ["train", "dev", "test"]: print "Building document vectors for", dataset_name doc_vectors = {} for d in docs(dataset_name): sentences = {} did = None for mention_num in sorted(d["mentions"].keys(), key=int): m = d["mentions"][mention_num] did = m["doc_id"] if m['sent_num'] not in sentences: sentences[m['sent_num']] = m['sentence'] v = np.zeros(vectors.vectors[0].size) n = 0 for s in sentences.values(): for w in s: v += vectors.vectors[vectors[w]] n += 1 doc_vectors[did] = v / n utils.write_pickle(doc_vectors, directories.MISC + dataset_name + "_document_vectors.pkl")
def evaluate_model(dataset, docs, model, model_props, stats, save_output=False, save_scores=False, print_table=False): prog = utils.Progbar(dataset.n_batches) mt = RankingMetricsTracker(dataset.name, model_props=model_props) \ if model_props.ranking else ClassificationMetricsTracker(dataset.name) mta = ClassificationMetricsTracker(dataset.name + " anaphoricity", anaphoricity=True) docs_by_id = {doc.did: doc for doc in docs} if model_props.ranking else {} saved_links, saved_scores = (defaultdict(list) if save_output else None, defaultdict(dict) if save_scores else None) for i, X in enumerate(dataset): if X['y'].size == 0: continue progress = [] scores = model.predict_on_batch(X) if model_props.ranking: update_doc(docs_by_id[X['did']], X, scores, saved_links=saved_links, saved_scores=saved_scores) if model_props.anaphoricity and not model_props.ranking: progress.append(("anaphoricity loss", mta.update(X, scores[0][:, 0]))) if not model_props.anaphoricity_only: progress.append(("loss", mt.update( X, scores if model_props.ranking else scores[1 if model_props.anaphoricity else 0][:, 0]))) prog.update(i + 1, exact=progress) if save_scores: print "Writing scores" utils.write_pickle(saved_scores, model_props.path + dataset.name + '_scores.pkl') if save_output: print "Writing output" utils.write_pickle(saved_links, model_props.path + dataset.name + '_links.pkl') utils.write_pickle(docs, model_props.path + dataset.name + '_processed_docs.pkl') timer.start("metrics") if model_props.ranking: stats.update(compute_metrics(docs, dataset.name)) stats["validate time"] = time.time() - prog.start if model_props.anaphoricity and not model_props.ranking: mta.finish(stats) if not model_props.anaphoricity_only: mt.finish(stats) timer.stop("metrics") if print_table: print " & ".join(map(lambda x: "{:.2f}".format(x * 100), [ stats[dataset.name + " muc precision"], stats[dataset.name + " muc recall"], stats[dataset.name + " muc"], stats[dataset.name + " b3 precision"], stats[dataset.name + " b3 recall"], stats[dataset.name + " b3"], stats[dataset.name + " ceafe precision"], stats[dataset.name + " ceafe recall"], stats[dataset.name + " ceafe"], stats[dataset.name + " conll"], ]))
def run_evaluation(self): train_scores, train_loss, dev_pairs = evaluate(self, self.dev_docs, self.dev_data, "Evaluating on train") test_scores, test_loss, test_pairs = evaluate(self, self.test_docs, self.test_data, "Evaluating on test") epoch_stats = { "epoch": self.epoch, "n": self.n, "train_loss": train_loss, "test_loss": test_loss } epoch_stats.update( {"train " + k: v for k, v in train_scores.iteritems()}) epoch_stats.update( {"test " + k: v for k, v in test_scores.iteritems()}) self.history.append(epoch_stats) utils.write_pickle(self.history, self.model_props.path + 'history.pkl') timer.print_totals() test_conll = epoch_stats["test conll"] if self.epoch % self.write_every == 0: self.best_conll_window = 0 if test_conll > self.best_conll: self.best_conll = test_conll print "New best CoNLL, saving model" self.save_progress(dev_pairs, test_pairs, "best") if test_conll > self.best_conll_window: self.best_conll_window = test_conll print "New best CoNLL in window, saving model" self.save_progress( dev_pairs, test_pairs, str(self.write_every * int(self.epoch / self.write_every))) self.model.save_weights(self.model_props.path + "weights.hdf5", overwrite=True)
def test_model_actively(self): """ Starts the active learning process """ logging.info(f'Starting validation for {self.amine}') # Run forward pass on the validation data logging.debug(f'Weights for loss function: {self.weights}') iters, all_data, all_labels, x_t, y_t, x_v, y_v = self.setup_active_learning( ) for i in range(iters): logging.debug( f'Doing active learning with {len(x_t)} example. Iteration: {i}' ) # Update available datapoints in the pool and evaluate current # model performance x_t, y_t, x_v, y_v = self.active_learning(all_data, all_labels, x_t, y_t, x_v, y_v) # Save this dictionary in case we need it later write_pickle(self.dst_folder / Path('cv_statistics.pkl'), self.cv_statistics)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--attn-data-file", required=True, help="Pickle file containing extracted attention maps.") parser.add_argument("--outfile", required=True, help="Where to write out the distances between heads.") args = parser.parse_args() print("Loading attention data") data = utils.load_pickle(args.attn_data_file) print("Computing head distances") js_distances = np.zeros([144, 144]) for doc in utils.logged_loop(data, n_steps=None): if "attns" not in doc: continue tokens, attns = doc["tokens"], np.array(doc["attns"]) attns_flat = attns.reshape([144, attns.shape[2], attns.shape[3]]) for head in range(144): head_attns = np.expand_dims(attns_flat[head], 0) head_attns_smoothed = (0.001 / head_attns.shape[1]) + (head_attns * 0.999) attns_flat_smoothed = (0.001 / attns_flat.shape[1]) + (attns_flat * 0.999) m = (head_attns_smoothed + attns_flat_smoothed) / 2 js = -head_attns_smoothed * np.log(m / head_attns_smoothed) js += -attns_flat_smoothed * np.log(m / attns_flat_smoothed) js /= 2 js = js.sum(-1).sum(-1) js_distances[head] += js utils.write_pickle(js_distances, args.outfile)
def main(game, level, metatile_coords_dict_file, metatile_id_map_file, player_img): start_time = datetime.now() print("\nCreating {(tile id, extra_info): coords} map ...") metatile_coords_dict = read_pickle(metatile_coords_dict_file) metatile_id_map = read_pickle(metatile_id_map_file) tile_id_extra_info_coords_map = {} for metatile_str, coords in metatile_coords_dict.items(): metatile = Metatile.from_str(metatile_str) metatile_id = metatile_id_map.get(metatile_str) if metatile_id is None: error_exit("metatile_str not found in metatile_id_map") has_graph = bool(metatile.graph_as_dict) extra_info = "" if not has_graph: extra_info += "E" # metatile graph is empty if len(coords) == 1: extra_info += "S" # metatile was only used for one tile tile_id_extra_info_coords_map[(metatile_id, extra_info)] = coords save_directory = "level_saved_files_%s/tile_id_coords_maps/%s/" % ( player_img, game) tile_id_extra_info_coords_map_file = get_filepath(save_directory, "%s.pickle" % level) write_pickle(tile_id_extra_info_coords_map_file, tile_id_extra_info_coords_map) end_time = datetime.now() runtime = str(end_time - start_time) print("Runtime: %s\n" % runtime) return tile_id_extra_info_coords_map_file, runtime
def write_document_vectors(): vectors = word_vectors.WordVectors(load=True) for dataset_name in ["train", "dev", "test"]: print "Building document vectors for", dataset_name doc_vectors = {} for d in docs(dataset_name): sentences = {} did = None for mention_num in sorted(d["mentions"].keys(), key=int): m = d["mentions"][mention_num] did = m["doc_id"] if m['sent_num'] not in sentences: sentences[m['sent_num']] = m['sentence'] v = np.zeros(vectors.vectors[0].size) n = 0 for s in sentences.values(): for w in s: v += vectors.vectors[vectors[w]] n += 1 doc_vectors[did] = v / n utils.write_pickle( doc_vectors, directories.MISC + dataset_name + "_document_vectors.pkl")
def write(self, path): utils.write_pickle(self.__dict__, path)
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'): print "Training", model_props.path pprint(model_props.__dict__) model_props.write(model_props.path + 'model_props.pkl') utils.rmkdir(model_props.path + 'src') for fname in os.listdir('.'): if fname.endswith('.py'): shutil.copyfile(fname, model_props.path + 'src/' + fname) if model_props.ranking or \ model_props.top_pairs: write_start = 0 write_every = 10 else: write_start = 80 write_every = 20 print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train", model_props, with_ids=True) dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name, model_props, with_ids=True) print "Building model" model, _ = pairwise_models.get_model(dev, vectors, model_props) json_string = model.to_json() open(model_props.path + 'architecture.json', 'w').write(json_string) best_val_score = 1000 best_val_score_in_window = 1000 history = [] for epoch in range(n_epochs): timer.start("train") print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path) epoch_stats = {} model_weights = model.get_weights() train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl') dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl') if reduced: dev_docs = dev_docs[:3] if model_props.ranking: print "Running over training set" run_model_over_docs(train, train_docs, model) epoch_stats.update(compute_metrics(train_docs, "train")) if model_props.use_rewards: print "Setting costs" set_costs(train, train_docs) print "Training" prog = utils.Progbar(train.n_batches) train.shuffle() loss_sum, n_examples = 0, 0 for i, X in enumerate(train): if X['y'].size == 0: continue batch_loss = model.train_on_batch(X) loss_sum += batch_loss * train.scale_factor n_examples += X['y'].size prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)]) epoch_stats["train time"] = time.time() - prog.start for k in prog.unique_values: epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1]) epoch_stats["weight diffs"] = [ (np.sum(np.abs(new_weight - old_weight)), new_weight.size) for new_weight, old_weight in zip(model.get_weights(), model_weights)] summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0) epoch_stats["total weight diff"] = tuple(summed) print "Testing on dev set" evaluate_model(dev, dev_docs, model, model_props, epoch_stats) history.append(epoch_stats) utils.write_pickle(history, model_props.path + 'history.pkl') score = -epoch_stats["dev conll"] if model_props.ranking else \ (epoch_stats["dev loss"] if not model_props.anaphoricity_only else epoch_stats["dev anaphoricity loss"]) if score < best_val_score: best_val_score = score print "New best {:}, saving model".format( "CoNLL F1" if model_props.ranking else "validation loss") model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True) if score < best_val_score_in_window and epoch > write_start: print "Best in last {:}, saved to weights_{:}".format( write_every, write_every * (epoch / write_every)) best_val_score_in_window = score model.save_weights(model_props.path + "weights_{:}.hdf5".format( write_every * (epoch / write_every)), overwrite=True) if epoch + write_every >= n_epochs: model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True) if epoch % write_every == 0: best_val_score_in_window = 1000 timer.stop("train") timer.print_totals() print timer.clear()
def write_feature_names(): utils.write_pickle({f: i for i, f in enumerate(next( utils.load_json_lines(directories.RAW + 'train'))["pair_feature_names"])}, directories.MISC + 'pair_feature_names.pkl')