Example #1
0
def test(model_props=None,
         model_name=None,
         weights_file='best_weights',
         dataset_name='test',
         save_output=True,
         save_scores=False):
    if model_props is None:
        model_props = model_properties.MentionRankingProps(
            name=model_name,
            load_weights_from=model_name,
            weights_file=weights_file)

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    dataset = datasets.DocumentBatchedDataset(dataset_name,
                                              model_props,
                                              with_ids=True)
    docs = utils.load_pickle(directories.DOCUMENTS + dataset_name +
                             '_docs.pkl')
    stats = {}

    print "Building model"
    model, _ = pairwise_models.get_model(dataset, vectors, model_props)

    print "Evaluating model on", dataset_name
    evaluate_model(dataset,
                   docs,
                   model,
                   model_props,
                   stats,
                   save_output=save_output,
                   save_scores=save_scores)
    timer.clear()
    utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
Example #2
0
def write_action_spaces(dataset_name,
                        action_space_path,
                        model_path,
                        ltr=False):
    output_file = action_space_path + dataset_name + "_action_space.pkl"
    print "Writing candidate actions to " + output_file
    scores = utils.load_pickle(model_path + dataset_name + "_scores.pkl")
    write_probable_pairs(dataset_name, action_space_path, scores)
    probable_pairs = utils.load_pickle(action_space_path + dataset_name +
                                       '_probable_pairs.pkl')

    possible_pairs_total = 0
    action_spaces = []
    for did in scores:
        if did in probable_pairs:
            actions = defaultdict(list)
            for (m1, m2) in probable_pairs[did]:
                actions[m2].append(m1)
            if ltr:
                actions = sorted(actions.items(),
                                 cmp=lambda (ana1, ants1), (ana2, ants2): -1
                                 if (ana1, ana2) in scores[did] else 1)
                for i in range(len(actions) - 1):
                    assert (actions[i][0], actions[i + 1][0]) in scores[did]
            else:
                actions = sorted(actions.items(),
                                 key=lambda (ana, ants): max(scores[did][
                                     (ant, ana)] - scores[did][(-1, ana)]
                                                             for ant in ants))
            possible_pairs = get_possible_pairs(probable_pairs[did])
            possible_pairs_total += len(possible_pairs)
            action_spaces.append(ActionSpace(did, actions, possible_pairs))
    utils.write_pickle(action_spaces, output_file)
Example #3
0
    def get_asp_valid_path(model_str,
                           player_img,
                           answer_set_filename,
                           save=True):
        # Initialize start and goal fact variables
        start_nodes = []
        goal_nodes = []
        is_start_idx = State.prolog_state_contents_is_start_index()
        goal_reached_idx = State.prolog_state_contents_goal_reached_index()

        # Create new graph for model
        graph = nx.Graph()

        # Add nodes from reachable facts
        reachable_facts = Solver.get_facts_as_list(model_str,
                                                   fact_name='reachable')

        for reachable_fact in reachable_facts:
            reachable_contents = Solver.get_fact_contents_as_list(
                reachable_fact)
            reachable_node = str(reachable_contents)
            graph.add_node(reachable_node)
            if reachable_contents[is_start_idx] == '1':
                start_nodes.append(reachable_node)
            if reachable_contents[goal_reached_idx] == '1':
                goal_nodes.append(reachable_node)

        # Check that reachable start and goal states exist
        if len(start_nodes) == 0:
            error_exit('No reachable start states found in model str')
        if len(goal_nodes) == 0:
            error_exit('No reachable goal states found in model str')

        # Add edges from link facts
        link_facts = Solver.get_facts_as_list(model_str, fact_name='link')
        for link_fact in link_facts:
            link_contents = Solver.get_fact_contents_as_list(link_fact)
            src_node = str(link_contents[:len(link_contents) // 2])
            dest_node = str(link_contents[len(link_contents) // 2:])
            graph.add_edge(src_node, dest_node)

        # Check if valid path exists from start to goal
        for start_node in start_nodes:
            for goal_node in goal_nodes:
                valid_path_exists = nx.has_path(graph,
                                                source=start_node,
                                                target=goal_node)
                if valid_path_exists:
                    valid_path = nx.dijkstra_path(graph,
                                                  source=start_node,
                                                  target=goal_node)
                    if save:
                        valid_path_str = " => \n".join(valid_path)
                        valid_path_file = get_filepath(
                            "level_saved_files_%s/generated_level_paths" %
                            player_img, "%s.pickle" % answer_set_filename)
                        write_pickle(valid_path_file, valid_path_str)
                    return valid_path

        return None
def main(save_filename, unique_metatiles_file, player_img):

    print("Creating id maps from unique metatiles file %s..." %
          unique_metatiles_file)
    start_time = datetime.now()

    metatile_count = 0
    id_metatile_map = {}
    metatile_id_map = {}

    unique_metatiles = read_pickle(unique_metatiles_file)

    for metatile in unique_metatiles:
        metatile_count += 1
        metatile_id = "t%d" % metatile_count
        metatile_str = metatile.to_str()
        id_metatile_map[metatile_id] = metatile_str
        metatile_id_map[metatile_str] = metatile_id

    level_saved_files_dir = "level_saved_files_%s/" % player_img
    outfile = "%s.pickle" % save_filename

    id_metatile_map_file = get_filepath(
        level_saved_files_dir + "id_metatile_maps", outfile)
    metatile_id_map_file = get_filepath(
        level_saved_files_dir + "metatile_id_maps", outfile)

    write_pickle(id_metatile_map_file, id_metatile_map)
    write_pickle(metatile_id_map_file, metatile_id_map)

    end_time = datetime.now()
    runtime = str(end_time - start_time)
    print("Runtime: %s" % runtime)

    return id_metatile_map_file, metatile_id_map_file, runtime
Example #5
0
def main(save_filename, unique_metatiles_file, player_img, print_stats):

    print("Calculating states per metatile stats for the given unique_metatiles_file: %s" % unique_metatiles_file)
    start_time = datetime.now()

    save_directory = "level_saved_files_%s/metatile_num_states_dicts/" % player_img
    save_file = "%s.pickle" % save_filename
    metatile_num_states_dict_file = get_filepath(save_directory, save_file)

    unique_metatiles = read_pickle(unique_metatiles_file)
    metatile_num_states_dict = {}

    for metatile in unique_metatiles:
        metatile_str = metatile.to_str()
        metatile_graph = nx.DiGraph(metatile.graph_as_dict)
        num_states = len(metatile_graph.nodes())
        metatile_num_states_dict[metatile_str] = num_states

    write_pickle(metatile_num_states_dict_file, metatile_num_states_dict)

    end_time = datetime.now()
    runtime = str(end_time-start_time)

    if print_stats:
        print(get_metatile_num_states_stats(metatile_num_states_dict))

    print("Runtime: %s\n" % runtime)

    return runtime
Example #6
0
def preprocess_names(config):
    word_counts = utils.load_pickle(config.word_counts_raw)
    capitalized_counts = utils.load_pickle(config.capitalized_counts)
    
    remove = ["english", "french", "german", "august", "president", "colonel", "lord", "june",
              "major", "states", "august", "sunday", "christmas", "america", "paris", "france",
              "florence", "roman", "israel", "ireland", "bible", "france", "england"]

    name_stats = []
    names, wordlike_names = set(), set()
    for name_file in [config.first_names_txt, config.last_names_txt]:
        with open(name_file) as f:
            for line in f:
                split = line.split()
                name = split[0].lower()
                freq = float(line.split()[1])

                count_ratio = word_counts[name] / max(1, float(capitalized_counts[name]))
                if name in remove and freq < 0.002:
                    print name
                if name not in remove and freq > 0.002:
                    name_stats.append((name, word_counts[name], capitalized_counts[name],
                                       count_ratio))
                    if count_ratio < 0.75:
                        names.add(name)
                        if count_ratio > 0.2:
                            wordlike_names.add(name)

    for w, wc, cc, cr in sorted(name_stats, key=itemgetter(-1), reverse=True):
        if cc > 500:
            print w, wc, cc, cr

    utils.write_pickle(names, config.names)
    utils.write_pickle(wordlike_names, config.wordlike_names)
    def run_evaluation(self):
        train_scores, train_loss, dev_pairs = evaluate(self, self.dev_docs, self.dev_data,
                                                       "Evaluating on train")
        test_scores, test_loss, test_pairs = evaluate(self, self.test_docs, self.test_data,
                                                      "Evaluating on test")
        epoch_stats = {
            "epoch": self.epoch,
            "n": self.n,
            "train_loss": train_loss,
            "test_loss": test_loss
        }
        epoch_stats.update({"train " + k: v for k, v in train_scores.iteritems()})
        epoch_stats.update({"test " + k: v for k, v in test_scores.iteritems()})
        self.history.append(epoch_stats)
        utils.write_pickle(self.history, self.model_props.path + 'history.pkl')
        timer.print_totals()

        test_conll = epoch_stats["test conll"]
        if self.epoch % self.write_every == 0:
            self.best_conll_window = 0
        if test_conll > self.best_conll:
            self.best_conll = test_conll
            print "New best CoNLL, saving model"
            self.save_progress(dev_pairs, test_pairs, "best")
        if test_conll > self.best_conll_window:
            self.best_conll_window = test_conll
            print "New best CoNLL in window, saving model"
            self.save_progress(dev_pairs, test_pairs,
                               str(self.write_every * int(self.epoch / self.write_every)))
        self.model.save_weights(self.model_props.path + "weights.hdf5", overwrite=True)
Example #8
0
    def eval(self):

        # self.logger.info('--------------------Evaluation: mAP@50-------------------')

        self.ImgNet.eval().cuda()
        self.TxtNet.eval().cuda()

        re_BI, re_BT, re_LT, qu_BI, qu_BT, qu_LT, indexes = generate_hashes_from_dataloader(
            self.database_loader, self.test_loader, self.ImgNet, self.TxtNet,
            self.cfg.LABEL_DIM)
        qu_BI = self.get_each_5th_element(qu_BI)
        re_BI = self.get_each_5th_element(re_BI)
        qu_LI = self.get_each_5th_element(qu_LT)
        re_LI = self.get_each_5th_element(re_LT)

        indexes = list(indexes)
        indexes[0] = self.get_each_5th_element(indexes[0])
        indexes[2] = self.get_each_5th_element(indexes[2])

        self.visualize_retrieval(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT,
                                 re_LI, re_LT, indexes, 'DJSRH')

        MAP_I2T, MAP_T2I, MAP_I2I, MAP_T2T, MAP_AVG = self.calc_maps_k(
            qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT, re_LI, re_LT,
            self.cfg.MAP_K)

        MAPS = (MAP_I2T, MAP_T2I, MAP_I2I, MAP_T2T)

        maps5 = (MAP_I2T, MAP_T2I, MAP_I2I, MAP_T2T, MAP_AVG)
        maps10 = self.calc_maps_k(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT,
                                  re_LI, re_LT, 10)
        maps20 = self.calc_maps_k(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT,
                                  re_LI, re_LT, 20)
        mapshr = self.calc_maps_rad(qu_BI, qu_BT, re_BI, re_BT, qu_LI, qu_LT,
                                    re_LI, re_LT, [0, 1, 2, 3, 4, 5])

        top_k_hists(qu_BI, qu_BT, re_BI, re_BT, model='DJSRH')
        hr_hists(qu_BI, qu_BT, re_BI, re_BT, model='DJSRH')

        build_binary_hists(qu_BI, qu_BT, re_BI, re_BT, 'DJSRH',
                           [i[0] for i in mapshr])

        maps_eval = (maps5, maps10, maps20, mapshr)

        if (self.best_it + self.best_ti + self.best_ii +
                self.best_tt) < (MAP_I2T + MAP_T2I + MAP_I2I + MAP_T2T):
            self.best_it = MAP_I2T
            self.best_ti = MAP_T2I
            self.best_ii = MAP_I2I
            self.best_tt = MAP_T2T

            if not self.cfg.TEST:
                self.save_checkpoints('best.pth')

        if not self.cfg.TEST:
            self.save_checkpoints('last.pth')
            write_pickle(
                osp.join(self.cfg.MODEL_DIR, self.path, 'maps_eval.pkl'),
                maps_eval)
Example #9
0
 def export(self):
     path = '../yelp_dataset/fm_res/'
     V_filename = 'FM_V.pickle'
     W_filename = 'FM_W.pickle'
     V = self.V.detach()
     W = [self.W.detach(), self.w0.detach()]
     write_pickle(path + V_filename, V)
     write_pickle(path + W_filename, W)
Example #10
0
def write_feature_names():
    utils.write_pickle(
        {
            f: i
            for i, f in enumerate(
                next(utils.load_json_lines(directories.RAW +
                                           'train'))["pair_feature_names"])
        }, directories.MISC + 'pair_feature_names.pkl')
Example #11
0
def write_docs(dataset_name):
    gold, mention_to_gold = load_gold(dataset_name)
    mentions = load_mentions(dataset_name)
    docs = []
    for did in gold:
        docs.append(Document(did, mentions[did],
                             gold[did], mention_to_gold[did]))
    utils.write_pickle(docs, directories.DOCUMENTS + dataset_name + '_docs.pkl')
Example #12
0
def write_docs(dataset_name):
    gold, mention_to_gold = load_gold(dataset_name)
    mentions = load_mentions(dataset_name)
    docs = []
    for did in gold:
        docs.append(
            Document(did, mentions[did], gold[did], mention_to_gold[did]))
    utils.write_pickle(docs,
                       directories.DOCUMENTS + dataset_name + '_docs.pkl')
Example #13
0
def write_genres():
    sources = set()
    for dataset_name in ["train"]:
        print "Adding sources from", dataset_name
        for d in docs(dataset_name):
            sources.add(d["document_features"]["source"])
    print sources
    utils.write_pickle({source: i for i, source in enumerate(sorted(sources))},
                      directories.MISC + 'genres.pkl')
Example #14
0
 def export(self, filepath, metapath):
     r"""
     export the embeddings to files
     """
     user_factors = self.user_factors.detach().cpu()
     item_factors = self.item_factors.detach().cpu()
     user_file = filepath + metapath + '_user' + '.pickle'
     item_file = filepath + metapath + '_item' + '.pickle'
     write_pickle(user_file, user_factors)
     write_pickle(item_file, item_factors)
Example #15
0
def prep_data(config):
    vocab = vocabulary.Vocabulary(config)
    for era in ["historic", "modern"]:
        sequences = []
        for sentence in sentences(config, True,
                                  include_historic=(era == "historic"),
                                  include_modern=(era == "modern")):
            print " ".join([vocab[vocab[w]] for w in sentence])
            sequences.append([vocab[w] for w in sentence])
        utils.write_pickle(sequences, config.all_sequences[era])
Example #16
0
def write_genres():
    sources = set()
    for dataset_name in ["train"]:
        print "Adding sources from", dataset_name
        for d in docs(dataset_name):
            sources.add(d["document_features"]["source"])
    print sources
    utils.write_pickle({source: i
                        for i, source in enumerate(sorted(sources))},
                       directories.MISC + 'genres.pkl')
Example #17
0
    def process_answer_set(self, model_str):
        player_img, prolog_filename = Solver.parse_prolog_filepath(
            self.prolog_file)
        answer_set_filename = self.get_cur_answer_set_filename(prolog_filename)

        # Create assignments dictionary {(tile_x, tile_y): tile_id}
        assignments_dict = Solver.create_assignments_dict(model_str)

        # Create and save structural txt file for the generated level
        level_structural_txt = ""
        for row in range(self.level_h):
            for col in range(self.level_w):
                tile_xy = (col, row)
                tile_id = assignments_dict.get(tile_xy)
                tile_char = self.get_tile_char(tile_id)
                level_structural_txt += tile_char
            level_structural_txt += "\n"

        if self.save:
            generated_level_txt_dir = "level_structural_layers/generated/"
            level_structural_txt_file = get_filepath(
                generated_level_txt_dir, "%s.txt" % answer_set_filename)
            write_file(level_structural_txt_file, level_structural_txt)

            generated_level_assignments_dir = "level_saved_files_%s/generated_level_assignments_dicts/" % player_img
            level_assignments_file = get_filepath(
                generated_level_assignments_dir,
                "%s.pickle" % answer_set_filename)
            write_pickle(level_assignments_file, assignments_dict)

            generated_level_model_str_dir = "level_saved_files_%s/generated_level_model_strs/" % player_img
            level_model_str_file = get_filepath(generated_level_model_str_dir,
                                                "%s.txt" % answer_set_filename)
            write_pickle(level_model_str_file, model_str)

        if self.print_level:
            print(level_structural_txt)

        if self.validate:
            asp_valid = Solver.asp_is_valid(
                check_path=True,
                check_onground=self.require_all_platforms_reachable,
                check_bonus=self.require_all_bonus_tiles_reachable,
                model_str=model_str,
                player_img=player_img,
                answer_set_filename=answer_set_filename,
                tile_ids=self.tile_ids.copy(),
                save=self.save)
            self.asp_valid_levels_count += 1 if asp_valid else 0

            # state_graph_valid_path = Solver.get_state_graph_valid_path(assignments_dict, player_img, prolog_filename,
            #                                                            answer_set_filename, save=self.save)
            # self.state_graph_valid_levels_count += 1 if state_graph_valid_path is not None else 0

        self.increment_answer_set_count()
Example #18
0
def main(data_path):
    # if os.path.isfile(data_path):
    #     df_loans = read_pickle("{}preprocessed_loans.pickle".format(data_path))
    # else:
    contract_addresses = get_contract_addresses()
    loans = get_all_loans(contract_addresses)
    write_pickle(loans, "{}loans.pickle".format(data_path))
    #
#    loans = read_pickle("{}loans.pickle".format(data_path))
    df_loans = preprocess(loans)
    write_pickle(df_loans, "{}preprocessed_loans.pickle".format(data_path))
Example #19
0
def write_words():
    words = Counter()
    for dataset_name in ["train", "dev", "test"]:
        inc = 1 if dataset_name == "train" else 0
        print "Adding words from", dataset_name
        for d in docs(dataset_name):
            for mention in d["mentions"].values():
                for w in mention["sentence"]:
                    words[word_vectors.normalize(w)] += inc
                words[word_vectors.normalize(mention["dep_relation"])] += 1
    utils.write_pickle(words, directories.MISC + 'word_counts.pkl')
Example #20
0
def write_words():
    words = Counter()
    for dataset_name in ["train", "dev", "test"]:
        inc = 1 if dataset_name == "train" else 0
        print "Adding words from", dataset_name
        for d in docs(dataset_name):
            for mention in d["mentions"].values():
                for w in mention["sentence"]:
                    words[word_vectors.normalize(w)] += inc
                words[word_vectors.normalize(mention["dep_relation"])] += 1
    utils.write_pickle(words, directories.MISC + 'word_counts.pkl')
Example #21
0
File: skl.py Project: lansiz/neuron
def build_data():
    training_data = {}
    testing_data = {}
    train_size = 120
    for i in range(10):
        imgs = get_imgs_by_number(i)
        random.shuffle(imgs)
        training_data[i] = [img[1] for img in imgs[:train_size]]
        testing_data[i] = [img[1] for img in imgs[train_size:]]
    utils.write_pickle(training_data, 'train_data.pkl')
    utils.write_pickle(testing_data, 'test_data.pkl')
    print('training and testing data is shuffled and written')
Example #22
0
def count_words(config, normalize):
    word_counts = Counter()
    capitalized_counts = Counter()
    for sentence in sentences(config, normalize):
        for i in range(len(sentence)):
            w = sentence[i]
            word_counts[w] += 1
            if w[0].isupper() and not first_word(sentence, i):
                capitalized_counts[w.lower()] += 1

    utils.write_pickle(word_counts, config.word_counts if normalize else config.word_counts_raw)
    if not normalize:
        utils.write_pickle(capitalized_counts, config.capitalized_counts)
Example #23
0
def main(save_filename, metatile_id_map_file, id_metatile_map_file,
         metatile_coords_dict_files, player_img):

    print("Constructing tile_id constraints dictionary...")
    start_time = datetime.now()

    # Create save file path
    metatile_constraints_dir = "level_saved_files_%s/metatile_constraints" % player_img
    metatile_constraints_file = get_filepath(metatile_constraints_dir,
                                             "%s.pickle" % save_filename)

    # Load in files
    metatile_id_map = read_pickle(metatile_id_map_file)
    id_metatile_map = read_pickle(id_metatile_map_file)
    metatile_coords_dicts = [
        read_pickle(file) for file in metatile_coords_dict_files
    ]

    coord_metatiles_dict = get_coord_metatiles_dict(metatile_coords_dicts)
    coord_tile_ids_map = get_coord_tile_ids_map(metatile_id_map,
                                                coord_metatiles_dict)

    tile_id_constraints_dict = {}
    for tile_id, metatile_str in id_metatile_map.items():
        metatile = Metatile.from_str(metatile_str)
        tile_id_constraints_dict[tile_id] = {
            "type": metatile.type,
            "graph": metatile.graph_as_dict,
            "games": metatile.games,
            "levels": metatile.levels,
            "adjacent": {
                TOP: [],
                BOTTOM: [],
                LEFT: [],
                RIGHT: [],
                TOP_LEFT: [],
                BOTTOM_LEFT: [],
                TOP_RIGHT: [],
                BOTTOM_RIGHT: []
            }
        }
    tile_id_constraints_dict = populate_tile_id_constraints_adjacencies(
        tile_id_constraints_dict, coord_tile_ids_map)

    end_time = datetime.now()
    runtime = str(end_time - start_time)

    write_pickle(metatile_constraints_file, tile_id_constraints_dict)
    print("Runtime: %s\n" % runtime)

    return metatile_constraints_file, runtime
Example #24
0
def main(config, train_size=0.8, min_length=6, max_length=30):
    for era in ["historic", "modern"]:
        seqs = utils.load_pickle(config.all_sequences[era])
        n = len(seqs)
        seqs = [s for s in seqs if min_length <= len(s) <= max_length]
        print len(seqs), n

        lengths = [len(s) for s in seqs]
        #sns.distplot(lengths)
        #plt.show()

        random.shuffle(seqs)
        train_size = int(train_size * len(seqs))
        utils.write_pickle(seqs[:train_size], config.train_sequences[era])
        utils.write_pickle(seqs[train_size:], config.dev_sequences[era])
Example #25
0
def stemWords():
    from operator import add
    import utils
    from nltk.stem.porter import *
    stemmer = PorterStemmer()
    D2 = utils.read_pickle('../all-word-series')
    D3 = {}
    for k, v in D2.iteritems():
        sk = stemmer.stem(k)
        if sk not in D3:
            D3[sk] = v
        else:
            D3[sk] = map(add, D3[sk], v)
        
    utils.write_pickle(D3, '../stemmed-all-unigrams')
Example #26
0
def save_process_runtimes(process_key, process_runtimes):
    all_levels_process_info_file = utils.get_filepath(
        "", "all_levels_process_info.pickle")
    if os.path.exists(all_levels_process_info_file):
        all_levels_process_info = utils.read_pickle(
            all_levels_process_info_file)
    else:
        all_levels_process_info = {}

    if all_levels_process_info.get(process_key) is None:
        all_levels_process_info[process_key] = {}

    for process_step, runtime_str in process_runtimes:
        all_levels_process_info[process_key][process_step] = runtime_str

    utils.write_pickle(all_levels_process_info_file, all_levels_process_info)
Example #27
0
def write_probable_pairs(dataset_name, action_space_path, scores):
    probable_pairs = {}
    margin_removals = 0
    total_pairs = 0
    total_size = 0
    for did in utils.logged_loop(scores):
        doc_scores = scores[did]
        pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1],
                       key=lambda pr: doc_scores[pr] - (-1 - 0.3 * doc_scores[
                           (-1, pr[1])]),
                       reverse=True)

        total_pairs += len(pairs)
        probable_pairs[did] = []
        for pair in pairs:
            score = doc_scores[pair] - (-1 - 0.3 * doc_scores[(-1, pair[1])])
            if score < SCORE_THRESHOLD:
                break
            probable_pairs[did].append(pair)

        max_scores = {}
        for pair in probable_pairs[did]:
            if pair[1] not in max_scores:
                max_scores[pair[1]] = max(doc_scores[pair],
                                          -1 - 0.3 * doc_scores[(-1, pair[1])])
            else:
                max_scores[pair[1]] = max(max_scores[pair[1]],
                                          doc_scores[pair])
        margin_removals += len(probable_pairs[did])
        probable_pairs[did] = [
            p for p in probable_pairs[did]
            if doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD
        ]
        margin_removals -= len(probable_pairs[did])
        total_size += len(probable_pairs[did])

    print "num docs:", len(scores)
    print "avg size without filter: {:.1f}".format(total_pairs /
                                                   float(len(scores)))
    print "avg size: {:.1f}".format(total_size / float(len(scores)))
    print "margin removals size: {:.1f}".format(margin_removals /
                                                float(len(scores)))
    utils.write_pickle(
        probable_pairs,
        action_space_path + dataset_name + '_probable_pairs.pkl')
    shutil.copyfile('clustering_preprocessing.py',
                    action_space_path + 'clustering_preprocessing.py')
Example #28
0
def write_feature_names():
    raw_train = directories.RAW + 'train'
    try:
        utils.write_pickle(
            {
                f: i
                for i, f in enumerate(
                    next(utils.load_json_lines(raw_train))
                    ["pair_feature_names"])
            }, directories.MISC + 'pair_feature_names.pkl')
    except FileNotFoundError as e:
        if e.filename == raw_train:
            raise FileNotFoundError(
                'Raw training data not found.  Perhaps you need to copy the original dataset first: %s'
                % e.filename) from e
        else:
            raise
Example #29
0
def eval_acc(data):
    query_path = './queries.pkl'
    rewards_path = './rewards.txt'

    write_pickle(query_path, data)
    while(not os.path.isfile(rewards_path)):
        sleep(0.1)

    rewards_file = open(rewards_path, 'r')
    acc = rewards_file.read()
    os.remove(rewards_path)
    try:
        acc = float(acc)
    except Exception:
        acc = 0.0

    return acc
Example #30
0
def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test',
         save_output=True, save_scores=False):
    if model_props is None:
        model_props = model_properties.MentionRankingProps(name=model_name,
                                                           load_weights_from=model_name,
                                                           weights_file=weights_file)

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True)
    docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl')
    stats = {}

    print "Building model"
    model, _ = pairwise_models.get_model(dataset, vectors, model_props)

    print "Evaluating model on", dataset_name
    evaluate_model(dataset, docs, model, model_props, stats,
                   save_output=save_output, save_scores=save_scores)
    timer.clear()
    utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
Example #31
0
def write_document_vectors():
    vectors = word_vectors.WordVectors(load=True)
    for dataset_name in ["train", "dev", "test"]:
        print "Building document vectors for", dataset_name
        doc_vectors = {}
        for d in docs(dataset_name):
            sentences = {}
            did = None
            for mention_num in sorted(d["mentions"].keys(), key=int):
                m = d["mentions"][mention_num]
                did = m["doc_id"]
                if m['sent_num'] not in sentences:
                    sentences[m['sent_num']] = m['sentence']

            v = np.zeros(vectors.vectors[0].size)
            n = 0
            for s in sentences.values():
                for w in s:
                    v += vectors.vectors[vectors[w]]
                    n += 1
            doc_vectors[did] = v / n
        utils.write_pickle(doc_vectors, directories.MISC + dataset_name + "_document_vectors.pkl")
Example #32
0
def evaluate_model(dataset, docs, model, model_props, stats, save_output=False, save_scores=False,
                   print_table=False):
    prog = utils.Progbar(dataset.n_batches)
    mt = RankingMetricsTracker(dataset.name, model_props=model_props) \
        if model_props.ranking else ClassificationMetricsTracker(dataset.name)
    mta = ClassificationMetricsTracker(dataset.name + " anaphoricity", anaphoricity=True)

    docs_by_id = {doc.did: doc for doc in docs} if model_props.ranking else {}
    saved_links, saved_scores = (defaultdict(list) if save_output else None,
                                 defaultdict(dict) if save_scores else None)
    for i, X in enumerate(dataset):
        if X['y'].size == 0:
            continue
        progress = []
        scores = model.predict_on_batch(X)
        if model_props.ranking:
            update_doc(docs_by_id[X['did']], X, scores,
                       saved_links=saved_links, saved_scores=saved_scores)
        if model_props.anaphoricity and not model_props.ranking:
            progress.append(("anaphoricity loss", mta.update(X, scores[0][:, 0])))
        if not model_props.anaphoricity_only:
            progress.append(("loss", mt.update(
                X, scores if model_props.ranking else
                scores[1 if model_props.anaphoricity else 0][:, 0])))
        prog.update(i + 1, exact=progress)

    if save_scores:
        print "Writing scores"
        utils.write_pickle(saved_scores, model_props.path + dataset.name + '_scores.pkl')
    if save_output:
        print "Writing output"
        utils.write_pickle(saved_links, model_props.path + dataset.name + '_links.pkl')
        utils.write_pickle(docs, model_props.path + dataset.name + '_processed_docs.pkl')

    timer.start("metrics")
    if model_props.ranking:
        stats.update(compute_metrics(docs, dataset.name))
    stats["validate time"] = time.time() - prog.start
    if model_props.anaphoricity and not model_props.ranking:
        mta.finish(stats)
    if not model_props.anaphoricity_only:
        mt.finish(stats)

    timer.stop("metrics")

    if print_table:
        print " & ".join(map(lambda x: "{:.2f}".format(x * 100), [
            stats[dataset.name + " muc precision"],
            stats[dataset.name + " muc recall"],
            stats[dataset.name + " muc"],
            stats[dataset.name + " b3 precision"],
            stats[dataset.name + " b3 recall"],
            stats[dataset.name + " b3"],
            stats[dataset.name + " ceafe precision"],
            stats[dataset.name + " ceafe recall"],
            stats[dataset.name + " ceafe"],
            stats[dataset.name + " conll"],
        ]))
    def run_evaluation(self):
        train_scores, train_loss, dev_pairs = evaluate(self, self.dev_docs,
                                                       self.dev_data,
                                                       "Evaluating on train")
        test_scores, test_loss, test_pairs = evaluate(self, self.test_docs,
                                                      self.test_data,
                                                      "Evaluating on test")
        epoch_stats = {
            "epoch": self.epoch,
            "n": self.n,
            "train_loss": train_loss,
            "test_loss": test_loss
        }
        epoch_stats.update(
            {"train " + k: v
             for k, v in train_scores.iteritems()})
        epoch_stats.update(
            {"test " + k: v
             for k, v in test_scores.iteritems()})
        self.history.append(epoch_stats)
        utils.write_pickle(self.history, self.model_props.path + 'history.pkl')
        timer.print_totals()

        test_conll = epoch_stats["test conll"]
        if self.epoch % self.write_every == 0:
            self.best_conll_window = 0
        if test_conll > self.best_conll:
            self.best_conll = test_conll
            print "New best CoNLL, saving model"
            self.save_progress(dev_pairs, test_pairs, "best")
        if test_conll > self.best_conll_window:
            self.best_conll_window = test_conll
            print "New best CoNLL in window, saving model"
            self.save_progress(
                dev_pairs, test_pairs,
                str(self.write_every * int(self.epoch / self.write_every)))
        self.model.save_weights(self.model_props.path + "weights.hdf5",
                                overwrite=True)
Example #34
0
    def test_model_actively(self):
        """
            Starts the active learning process
        """
        logging.info(f'Starting validation for {self.amine}')
        # Run forward pass on the validation data
        logging.debug(f'Weights for loss function: {self.weights}')

        iters, all_data, all_labels, x_t, y_t, x_v, y_v = self.setup_active_learning(
        )

        for i in range(iters):
            logging.debug(
                f'Doing active learning with {len(x_t)} example. Iteration: {i}'
            )
            # Update available datapoints in the pool and evaluate current
            # model performance
            x_t, y_t, x_v, y_v = self.active_learning(all_data, all_labels,
                                                      x_t, y_t, x_v, y_v)

        # Save this dictionary in case we need it later
        write_pickle(self.dst_folder / Path('cv_statistics.pkl'),
                     self.cv_statistics)
Example #35
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--attn-data-file",
        required=True,
        help="Pickle file containing extracted attention maps.")
    parser.add_argument("--outfile",
                        required=True,
                        help="Where to write out the distances between heads.")
    args = parser.parse_args()

    print("Loading attention data")
    data = utils.load_pickle(args.attn_data_file)

    print("Computing head distances")
    js_distances = np.zeros([144, 144])
    for doc in utils.logged_loop(data, n_steps=None):
        if "attns" not in doc:
            continue
        tokens, attns = doc["tokens"], np.array(doc["attns"])

        attns_flat = attns.reshape([144, attns.shape[2], attns.shape[3]])
        for head in range(144):
            head_attns = np.expand_dims(attns_flat[head], 0)
            head_attns_smoothed = (0.001 / head_attns.shape[1]) + (head_attns *
                                                                   0.999)
            attns_flat_smoothed = (0.001 / attns_flat.shape[1]) + (attns_flat *
                                                                   0.999)
            m = (head_attns_smoothed + attns_flat_smoothed) / 2
            js = -head_attns_smoothed * np.log(m / head_attns_smoothed)
            js += -attns_flat_smoothed * np.log(m / attns_flat_smoothed)
            js /= 2
            js = js.sum(-1).sum(-1)
            js_distances[head] += js

        utils.write_pickle(js_distances, args.outfile)
Example #36
0
def main(game, level, metatile_coords_dict_file, metatile_id_map_file,
         player_img):

    start_time = datetime.now()
    print("\nCreating {(tile id, extra_info): coords} map ...")

    metatile_coords_dict = read_pickle(metatile_coords_dict_file)
    metatile_id_map = read_pickle(metatile_id_map_file)
    tile_id_extra_info_coords_map = {}

    for metatile_str, coords in metatile_coords_dict.items():
        metatile = Metatile.from_str(metatile_str)
        metatile_id = metatile_id_map.get(metatile_str)
        if metatile_id is None:
            error_exit("metatile_str not found in metatile_id_map")
        has_graph = bool(metatile.graph_as_dict)
        extra_info = ""
        if not has_graph:
            extra_info += "E"  # metatile graph is empty
        if len(coords) == 1:
            extra_info += "S"  # metatile was only used for one tile

        tile_id_extra_info_coords_map[(metatile_id, extra_info)] = coords

    save_directory = "level_saved_files_%s/tile_id_coords_maps/%s/" % (
        player_img, game)
    tile_id_extra_info_coords_map_file = get_filepath(save_directory,
                                                      "%s.pickle" % level)
    write_pickle(tile_id_extra_info_coords_map_file,
                 tile_id_extra_info_coords_map)

    end_time = datetime.now()
    runtime = str(end_time - start_time)
    print("Runtime: %s\n" % runtime)

    return tile_id_extra_info_coords_map_file, runtime
Example #37
0
def write_document_vectors():
    vectors = word_vectors.WordVectors(load=True)
    for dataset_name in ["train", "dev", "test"]:
        print "Building document vectors for", dataset_name
        doc_vectors = {}
        for d in docs(dataset_name):
            sentences = {}
            did = None
            for mention_num in sorted(d["mentions"].keys(), key=int):
                m = d["mentions"][mention_num]
                did = m["doc_id"]
                if m['sent_num'] not in sentences:
                    sentences[m['sent_num']] = m['sentence']

            v = np.zeros(vectors.vectors[0].size)
            n = 0
            for s in sentences.values():
                for w in s:
                    v += vectors.vectors[vectors[w]]
                    n += 1
            doc_vectors[did] = v / n
        utils.write_pickle(
            doc_vectors,
            directories.MISC + dataset_name + "_document_vectors.pkl")
Example #38
0
 def write(self, path):
     utils.write_pickle(self.__dict__, path)
Example #39
0
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'):
    print "Training", model_props.path
    pprint(model_props.__dict__)

    model_props.write(model_props.path + 'model_props.pkl')
    utils.rmkdir(model_props.path + 'src')
    for fname in os.listdir('.'):
        if fname.endswith('.py'):
            shutil.copyfile(fname, model_props.path + 'src/' + fname)
    if model_props.ranking or \
            model_props.top_pairs:
        write_start = 0
        write_every = 10
    else:
        write_start = 80
        write_every = 20

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train",
                                            model_props, with_ids=True)
    dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name,
                                          model_props, with_ids=True)

    print "Building model"
    model, _ = pairwise_models.get_model(dev, vectors, model_props)
    json_string = model.to_json()
    open(model_props.path + 'architecture.json', 'w').write(json_string)

    best_val_score = 1000
    best_val_score_in_window = 1000
    history = []
    for epoch in range(n_epochs):
        timer.start("train")
        print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path)

        epoch_stats = {}
        model_weights = model.get_weights()
        train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl')
        dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl')
        if reduced:
            dev_docs = dev_docs[:3]

        if model_props.ranking:
            print "Running over training set"
            run_model_over_docs(train, train_docs, model)
            epoch_stats.update(compute_metrics(train_docs, "train"))
            if model_props.use_rewards:
                print "Setting costs"
                set_costs(train, train_docs)

        print "Training"
        prog = utils.Progbar(train.n_batches)
        train.shuffle()
        loss_sum, n_examples = 0, 0
        for i, X in enumerate(train):
            if X['y'].size == 0:
                continue
            batch_loss = model.train_on_batch(X)
            loss_sum += batch_loss * train.scale_factor
            n_examples += X['y'].size
            prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)])
        epoch_stats["train time"] = time.time() - prog.start
        for k in prog.unique_values:
            epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1])

        epoch_stats["weight diffs"] = [
            (np.sum(np.abs(new_weight - old_weight)), new_weight.size)
            for new_weight, old_weight in zip(model.get_weights(), model_weights)]
        summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0)
        epoch_stats["total weight diff"] = tuple(summed)

        print "Testing on dev set"
        evaluate_model(dev, dev_docs, model, model_props, epoch_stats)

        history.append(epoch_stats)
        utils.write_pickle(history, model_props.path + 'history.pkl')
        score = -epoch_stats["dev conll"] if model_props.ranking else \
            (epoch_stats["dev loss"] if not model_props.anaphoricity_only else
             epoch_stats["dev anaphoricity loss"])
        if score < best_val_score:
            best_val_score = score
            print "New best {:}, saving model".format(
                "CoNLL F1" if model_props.ranking else "validation loss")
            model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True)
        if score < best_val_score_in_window and epoch > write_start:
            print "Best in last {:}, saved to weights_{:}".format(
                write_every, write_every * (epoch / write_every))
            best_val_score_in_window = score
            model.save_weights(model_props.path + "weights_{:}.hdf5".format(
                write_every * (epoch / write_every)), overwrite=True)
            if epoch + write_every >= n_epochs:
                model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True)
        if epoch % write_every == 0:
            best_val_score_in_window = 1000

        timer.stop("train")
        timer.print_totals()
        print

    timer.clear()
Example #40
0
def write_feature_names():
    utils.write_pickle({f: i for i, f in enumerate(next(
        utils.load_json_lines(directories.RAW + 'train'))["pair_feature_names"])},
                      directories.MISC + 'pair_feature_names.pkl')