コード例 #1
0
    def build(cls, instances):
        prep_vocab_list = read_vocab_list(prep_vocab_list_file)

        print '\nBuilding CoreNLP Reader from {}'.format(corenlp_root)
        corenlp_dict = {}

        for instance in instances:
            pred_pointer = instance.pred_pointer
            if pred_pointer.fileid not in corenlp_dict:

                path = join(corenlp_root, 'idx', pred_pointer.get_path())
                idx_mapping = []
                with open(path, 'r') as fin:
                    for line in fin:
                        idx_mapping.append([int(i) for i in line.split()])

                path = join(corenlp_root, 'parsed',
                            pred_pointer.get_path('.xml.bz2'))
                doc = read_doc_from_corenlp(BZ2File(path, 'r'))

                script = Script.from_doc(doc)

                rich_script = RichScript.build(script,
                                               prep_vocab_list=prep_vocab_list,
                                               use_lemma=True,
                                               filter_stop_events=False)

                corenlp_dict[pred_pointer.fileid] = \
                    (idx_mapping, doc, script, rich_script)

        return cls(corenlp_dict)
コード例 #2
0
    def evaluate(self, all_scripts, **kwargs):
        self.set_config(**kwargs)
        self.log_evaluator_info()
        self.eval_stats.reset()
        for script in tqdm(all_scripts, desc='Processed', ncols=100):
            assert isinstance(script, Script), \
                'every script in all_scripts must be a {} instance'.format(
                    get_class_name(Script))

            # ignore script where there is less than 2 events
            # (i.e., no context events to be compared to)
            if len(script.events) < 2:
                continue
            # ignore script where there is less than 2 entities
            # (i.e., only one candidate to select from)
            if len(script.entities) < 2:
                continue

            self.logger.debug('Processing script {}'.format(script.doc_name))

            # load prep_vocab_list
            prep_vocab_list = read_vocab_list(consts.PREP_VOCAB_LIST_FILE)

            # build the rich_script from script
            rich_script = RichScript.build(
                script,
                prep_vocab_list=prep_vocab_list,
                use_lemma=self.use_lemma,
                filter_stop_events=self.filter_stop_events)
            # index the rich_script with the embedding model
            rich_script.get_index(self.embedding_model,
                                  include_type=self.include_type,
                                  use_unk=True)

            # get the list of indexed events in the script
            rich_event_list = rich_script.get_indexed_events()
            # ignore rich_script where there is less than 2 indexed events
            # (i.e., no context events to be compared to)
            if len(rich_event_list) < 2:
                continue

            self.evaluate_event_list(rich_event_list)

        self.print_stats()
コード例 #3
0
if args.prep_vocab:
    prep_vocab_list = read_vocab_list(args.prep_vocab)
else:
    prep_vocab_list = read_vocab_list(
        join(cur_dir_path, consts.PREP_VOCAB_LIST_FILE))

pred_count_dict = None
if args.subsampling:
    with open(join(cur_dir_path, consts.PRED_VOCAB_COUNT_FILE)) as fin:
        pred_count_dict = read_counter(fin)

for input_f in input_files:
    with BZ2File(input_f, 'r') as fin:
        script_corpus = ScriptCorpus.from_text(fin.read())
        for script in script_corpus.scripts:
            rich_script = RichScript.build(script,
                                           prep_vocab_list=prep_vocab_list,
                                           use_lemma=args.use_lemma,
                                           filter_stop_events=False)
            rich_script.get_index(model,
                                  include_type=True,
                                  use_unk=True,
                                  pred_count_dict=pred_count_dict)
            pair_tuning_inputs = rich_script.get_pair_tuning_input_list(
                neg_sample_type=args.neg_sample_type)
            if len(pair_tuning_inputs) > 0:
                fout.write('\n'.join(map(str, pair_tuning_inputs)) + '\n')

fout.close()