def build(cls, instances): prep_vocab_list = read_vocab_list(prep_vocab_list_file) print '\nBuilding CoreNLP Reader from {}'.format(corenlp_root) corenlp_dict = {} for instance in instances: pred_pointer = instance.pred_pointer if pred_pointer.fileid not in corenlp_dict: path = join(corenlp_root, 'idx', pred_pointer.get_path()) idx_mapping = [] with open(path, 'r') as fin: for line in fin: idx_mapping.append([int(i) for i in line.split()]) path = join(corenlp_root, 'parsed', pred_pointer.get_path('.xml.bz2')) doc = read_doc_from_corenlp(BZ2File(path, 'r')) script = Script.from_doc(doc) rich_script = RichScript.build(script, prep_vocab_list=prep_vocab_list, use_lemma=True, filter_stop_events=False) corenlp_dict[pred_pointer.fileid] = \ (idx_mapping, doc, script, rich_script) return cls(corenlp_dict)
def evaluate(self, all_scripts, **kwargs): self.set_config(**kwargs) self.log_evaluator_info() self.eval_stats.reset() for script in tqdm(all_scripts, desc='Processed', ncols=100): assert isinstance(script, Script), \ 'every script in all_scripts must be a {} instance'.format( get_class_name(Script)) # ignore script where there is less than 2 events # (i.e., no context events to be compared to) if len(script.events) < 2: continue # ignore script where there is less than 2 entities # (i.e., only one candidate to select from) if len(script.entities) < 2: continue self.logger.debug('Processing script {}'.format(script.doc_name)) # load prep_vocab_list prep_vocab_list = read_vocab_list(consts.PREP_VOCAB_LIST_FILE) # build the rich_script from script rich_script = RichScript.build( script, prep_vocab_list=prep_vocab_list, use_lemma=self.use_lemma, filter_stop_events=self.filter_stop_events) # index the rich_script with the embedding model rich_script.get_index(self.embedding_model, include_type=self.include_type, use_unk=True) # get the list of indexed events in the script rich_event_list = rich_script.get_indexed_events() # ignore rich_script where there is less than 2 indexed events # (i.e., no context events to be compared to) if len(rich_event_list) < 2: continue self.evaluate_event_list(rich_event_list) self.print_stats()
if args.prep_vocab: prep_vocab_list = read_vocab_list(args.prep_vocab) else: prep_vocab_list = read_vocab_list( join(cur_dir_path, consts.PREP_VOCAB_LIST_FILE)) pred_count_dict = None if args.subsampling: with open(join(cur_dir_path, consts.PRED_VOCAB_COUNT_FILE)) as fin: pred_count_dict = read_counter(fin) for input_f in input_files: with BZ2File(input_f, 'r') as fin: script_corpus = ScriptCorpus.from_text(fin.read()) for script in script_corpus.scripts: rich_script = RichScript.build(script, prep_vocab_list=prep_vocab_list, use_lemma=args.use_lemma, filter_stop_events=False) rich_script.get_index(model, include_type=True, use_unk=True, pred_count_dict=pred_count_dict) pair_tuning_inputs = rich_script.get_pair_tuning_input_list( neg_sample_type=args.neg_sample_type) if len(pair_tuning_inputs) > 0: fout.write('\n'.join(map(str, pair_tuning_inputs)) + '\n') fout.close()