def build_predicates(self): assert len(self.all_instances) > 0 assert self.treebank_reader is not None assert self.nombank_reader is not None assert self.predicate_mapping is not None assert self.corenlp_reader is not None if len(self.all_predicates) > 0: log.warning('Overriding existing predicates') self.all_predicates = [] log.info('Building predicates') for instance in self.all_instances: predicate = Predicate.build(instance) predicate.set_pred(self.predicate_mapping[str( predicate.pred_pointer)]) self.all_predicates.append(predicate) log.info('Checking explicit arguments with Nombank instances') for predicate in self.all_predicates: nombank_instance = self.nombank_reader.search_by_pointer( predicate.pred_pointer) predicate.check_exp_args(nombank_instance, add_missing_args=False, remove_conflict_imp_args=False, verbose=False) log.info('Parsing all implicit and explicit arguments') for predicate in self.all_predicates: predicate.parse_args( self.treebank_reader, self.corenlp_reader, include_non_head_entity=self.include_non_head_entity) log.info('Done')
def main(): all_instances = read_imp_arg_dataset(sys.argv[1]) # print_imp_arg_dataset(all_instances, 'test_annotations.txt') print '\nLoading predicate dict from {}'.format(predicate_dict_path) predicate_dict = pkl.load(open(predicate_dict_path, 'r')) # print_imp_arg_dataset_by_pred(all_instances, predicate_dict, 'ia_by_pred') treebank_reader = TreebankReader() propbank_reader = PropbankReader() propbank_reader.build_index() nombank_reader = NombankReader() nombank_reader.build_index() if not exists(corenlp_dict_path): print '\nNo existing CoreNLP dict found' corenlp_reader = CoreNLPReader.build(all_instances) corenlp_reader.save(corenlp_dict_path) else: corenlp_reader = CoreNLPReader.load(corenlp_dict_path) if not exists(all_predicates_path): all_predicates = [] for instance in all_instances: predicate = Predicate.build(instance) predicate.set_pred(predicate_dict[str(predicate.pred_pointer)]) all_predicates.append(predicate) print '\nChecking explicit arguments with Nombank instances' for predicate in all_predicates: nombank_instance = nombank_reader.search_by_pointer( predicate.pred_pointer) predicate.check_exp_args(nombank_instance, add_missing_args=False, remove_conflict_imp_args=False, verbose=False) print '\nParsing all implicit and explicit arguments' for predicate in tqdm(all_predicates, desc='Processed', ncols=100): predicate.parse_args(treebank_reader, corenlp_reader) print '\nSaving all parsed predicates to {}'.format( all_predicates_path) pkl.dump(all_predicates, open(all_predicates_path, 'w')) else: print '\nLoading all parsed predicates from {}'.format( all_predicates_path) start_time = timeit.default_timer() all_predicates = pkl.load(open(all_predicates_path, 'r')) elapsed = timeit.default_timer() - start_time print '\tDone in {:.3f} seconds'.format(elapsed) # check_multi_pobj(all_predicates) if not exists(candidate_dict_path): print '\nBuilding candidate dict from Propbank and Nombank' candidate_dict = CandidateDict(propbank_reader=propbank_reader, nombank_reader=nombank_reader, corenlp_reader=corenlp_reader, max_dist=max_candidate_dist) for predicate in tqdm(all_predicates, desc='Processed', ncols=100): candidate_dict.add_candidates(predicate.pred_pointer) candidate_dict.save(candidate_dict_path) else: candidate_dict = CandidateDict.load(candidate_dict_path, propbank_reader=propbank_reader, nombank_reader=nombank_reader, corenlp_reader=corenlp_reader, max_dist=max_candidate_dist) # candidate_dict.print_all_candidates('all_candidates.txt') print '\nAdding candidates to predicates' for predicate in all_predicates: for candidate in candidate_dict.get_candidates(predicate.pred_pointer): predicate.candidates.append(candidate) # print_all_predicate(all_predicates, 'all_predicates.txt', verbose=True, # include_candidates=True, include_dice_scores=True, # corenlp_reader=corenlp_reader) if not exists(all_rich_predicates_path): all_rich_predicates = [] for predicate in all_predicates: rich_predicate = RichPredicate.build(predicate, corenlp_reader, use_lemma=True, use_entity=True, use_corenlp_tokens=True) all_rich_predicates.append(rich_predicate) print '\nSaving all rich predicates to {}'.format( all_rich_predicates_path) pkl.dump(all_rich_predicates, open(all_rich_predicates_path, 'w')) print '\nPrinting statistics' print_stats(all_predicates)