Example #1
0
    def build_predicates(self):
        assert len(self.all_instances) > 0
        assert self.treebank_reader is not None
        assert self.nombank_reader is not None
        assert self.predicate_mapping is not None
        assert self.corenlp_reader is not None

        if len(self.all_predicates) > 0:
            log.warning('Overriding existing predicates')
            self.all_predicates = []

        log.info('Building predicates')
        for instance in self.all_instances:
            predicate = Predicate.build(instance)
            predicate.set_pred(self.predicate_mapping[str(
                predicate.pred_pointer)])
            self.all_predicates.append(predicate)

        log.info('Checking explicit arguments with Nombank instances')
        for predicate in self.all_predicates:
            nombank_instance = self.nombank_reader.search_by_pointer(
                predicate.pred_pointer)
            predicate.check_exp_args(nombank_instance,
                                     add_missing_args=False,
                                     remove_conflict_imp_args=False,
                                     verbose=False)

        log.info('Parsing all implicit and explicit arguments')
        for predicate in self.all_predicates:
            predicate.parse_args(
                self.treebank_reader,
                self.corenlp_reader,
                include_non_head_entity=self.include_non_head_entity)
        log.info('Done')
Example #2
0
def main():

    all_instances = read_imp_arg_dataset(sys.argv[1])

    # print_imp_arg_dataset(all_instances, 'test_annotations.txt')

    print '\nLoading predicate dict from {}'.format(predicate_dict_path)
    predicate_dict = pkl.load(open(predicate_dict_path, 'r'))

    # print_imp_arg_dataset_by_pred(all_instances, predicate_dict, 'ia_by_pred')

    treebank_reader = TreebankReader()

    propbank_reader = PropbankReader()
    propbank_reader.build_index()

    nombank_reader = NombankReader()
    nombank_reader.build_index()

    if not exists(corenlp_dict_path):
        print '\nNo existing CoreNLP dict found'
        corenlp_reader = CoreNLPReader.build(all_instances)
        corenlp_reader.save(corenlp_dict_path)
    else:
        corenlp_reader = CoreNLPReader.load(corenlp_dict_path)

    if not exists(all_predicates_path):
        all_predicates = []
        for instance in all_instances:
            predicate = Predicate.build(instance)
            predicate.set_pred(predicate_dict[str(predicate.pred_pointer)])
            all_predicates.append(predicate)

        print '\nChecking explicit arguments with Nombank instances'
        for predicate in all_predicates:
            nombank_instance = nombank_reader.search_by_pointer(
                predicate.pred_pointer)
            predicate.check_exp_args(nombank_instance,
                                     add_missing_args=False,
                                     remove_conflict_imp_args=False,
                                     verbose=False)

        print '\nParsing all implicit and explicit arguments'
        for predicate in tqdm(all_predicates, desc='Processed', ncols=100):
            predicate.parse_args(treebank_reader, corenlp_reader)

        print '\nSaving all parsed predicates to {}'.format(
            all_predicates_path)
        pkl.dump(all_predicates, open(all_predicates_path, 'w'))

    else:
        print '\nLoading all parsed predicates from {}'.format(
            all_predicates_path)

        start_time = timeit.default_timer()
        all_predicates = pkl.load(open(all_predicates_path, 'r'))
        elapsed = timeit.default_timer() - start_time
        print '\tDone in {:.3f} seconds'.format(elapsed)

    # check_multi_pobj(all_predicates)

    if not exists(candidate_dict_path):
        print '\nBuilding candidate dict from Propbank and Nombank'
        candidate_dict = CandidateDict(propbank_reader=propbank_reader,
                                       nombank_reader=nombank_reader,
                                       corenlp_reader=corenlp_reader,
                                       max_dist=max_candidate_dist)

        for predicate in tqdm(all_predicates, desc='Processed', ncols=100):
            candidate_dict.add_candidates(predicate.pred_pointer)

        candidate_dict.save(candidate_dict_path)

    else:
        candidate_dict = CandidateDict.load(candidate_dict_path,
                                            propbank_reader=propbank_reader,
                                            nombank_reader=nombank_reader,
                                            corenlp_reader=corenlp_reader,
                                            max_dist=max_candidate_dist)

    # candidate_dict.print_all_candidates('all_candidates.txt')

    print '\nAdding candidates to predicates'
    for predicate in all_predicates:
        for candidate in candidate_dict.get_candidates(predicate.pred_pointer):
            predicate.candidates.append(candidate)

    # print_all_predicate(all_predicates, 'all_predicates.txt', verbose=True,
    #                     include_candidates=True, include_dice_scores=True,
    #                     corenlp_reader=corenlp_reader)

    if not exists(all_rich_predicates_path):
        all_rich_predicates = []

        for predicate in all_predicates:
            rich_predicate = RichPredicate.build(predicate,
                                                 corenlp_reader,
                                                 use_lemma=True,
                                                 use_entity=True,
                                                 use_corenlp_tokens=True)
            all_rich_predicates.append(rich_predicate)

        print '\nSaving all rich predicates to {}'.format(
            all_rich_predicates_path)
        pkl.dump(all_rich_predicates, open(all_rich_predicates_path, 'w'))

    print '\nPrinting statistics'
    print_stats(all_predicates)