Example #1
0
    def count_predicates(self):
        verbs_count = 0
        noms_count = 0

        # Creates the dictionaries from the NOMLEX pairs
        nomlex_pairs = self._get_nomlex_pairs(
            get_lexicon_path(config.NOMLEX_PLUS_NAME, "json"))
        verbs = [pair[0] for pair in nomlex_pairs]
        noms = [pair[1] for pair in nomlex_pairs]

        # Counts the number of verbs appearances in a random dataset
        limited_n_sentences = 10**5  # 10 ** 6
        sentences_file = open(config.WIKI_SENTENCES_PATH)
        pbar = tqdm(enumerate(sentences_file), leave=False)
        for i, sentence in pbar:
            doc = get_dependency_tree(sentence, disable=['ner', 'parser'])

            for word in doc:
                if word.pos_ == UPOS_VERB and word.lemma_ in verbs:
                    verbs_count += 1
                elif word.pos_ == UPOS_NOUN and word.lemma_ in noms:
                    noms_count += 1

            pbar.set_description(f"VERBS={verbs_count}, NOMS={noms_count}")

            if i > limited_n_sentences:
                break

        print(f"Founded {verbs_count} verbs")
        print(f"Founded {noms_count} noms")
    def extract_arguments(self,
                          sentence,
                          return_dependency_tree=False,
                          min_arguments=0,
                          using_default=False,
                          transer_args_predictor=None,
                          context_args_predictor=None,
                          specify_none=False,
                          trim_arguments=True,
                          verb_noun_matcher=None,
                          limited_verbs=None,
                          predicate_indexes=None,
                          return_single=False):
        """
		Extracts arguments of nominalizations and verbs in the given sentence, using NOMLEX lexicon
		:param sentence: a string text or a dependency tree parsing of a sentence
		:param return_dependency_tree: whether to return the depenency tree of the given sentence as a third parameter (optional)
		:param min_arguments: the minimum number of arguments for any founed extraction (0 is deafult)
		:param using_default: whether to use the default entry in the lexicon all of the time, otherwise only whenever it is needed
		:param arguments_predictor: the model-based extractor object to determine the argument type of a span (optional)
		:param specify_none: whether to specify in the resulted extractions about the unused arguments
		:param trim_arguments: whether to trim the argument spans in the resulted extractions
		:param verb_noun_matcher: a object that matches a verb with nouns whitin its word family using CATVAR (optional)
		:param limited_verbs: a list of limited verbs, which limits the predicates that their arguments will be extracted (optional)
		:param predicate_indexes: a list of specific indexes of the predicated that should be extracted
		:param return_single: whether to return only a single dictionary of the extracted arguments, together
		:return: Two dictionaries (and an optional dependency tree):
				 - The founded extractions for each relevant verbs in the given sentence ({verb_Token: [extraction_Span]})
				 - The founded extractions for each relevant nominalizations in the given sentence ({nom_Token: [extraction_Span]})
				 The two dictionaries may return as a single dictionary
		"""

        dependency_tree = get_dependency_tree(sentence)

        # Extract arguments based on the verbal lexicon
        #extractions_per_verb = self.verb_lexicon.extract_arguments(dependency_tree, min_arguments, using_default, transer_args_predictor, context_args_predictor,
        #														   specify_none, trim_arguments, verb_noun_matcher, limited_verbs, predicate_indexes)
        extractions_per_verb = {}

        # Extract arguments based on the nominal lexicon
        extractions_per_nom = self.nom_lexicon.extract_arguments(
            dependency_tree, min_arguments, using_default,
            transer_args_predictor, context_args_predictor, specify_none,
            trim_arguments, verb_noun_matcher, limited_verbs,
            predicate_indexes)

        if return_single:
            extractions_per_word = extractions_per_verb
            extractions_per_word.update(extractions_per_nom)

            if return_dependency_tree:
                return extractions_per_word, dependency_tree
            else:
                return extractions_per_word

        if return_dependency_tree:
            return extractions_per_verb, extractions_per_nom, dependency_tree
        else:
            return extractions_per_verb, extractions_per_nom
    def new_extract_arguments(self, sentence):
        dependency_tree = get_dependency_tree(sentence)

        for token in dependency_tree:
            word_entry = self.search_word()

            if not word_entry:
                continue
    def search_matching_extractions(self,
                                    searched_args,
                                    sentences,
                                    extractor_function,
                                    limited_results=None):
        limited_words, limited_verbs = set(), set()
        for suitable_verb in searched_args.keys():
            limited_words.update(
                self.verb_noun_matcher.get_all_forms(suitable_verb))
            limited_verbs.add(suitable_verb)

        if limited_words == set():
            return {}

        # Extract the same arguments for every nominalization or verb in the given list of sentences
        count_sents = 0
        matching_extractions = defaultdict(list)
        for sentence in tqdm(sentences):
            doc = get_dependency_tree(sentence,
                                      disable=['ner', 'parser', 'tagger'])
            lemmas = [w.lemma_ for w in doc]

            # Does any of the searched words (verbs and noms) appear in the sentence?
            if limited_words.isdisjoint(lemmas):
                continue

            # Get the extractions of the sentence
            extractions_per_word = extractor_function(
                self,
                sentence,
                limited_verbs=list(limited_verbs),
                return_single=True)

            # Replace arguments with the searched arguments names
            trans_extractions_per_word = self._translate_extractions(
                extractions_per_word, searched_args)
            matching_extractions.update(trans_extractions_per_word)

            if len(trans_extractions_per_word) >= 1:
                count_sents += 1

            if limited_results and count_sents >= limited_results:
                break

        return matching_extractions
Example #5
0
    def _count_verbs_appearances(verbs: set):
        verbs_appearances = defaultdict(int)

        # Counts the number of verbs appearances in a random dataset
        limited_n_sentences = 10**3  # 10 ** 6
        sentences_file = open(config.WIKI_SENTENCES_PATH)
        for i, sentence in tqdm(enumerate(sentences_file), leave=False):
            doc = get_dependency_tree(sentence, disable=['ner', 'parser'])
            verbs_lemmas = list([w.lemma_ for w in doc if w.pos_ == UPOS_VERB])

            for verb in verbs_lemmas:
                if verb in verbs:
                    verbs_appearances[verb] += 1

            if i > limited_n_sentences:
                break

        print(
            f"Founded {len(verbs_appearances.keys())} verbs from {len(verbs)}")

        return verbs_appearances
    def parse_sentence(self, sent):
        if type(sent) == str:
            sent = self.clean_sentence(sent)
            n_words = len(sent.split(" "))

            # Ignore too short or too long sentences
            if sent == "" or n_words >= self.SENT_LEN_MAX or n_words <= self.SENT_LEN_MIN:
                return None

            if not self.is_english(sent):
                return None

            doc = get_dependency_tree(sent)

        else:  # the sentence is actually the dependency tree
            doc = sent

        # The current line must include only one sentence
        if len(list(doc.sents)) > 1:
            return None

        return doc
    def extract_argumnents(self, sentence, word_condition):
        doc = get_dependency_tree(sentence)
        extractions_per_word = {}
        none_predicates = []

        for token in doc:
            if not word_condition(token):
                continue

            if is_noun(token):
                word_extractor = self.noun_args_extractor
            elif is_verb(token):
                word_extractor = self.verb_args_extractor
            else:
                continue

            if word_extractor.is_predicate(token):
                predicate = Predicate(token)
                extractions_per_word[
                    predicate] = word_extractor.extract_arguments(predicate)
            else:
                none_predicates.append(token)

        return extractions_per_word, none_predicates
def main():
    # Generation of lexicons and datasets can be forced
    if "-f" in sys.argv:
        config.LOAD_DATASET = False
        config.LOAD_LEXICON = False
        config.REWRITE_TEST = True
        config.IGNORE_PROCESSED_DATASET = False

    # DEBUG mode
    if "-debug" in sys.argv:
        config.DEBUG = True

    if "-lispToJson" in sys.argv:
        if not config.LOAD_LEXICON:
            ArgumentsExtractor(config.LEXICON_FILE_NAME)
        else:
            lisp_to_json(config.LEXICON_FILE_NAME)

    if "-rule" in sys.argv:
        extractor_func = ArgumentsExtractor.rule_based_extraction
    elif "-model" in sys.argv:
        extractor_func = ArgumentsExtractor.model_based_extraction
    elif "-hybrid" in sys.argv:
        extractor_func = ArgumentsExtractor.hybrid_based_extraction
    else:  # default is rule-based
        extractor_func = ArgumentsExtractor.rule_based_extraction

    if "-extract" in sys.argv:
        sentence = sys.argv[-1]
        test_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME)

        dependency_tree = get_dependency_tree(sentence)
        extractor_function = timeit(extractor_func)
        extractions_per_verb, extractions_per_nom = extractor_function(
            test_extractor, dependency_tree)

        print("--------------------------------\nVERB:")
        separate_line_print(extractions_per_verb)

        print("--------------------------------\nNOM:")
        separate_line_print(extractions_per_nom)

        arguments_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME)
        dataset_creator = DatasetCreator(arguments_extractor)
        x = dataset_creator.get_nouns_samples(
            dependency_tree, {
                l: 0
                for l in {
                    "SUBJECT", "OBJECT", "NONE", "NOT-NOM", "VERB-NOM", "PP",
                    "IND-OBJ"
                }
            }, None)
        print(x)

    if "-search" in sys.argv:
        sentence = sys.argv[-1]
        test_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME)

        sentences_file = open(config.EXAMPLE_SENTENCES_PATH)
        searched_args = test_extractor.get_searched_args(
            sentence, extractor_func)

        search_function = timeit(test_extractor.search_matching_extractions)
        matching_extractions = search_function(searched_args,
                                               sentences_file,
                                               extractor_func,
                                               limited_results=5)
        separate_line_print(matching_extractions)

    if "-test" in sys.argv:
        test_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME)
        test(test_extractor, extractor_func)

    if "-datasets" in sys.argv:
        config.LOAD_DATASET = True
        config.LOAD_LEXICON = True
        arguments_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME)
        dataset_creator = DatasetCreator(arguments_extractor)
        in_path = sys.argv[-1]

        if "-sentences" in sys.argv:
            dataset_creator.create_sentences_dataset(in_path)
        elif "-parse" in sys.argv:
            dataset_creator.create_parsed_dataset(in_path)
        elif "-example" in sys.argv:
            dataset_creator.dataset_size = 100000
            dataset_creator.create_examples_dataset(in_path)
        elif "-combined" in sys.argv:
            dataset_creator.dataset_size = 100000
            dataset_creator.create_combined_dataset(in_path)
        elif "-args" in sys.argv:
            dataset_creator.dataset_size = 100000
            dataset_creator.create_args_dataset(in_path)
        elif "-nouns" in sys.argv:
            dataset_creator.dataset_size = 100000
            dataset_creator.create_nouns_dataset(in_path)

    if "-train" in sys.argv:
        #arguments_predictor = TypesPredictor({"SUBJECT", "OBJECT", "NONE"})
        arguments_predictor = TypesPredictor({
            "SUBJECT", "OBJECT", "NONE", "NOT-NOM", "VERB-NOM", "PP", "IND-OBJ"
        })
        arguments_predictor.train()