def count_predicates(self): verbs_count = 0 noms_count = 0 # Creates the dictionaries from the NOMLEX pairs nomlex_pairs = self._get_nomlex_pairs( get_lexicon_path(config.NOMLEX_PLUS_NAME, "json")) verbs = [pair[0] for pair in nomlex_pairs] noms = [pair[1] for pair in nomlex_pairs] # Counts the number of verbs appearances in a random dataset limited_n_sentences = 10**5 # 10 ** 6 sentences_file = open(config.WIKI_SENTENCES_PATH) pbar = tqdm(enumerate(sentences_file), leave=False) for i, sentence in pbar: doc = get_dependency_tree(sentence, disable=['ner', 'parser']) for word in doc: if word.pos_ == UPOS_VERB and word.lemma_ in verbs: verbs_count += 1 elif word.pos_ == UPOS_NOUN and word.lemma_ in noms: noms_count += 1 pbar.set_description(f"VERBS={verbs_count}, NOMS={noms_count}") if i > limited_n_sentences: break print(f"Founded {verbs_count} verbs") print(f"Founded {noms_count} noms")
def extract_arguments(self, sentence, return_dependency_tree=False, min_arguments=0, using_default=False, transer_args_predictor=None, context_args_predictor=None, specify_none=False, trim_arguments=True, verb_noun_matcher=None, limited_verbs=None, predicate_indexes=None, return_single=False): """ Extracts arguments of nominalizations and verbs in the given sentence, using NOMLEX lexicon :param sentence: a string text or a dependency tree parsing of a sentence :param return_dependency_tree: whether to return the depenency tree of the given sentence as a third parameter (optional) :param min_arguments: the minimum number of arguments for any founed extraction (0 is deafult) :param using_default: whether to use the default entry in the lexicon all of the time, otherwise only whenever it is needed :param arguments_predictor: the model-based extractor object to determine the argument type of a span (optional) :param specify_none: whether to specify in the resulted extractions about the unused arguments :param trim_arguments: whether to trim the argument spans in the resulted extractions :param verb_noun_matcher: a object that matches a verb with nouns whitin its word family using CATVAR (optional) :param limited_verbs: a list of limited verbs, which limits the predicates that their arguments will be extracted (optional) :param predicate_indexes: a list of specific indexes of the predicated that should be extracted :param return_single: whether to return only a single dictionary of the extracted arguments, together :return: Two dictionaries (and an optional dependency tree): - The founded extractions for each relevant verbs in the given sentence ({verb_Token: [extraction_Span]}) - The founded extractions for each relevant nominalizations in the given sentence ({nom_Token: [extraction_Span]}) The two dictionaries may return as a single dictionary """ dependency_tree = get_dependency_tree(sentence) # Extract arguments based on the verbal lexicon #extractions_per_verb = self.verb_lexicon.extract_arguments(dependency_tree, min_arguments, using_default, transer_args_predictor, context_args_predictor, # specify_none, trim_arguments, verb_noun_matcher, limited_verbs, predicate_indexes) extractions_per_verb = {} # Extract arguments based on the nominal lexicon extractions_per_nom = self.nom_lexicon.extract_arguments( dependency_tree, min_arguments, using_default, transer_args_predictor, context_args_predictor, specify_none, trim_arguments, verb_noun_matcher, limited_verbs, predicate_indexes) if return_single: extractions_per_word = extractions_per_verb extractions_per_word.update(extractions_per_nom) if return_dependency_tree: return extractions_per_word, dependency_tree else: return extractions_per_word if return_dependency_tree: return extractions_per_verb, extractions_per_nom, dependency_tree else: return extractions_per_verb, extractions_per_nom
def new_extract_arguments(self, sentence): dependency_tree = get_dependency_tree(sentence) for token in dependency_tree: word_entry = self.search_word() if not word_entry: continue
def search_matching_extractions(self, searched_args, sentences, extractor_function, limited_results=None): limited_words, limited_verbs = set(), set() for suitable_verb in searched_args.keys(): limited_words.update( self.verb_noun_matcher.get_all_forms(suitable_verb)) limited_verbs.add(suitable_verb) if limited_words == set(): return {} # Extract the same arguments for every nominalization or verb in the given list of sentences count_sents = 0 matching_extractions = defaultdict(list) for sentence in tqdm(sentences): doc = get_dependency_tree(sentence, disable=['ner', 'parser', 'tagger']) lemmas = [w.lemma_ for w in doc] # Does any of the searched words (verbs and noms) appear in the sentence? if limited_words.isdisjoint(lemmas): continue # Get the extractions of the sentence extractions_per_word = extractor_function( self, sentence, limited_verbs=list(limited_verbs), return_single=True) # Replace arguments with the searched arguments names trans_extractions_per_word = self._translate_extractions( extractions_per_word, searched_args) matching_extractions.update(trans_extractions_per_word) if len(trans_extractions_per_word) >= 1: count_sents += 1 if limited_results and count_sents >= limited_results: break return matching_extractions
def _count_verbs_appearances(verbs: set): verbs_appearances = defaultdict(int) # Counts the number of verbs appearances in a random dataset limited_n_sentences = 10**3 # 10 ** 6 sentences_file = open(config.WIKI_SENTENCES_PATH) for i, sentence in tqdm(enumerate(sentences_file), leave=False): doc = get_dependency_tree(sentence, disable=['ner', 'parser']) verbs_lemmas = list([w.lemma_ for w in doc if w.pos_ == UPOS_VERB]) for verb in verbs_lemmas: if verb in verbs: verbs_appearances[verb] += 1 if i > limited_n_sentences: break print( f"Founded {len(verbs_appearances.keys())} verbs from {len(verbs)}") return verbs_appearances
def parse_sentence(self, sent): if type(sent) == str: sent = self.clean_sentence(sent) n_words = len(sent.split(" ")) # Ignore too short or too long sentences if sent == "" or n_words >= self.SENT_LEN_MAX or n_words <= self.SENT_LEN_MIN: return None if not self.is_english(sent): return None doc = get_dependency_tree(sent) else: # the sentence is actually the dependency tree doc = sent # The current line must include only one sentence if len(list(doc.sents)) > 1: return None return doc
def extract_argumnents(self, sentence, word_condition): doc = get_dependency_tree(sentence) extractions_per_word = {} none_predicates = [] for token in doc: if not word_condition(token): continue if is_noun(token): word_extractor = self.noun_args_extractor elif is_verb(token): word_extractor = self.verb_args_extractor else: continue if word_extractor.is_predicate(token): predicate = Predicate(token) extractions_per_word[ predicate] = word_extractor.extract_arguments(predicate) else: none_predicates.append(token) return extractions_per_word, none_predicates
def main(): # Generation of lexicons and datasets can be forced if "-f" in sys.argv: config.LOAD_DATASET = False config.LOAD_LEXICON = False config.REWRITE_TEST = True config.IGNORE_PROCESSED_DATASET = False # DEBUG mode if "-debug" in sys.argv: config.DEBUG = True if "-lispToJson" in sys.argv: if not config.LOAD_LEXICON: ArgumentsExtractor(config.LEXICON_FILE_NAME) else: lisp_to_json(config.LEXICON_FILE_NAME) if "-rule" in sys.argv: extractor_func = ArgumentsExtractor.rule_based_extraction elif "-model" in sys.argv: extractor_func = ArgumentsExtractor.model_based_extraction elif "-hybrid" in sys.argv: extractor_func = ArgumentsExtractor.hybrid_based_extraction else: # default is rule-based extractor_func = ArgumentsExtractor.rule_based_extraction if "-extract" in sys.argv: sentence = sys.argv[-1] test_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME) dependency_tree = get_dependency_tree(sentence) extractor_function = timeit(extractor_func) extractions_per_verb, extractions_per_nom = extractor_function( test_extractor, dependency_tree) print("--------------------------------\nVERB:") separate_line_print(extractions_per_verb) print("--------------------------------\nNOM:") separate_line_print(extractions_per_nom) arguments_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME) dataset_creator = DatasetCreator(arguments_extractor) x = dataset_creator.get_nouns_samples( dependency_tree, { l: 0 for l in { "SUBJECT", "OBJECT", "NONE", "NOT-NOM", "VERB-NOM", "PP", "IND-OBJ" } }, None) print(x) if "-search" in sys.argv: sentence = sys.argv[-1] test_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME) sentences_file = open(config.EXAMPLE_SENTENCES_PATH) searched_args = test_extractor.get_searched_args( sentence, extractor_func) search_function = timeit(test_extractor.search_matching_extractions) matching_extractions = search_function(searched_args, sentences_file, extractor_func, limited_results=5) separate_line_print(matching_extractions) if "-test" in sys.argv: test_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME) test(test_extractor, extractor_func) if "-datasets" in sys.argv: config.LOAD_DATASET = True config.LOAD_LEXICON = True arguments_extractor = ArgumentsExtractor(config.LEXICON_FILE_NAME) dataset_creator = DatasetCreator(arguments_extractor) in_path = sys.argv[-1] if "-sentences" in sys.argv: dataset_creator.create_sentences_dataset(in_path) elif "-parse" in sys.argv: dataset_creator.create_parsed_dataset(in_path) elif "-example" in sys.argv: dataset_creator.dataset_size = 100000 dataset_creator.create_examples_dataset(in_path) elif "-combined" in sys.argv: dataset_creator.dataset_size = 100000 dataset_creator.create_combined_dataset(in_path) elif "-args" in sys.argv: dataset_creator.dataset_size = 100000 dataset_creator.create_args_dataset(in_path) elif "-nouns" in sys.argv: dataset_creator.dataset_size = 100000 dataset_creator.create_nouns_dataset(in_path) if "-train" in sys.argv: #arguments_predictor = TypesPredictor({"SUBJECT", "OBJECT", "NONE"}) arguments_predictor = TypesPredictor({ "SUBJECT", "OBJECT", "NONE", "NOT-NOM", "VERB-NOM", "PP", "IND-OBJ" }) arguments_predictor.train()