def __init__(self, config_path): print("Initializing configuration for relations generator...") # Read the config file self.config = configparser.RawConfigParser() try: self.config.read(config_path) except OSError: print("Unable to read config, aborting.") return # Load and split the POS parameter into a list # Create a mapping from each possible POS value in the config to the corresponding rwn component mapping = {"verb": rwn.Synset.Pos.VERB, "noun": rwn.Synset.Pos.NOUN, "adverb": rwn.Synset.Pos.ADVERB, "adjective": rwn.Synset.Pos.ADJECTIVE} # Keep in the map of PoS : Rwn.Pos only the specified parts of speech self.pos = mapping # Load the root of the folders containing constraints self.constraints_root_path = self.config.get("paths", "CONSTRAINTS_ROOT_PATH") vocab_path = self.config.get("paths", "VOCAB_PATH") self.vocabulary = list() with open(file=vocab_path, mode="r", encoding="utf-8") as vocab_file: for line in vocab_file: self.vocabulary.append(line.strip()) self.vocabulary = set(unique(self.vocabulary)) print(f"Loaded {len(self.vocabulary)} words from {vocab_path}") print( f"Finished initializing config for relations generator. Will output results to {self.constraints_root_path}")
def augment_antonym_verbs(lemmas: list, times: list) -> list: """ Computes the list of valid antonym pairs. :param lemmas: A list of verb lemmas. :param times: A list of verbal times. :return: The list of antonym verb pairs generated from the provided lemmas and verbal times. """ wordnet = rwn.RoWordNet() conjugator = mlconjug.Conjugator(language='ro') # Enforce lemmas vs use language model to lemmatize. First choice should suffice for now. # Todo: Compare advantages/disadvantages, maybe implement second global_antonym_pairs = list() for lemma in lemmas: global_antonym_pairs.extend( generate_conjugated_pairs(lemma, wordnet, conjugator, times, 'ant')) valid_antonym_pairs = unique( list([ pair for pair in global_antonym_pairs if pair[0] is not None and pair[1] is not None ])) return valid_antonym_pairs
def extract_verb_lemmas(base_langauge_model: str, sentences: list) -> tuple: verb_lemmas = list() verb_times = list() conjugator = mlconjug.Conjugator(language='ro') # Manually exclude the auxilliary verbs. If the to-do below is solved, this is no longer required auxilliary_verb_lemmas = ['vrea', 'avea', 'fi'] excluded_verb_forms = ['Vmp--sf', 'Vmp--sm', 'Vmp--pm', 'Vmp--pf'] # Load the SpaCy Language Model lang_model = spacy.load(base_langauge_model) for sentence in sentences: doc = lang_model(sentence) for token in doc: # For now this takes all the verbs regardless of their conjugation. # Todo: Use https://universaldependencies.org/tagset-conversion/ro-multext-uposf.html if valid_verb(token, auxilliary_verb_lemmas, excluded_verb_forms): if token.lemma_ not in verb_lemmas: verb_lemmas.append(token.lemma_) # Figure the possible conjugations of the lemma which are equal to the token try: target_conjugations = conjugator.conjugate( token.lemma_) # For each conjugation of the target word for conjugation in target_conjugations.iterate(): # Unpack the time, mood, person and value from the tuple time, mood, person, value = conjugation if value == token.text: # Found a possible candidate, append the time/mood/person to the list verb_times.append((time, mood, person)) except ValueError: print( f"Unable to conjugate, possibly mistagged verb {token.text}" ) verb_lemmas = unique(verb_lemmas) verb_times = unique(verb_times) print("Verbs extracted", verb_lemmas) print("Verb times extracted:", verb_times) return verb_lemmas, verb_times
def generate_antonym_pairs(config: SettingConfig) -> dict: """ Generates antonym pairs from RoWordNet. :param config: Configuration of the current run. :return: A dictionary where keys are strings representing parts of speech and values are lists of pairs corresponding to synonyms / antonyms from that category. """ print(f"Generating initial antonym pairs from RoWordNet @ {datetime.now()}") wn = rwn.RoWordNet() # Create the output dictionary that will be of type dict(str : set(pair(str, str)) where the key is # the PoS and the value is a set of pairs of words of PoS specified by the key pairs = dict() # Iterate over the selected parts of speech for part_of_speech in config.pos.values(): pos_pairs = list() # Return all synsets corresponding to the PoS synset_ids = wn.synsets(pos=part_of_speech) # Iterate all the synsets for the current PoS for synset_id in synset_ids: # Get the synset object specified by synset_id synset = wn.synset(synset_id) # Get the outbound relations of type antonym from outbound_relations = filter(lambda x: x[1] == 'near_antonym', wn.outbound_relations(synset_id)) # Iterate outbound relations for relation in outbound_relations: # Get the synset corresponding to the target of the outbound relation target_synset = wn.synset(relation[0]) # Get all the pairs, sort them by first word to keep set entries unique current_iteration_pairs = get_cross_synset_pairs(synset, target_synset) # Add the current set of pairs pos_pairs.extend(current_iteration_pairs) # Get corresponding key in pos dictionary and add the pair to the resulting dictionary for key, value in config.pos.items(): if value == part_of_speech: pairs[key] = unique(pos_pairs) # Return the whole dictionary print(f"Successfully generated antonym paris @ {datetime.now()}") return pairs
def generate_conjugated_pairs(lemma: str, wordnet: RoWordNet, conjugator: mlconjug.Conjugator, valid_verbal_times: list, mode: str) -> list: """ Generates the conjugated pairs for a verb. :param lemma: The lemma (infinitive form) of the target word. :param wordnet: An instance of the RoWordNet lexicon. :param conjugator: An instance of the mlConjug conjugator. :param valid_verbal_times: A list of valid verb times. :param mode: Specifies whether the function returns the list of antonym pairs or synonym pairs. :return: A list of conjugated pairs, according to the constraint specified by mode. """ # Check the mode if mode == 'syn': relationships = generate_rwn_synonyms(wordnet, lemma) elif mode == 'ant': relationships = generate_rwn_antonyms(wordnet, lemma) else: raise ValueError('Invalid Mode') pairs = list() for relationship in relationships: for pair in relationship: pairs.append(pair) pairs = unique(pairs) pairs = list(filter(None, list(map(lambda x: process_pair(x), pairs)))) conjugated_pairs = list() for pair in pairs: for time in valid_verbal_times: try: conjugated_first = conjugate(pair[0], conjugator, time) conjugated_second = conjugate(pair[1], conjugator, time) if not (conjugated_first is None and conjugated_second is None): conjugated_pairs.append( (conjugated_first, conjugated_second)) except ValueError: # Well, no big deal if we are unable to conjugate the pairs. Probably the correct place to "solve" # the error of intruders by simply passing to the next conjugation step and filtering only # valid elements in the list. print(f"Value Error when conjugating {pair}") pass return conjugated_pairs
def generate_synonym_pairs(config: SettingConfig) -> dict: """ Generates synonym pairs from RoWordNet. :param config: Configuration of the current run. :return: A dictionary where keys are strings representing parts of speech and values are lists of pairs corresponding to synonyms / antonyms from that category. """ wn = rwn.RoWordNet() # Create the output dictionary that will be of type dict(str : set(pair(str, str)) where the key is # the PoS and the value is a set of pairs of words of PoS specified by the key pairs = dict() # Iterate over the selected parts of speech for part_of_speech in config.pos.values(): pos_pairs = list() # Return all synsets corresponding to the PoS synset_ids = wn.synsets(pos=part_of_speech) # Iterate all the synsets for the current PoS for synset_id in synset_ids: # Get the synset object specified by synset_id synset = wn.synset(synset_id) # Get all the pairs, sort them by first word to keep set entries unique current_iteration_pairs = get_synset_pairs(synset) # Append all pairs from the current PoS to the global set pos_pairs.extend(current_iteration_pairs) # Get corresponding key in pos dictionary and add the pair to the resulting dictionary for key, value in config.pos.items(): if value == part_of_speech: pairs[key] = unique(pos_pairs) return pairs
def postprocess_pairs(raw_pairs: list, config: SettingConfig) -> list: """ Processes a list of pairs to remove reflexive forms, pairs with duplicate elements, proper nouns, etc. :param raw_pairs: List of tuples representing the initial pairs. :param config: Configuration of the current run. :return: The filtered list of pairs from the initial pairs. """ processed_pairs = list() for raw_pair in raw_pairs: # Preprocess each line processed_line = process_pair(raw_pair) # If the processed line is not empty (meaning we have 2 different words separated by a space) if processed_line: # Split the words w1, w2 = processed_line # Check if both are in the dictionary if w1 in config.vocabulary and w2 in config.vocabulary: processed_pairs.append((w1, w2)) return unique(processed_pairs)
def __init__(self, config_path, language_model_name): # Read the config file self.config = configparser.RawConfigParser() try: self.config.read(config_path) except OSError: print("Unable to read config, aborting.") return # Read the word vectors path from the config self.input_vectors_path = self.config.get("paths", "VEC_PATH") # Read the vocabulary mode (all words or just dataset words) from the config self.vocab_mode = self.config.get("settings", "VOCABULARY") if self.vocab_mode == 'all': self.diacritics = 'True' vocab_path = self.config.get("paths", "VOCAB_PATH") elif self.vocab_mode == 'small': self.diacritics = self.config.get("settings", "DIACRITICS") if self.diacritics == 'True': vocab_path = self.config.get("paths", "VOCAB_PATH_DATASET_DIAC") else: vocab_path = self.config.get("paths", "VOCAB_PATH_DATASET_NODIAC") else: print('Wrong value for parameter VOCABULARY in config. Exiting') return synonym_paths = list() antonym_paths = list() # Read the root path of the constraints files constraints_root_path = self.config.get("paths", "CONSTRAINTS_ROOT_PATH") # Read the PoS variable self.parts_of_speech = self.config.get("settings", "POS").replace( "[", "").replace("]", "").replace(" ", "").split(",") print("Loading constraints...") # Append antonyms and synonyms of each selected PoS from their respective folder for part_of_speech in self.parts_of_speech: antonym_paths.append( os.path.join(constraints_root_path, part_of_speech, "antonyms.txt")) synonym_paths.append( os.path.join(constraints_root_path, part_of_speech, "synonyms.txt")) self.synonyms = to.load_multiple_constraints(synonym_paths) self.antonyms = to.load_multiple_constraints(antonym_paths) vsp_path = self.config.get("paths", "VSP_PAIRS_PATH") self.vsp_pairs = to.load_vsp_pairs(vsp_path) print("Loaded constraints.") # Read and parse the mode (whether to include synonyms, antonyms or VSP pairs in the current run) mode = self.config.get("settings", "MODE").replace("[", "").replace( "]", "").replace(" ", "").split(",") vocab = list() print("Loading vocabulary...") with open(file=vocab_path, mode="r", encoding="utf-8") as vocab_file: for line in vocab_file: vocab.append(line.strip()) # Add augmented words from synonyms list to the vocab # Small optimization trick for O(1) lookup: vocab_set = set(vocab) for pair in self.synonyms: if pair[0] in vocab_set or pair[1] in vocab_set: vocab.append(pair[0]) vocab.append(pair[1]) # Add augmented words from antonym lists to the vocab for pair in self.antonyms: if pair[0] in vocab_set or pair[1] in vocab_set: vocab.append(pair[0]) vocab.append(pair[1]) vocab = to.unique(vocab) print("Loaded vocabulary.") # Load the word vectors print("Loading word vectors...") dimensions, self.vectors = to.load_vectors(self.input_vectors_path, set(vocab)) # Return if vectors were not successfully loaded if not self.vectors: print("Unable to load initial vectors") return print("Loaded word vectors ") if language_model_name: self.output_vectors_path = f"{self.config.get('paths', 'VEC_ROOT_PATH')}/{language_model_name}.vec" else: self.output_vectors_path = self.config.get( "paths", "CF_VEC_PATH" ).split( "." )[0] + f"_{str(datetime.timestamp(datetime.now())).split('.')[0]}.vec" # The vocabulary contains the keys of vectors successfully loaded by the initial vocabulary: Words in the # initial vocabulary with no corresponding vector are skipped self.vocabulary = to.unique(self.vectors.keys()) self.dimensions = f"{len(self.vocabulary)} {dimensions.split(' ')[1]}" # Load synonym and antonym pairs from the paths specified self.mode = mode # Read the hyperparameters of our run self.hyper_k1 = self.config.getfloat("hyperparameters", "hyper_k1") self.hyper_k2 = self.config.getfloat("hyperparameters", "hyper_k2") self.hyper_k3 = self.config.getfloat("hyperparameters", "hyper_k3") self.sgd_iters = self.config.getint("hyperparameters", "sgd_iters") self.delta = self.config.getfloat("hyperparameters", "delta") self.gamma = self.config.getfloat("hyperparameters", "gamma") self.rho = self.config.getfloat("hyperparameters", "rho") print( f"Initialized counterfitting settings. Vocab path: {vocab_path}, PoS paths: {self.parts_of_speech}," f" Mode: {self.mode}, diacritics: {self.diacritics}." f" Hyperpameters: {self.hyperparams_tostring()}")