def __init__(self, config_path):

        print("Initializing configuration for relations generator...")
        # Read the config file
        self.config = configparser.RawConfigParser()
        try:
            self.config.read(config_path)
        except OSError:
            print("Unable to read config, aborting.")
            return

        # Load and split the POS parameter into a list

        # Create a mapping from each possible POS value in the config to the corresponding rwn component
        mapping = {"verb": rwn.Synset.Pos.VERB,
                   "noun": rwn.Synset.Pos.NOUN,
                   "adverb": rwn.Synset.Pos.ADVERB,
                   "adjective": rwn.Synset.Pos.ADJECTIVE}

        # Keep in the map of PoS : Rwn.Pos only the specified parts of speech
        self.pos = mapping

        # Load the root of the folders containing constraints
        self.constraints_root_path = self.config.get("paths", "CONSTRAINTS_ROOT_PATH")

        vocab_path = self.config.get("paths", "VOCAB_PATH")

        self.vocabulary = list()
        with open(file=vocab_path, mode="r", encoding="utf-8") as vocab_file:
            for line in vocab_file:
                self.vocabulary.append(line.strip())
        self.vocabulary = set(unique(self.vocabulary))
        print(f"Loaded {len(self.vocabulary)} words from {vocab_path}")
        print(
            f"Finished initializing config for relations generator. Will output results to {self.constraints_root_path}")
def augment_antonym_verbs(lemmas: list, times: list) -> list:
    """
      Computes the list of valid antonym pairs.
      :param lemmas: A list of verb lemmas.
      :param times: A list of verbal times.
      :return: The list of antonym verb pairs generated from the provided lemmas and verbal times.
    """
    wordnet = rwn.RoWordNet()
    conjugator = mlconjug.Conjugator(language='ro')

    # Enforce lemmas vs use language model to lemmatize. First choice should suffice for now.
    # Todo: Compare advantages/disadvantages, maybe implement second
    global_antonym_pairs = list()

    for lemma in lemmas:
        global_antonym_pairs.extend(
            generate_conjugated_pairs(lemma, wordnet, conjugator, times,
                                      'ant'))

    valid_antonym_pairs = unique(
        list([
            pair for pair in global_antonym_pairs
            if pair[0] is not None and pair[1] is not None
        ]))

    return valid_antonym_pairs
def extract_verb_lemmas(base_langauge_model: str, sentences: list) -> tuple:
    verb_lemmas = list()
    verb_times = list()
    conjugator = mlconjug.Conjugator(language='ro')

    # Manually exclude the auxilliary verbs. If the to-do below is solved, this is no longer required
    auxilliary_verb_lemmas = ['vrea', 'avea', 'fi']
    excluded_verb_forms = ['Vmp--sf', 'Vmp--sm', 'Vmp--pm', 'Vmp--pf']

    # Load the SpaCy Language Model
    lang_model = spacy.load(base_langauge_model)
    for sentence in sentences:
        doc = lang_model(sentence)
        for token in doc:
            # For now this takes all the verbs regardless of their conjugation.
            # Todo: Use https://universaldependencies.org/tagset-conversion/ro-multext-uposf.html
            if valid_verb(token, auxilliary_verb_lemmas, excluded_verb_forms):
                if token.lemma_ not in verb_lemmas:
                    verb_lemmas.append(token.lemma_)

                    # Figure the possible conjugations of the lemma which are equal to the token
                    try:
                        target_conjugations = conjugator.conjugate(
                            token.lemma_)

                        # For each conjugation of the target word
                        for conjugation in target_conjugations.iterate():

                            # Unpack the time, mood, person and value from the tuple
                            time, mood, person, value = conjugation

                            if value == token.text:
                                # Found a possible candidate, append the time/mood/person to the list
                                verb_times.append((time, mood, person))

                    except ValueError:
                        print(
                            f"Unable to conjugate, possibly mistagged verb {token.text}"
                        )

    verb_lemmas = unique(verb_lemmas)
    verb_times = unique(verb_times)
    print("Verbs extracted", verb_lemmas)
    print("Verb times extracted:", verb_times)
    return verb_lemmas, verb_times
def generate_antonym_pairs(config: SettingConfig) -> dict:
    """
    Generates antonym pairs from RoWordNet.
    :param config: Configuration of the current run.
    :return: A dictionary where keys are strings representing parts of speech and values are lists of pairs
    corresponding to synonyms / antonyms from that category.
    """
    print(f"Generating initial antonym pairs from RoWordNet @ {datetime.now()}")
    wn = rwn.RoWordNet()

    # Create the output dictionary that will be of type dict(str : set(pair(str, str)) where the key is
    # the PoS and the value is a set of pairs of words of PoS specified by the key
    pairs = dict()

    # Iterate over the selected parts of speech
    for part_of_speech in config.pos.values():

        pos_pairs = list()

        # Return all synsets corresponding to the PoS
        synset_ids = wn.synsets(pos=part_of_speech)

        # Iterate all the synsets for the current PoS
        for synset_id in synset_ids:

            # Get the synset object specified by synset_id
            synset = wn.synset(synset_id)

            # Get the outbound relations of type antonym from
            outbound_relations = filter(lambda x: x[1] == 'near_antonym', wn.outbound_relations(synset_id))

            # Iterate outbound relations
            for relation in outbound_relations:
                # Get the synset corresponding to the target of the outbound relation
                target_synset = wn.synset(relation[0])

                # Get all the pairs, sort them by first word to keep set entries unique
                current_iteration_pairs = get_cross_synset_pairs(synset, target_synset)

                # Add the current set of pairs
                pos_pairs.extend(current_iteration_pairs)

        # Get corresponding key in pos dictionary and add the pair to the resulting dictionary
        for key, value in config.pos.items():
            if value == part_of_speech:
                pairs[key] = unique(pos_pairs)

    # Return the whole dictionary
    print(f"Successfully generated antonym paris @ {datetime.now()}")
    return pairs
def generate_conjugated_pairs(lemma: str, wordnet: RoWordNet,
                              conjugator: mlconjug.Conjugator,
                              valid_verbal_times: list, mode: str) -> list:
    """
    Generates the conjugated pairs for a verb.
    :param lemma: The lemma (infinitive form) of the target word.
    :param wordnet: An instance of the RoWordNet lexicon.
    :param conjugator: An instance of the mlConjug conjugator.
    :param valid_verbal_times: A list of valid verb times.
    :param mode: Specifies whether the function returns the list of antonym pairs or synonym pairs.
    :return: A list of conjugated pairs, according to the constraint specified by mode.
    """

    # Check the mode
    if mode == 'syn':
        relationships = generate_rwn_synonyms(wordnet, lemma)
    elif mode == 'ant':
        relationships = generate_rwn_antonyms(wordnet, lemma)
    else:
        raise ValueError('Invalid Mode')

    pairs = list()
    for relationship in relationships:
        for pair in relationship:
            pairs.append(pair)
    pairs = unique(pairs)

    pairs = list(filter(None, list(map(lambda x: process_pair(x), pairs))))

    conjugated_pairs = list()

    for pair in pairs:
        for time in valid_verbal_times:
            try:
                conjugated_first = conjugate(pair[0], conjugator, time)
                conjugated_second = conjugate(pair[1], conjugator, time)
                if not (conjugated_first is None
                        and conjugated_second is None):
                    conjugated_pairs.append(
                        (conjugated_first, conjugated_second))
            except ValueError:
                # Well, no big deal if we are unable to conjugate the pairs. Probably the correct place to "solve"
                # the error of intruders by simply passing to the next conjugation step and filtering only
                # valid elements in the list.
                print(f"Value Error when conjugating {pair}")
                pass

    return conjugated_pairs
def generate_synonym_pairs(config: SettingConfig) -> dict:
    """
    Generates synonym pairs from RoWordNet.
    :param config: Configuration of the current run.
    :return: A dictionary where keys are strings representing parts of speech and values are lists of pairs
    corresponding to synonyms / antonyms from that category.
    """
    wn = rwn.RoWordNet()

    # Create the output dictionary that will be of type dict(str : set(pair(str, str)) where the key is
    # the PoS and the value is a set of pairs of words of PoS specified by the key
    pairs = dict()

    # Iterate over the selected parts of speech
    for part_of_speech in config.pos.values():

        pos_pairs = list()

        # Return all synsets corresponding to the PoS
        synset_ids = wn.synsets(pos=part_of_speech)

        # Iterate all the synsets for the current PoS
        for synset_id in synset_ids:
            # Get the synset object specified by synset_id
            synset = wn.synset(synset_id)

            # Get all the pairs, sort them by first word to keep set entries unique
            current_iteration_pairs = get_synset_pairs(synset)

            # Append all pairs from the current PoS to the global set
            pos_pairs.extend(current_iteration_pairs)

        # Get corresponding key in pos dictionary and add the pair to the resulting dictionary
        for key, value in config.pos.items():
            if value == part_of_speech:
                pairs[key] = unique(pos_pairs)

    return pairs
def postprocess_pairs(raw_pairs: list, config: SettingConfig) -> list:
    """
    Processes a list of pairs to remove reflexive forms, pairs with duplicate elements, proper nouns, etc.
    :param raw_pairs: List of tuples representing the initial pairs.
    :param config: Configuration of the current run.
    :return: The filtered list of pairs from the initial pairs.
    """
    processed_pairs = list()

    for raw_pair in raw_pairs:
        # Preprocess each line

        processed_line = process_pair(raw_pair)

        # If the processed line is not empty (meaning we have 2 different words separated by a space)
        if processed_line:
            # Split the words
            w1, w2 = processed_line

            # Check if both are in the dictionary
            if w1 in config.vocabulary and w2 in config.vocabulary:
                processed_pairs.append((w1, w2))
    return unique(processed_pairs)
Example #8
0
    def __init__(self, config_path, language_model_name):

        # Read the config file
        self.config = configparser.RawConfigParser()
        try:
            self.config.read(config_path)
        except OSError:
            print("Unable to read config, aborting.")
            return

        # Read the word vectors path from the config
        self.input_vectors_path = self.config.get("paths", "VEC_PATH")

        # Read the vocabulary mode (all words or just dataset words) from the config
        self.vocab_mode = self.config.get("settings", "VOCABULARY")
        if self.vocab_mode == 'all':
            self.diacritics = 'True'
            vocab_path = self.config.get("paths", "VOCAB_PATH")
        elif self.vocab_mode == 'small':
            self.diacritics = self.config.get("settings", "DIACRITICS")
            if self.diacritics == 'True':
                vocab_path = self.config.get("paths",
                                             "VOCAB_PATH_DATASET_DIAC")
            else:
                vocab_path = self.config.get("paths",
                                             "VOCAB_PATH_DATASET_NODIAC")
        else:
            print('Wrong value for parameter VOCABULARY in config. Exiting')
            return

        synonym_paths = list()
        antonym_paths = list()

        # Read the root path of the constraints files
        constraints_root_path = self.config.get("paths",
                                                "CONSTRAINTS_ROOT_PATH")

        # Read the PoS variable
        self.parts_of_speech = self.config.get("settings", "POS").replace(
            "[", "").replace("]", "").replace(" ", "").split(",")

        print("Loading constraints...")
        # Append antonyms and synonyms of each selected PoS from their respective folder
        for part_of_speech in self.parts_of_speech:
            antonym_paths.append(
                os.path.join(constraints_root_path, part_of_speech,
                             "antonyms.txt"))
            synonym_paths.append(
                os.path.join(constraints_root_path, part_of_speech,
                             "synonyms.txt"))

        self.synonyms = to.load_multiple_constraints(synonym_paths)
        self.antonyms = to.load_multiple_constraints(antonym_paths)

        vsp_path = self.config.get("paths", "VSP_PAIRS_PATH")

        self.vsp_pairs = to.load_vsp_pairs(vsp_path)

        print("Loaded constraints.")
        # Read and parse the mode (whether to include synonyms, antonyms or VSP pairs in the current run)
        mode = self.config.get("settings", "MODE").replace("[", "").replace(
            "]", "").replace(" ", "").split(",")

        vocab = list()

        print("Loading vocabulary...")
        with open(file=vocab_path, mode="r", encoding="utf-8") as vocab_file:
            for line in vocab_file:
                vocab.append(line.strip())

        # Add augmented words from synonyms list to the vocab
        # Small optimization trick for O(1) lookup:
        vocab_set = set(vocab)
        for pair in self.synonyms:
            if pair[0] in vocab_set or pair[1] in vocab_set:
                vocab.append(pair[0])
                vocab.append(pair[1])

        # Add augmented words from antonym lists to the vocab
        for pair in self.antonyms:
            if pair[0] in vocab_set or pair[1] in vocab_set:
                vocab.append(pair[0])
                vocab.append(pair[1])

        vocab = to.unique(vocab)
        print("Loaded vocabulary.")

        # Load the word vectors
        print("Loading word vectors...")
        dimensions, self.vectors = to.load_vectors(self.input_vectors_path,
                                                   set(vocab))

        # Return if vectors were not successfully loaded
        if not self.vectors:
            print("Unable to load initial vectors")
            return

        print("Loaded word vectors ")
        if language_model_name:
            self.output_vectors_path = f"{self.config.get('paths', 'VEC_ROOT_PATH')}/{language_model_name}.vec"
        else:
            self.output_vectors_path = self.config.get(
                "paths", "CF_VEC_PATH"
            ).split(
                "."
            )[0] + f"_{str(datetime.timestamp(datetime.now())).split('.')[0]}.vec"

        # The vocabulary contains the keys of vectors successfully loaded by the initial vocabulary: Words in the
        # initial vocabulary with no corresponding vector are skipped
        self.vocabulary = to.unique(self.vectors.keys())

        self.dimensions = f"{len(self.vocabulary)} {dimensions.split(' ')[1]}"

        # Load synonym and antonym pairs from the paths specified
        self.mode = mode

        # Read the hyperparameters of our run
        self.hyper_k1 = self.config.getfloat("hyperparameters", "hyper_k1")
        self.hyper_k2 = self.config.getfloat("hyperparameters", "hyper_k2")
        self.hyper_k3 = self.config.getfloat("hyperparameters", "hyper_k3")
        self.sgd_iters = self.config.getint("hyperparameters", "sgd_iters")

        self.delta = self.config.getfloat("hyperparameters", "delta")
        self.gamma = self.config.getfloat("hyperparameters", "gamma")
        self.rho = self.config.getfloat("hyperparameters", "rho")
        print(
            f"Initialized counterfitting settings. Vocab path: {vocab_path}, PoS paths: {self.parts_of_speech},"
            f" Mode: {self.mode}, diacritics: {self.diacritics}."
            f" Hyperpameters: {self.hyperparams_tostring()}")