punct_1=punct_en, punct_2=punct_de,
                                                                     stop_words_1=stop_words_en,
                                                                     stop_words_2=stop_words_de, test_mode=True)

tokenized_corpus_test_en, tokenized_corpus_test_de = tokenize_corpuses(P=P,
                                                                       corpus_1=corpus_test_en, corpus_2=corpus_test_de,
                                                                       punct_1=punct_en, punct_2=punct_de,
                                                                       stop_words_1=stop_words_en,
                                                                       stop_words_2=stop_words_de, test_mode=True)

# Build vocabularies
count_dict_en = P.dictionary_frequencies(tokenized_corpus_en)
vocab_en = Vocabulary(count_dictionary=count_dict_en, min_count=0)
count_dict_de = P.dictionary_frequencies(tokenized_corpus_de)
vocab_de = Vocabulary(count_dictionary=count_dict_de, min_count=0)
vocab_en.build_from_token(tokenized_corpus_en)
vocab_de.build_from_token(tokenized_corpus_de)

# Build datasets
X_train = P.get_tensor_set_for_regression(vocab_en, vocab_de, tokenized_corpus_en, tokenized_corpus_de).numpy()
X_val = P.get_tensor_set_for_regression(vocab_en, vocab_de, tokenized_corpus_val_en, tokenized_corpus_val_de).numpy()
X_test = P.get_tensor_set_for_regression(vocab_en, vocab_de, tokenized_corpus_test_en, tokenized_corpus_test_de).numpy()

print("Training shape: ", X_train.shape)
print("Validation shape: ", X_val.shape)

# Apply linear regression
lr = LinearRegression().fit(X_train, y_train)

# Validation
predictions = lr.predict(X_val)
Esempio n. 2
0
    def __init__(self,
                 P,
                 corpus_1,
                 corpus_2,
                 y_train,
                 stop_words_1,
                 stop_words_2,
                 corpus_val_1,
                 corpus_val_2,
                 y_val,
                 n_gram='unigram',
                 frequency=False,
                 min_count=2,
                 min_count_gram=1):
        """
        BagOfWords Constructor

        :param P: Preprocessing instance used on the training sets
        :param corpus_1: Training corpus for language 1
        :param corpus_2: Training corpus for language 2
        :param y_train: Training labels
        :param stop_words_1: Stop words for language 1
        :param stop_words_2: Stop words for language 2
        :param corpus_val_1: Validation corpus for language 1
        :param corpus_val_2: Validation corpus for language 2
        :param y_val: Validation labels
        :param n_gram: Option for the construction of the vocabulary (See READLE.md for details)
        :param frequency: True: normalize the bag of words with frequency of words in the sentence.
                          False: 1 if word in the sentence, 0 otherwise.
        :param min_count: Do not add words in vocabulary that are not present more than min_count.
        :param min_count_gram: Do not add n-gram in vocabulary that are not present more than min_count_gram.
        """

        # General parameters
        self.y_train = y_train
        self.y_val = y_val
        self.model = None
        self.device = None
        self.optimizer = None
        self.frequency = frequency
        self.n_gram = n_gram
        self.P = P
        self.pred = None
        self.X_test = None
        self.predictions = None

        # Get punctuation
        self.punct_1 = P.get_regex_special_characters(corpus_1)
        self.punct_2 = P.get_regex_special_characters(corpus_2)
        # Stop words
        self.stop_words_1 = stop_words_1
        self.stop_words_2 = stop_words_2

        # Tokenize corpus
        tokenized_corpus_1, tokenized_corpus_2 = tokenize_corpuses(
            P=P,
            corpus_1=corpus_1,
            corpus_2=corpus_2,
            punct_1=self.punct_1,
            punct_2=self.punct_2,
            stop_words_1=stop_words_1,
            stop_words_2=stop_words_2)

        # Build vocabularies
        count_dict_1 = P.dictionary_frequencies(tokenized_corpus_1)
        vocab_1 = Vocabulary(count_dictionary=count_dict_1,
                             min_count=min_count)
        count_dict_2 = P.dictionary_frequencies(tokenized_corpus_2)
        vocab_2 = Vocabulary(count_dictionary=count_dict_2,
                             min_count=min_count)
        vocab_1.build_from_token(tokenized_corpus_1)
        vocab_2.build_from_token(tokenized_corpus_2)

        # If bigram mode we build bi-gram vocabulary as well
        if n_gram == 'bigram':
            t1 = P.tokenize_bigram(tokenized_corpus_1)
            count_dict_1 = P.dictionary_frequencies(t1)
            vocab_1.build_from_token(corpus=t1,
                                     min_count=min_count_gram,
                                     dictionary=count_dict_1)
            t2 = P.tokenize_bigram(tokenized_corpus_2)
            count_dict_2 = P.dictionary_frequencies(t2)
            vocab_2.build_from_token(corpus=t2,
                                     min_count=min_count_gram,
                                     dictionary=count_dict_2)
            for i in range(len(tokenized_corpus_1)):
                tokenized_corpus_1[i] += t1[i]
                tokenized_corpus_2[i] += t2[i]

        # If trigram mode, we build bi-gram and tri-gram vocabulary as well
        if n_gram == 'trigram':
            # Bigram
            t1 = P.tokenize_bigram(tokenized_corpus_1)
            count_dict_1 = P.dictionary_frequencies(t1)
            vocab_1.build_from_token(corpus=t1,
                                     min_count=min_count_gram,
                                     dictionary=count_dict_1)
            t2 = P.tokenize_bigram(tokenized_corpus_2)
            count_dict_2 = P.dictionary_frequencies(t2)
            vocab_2.build_from_token(corpus=t2,
                                     min_count=min_count_gram,
                                     dictionary=count_dict_2)
            # Trigram
            t3 = P.tokenize_trigram(tokenized_corpus_1)
            count_dict_1 = P.dictionary_frequencies(t3)
            vocab_1.build_from_token(corpus=t3,
                                     min_count=min_count_gram,
                                     dictionary=count_dict_1)
            t4 = P.tokenize_trigram(tokenized_corpus_2)
            count_dict_2 = P.dictionary_frequencies(t4)
            vocab_2.build_from_token(corpus=t4,
                                     min_count=min_count_gram,
                                     dictionary=count_dict_2)
            for i in range(len(tokenized_corpus_1)):
                tokenized_corpus_1[i] += t1[i] + t3[i]
                tokenized_corpus_2[i] += t2[i] + t4[i]

        # Build X_train
        self.X_train = get_training_bag_of_words_appearance(
            _vocab1=vocab_1,
            _vocab2=vocab_2,
            tokenized_corpus_1=tokenized_corpus_1,
            tokenized_corpus_2=tokenized_corpus_2,
            frequency=frequency)
        # Tokenize validation sets
        tokenized_val_corpus_1, tokenized_val_corpus_2 = tokenize_corpuses(
            P=P,
            corpus_1=corpus_val_1,
            corpus_2=corpus_val_2,
            punct_1=self.punct_1,
            punct_2=self.punct_2,
            stop_words_1=stop_words_1,
            stop_words_2=stop_words_2)
        # If bigram mode, tokenize for bi-gram as well
        if n_gram == 'bigram':
            t1 = P.tokenize_bigram(tokenized_val_corpus_1)
            t2 = P.tokenize_bigram(tokenized_val_corpus_2)
            for i in range(len(tokenized_val_corpus_1)):
                tokenized_val_corpus_1[i] += t1[i]
                tokenized_val_corpus_2[i] += t2[i]

        # If trigram mode, tokenize for bi-gram and tri-gram as well
        if n_gram == 'trigram':
            t1 = P.tokenize_bigram(tokenized_val_corpus_1)
            t2 = P.tokenize_bigram(tokenized_val_corpus_2)
            t3 = P.tokenize_trigram(tokenized_val_corpus_1)
            t4 = P.tokenize_trigram(tokenized_val_corpus_2)
            for i in range(len(tokenized_val_corpus_1)):
                tokenized_val_corpus_1[i] += t1[i] + t3[i]
                tokenized_val_corpus_2[i] += t2[i] + t4[i]

        # Build X_val
        self.X_val = get_training_bag_of_words_appearance(
            _vocab1=vocab_1,
            _vocab2=vocab_2,
            tokenized_corpus_1=tokenized_val_corpus_1,
            tokenized_corpus_2=tokenized_val_corpus_2,
            frequency=frequency)
        # Vocabularies
        self.vocab_1 = vocab_1
        self.vocab_2 = vocab_2

        # Information after creation of the instance
        print(n_gram + " model:")
        print("Shape training set: ", self.X_train.shape)
        print("Shape validation set: ", self.X_val.shape)
        print("Shape training scores: ", y_train.shape)
        print("Shape validation scores: ", y_val.shape)
        print()