Exemple #1
0
 def choose_features(self, samples, retrain=False):
     if self.feature_type in ['tf-idf', 'bow']:
         return Vectorizer(self.name, self.feature_type).vectors(samples, retrain).todense()
     elif self.feature_type == 'word-embeddings':
         return Embeddings(self.name, 100).encode_samples(samples)
     else:
         return samples # no change/manipulation
    def train_experimental_CNN(self, features, labels, trainable_embeddings):
        embedding_size = 100

        if self.feature_type == 'word-embeddings':
            x_train = pad_sequences(features, maxlen=self.max_len, padding='post')
            pretrained_embeddings = Embeddings(self.name, embedding_size).vectors()
            vocab_size = pretrained_embeddings.shape[0]
            embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                                input_length=self.max_len, trainable=trainable_embeddings, weights=[pretrained_embeddings])
        elif self.feature_type in ['tf-idf', 'bow']:
            x_train = features
            vocab_size = x_train.shape[1]
            embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                                input_length=vocab_size, trainable=trainable_embeddings)
        else:
            raise Exception('Please select a valid feature')

        model = Sequential()
        model.add(embedding_layer)
        model.add(Conv1D(filters=256, kernel_size=5, activation='relu'))
        model.add(MaxPool1D(pool_size=5)) # output size 20 (self.max_len/5) (MaxPool acts on the input dimension/size)
        model.add(Dropout(rate=0.3))
        model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
        model.add(MaxPool1D(pool_size=15)) # filter of size 5 applied on the size 20 output of 1st MaxPool1D
        model.add(Flatten())
        model.add(Dense(units=64, activation='relu'))
        model.add(Dropout(rate=0.5))
        model.add(Dense(units=32, activation='relu'))
        model.add(Dropout(rate=0.5))
        model.add(Dense(units=self.num_classes, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
        print(model.summary())
        self.model = model

        numeric_labels = SentenceLabelEncoder().encode_numerical(labels)
        class_weights = class_weight.compute_class_weight('balanced', np.unique(numeric_labels), numeric_labels)
        y_train = SentenceLabelEncoder().encode_categorical(labels)

        self.model.fit(x_train, y_train, validation_split=0.2, epochs=20, batch_size=128,
                        verbose=2, shuffle=True, class_weight=class_weights)
        print('##########################\n\n\t Cross Validation completed \n\n\t##########################')

        self.model.fit(x_train, y_train, epochs=20, batch_size=128,
                        verbose=2, shuffle=True, class_weight=class_weights)
        loss, accuracy = self.model.evaluate(x_train, y_train)
        print('loss, accuracy:', loss, accuracy)

        self.labels_pred = SentenceLabelEncoder().decode(self.model.predict_classes(x_train))
    def train_common_baseline_CNN(self, features, labels, trainable_embeddings):
        embedding_size = 100

        x_train = pad_sequences(features, maxlen=self.max_len, padding='post')
        pretrained_embeddings = Embeddings(self.name, embedding_size).vectors()
        vocab_size = pretrained_embeddings.shape[0]
        embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                            input_length=self.max_len, trainable=trainable_embeddings, weights=[pretrained_embeddings])

        main_input = Input(shape=(self.max_len, ))
        x = embedding_layer(main_input)
        x3 = Conv1D(filters=100, kernel_size=2, activation='relu')(x) # generalization of bigrams
        x4 = Conv1D(filters=100, kernel_size=4, activation='relu')(x) # generalization of ngrams, n=4
        x5 = Conv1D(filters=100, kernel_size=5, activation='relu')(x) # generalization of ngrams, n=5
        x3 = MaxPool1D(pool_size=99)(x3)
        x4 = MaxPool1D(pool_size=97)(x4)
        x5 = MaxPool1D(pool_size=96)(x5)
        out = concatenate([x3, x4, x5])
        out = Dropout(rate=0.5)(out)
        out = Flatten()(out) # todo - check should we use a Dense (fully connected layer)?
        main_output = Dense(self.num_classes, activation='softmax')(out)
        model = Model(inputs=main_input, outputs=main_output)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
        print(model.summary())
        self.model = model

        numeric_labels = SentenceLabelEncoder().encode_numerical(labels)
        class_weights = class_weight.compute_class_weight('balanced', np.unique(numeric_labels), numeric_labels)
        y_train = SentenceLabelEncoder().encode_categorical(labels)

        self.model.fit(x_train, y_train, validation_split=0.2, epochs=30, batch_size=128,
                        verbose=2, shuffle=True, class_weight=class_weights)
        print('##########################\n\n\t Cross Validation completed \n\n\t##########################')

        self.model.fit(x_train, y_train, epochs=30, batch_size=128,
                        verbose=2, shuffle=True, class_weight=class_weights)
        loss, accuracy = self.model.evaluate(x_train, y_train)
        print('loss, accuracy:', loss, accuracy)

        self.labels_pred = SentenceLabelEncoder().decode(np.argmax(self.model.predict(x_train), axis=1))
    def train_vanilla_NN(self, features, labels, trainable_embeddings):
        model = Sequential()
        embedding_size = 100
        if self.feature_type == 'word-embeddings':
            x_train = pad_sequences(features,
                                    maxlen=self.max_len,
                                    padding='post')
            pretrained_embeddings = Embeddings(self.name,
                                               embedding_size).vectors()
            vocab_size = pretrained_embeddings.shape[0]
            model.add(
                Embedding(input_dim=vocab_size,
                          output_dim=embedding_size,
                          input_length=self.max_len,
                          trainable=trainable_embeddings,
                          weights=[pretrained_embeddings]))
            model.add(Flatten())
            model.add(Dense(units=4 * embedding_size, activation='relu'))
        elif self.feature_type in ['tf-idf', 'bow']:
            x_train = features
            vocab_size = x_train.shape[1]
            model.add(
                Dense(units=4 * embedding_size,
                      activation='relu',
                      input_dim=vocab_size))
        else:
            raise Exception('Please select a valid feature')

        model.add(Dropout(rate=0.5))
        model.add(Dense(units=self.num_classes, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        print(model.summary())
        self.model = model

        numeric_labels = SentenceLabelEncoder().encode_numerical(labels)
        class_weights = class_weight.compute_class_weight(
            'balanced', np.unique(numeric_labels), numeric_labels)
        y_train = SentenceLabelEncoder().encode_categorical(labels)

        self.model.fit(x_train,
                       y_train,
                       validation_split=0.2,
                       epochs=10,
                       batch_size=32,
                       verbose=2,
                       shuffle=True,
                       class_weight=class_weights)
        print(
            '##########################\n\n\t Cross Validation completed \n\n\t##########################'
        )

        self.model.fit(x_train,
                       y_train,
                       epochs=10,
                       batch_size=32,
                       verbose=2,
                       shuffle=True,
                       class_weight=class_weights)
        loss, accuracy = self.model.evaluate(x_train, y_train)
        print('loss, accuracy:', loss, accuracy)

        self.labels_pred = SentenceLabelEncoder().decode(
            self.model.predict_classes(x_train))
Exemple #5
0
 def generate_embeddings_coordinates(self, model_name, dimension,
                                     reduced_dimension):
     embeddings = Embeddings(model_name, dimension)
     embeddings.words_coordinates(reduced_dimension)
Exemple #6
0
 def train_embeddings(self, model_name, dimension, sents):
     embeddings = Embeddings(model_name, dimension)
     embeddings.train(sents)