class Vectorizer:
    def __init__(self, fit_data, tst_data, type='count', params={}):
        self.fit_data = fit_data
        self.tst_data = tst_data
        if type == 'count':
            self.vectorizer = CountVectorizer(**params)
            self.vectorizer.fit([' '.join(d[1]) for d in fit_data])
            self.tst_vecs = self.vectorizer.transform(
                [' '.join(d[1]) for d in self.tst_data]).toarray()
            self.fit_vecs = self.vectorizer.transform(
                [' '.join(d[1]) for d in self.fit_data]).toarray()

        elif type == 'tfidf':
            self.vectorizer = TfidfVectorizer(**params)
            self.vectorizer.fit([' '.join(d[1]) for d in fit_data])
            self.tst_vecs = self.vectorizer.transform(
                [' '.join(d[1]) for d in self.tst_data]).toarray()
            self.fit_vecs = self.vectorizer.transform(
                [' '.join(d[1]) for d in self.fit_data]).toarray()

        elif type == 'wordembedd':
            fit_sentences = [d[1] for d in fit_data]
            tst_sentences = [d[1] for d in tst_data]
            self.vectorizer = Word2Vec(fit_sentences, **params)
            self.vectorizer = self.vectorizer.wv
            self.fit_vecs = [
                np.array([
                    self.vectorizer[word] for word in sentence
                    if word in self.vectorizer.vocab
                ]).flatten() for sentence in fit_sentences
            ]

            self.tst_vecs = [
                np.array([
                    self.vectorizer[word] for word in sentence
                    if word in self.vectorizer.vocab
                ]).flatten() for sentence in tst_sentences
            ]

            self.min = min(np.min([len(vector) for vector in self.fit_vecs]),
                           np.min([len(vector) for vector in self.tst_vecs]))

            self.fit_vecs = [fit_vec[:self.min] for fit_vec in self.fit_vecs]
            self.tst_vecs = [tst_vec[:self.min] for tst_vec in self.tst_vecs]

        elif type == 'fasttext':
            fit_sentences = [d[1] for d in fit_data]
            tst_sentences = [d[1] for d in tst_data]
            self.vectorizer = FastText(fit_sentences, **params)
            self.vectorizer = self.vectorizer.wv
            self.fit_vecs = [
                np.array([
                    self.vectorizer[word] for word in sentence
                    if word in self.vectorizer.vocab
                ]).flatten() for sentence in fit_sentences
            ]

            self.tst_vecs = [
                np.array([
                    self.vectorizer[word] for word in sentence
                    if word in self.vectorizer.vocab
                ]).flatten() for sentence in tst_sentences
            ]

            self.min = min(np.min([len(vector) for vector in self.fit_vecs]),
                           np.min([len(vector) for vector in self.tst_vecs]))

            self.fit_vecs = self.fit_vecs[:self.min]
            self.tst_vecs = self.tst_vecs[:self.min]

        # if type == 'count':
        #   self.vectorizer = CountVectorizer(**params)
        #   self.vectorizer.fit([' '.join(d[1]) for d in fit_data])
        #   self.vectorize_call = lambda data: self.vectorizer.transform([' '.join(d) for d in data]).toarray()

        # elif type == 'tfidf':
        #   self.vectorizer = TfidfVectorizer(**params)
        #   self.vectorizer.fit([' '.join(d[1]) for d in fit_data])
        #   self.vectorize_call = lambda data: self.vectorizer.transform([' '.join(d) for d in data]).toarray()

        # elif type == 'wordembedd':
        #   sentences = [d[1] for d in fit_data]
        #   self.vectorizer = Word2Vec(sentences, **params)
        #   self.vectorizer = self.vectorizer.wv
        #   self.vectorize_call = lambda data: self.vectorize_call_helper(data)

        # elif type == 'fasttext':
        #   sentences = [d[1] for d in fit_data]
        #   self.vectorizer = FastText(sentences, **params)
        #   self.vectorizer = self.vectorizer.wv
        #   self.vectorize_call = lambda data: self.vectorize_call_helper(data)

    def vectorize(self):
        return ([[d[0], self.fit_vecs[i], d[2]]
                 for i, d in enumerate(self.fit_data)],
                [[d[0], self.tst_vecs[i], d[2]]
                 for i, d in enumerate(self.tst_data)])

    # def vectorize_call_helper(self, min):
    #   non_equal_vectors = [
    #     np.array([self.vectorizer[word] for word in sentence if word in self.vectorizer.vocab]).flatten() for sentence in data
    #   ]
    #   min_len = self.min or min([len(vector) for vector in non_equal_vectors])
    #   self.min = min_len
    #   print(self.min)
    #   equal_vectors = np.array([vector[:min_len] for vector in non_equal_vectors])
    #   for i, vector in enumerate(equal_vectors):
    #     assert i < 1 or len(vector) == len(equal_vectors[i-1])
    #   return equal_vectors
Ejemplo n.º 2
0
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)
model.summary()

# Traininig
model.fit(x_train,
          y_train,
          batch_size=batch_size,
          shuffle=True,
          epochs=no_epochs,
          validation_data=(x_test, y_test),
          callbacks=[tensorboard,
                     EarlyStopping(min_delta=0.001, patience=3)])

# Save the model
model.save('amazon-sentiment-fasttext.model')
model_final = model

# # Evaluate the model
print(model_final.metrics_names)
print(model_final.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1))

# In[24]:

# Compare sentiment predict score to original data