def preprocess_data(self, dataset, y_dataset):
     logging.info("Transform data on " + self.__class__.__name__)
     processed_dataset = process_dataset(dataset)
     processed_dataset = processed_dataset.map(
         lambda x: ' '.join(word for word in x))
     doc_term_matrix = self.count_vectorizer.transform(
         processed_dataset.values.astype('U'))
     return self.model.transform(doc_term_matrix)
Ejemplo n.º 2
0
 def preprocess_data(self, dataset, y_dataset):
     logging.info("Transforming data on " + self.__class__.__name__)
     processed_dataset = process_dataset(dataset)
     processed_dataset = processed_dataset.map(
         lambda x: ' '.join(word for word in x))
     tfidf = self.tfidf_vectorizer.transform(
         processed_dataset.values.astype('U'))
     return self.model.transform(tfidf)
 def preprocess_data(
     self,
     dataset,
     y_dataset
 ):
     logging.info("Transforming data on " + self.__class__.__name__)
     processed_dataset = process_dataset(dataset).tolist()
     vectors = [self.model.infer_vector(processed_dataset[doc_id]) for doc_id in range(len(processed_dataset))]
     return vectors
 def train(self, x, y=None):
     logging.info("Building vocabulary on " + self.__class__.__name__)
     t0 = time.time()
     processed_dataset = process_dataset(x)
     processed_dataset = processed_dataset.map(
         lambda x: ' '.join(word for word in x))
     doc_term_matrix = self.count_vectorizer.fit_transform(
         processed_dataset.values.astype('U'))
     self.model.fit(doc_term_matrix)
     elapsed = (time.time() - t0)
     logging.info("Done in %.3fsec" % elapsed)
Ejemplo n.º 5
0
 def train(self, x, y=None):
     logging.info("Building vectorizer on " + self.__class__.__name__)
     t0 = time.time()
     processed_dataset = process_dataset(x)
     processed_dataset = processed_dataset.map(
         lambda x: ' '.join(word for word in x))
     tfidf = self.tfidf_vectorizer.fit_transform(
         processed_dataset.values.astype('U'))
     self.model.fit(tfidf)
     elapsed = (time.time() - t0)
     logging.info("Done in %.3fsec" % elapsed)
 def train(
     self,
     x,
     y
 ):
     logging.info("Training " + self.__class__.__name__)
     t0 = time.time()
     processed_x = process_dataset(x)
     documents = [TaggedDocument(doc, [tag]) for doc, tag in zip(processed_x, y)]
     self.model.build_vocab(documents)
     self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.epochs)
     elapsed = (time.time() - t0)
     logging.info("Done in %.3fsec" % elapsed)