class MultihotEncoder(BaseEstimator, TransformerMixin): """ Wraps `MultiLabelBinarizer` in a pipeline safe transformer Args: sparse_output (bool): convert output to sparse matrix """ def __init__(self, sparse_output=False): self.transformer = MultiLabelBinarizer() self.sparse_output = sparse_output def fit(self, X, y=None): """ Fit MultiLabelBinarizer """ self.transformer.fit(X) return self def transform(self, X,y=None): """ Transform MultiLabelBinarizer """ # ignore unseen label warning with warnings.catch_warnings(): warnings.simplefilter("ignore") X_t = self.transformer.transform(X) if self.sparse_output: return sparse.csr_matrix(X_t) else: return X_t
class MultiLabelBinarizerImpl(): def __init__(self, classes=None, sparse_output=False): self._hyperparams = { 'classes': classes, 'sparse_output': sparse_output } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
doc2vec = Doc2Vec.load(doc2vec_model_location) # Convert the categories to one hot encoded categories labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()]) # Convert load the articles with their corresponding categories train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')] test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')] shuffle(train_articles) shuffle(test_articles) # Convert the articles to document vectors using the doc2vec model train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles] test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles] train_labels = labelBinarizer.transform([article['categories'] for article in train_articles]) test_labels = labelBinarizer.transform([article['categories'] for article in test_articles]) train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels) # Initialize the neural network model = Sequential() model.add(Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(output_dim=1200, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(output_dim=400, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(output_dim=600, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(output_dim=train_labels.shape[1], activation='sigmoid')) model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId) } for fileId in reuters.fileids() if fileId.startswith('test/')] shuffle(train_articles) shuffle(test_articles) # Convert the articles to document vectors using the doc2vec model train_data = [ doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles ] test_data = [ doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles ] train_labels = labelBinarizer.transform( [article['categories'] for article in train_articles]) test_labels = labelBinarizer.transform( [article['categories'] for article in test_articles]) train_data, test_data, train_labels, test_labels = numpy.asarray( train_data), numpy.asarray(test_data), numpy.asarray( train_labels), numpy.asarray(test_labels) # Initialize the neural network model = Sequential() model.add( Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(output_dim=1200, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(output_dim=400, activation='relu')) model.add(Dropout(0.3))