bags_test = count_vec.transform(docs_test) tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) tf_idf_train = tfidf.fit_transform(bags_train) tf_idf_test = tfidf.transform(bags_test) X_train = pd.DataFrame(tf_idf_train.toarray()) X_test = pd.DataFrame(tf_idf_test.toarray()) return X_train.reset_index(drop=True), X_test.reset_index( drop=True), y_train, y_test if __name__ == '__main__': df = load_pandas_df(nrows=1000, shuffle=True) X_train, X_test, y_train, y_test = preprocess_data(df) RUN_NAME = 'logistic_regression' logger.add(f'data/{RUN_NAME}/result.log', colorize=True, format='<green>{time}</green> {message}') logger.info(f'{X_train.shape}, {X_test.shape}') y_preds = [] NUM_CLASS = 9 oof_train = np.zeros((len(X_train), NUM_CLASS)) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))):
if self.use_idf: check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') expected_n_features = self._idf_diag.shape[0] if n_features != expected_n_features: raise ValueError("Input has n_features=%d while the model" " has been trained with n_features=%d" % ( n_features, expected_n_features)) X = X * self._idf_diag return X if __name__ == '__main__': df = load_pandas_df(nrows=10) # Normalization df['text'] = df['text'].apply(neologdn.normalize) tokenizer = WordTokenizer('MeCab') docs = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] ]) print(docs.shape) # (10,) count_vec = CountVectorizer(min_df=2, max_features=20000, ngram_range=(1, 3)) bags = count_vec.fit_transform(docs)
import matplotlib.pyplot as plt import japanize_matplotlib from utils_nlp.dataset.livedoor import load_pandas_df if __name__ == '__main__': df = load_pandas_df() df['first_char'] = df['text'].str[0] plot_df = df['first_char'].value_counts()[:10].reset_index() japanize_matplotlib.japanize() plt.figure(figsize=(15, 8)) plt.bar(plot_df['index'], plot_df['first_char']) plt.savefig('examples/visualization/japanize.png')