def tf_idf_features(train_data, test_data): # Bag-of-words representation tf_idf_vectorize = TfidfVectorizer(max_df=0.8, strip_accents='unicode', lowercase=True, ngram_range=(1, 1), norm='l2', stop_words='english') tf_idf_train = tf_idf_vectorize.fit_transform( train_data.data) #bag-of-word features for training data feature_names = tf_idf_vectorize.get_feature_names( ) #converts feature index to the word it represents. tf_idf_test = tf_idf_vectorize.transform(test_data.data) vocab = tf_idf_vectorize.decode(feature_names) shape = tf_idf_train.shape print('{} train data points.'.format(shape[0])) print('{} feature dimension.'.format(shape[1])) print('Most common word in training set is "{}"'.format( feature_names[tf_idf_train.sum(axis=0).argmax()])) return tf_idf_train, tf_idf_test, feature_names
except OSError as ex: print(ex) print("Couldn't import the data, did you unzip the wikidata.zip folder?") exit(-1) docs = dataset['data'] target = dataset['target'] docs_train, docs_test, y_train, y_test = train_test_split(docs, target, test_size=.2, random_state=0) vec = TfidfVectorizer(ngram_range=(1, 5), analyzer='char', use_idf=True) vec.decode(docs_train) mlp = MLPClassifier() model = make_pipeline(vec, mlp) model.fit(docs_train, y_train) y_predicted = model.predict(docs_test) target_names = [dataset.target_names[i] for i in np.unique(y_train)] print(classification_report(y_test, y_predicted, target_names=target_names)) cm = confusion_matrix(y_test, y_predicted) predicted_names = ['p_' + s for s in target_names] dfcm = DataFrame(cm, columns=predicted_names, index=target_names)