Esempio n. 1
0
def tf_idf_features(train_data, test_data):
    # Bag-of-words representation
    tf_idf_vectorize = TfidfVectorizer(max_df=0.8,
                                       strip_accents='unicode',
                                       lowercase=True,
                                       ngram_range=(1, 1),
                                       norm='l2',
                                       stop_words='english')
    tf_idf_train = tf_idf_vectorize.fit_transform(
        train_data.data)  #bag-of-word features for training data
    feature_names = tf_idf_vectorize.get_feature_names(
    )  #converts feature index to the word it represents.
    tf_idf_test = tf_idf_vectorize.transform(test_data.data)
    vocab = tf_idf_vectorize.decode(feature_names)
    shape = tf_idf_train.shape
    print('{} train data points.'.format(shape[0]))
    print('{} feature dimension.'.format(shape[1]))
    print('Most common word in training set is "{}"'.format(
        feature_names[tf_idf_train.sum(axis=0).argmax()]))
    return tf_idf_train, tf_idf_test, feature_names
except OSError as ex:
    print(ex)
    print("Couldn't import the data, did you unzip the wikidata.zip folder?")
    exit(-1)

docs = dataset['data']
target = dataset['target']

docs_train, docs_test, y_train, y_test = train_test_split(docs,
                                                          target,
                                                          test_size=.2,
                                                          random_state=0)

vec = TfidfVectorizer(ngram_range=(1, 5), analyzer='char', use_idf=True)

vec.decode(docs_train)

mlp = MLPClassifier()

model = make_pipeline(vec, mlp)

model.fit(docs_train, y_train)

y_predicted = model.predict(docs_test)

target_names = [dataset.target_names[i] for i in np.unique(y_train)]
print(classification_report(y_test, y_predicted, target_names=target_names))

cm = confusion_matrix(y_test, y_predicted)
predicted_names = ['p_' + s for s in target_names]
dfcm = DataFrame(cm, columns=predicted_names, index=target_names)