def run(dataset, hyperparameters, metrics, fname=None):
    # # Load Resources
    word2vec = None
    if hyperparameters['model'] != 'rand':
        word2vec = load_word2vec()
    # # Load Dataset
    df = load_dataset(dataset[0], **dataset[1])
    # # Preprocess
    df['clean_tweets'] = df.tweet.apply(
        TweetPreprocessor(normalize=['link', 'mention']).preprocess)
    df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize)
    X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(
        df.tokens, df.label)
    # # Train
    clf = NeuralNetClassifier(module=TextCNN,
                              corpus=df.tokens,
                              word_vectors=word2vec,
                              metrics=metrics,
                              **hyperparameters)
    clf.fit(X_train, y_train, validation_data=(X_dev, y_dev))
    # # Predict
    y_pred = clf.predict(X_test)
    # # Evaluate
    pprint(
        dict(dataset=dataset,
             hyperparameters=hyperparameters,
             scores={
                 scorer: get_score_func(scorer)(y_test, y_pred)
                 for scorer in metrics
             }))
    # # Save to file
    X_test['pred'] = y_pred
    X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
Exemple #2
0
def run_all(**kwargs):
    word2vec = load_word2vec()
    print("Individual Feature Comparison")
    for dataset in kwargs['DATASETS']:
        for features in kwargs['FEATURES']:
            run(
                dataset=dataset,
                features=features,
                metrics=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
                word2vec=word2vec,
            )
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import CCA

from tklearn.feature_extraction.hatespeech import load_hatebase, HatebaseVectorizer
from tklearn.text.word_vec import load_word2vec

hatebase = load_hatebase()

word2vec = load_word2vec()

embeddings = {}
for w in hatebase.keys():
    if w in word2vec.vocab:
        embeddings[w] = [word2vec.word_vec(w)]

hv = HatebaseVectorizer(features=['average_offensiveness'])

hv.fit(None)

C1 = np.array([v[0] for w, v in embeddings.items()])
C1.shape

C2 = hv.feature_vectors.loc[[hv.index[w] for w in embeddings.keys()]]
C2.shape

cca = CCA(n_components=25)

cca.fit(C2, C1)

embeddings = {}