def run(dataset, hyperparameters, metrics, fname=None): # # Load Resources word2vec = None if hyperparameters['model'] != 'rand': word2vec = load_word2vec() # # Load Dataset df = load_dataset(dataset[0], **dataset[1]) # # Preprocess df['clean_tweets'] = df.tweet.apply( TweetPreprocessor(normalize=['link', 'mention']).preprocess) df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize) X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split( df.tokens, df.label) # # Train clf = NeuralNetClassifier(module=TextCNN, corpus=df.tokens, word_vectors=word2vec, metrics=metrics, **hyperparameters) clf.fit(X_train, y_train, validation_data=(X_dev, y_dev)) # # Predict y_pred = clf.predict(X_test) # # Evaluate pprint( dict(dataset=dataset, hyperparameters=hyperparameters, scores={ scorer: get_score_func(scorer)(y_test, y_pred) for scorer in metrics })) # # Save to file X_test['pred'] = y_pred X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
def run_all(**kwargs): word2vec = load_word2vec() print("Individual Feature Comparison") for dataset in kwargs['DATASETS']: for features in kwargs['FEATURES']: run( dataset=dataset, features=features, metrics=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], word2vec=word2vec, )
import numpy as np import pandas as pd from sklearn.cross_decomposition import CCA from tklearn.feature_extraction.hatespeech import load_hatebase, HatebaseVectorizer from tklearn.text.word_vec import load_word2vec hatebase = load_hatebase() word2vec = load_word2vec() embeddings = {} for w in hatebase.keys(): if w in word2vec.vocab: embeddings[w] = [word2vec.word_vec(w)] hv = HatebaseVectorizer(features=['average_offensiveness']) hv.fit(None) C1 = np.array([v[0] for w, v in embeddings.items()]) C1.shape C2 = hv.feature_vectors.loc[[hv.index[w] for w in embeddings.keys()]] C2.shape cca = CCA(n_components=25) cca.fit(C2, C1) embeddings = {}