def run(dataset, hyperparameters, metrics, fname=None): # # Load Resources word2vec = None if hyperparameters['model'] != 'rand': word2vec = load_word2vec() # # Load Dataset df = load_dataset(dataset[0], **dataset[1]) # # Preprocess df['clean_tweets'] = df.tweet.apply( TweetPreprocessor(normalize=['link', 'mention']).preprocess) df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize) X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split( df.tokens, df.label) # # Train clf = NeuralNetClassifier(module=TextCNN, corpus=df.tokens, word_vectors=word2vec, metrics=metrics, **hyperparameters) clf.fit(X_train, y_train, validation_data=(X_dev, y_dev)) # # Predict y_pred = clf.predict(X_test) # # Evaluate pprint( dict(dataset=dataset, hyperparameters=hyperparameters, scores={ scorer: get_score_func(scorer)(y_test, y_pred) for scorer in metrics })) # # Save to file X_test['pred'] = y_pred X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
def run(dataset, features, word_embedding, metrics, fname): if dataset.lower().startswith('f'): df = load_fdcl18() else: df = load_dwmw17() tqdm.pandas(desc='Preprocessing Progress: ') df['clean_tweet'] = df.tweet.progress_apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess, ) tqdm.pandas(desc='Tokenizing Progress: ') df['tokens'] = df.clean_tweet.progress_apply(TweetTokenizer().tokenize) # # # Feature Extraction # tfidf_pipeline ff = [] if 'tfidf_vectorizer' in features: tfidf_kwargs = dict( tokenizer=TweetTokenizer().tokenize, stop_words=stopwords, min_df=.0025, max_df=0.25, ngram_range=(1, 3) ) ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')] # framenet_pipeline if 'framenet_pipeline' in features: count_vectorizer = ('count_vectorizer', CountVectorizer()) truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10)) ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')] # mean_embedding if 'mean_embedding' in features: ff += [('mean_embedding', mean_embedding(word_embedding), 'tokens')] # hatebase_vectorizer if 'hatebase_vectorizer' in features: ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')] # transfer_vectorizer if 'transfer_vectorizer' in features: hyper_params = features['transfer_vectorizer'] hyper_params['module'] = TextCNN hyper_params['corpus'] = df.tokens hyper_params['word_vectors'] = word_embedding # """ # Cross-validate and save predictions args = [NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False] ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')] # # Estimator pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())]) # # Evaluation (Cross Validation) # """ # Cross-validate and save predictions cv = CrossValidator(pipeline, n_splits=5, scoring=metrics) df['predictions'], cv_results = cv.cross_val_predict(df, df.label, return_scores=True) # """ Print Scores pprint({'dataset': dataset, 'features': features}) pprint(cv_results) scores = {} for scorer in metrics: scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ','] pprint(scores, type='table') # """ Save Predictions # df.to_excel(scratch_path('predictions_%s_%s.xlsx' % (dataset, fname)))
from scripts.utils import scratch_path from tklearn.datasets import load_fdcl18, load_dwmw17 from tklearn.model_selection import CrossValidator from tklearn.neural_network import NeuralNetClassifier from tklearn.neural_network.model import TextCNN from tklearn.preprocessing.tweet import TweetPreprocessor from tklearn.text.word_vec import load_word2vec from tklearn.utils import pprint DATASET = 'FDCL18' if __name__ == '__main__': # Load Dataset and Extract Features if DATASET.lower().startswith('f'): df = load_fdcl18(num_classes=2) pprint({'dataset': 'FDCL18(num_classes=2)'}) else: df = load_dwmw17(num_classes=2) pprint({'dataset': 'DWMW17(num_classes=2)'}) df['clean_tweets'] = df.tweet.apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess) df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize) # Load Resources word2vec = load_word2vec() # Hyperparameters kwargs = { 'model': 'multichannel', 'epoch': 100, 'learning_rate': 0.01, 'max_sent_len': 50, 'batch_size': 50, # 'word_dim': 300,
def run(dataset, features, word2vec, metrics, fname=None): if dataset == 'fdcl18': df1 = load_fdcl18(num_classes=2) df2 = load_dwmw17(num_classes=2) df2 = df2.drop( ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1) else: df1 = load_dwmw17(num_classes=2) df2 = load_fdcl18(num_classes=2) df1 = df1.drop( ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1) # # Preprocessing preprocess = TweetPreprocessor(normalize=['link', 'mention']).preprocess tokenize = TweetTokenizer().tokenize # # # DF 1 - Preprocessing tqdm.pandas(desc='Preprocessing Progress: ') df1['clean_tweet'] = df1.tweet.progress_apply(preprocess) tqdm.pandas(desc='Tokenizing Progress: ') df1['tokens'] = df1.clean_tweet.progress_apply(tokenize) # # # DF 2 - Preprocessing tqdm.pandas(desc='Preprocessing Progress: ') df2['clean_tweet'] = df2.tweet.progress_apply(preprocess) tqdm.pandas(desc='Tokenizing Progress: ') df2['tokens'] = df2.clean_tweet.progress_apply(tokenize) # # # # Feature Extraction # # # tfidf_pipeline ff = [] if 'tfidf_vectorizer' in features: tfidf_kwargs = dict(tokenizer=TweetTokenizer().tokenize, stop_words=stopwords, min_df=.0025, max_df=0.25, ngram_range=(1, 3)) ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')] # # # framenet_pipeline if 'framenet_pipeline' in features: count_vectorizer = ('count_vectorizer', CountVectorizer()) truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10)) ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')] # # # mean_embedding if 'mean_embedding' in features: ff += [('mean_embedding', mean_embedding(word2vec), 'tokens')] # # # hatebase_vectorizer if 'hatebase_vectorizer' in features: ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')] # # # transfer_vectorizer if 'transfer_vectorizer' in features: hyper_params = features['transfer_vectorizer'] hyper_params['module'] = TextCNN hyper_params['corpus'] = df1.tokens hyper_params['word_vectors'] = word2vec # """ # Cross-validate and save predictions args = [ NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False ] ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')] # # # estimator pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())]) # # Grid Search`` # param_grid = [ # {'clf__C': [0.1, 1, 10, 50], 'classifier': linear_svc}, # # {'classifier': sgd_classifier}, # ] # gs = GridSearchCV(pipeline, param_grid, cv=5) # result = gs.fit(df, df.label).predict(df) # # Evaluation pipeline.fit(df1, df1.label) y_true, y_pred = df2.label, pipeline.predict(df2) # df2['predictions'] = y_pred # """ Print Scores pprint({'dataset': dataset, 'features': features}) scores = {} for scorer in metrics: scores[scorer] = [get_score_func(scorer)(y_true, y_pred)] pprint(scores, type='table')