def run(dataset, hyperparameters, metrics, fname=None): # # Load Resources word2vec = None if hyperparameters['model'] != 'rand': word2vec = load_word2vec() # # Load Dataset df = load_dataset(dataset[0], **dataset[1]) # # Preprocess df['clean_tweets'] = df.tweet.apply( TweetPreprocessor(normalize=['link', 'mention']).preprocess) df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize) X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split( df.tokens, df.label) # # Train clf = NeuralNetClassifier(module=TextCNN, corpus=df.tokens, word_vectors=word2vec, metrics=metrics, **hyperparameters) clf.fit(X_train, y_train, validation_data=(X_dev, y_dev)) # # Predict y_pred = clf.predict(X_test) # # Evaluate pprint( dict(dataset=dataset, hyperparameters=hyperparameters, scores={ scorer: get_score_func(scorer)(y_test, y_pred) for scorer in metrics })) # # Save to file X_test['pred'] = y_pred X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
def run(dataset, features, word_embedding, metrics, fname): if dataset.lower().startswith('f'): df = load_fdcl18() else: df = load_dwmw17() tqdm.pandas(desc='Preprocessing Progress: ') df['clean_tweet'] = df.tweet.progress_apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess, ) tqdm.pandas(desc='Tokenizing Progress: ') df['tokens'] = df.clean_tweet.progress_apply(TweetTokenizer().tokenize) # # # Feature Extraction # tfidf_pipeline ff = [] if 'tfidf_vectorizer' in features: tfidf_kwargs = dict( tokenizer=TweetTokenizer().tokenize, stop_words=stopwords, min_df=.0025, max_df=0.25, ngram_range=(1, 3) ) ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')] # framenet_pipeline if 'framenet_pipeline' in features: count_vectorizer = ('count_vectorizer', CountVectorizer()) truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10)) ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')] # mean_embedding if 'mean_embedding' in features: ff += [('mean_embedding', mean_embedding(word_embedding), 'tokens')] # hatebase_vectorizer if 'hatebase_vectorizer' in features: ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')] # transfer_vectorizer if 'transfer_vectorizer' in features: hyper_params = features['transfer_vectorizer'] hyper_params['module'] = TextCNN hyper_params['corpus'] = df.tokens hyper_params['word_vectors'] = word_embedding # """ # Cross-validate and save predictions args = [NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False] ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')] # # Estimator pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())]) # # Evaluation (Cross Validation) # """ # Cross-validate and save predictions cv = CrossValidator(pipeline, n_splits=5, scoring=metrics) df['predictions'], cv_results = cv.cross_val_predict(df, df.label, return_scores=True) # """ Print Scores pprint({'dataset': dataset, 'features': features}) pprint(cv_results) scores = {} for scorer in metrics: scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ','] pprint(scores, type='table') # """ Save Predictions # df.to_excel(scratch_path('predictions_%s_%s.xlsx' % (dataset, fname)))
def run_all(datasets, features, metrics): # Load static objects df = pd.read_csv(scratch_path('words_full.csv')) words = df.word embedding = df.drop(['Unnamed: 0', 'word'], axis=1).to_dict('list') d_vec = {words[int(i)]: v for i, v in embedding.items()} hate_vectors = load_embedding(d_vec) for dataset in datasets: feature_dict = {} for fid0, feature_map in enumerate(features): for fid1, args in enumerate(feature_map[1]): feature_dict.update({feature_map[0]: args}) run(dataset=dataset, features=feature_dict, word_vectors=hate_vectors, metrics=metrics, fname=str(fid0) + '_' + str(fid1))
'model': 'multichannel', 'epoch': 100, 'learning_rate': 0.01, 'max_sent_len': 50, 'batch_size': 50, # 'word_dim': 300, 'filters': [3, 4, 5], 'filter_num': [100, 100, 100], 'dropout_prob': 0.5, 'norm_limit': 3, } pprint(kwargs) # """ # Additional Parameters kwargs['module'] = TextCNN kwargs['corpus'] = df.tokens kwargs['word_vectors'] = word2vec # """ # Cross-validate and save predictions scorers = ['accuracy', 'precision', 'recall', 'f1'] estimator = NeuralNetClassifier(**kwargs) cv = CrossValidator(NeuralNetClassifier, kwargs, n_splits=5, scoring=scorers) df['predictions'], cv_results = cv.cross_val_predict(df.tokens, df.label, return_scores=True) # """ Print Scores pprint(cv_results) scores = {} for scorer in scorers: scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ','] pprint(scores, type='table') # """ Save Predictions # df.to_excel(scratch_path('cnn_predictions.xlsx')) # """ #