def run(dataset, hyperparameters, metrics, fname=None):
    # # Load Resources
    word2vec = None
    if hyperparameters['model'] != 'rand':
        word2vec = load_word2vec()
    # # Load Dataset
    df = load_dataset(dataset[0], **dataset[1])
    # # Preprocess
    df['clean_tweets'] = df.tweet.apply(
        TweetPreprocessor(normalize=['link', 'mention']).preprocess)
    df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize)
    X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(
        df.tokens, df.label)
    # # Train
    clf = NeuralNetClassifier(module=TextCNN,
                              corpus=df.tokens,
                              word_vectors=word2vec,
                              metrics=metrics,
                              **hyperparameters)
    clf.fit(X_train, y_train, validation_data=(X_dev, y_dev))
    # # Predict
    y_pred = clf.predict(X_test)
    # # Evaluate
    pprint(
        dict(dataset=dataset,
             hyperparameters=hyperparameters,
             scores={
                 scorer: get_score_func(scorer)(y_test, y_pred)
                 for scorer in metrics
             }))
    # # Save to file
    X_test['pred'] = y_pred
    X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
def run(dataset, features, word_embedding, metrics, fname):
    if dataset.lower().startswith('f'):
        df = load_fdcl18()
    else:
        df = load_dwmw17()
    tqdm.pandas(desc='Preprocessing Progress: ')
    df['clean_tweet'] = df.tweet.progress_apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess, )
    tqdm.pandas(desc='Tokenizing Progress: ')
    df['tokens'] = df.clean_tweet.progress_apply(TweetTokenizer().tokenize)
    # #
    # Feature Extraction
    # tfidf_pipeline
    ff = []
    if 'tfidf_vectorizer' in features:
        tfidf_kwargs = dict(
            tokenizer=TweetTokenizer().tokenize,
            stop_words=stopwords,
            min_df=.0025,
            max_df=0.25,
            ngram_range=(1, 3)
        )
        ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')]
    # framenet_pipeline
    if 'framenet_pipeline' in features:
        count_vectorizer = ('count_vectorizer', CountVectorizer())
        truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10))
        ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')]
    # mean_embedding
    if 'mean_embedding' in features:
        ff += [('mean_embedding', mean_embedding(word_embedding), 'tokens')]
    # hatebase_vectorizer
    if 'hatebase_vectorizer' in features:
        ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')]
    # transfer_vectorizer
    if 'transfer_vectorizer' in features:
        hyper_params = features['transfer_vectorizer']
        hyper_params['module'] = TextCNN
        hyper_params['corpus'] = df.tokens
        hyper_params['word_vectors'] = word_embedding
        # """ # Cross-validate and save predictions
        args = [NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False]
        ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')]
    # # Estimator
    pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())])
    # # Evaluation (Cross Validation)
    # """ # Cross-validate and save predictions
    cv = CrossValidator(pipeline, n_splits=5, scoring=metrics)
    df['predictions'], cv_results = cv.cross_val_predict(df, df.label, return_scores=True)
    # """ Print Scores
    pprint({'dataset': dataset, 'features': features})
    pprint(cv_results)
    scores = {}
    for scorer in metrics:
        scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ',']
    pprint(scores, type='table')
    # """ Save Predictions #
    df.to_excel(scratch_path('predictions_%s_%s.xlsx' % (dataset, fname)))
Beispiel #3
0
def run_all(datasets, features, metrics):
    # Load static objects
    df = pd.read_csv(scratch_path('words_full.csv'))
    words = df.word
    embedding = df.drop(['Unnamed: 0', 'word'], axis=1).to_dict('list')
    d_vec = {words[int(i)]: v for i, v in embedding.items()}
    hate_vectors = load_embedding(d_vec)
    for dataset in datasets:
        feature_dict = {}
        for fid0, feature_map in enumerate(features):
            for fid1, args in enumerate(feature_map[1]):
                feature_dict.update({feature_map[0]: args})
                run(dataset=dataset,
                    features=feature_dict,
                    word_vectors=hate_vectors,
                    metrics=metrics,
                    fname=str(fid0) + '_' + str(fid1))
        'model': 'multichannel',
        'epoch': 100,
        'learning_rate': 0.01,
        'max_sent_len': 50,
        'batch_size': 50,
        # 'word_dim': 300,
        'filters': [3, 4, 5],
        'filter_num': [100, 100, 100],
        'dropout_prob': 0.5,
        'norm_limit': 3,
    }
    pprint(kwargs)
    # """ # Additional Parameters
    kwargs['module'] = TextCNN
    kwargs['corpus'] = df.tokens
    kwargs['word_vectors'] = word2vec
    # """ # Cross-validate and save predictions
    scorers = ['accuracy', 'precision', 'recall', 'f1']
    estimator = NeuralNetClassifier(**kwargs)
    cv = CrossValidator(NeuralNetClassifier, kwargs, n_splits=5, scoring=scorers)
    df['predictions'], cv_results = cv.cross_val_predict(df.tokens, df.label, return_scores=True)
    # """ Print Scores
    pprint(cv_results)
    scores = {}
    for scorer in scorers:
        scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ',']
    pprint(scores, type='table')
    # """ Save Predictions #
    df.to_excel(scratch_path('cnn_predictions.xlsx'))
    # """ #