def run_text():
    data = generate_data(split_point=15000, emb_size=50, maxlen=100)
    emb_matrix = data['emb_matrix']
    train_batches = data['train_batches']
    test_batches = data['test_batches']
    model = TextRNN(emb_matrix, trainable_embeddings=False)
    optimizer = Adam(model.params, 0.001)
    criterion = BCEWithLogitsLoss()
    train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
Beispiel #2
0
def run_text():
    data = generate_data(emb_size=50, max_len=100)
    emb_matrix = data['emb_matrix']
    train_batches = data['train_batches']
    test_batches = data['test_batches']
    model = ProjectedAttentionTextRNN(emb_matrix, stacked_layers=1)
    optimizer = Adam(model.params, 0.001)
    criterion = BCEWithLogitsLoss()
    train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text():
    data = generate_data(size=200000,
                         split_point=160000,
                         emb_size=50,
                         max_len=25)
    emb_matrix = data['emb_matrix']
    train_batches = data['train_batches']
    test_batches = data['test_batches']
    model = ProjectedAttentionTextRNN(emb_matrix)
    optimizer = Adam(model.params, 0.001)
    criterion = BCEWithLogitsLoss()
    train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text_author():
    data = generate_data(size=1000000,
                         split_point=960000,
                         emb_size=25,
                         max_len=25)
    a2i = data['a2i']
    emb_matrix = data['emb_matrix']
    train_batches = data['train_batches']
    test_batches = data['test_batches']
    model = TextAuthorRNN(emb_matrix, len(a2i))
    optimizer = Adam(model.params, 0.001)
    criterion = BCEWithLogitsLoss()
    train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text_title_author():
    data = generate_data(split_point=15000, emb_size=50, maxlen=100)
    a2i = data['a2i']
    emb_matrix = data['emb_matrix']
    train_batches = data['train_batches']
    test_batches = data['test_batches']
    model = ProjectedAttentionTextTitleAuthorRNN(
        emb_matrix,
        author_embeddings_input_size=len(a2i),
        embeddings_dropout=0.5,
        top_mlp_dropout=0.5,
        text_stacked_layers=1,
        text_cell_hidden_size=128,
        title_cell_hidden_size=32,
        top_mlp_outer_activation=None,
        top_mlp_layers=2)
    optimizer = Adam(model.params, 0.001)
    criterion = BCEWithLogitsLoss()
    train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
from utils_deepwalk import deepwalk

sys.path.append("../")
from preprocess import get_train_data, import_texts, generate_data, clean_host_texts

# Generating train data without duplicates and test data
data = "../data/"
train_file = data + "train_noduplicates.csv"
train_hosts, y_train = get_train_data(train_file)
texts_path = "../text/text"
texts = import_texts(texts_path)

with open(data + "test.csv", "r") as f:
    test_hosts = f.read().splitlines()

train_data = generate_data(train_hosts, texts)
test_data = generate_data(test_hosts, texts)

# Preprocessing texts
tokenizer = TweetTokenizer()
punctuation = string.punctuation + "’“”.»«…°"
stpwords_fr = stopwords.words("french")
stpwords_en = stopwords.words("english")
cleaned_train_data = clean_host_texts(data=train_data,
                                      tok=tokenizer,
                                      stpwds=stpwords_fr + stpwords_en,
                                      punct=punctuation)
cleaned_test_data = clean_host_texts(data=test_data,
                                     tok=tokenizer,
                                     stpwds=stpwords_fr + stpwords_en,
                                     punct=punctuation)
Beispiel #7
0
            'sexual_precision': sexual_precision_scores,
            'physical_f1_score': physical_f1_scores,
            'physical_recall': physical_recall_scores,
            'physical_precision': physical_precision_scores
        }
        df = pd.DataFrame.from_dict(results_dict)
        if "results.csv" in os.listdir(RESULTS_DIR):
            df_old = pd.read_csv(RESULTS_DIR + "results.csv")
            df = pd.concat([df_old, df])
        df.to_csv(RESULTS_DIR + "results.csv", index=False)


if __name__ == "__main__":
    if not os.path.exists(MODELS_DIR):
        os.makedirs(MODELS_DIR)
    if not os.path.exists(RESULTS_DIR):
        os.makedirs(RESULTS_DIR)

    data = generate_data(embs_path=GLOVE_EMBEDDINGS_PATH,
                         maxlen=CONFIG['maxlen'],
                         batch_size=CONFIG['batch_size'])

    run_model("vanilla_last", data, False)
    run_model("vanilla_projected_last", data, False)
    run_model("vanilla_avg", data, False)
    run_model("vanilla_projected_avg", data, False)
    run_model("multi_attention", data, False)
    run_model("multi_projected_attention", data, False)
    run_model("projected_attention", data, False)
    run_model("attention", data, False)