def test_load_dictionaries():

    d1, d2 = load_dictionaries()

    assert isinstance(d1, Dictionary)
    assert isinstance(d2, Dictionary)

    assert len(d1) == 224011
    assert len(d2) == 219691

    assert "remain" in d1.values()
    assert d1[124] == "remain"
    assert d1.id2token[124] == "remain"
    assert d1.token2id["remain"] == 124
Example #2
0
def test_load_dictionaries():

    d1, d2 = load_dictionaries()

    assert isinstance(d1, Dictionary)
    assert isinstance(d2, Dictionary)

    assert len(d1) == 41_8347
    assert len(d2) == 40_4278

    assert "antiracism" in d1.values()
    assert "antiracism" in d2.values()
    assert d1[124813] == "antiracism"
    assert d1.id2token[124813] == "antiracism"
    assert d1.token2id["antiracism"] == 124813

    assert d1.token2id["pizzagate"] == 16052
    assert d2.token2id["pizzagate"] == 63088
Example #3
0
def unweighted_model():
    dictionary, dictionary_s = load_dictionaries()
    dictionary_size, dictionary_size_s = len(dictionary), len(dictionary_s)

    print("CONSTRUCTING THE MODEL...")
    #import model architecture
    seq_len = 20
    #input1
    inputs1 = Input(shape=(seq_len, ))
    #embedding1 = Embedding(dictionary_size + 1, 128)(inputs1)
    embedding1 = Embedding(dictionary_size + 1, 64)(
        inputs1)  # changed from 128 to 64 to match saved weights file
    conv1 = Conv1D(filters=32,
                   kernel_size=3,
                   activation='relu',
                   padding='valid')(embedding1)
    drop1 = Dropout(0.2)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    #Input2
    inputs2 = Input(shape=(seq_len, ))
    #embedding2 = Embedding(dictionary_size_s + 1, 128)(inputs2)
    embedding2 = Embedding(dictionary_size_s + 1, 64)(
        inputs2)  # changed from 128 to 64 to match saved weights file
    conv2 = Conv1D(filters=32,
                   kernel_size=3,
                   activation='relu',
                   padding='valid')(embedding2)
    drop2 = Dropout(0.2)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    #merge via concatenation
    merged = concatenate([flat1, flat2])
    #dense
    dense1 = Dense(64, activation='relu')(merged)
    dense2 = Dense(32, activation='relu')(dense1)
    outputs = Dense(2, activation='softmax')(dense2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    #print(model.summary())

    return model
Example #4
0
def unweighted_model():

    dictionary, dictionary_s = load_dictionaries()
    dictionary_size, dictionary_size_s = len(dictionary), len(dictionary_s)
    #print("DICTIONARY SIZES:", dictionary_size, dictionary_size_s) #> 224011 219691

    #import model architecture
    seq_len = 20
    #input1
    inputs1 = Input(shape=(seq_len, ))
    embedding1 = Embedding(dictionary_size + 1, 128)(inputs1)
    conv1 = Conv1D(filters=32,
                   kernel_size=3,
                   activation='relu',
                   padding='valid')(embedding1)
    drop1 = Dropout(0.2)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    #Input2
    inputs2 = Input(shape=(seq_len, ))
    embedding2 = Embedding(dictionary_size_s + 1, 128)(inputs2)
    conv2 = Conv1D(filters=32,
                   kernel_size=3,
                   activation='relu',
                   padding='valid')(embedding2)
    drop2 = Dropout(0.2)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    #merge via concatenation
    merged = concatenate([flat1, flat2])
    #dense
    dense1 = Dense(64, activation='relu')(merged)
    dense2 = Dense(32, activation='relu')(dense1)
    outputs = Dense(2, activation='softmax')(dense2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    #print(model.summary())

    return model
Example #5
0
# for text processing and formatting

import wordsegment as ws
import re
import string
from nltk.corpus import stopwords  # FYI: need to run nltk.download() or nltk.download('stopwords') on your machine for this to work
from keras.preprocessing import sequence

from app.dictionaries import load_dictionaries

ws.load()
dictionary, dictionary_s = load_dictionaries(
)  # consider moving into a function, or caching as an attribute upon init of a class


#segmentation
def my_replace(match):
    match = match.group()
    return ' '.join(ws.segment(match))


def process(twt):
    try:
        return (re.sub('#\w+', my_replace, twt))
    except Exception as e:
        return (None)


def clean(twt):
    #remove punctuation
    try: