def test_load_dictionaries(): d1, d2 = load_dictionaries() assert isinstance(d1, Dictionary) assert isinstance(d2, Dictionary) assert len(d1) == 224011 assert len(d2) == 219691 assert "remain" in d1.values() assert d1[124] == "remain" assert d1.id2token[124] == "remain" assert d1.token2id["remain"] == 124
def test_load_dictionaries(): d1, d2 = load_dictionaries() assert isinstance(d1, Dictionary) assert isinstance(d2, Dictionary) assert len(d1) == 41_8347 assert len(d2) == 40_4278 assert "antiracism" in d1.values() assert "antiracism" in d2.values() assert d1[124813] == "antiracism" assert d1.id2token[124813] == "antiracism" assert d1.token2id["antiracism"] == 124813 assert d1.token2id["pizzagate"] == 16052 assert d2.token2id["pizzagate"] == 63088
def unweighted_model(): dictionary, dictionary_s = load_dictionaries() dictionary_size, dictionary_size_s = len(dictionary), len(dictionary_s) print("CONSTRUCTING THE MODEL...") #import model architecture seq_len = 20 #input1 inputs1 = Input(shape=(seq_len, )) #embedding1 = Embedding(dictionary_size + 1, 128)(inputs1) embedding1 = Embedding(dictionary_size + 1, 64)( inputs1) # changed from 128 to 64 to match saved weights file conv1 = Conv1D(filters=32, kernel_size=3, activation='relu', padding='valid')(embedding1) drop1 = Dropout(0.2)(conv1) pool1 = MaxPooling1D(pool_size=2)(drop1) flat1 = Flatten()(pool1) #Input2 inputs2 = Input(shape=(seq_len, )) #embedding2 = Embedding(dictionary_size_s + 1, 128)(inputs2) embedding2 = Embedding(dictionary_size_s + 1, 64)( inputs2) # changed from 128 to 64 to match saved weights file conv2 = Conv1D(filters=32, kernel_size=3, activation='relu', padding='valid')(embedding2) drop2 = Dropout(0.2)(conv2) pool2 = MaxPooling1D(pool_size=2)(drop2) flat2 = Flatten()(pool2) #merge via concatenation merged = concatenate([flat1, flat2]) #dense dense1 = Dense(64, activation='relu')(merged) dense2 = Dense(32, activation='relu')(dense1) outputs = Dense(2, activation='softmax')(dense2) model = Model(inputs=[inputs1, inputs2], outputs=outputs) #print(model.summary()) return model
def unweighted_model(): dictionary, dictionary_s = load_dictionaries() dictionary_size, dictionary_size_s = len(dictionary), len(dictionary_s) #print("DICTIONARY SIZES:", dictionary_size, dictionary_size_s) #> 224011 219691 #import model architecture seq_len = 20 #input1 inputs1 = Input(shape=(seq_len, )) embedding1 = Embedding(dictionary_size + 1, 128)(inputs1) conv1 = Conv1D(filters=32, kernel_size=3, activation='relu', padding='valid')(embedding1) drop1 = Dropout(0.2)(conv1) pool1 = MaxPooling1D(pool_size=2)(drop1) flat1 = Flatten()(pool1) #Input2 inputs2 = Input(shape=(seq_len, )) embedding2 = Embedding(dictionary_size_s + 1, 128)(inputs2) conv2 = Conv1D(filters=32, kernel_size=3, activation='relu', padding='valid')(embedding2) drop2 = Dropout(0.2)(conv2) pool2 = MaxPooling1D(pool_size=2)(drop2) flat2 = Flatten()(pool2) #merge via concatenation merged = concatenate([flat1, flat2]) #dense dense1 = Dense(64, activation='relu')(merged) dense2 = Dense(32, activation='relu')(dense1) outputs = Dense(2, activation='softmax')(dense2) model = Model(inputs=[inputs1, inputs2], outputs=outputs) #print(model.summary()) return model
# for text processing and formatting import wordsegment as ws import re import string from nltk.corpus import stopwords # FYI: need to run nltk.download() or nltk.download('stopwords') on your machine for this to work from keras.preprocessing import sequence from app.dictionaries import load_dictionaries ws.load() dictionary, dictionary_s = load_dictionaries( ) # consider moving into a function, or caching as an attribute upon init of a class #segmentation def my_replace(match): match = match.group() return ' '.join(ws.segment(match)) def process(twt): try: return (re.sub('#\w+', my_replace, twt)) except Exception as e: return (None) def clean(twt): #remove punctuation try: