def tokenize_cleaned_tweets(tweets: Dict[str, List[str]], create_tokenizer=False): if create_tokenizer is False: max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') tok = load_tokenizer() ret = {} for hashtag, tweets in tweets.items(): padded_x, _ = encode_embed_docs(tweets, tok, max_tweets=max_tweet_word_count) ret[hashtag] = padded_x dest = 'hashtag_' + hashtag numpy.save(dest, padded_x) return ret else: all_tweets = tweets.values() flat_list = [] for sublist in all_tweets: for item in sublist: flat_list.append(item) tok = tokenize(flat_list, verbose=False) temp_padded_x, _ = encode_embed_docs(flat_list, tok) max_tweet_word_count = len(temp_padded_x[0]) save_pickle(max_tweet_word_count, 'max_tweet_word_count.pickle') save_tokenizer(tok) return tokenize_cleaned_tweets(tweets)
def load_training_sentiment_data_small(): t = load_tokenizer('learn') y = load_pickle('tokenized/learn/small_y.pickle') padded_x = load_pickle('tokenized/learn/small_padded_x.pickle') unpadded_x = load_pickle('tokenized/learn/small_unpadded_x.pickle') max_tweet_word_count = load_pickle( 'tokenized/learn/max_tweet_word_count.pickle') vocab_size = load_pickle('tokenized/learn/vocab_size.pickle') return t, y, padded_x, unpadded_x, max_tweet_word_count, vocab_size
def load_training_sentiment_data(): t = load_tokenizer() y = load_csv('tokenized/learn/lables.csv') padded_x = load_pickle('tokenized/learn/padded_x.pickle') unpadded_x = load_pickle('tokenized/learn/unpadded_x.pickle') max_tweet_word_count = load_pickle( 'tokenized/learn/max_tweet_word_count.pickle') vocab_size = t.num_words return t, y, padded_x, unpadded_x, max_tweet_word_count, vocab_size
def learn_embedding_word(x, y): padded_x = x tokenizer = load_tokenizer() max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') print('learning word...') e = EmbeddingWord(tokenizer, padded_x, None, max_tweet_word_count, tokenizer.num_words, y) e.create_embedding() weights = e.get_weights() save_model_mat(weights, 'embedding_word') return weights
def learn_embedding_skip_gram(x, y, texts): padded_x = x tokenizer = load_tokenizer() max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') print('learning skip_gram...') e = EmbeddingGensimSkipGram(tokenizer, padded_x, None, max_tweet_word_count, tokenizer.num_words, y, texts) e.create_embedding() weights = e.get_weights() save_model_mat(weights, 'embedding_skip_gram') return weights
from sklearn.model_selection import train_test_split from bix.twitter.base.utils import load_csv, encode_embed_docs, save_pickle, load_pickle, save_csv from bix.twitter.learn.tokenizer.tokenizer_utils import load_tokenizer if __name__ == '__main__': print('loading saved state') tokenizer = load_tokenizer('learn') x = load_csv('learn/tweets.csv') y = load_csv('learn/lables.csv') max_tweet_word_count = load_pickle( 'tokenized/learn/max_tweet_word_count.pickle') print('reducing learning data') x_learn, _, y_learn, _ = train_test_split( x, y, test_size=0.995, random_state=4) # 16k are more than enough print('encoding data') padded_x, unpadded_x = encode_embed_docs(x_learn, tokenizer, max_tweet_word_count) print('saving') save_pickle(padded_x, 'tokenized/learn/small_padded_x.pickle') save_pickle(unpadded_x, 'tokenized/learn/small_unpadded_x.pickle') save_pickle(y_learn, 'tokenized/learn/small_y.pickle') save_csv('learn/tweets_learn.csv', x_learn)
def train_model_convolutional(x, y, embedding_mats): # print(device_lib.list_local_devices()) # exit(0) padded_x = x max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') tok = load_tokenizer() vocab_size = tok.num_words #if 'test_all_data' in args: # padded_x = load_pickle('tokenized/learn/padded_x.pickle') # y = load_csv('learn/lables.csv') enc_y = np_utils.to_categorical(y) x_train, x_test, y_train, y_test = train_test_split(padded_x, enc_y, test_size=0.2, random_state=5) #model_mat_word = load_model_mat('embedding_word') #print(f'model_mat_word.shape: {model_mat_word[0].shape}') #model_mat_glove = load_model_mat('embedding_glove') #print(f'model_mat_glove.shape: {model_mat_glove[0].shape}') #model_mat_skip_gram = load_model_mat('embedding_skip_gram') #print(f'model_mat_skip_gram.shape: {model_mat_skip_gram[0].shape}') # model_mat_skip_gram = load_model_mat('embedding_skip_gram') # vocab_size = len(tok.word_index) + 1 # model = Sequential() # model.add(Embedding(vocab_size, model_mat_word[0].shape[1], input_length=max_tweet_word_count, # weights=model_mat_word)) # model.add(Flatten()) # model.add(Dense(1, activation='sigmoid')) # word input = Input(shape=(max_tweet_word_count, )) xs = [] for mat in embedding_mats: xs.append( Embedding(vocab_size, mat[0].shape[1], input_length=max_tweet_word_count, weights=mat, trainable=False)(input)) combined = concatenate(xs) z = Conv1D(100, 5, activation='relu')(combined) z = Conv1D(100, 5, activation='relu')(z) z = MaxPooling1D()(z) z = Conv1D(160, 5, activation='relu')(z) z = Conv1D(160, 5, activation='relu')(z) z = GlobalMaxPooling1D()(z) z = Dropout(0.5)(z) #f = Flatten()(z) # conv #polling #flatten #dense # (evntl. residual conv) #z = Dense(10, activation="relu")(z) z = Dense(len(y_test[0]), activation="softmax")(z) # ca 20kk total params model = Model(inputs=[input], outputs=z) # run_opts = tensorflow.RunOptions(report_tensor_allocations_upon_oom=True) # compile the model model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) # , options=run_opts) # experiment optimizer (adam vs rmsprop) # expeniment activation function (liki_relu, elu) # summarize the model print(model.summary()) # fit the model es = EarlyStopping(monitor='val_loss') model.fit([x_train], y_train, epochs=50, verbose=1, batch_size=8000, validation_split=0.1, callbacks=[es]) # todo: use return value # evaluate the model loss, accuracy = model.evaluate([x_test], y_test, verbose=1, batch_size=8000) print('Accuracy: %f' % (accuracy * 100)) # small: Accuracy: 93.041664 # all data - Accuracy: 79.458750 #3 embedding Layers: model.save('sentiment_conv_ep100.h5') print('finished')