예제 #1
0
def tokenize_cleaned_tweets(tweets: Dict[str, List[str]],
                            create_tokenizer=False):

    if create_tokenizer is False:
        max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')
        tok = load_tokenizer()
        ret = {}
        for hashtag, tweets in tweets.items():
            padded_x, _ = encode_embed_docs(tweets,
                                            tok,
                                            max_tweets=max_tweet_word_count)
            ret[hashtag] = padded_x
            dest = 'hashtag_' + hashtag
            numpy.save(dest, padded_x)
        return ret
    else:
        all_tweets = tweets.values()
        flat_list = []
        for sublist in all_tweets:
            for item in sublist:
                flat_list.append(item)
        tok = tokenize(flat_list, verbose=False)
        temp_padded_x, _ = encode_embed_docs(flat_list, tok)
        max_tweet_word_count = len(temp_padded_x[0])
        save_pickle(max_tweet_word_count, 'max_tweet_word_count.pickle')
        save_tokenizer(tok)

        return tokenize_cleaned_tweets(tweets)
예제 #2
0
파일: utils.py 프로젝트: fmschleif/bix
def load_training_sentiment_data_small():
    t = load_tokenizer('learn')
    y = load_pickle('tokenized/learn/small_y.pickle')
    padded_x = load_pickle('tokenized/learn/small_padded_x.pickle')
    unpadded_x = load_pickle('tokenized/learn/small_unpadded_x.pickle')
    max_tweet_word_count = load_pickle(
        'tokenized/learn/max_tweet_word_count.pickle')
    vocab_size = load_pickle('tokenized/learn/vocab_size.pickle')
    return t, y, padded_x, unpadded_x, max_tweet_word_count, vocab_size
예제 #3
0
파일: utils.py 프로젝트: fmschleif/bix
def load_training_sentiment_data():
    t = load_tokenizer()
    y = load_csv('tokenized/learn/lables.csv')
    padded_x = load_pickle('tokenized/learn/padded_x.pickle')
    unpadded_x = load_pickle('tokenized/learn/unpadded_x.pickle')
    max_tweet_word_count = load_pickle(
        'tokenized/learn/max_tweet_word_count.pickle')
    vocab_size = t.num_words
    return t, y, padded_x, unpadded_x, max_tweet_word_count, vocab_size
예제 #4
0
def learn_embedding_word(x, y):
    padded_x = x
    tokenizer = load_tokenizer()
    max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')

    print('learning word...')
    e = EmbeddingWord(tokenizer, padded_x, None, max_tweet_word_count,
                      tokenizer.num_words, y)
    e.create_embedding()
    weights = e.get_weights()
    save_model_mat(weights, 'embedding_word')
    return weights
예제 #5
0
def learn_embedding_skip_gram(x, y, texts):
    padded_x = x
    tokenizer = load_tokenizer()
    max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')

    print('learning skip_gram...')
    e = EmbeddingGensimSkipGram(tokenizer, padded_x, None,
                                max_tweet_word_count, tokenizer.num_words, y,
                                texts)
    e.create_embedding()
    weights = e.get_weights()
    save_model_mat(weights, 'embedding_skip_gram')
    return weights
from sklearn.model_selection import train_test_split

from bix.twitter.base.utils import load_csv, encode_embed_docs, save_pickle, load_pickle, save_csv
from bix.twitter.learn.tokenizer.tokenizer_utils import load_tokenizer

if __name__ == '__main__':
    print('loading saved state')

    tokenizer = load_tokenizer('learn')
    x = load_csv('learn/tweets.csv')
    y = load_csv('learn/lables.csv')
    max_tweet_word_count = load_pickle(
        'tokenized/learn/max_tweet_word_count.pickle')

    print('reducing learning data')
    x_learn, _, y_learn, _ = train_test_split(
        x, y, test_size=0.995, random_state=4)  # 16k are more than enough

    print('encoding data')

    padded_x, unpadded_x = encode_embed_docs(x_learn, tokenizer,
                                             max_tweet_word_count)

    print('saving')

    save_pickle(padded_x, 'tokenized/learn/small_padded_x.pickle')
    save_pickle(unpadded_x, 'tokenized/learn/small_unpadded_x.pickle')
    save_pickle(y_learn, 'tokenized/learn/small_y.pickle')
    save_csv('learn/tweets_learn.csv', x_learn)
예제 #7
0
def train_model_convolutional(x, y, embedding_mats):
    #    print(device_lib.list_local_devices())
    #    exit(0)
    padded_x = x

    max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')
    tok = load_tokenizer()
    vocab_size = tok.num_words

    #if 'test_all_data' in args:
    #    padded_x = load_pickle('tokenized/learn/padded_x.pickle')
    #    y = load_csv('learn/lables.csv')

    enc_y = np_utils.to_categorical(y)

    x_train, x_test, y_train, y_test = train_test_split(padded_x,
                                                        enc_y,
                                                        test_size=0.2,
                                                        random_state=5)

    #model_mat_word = load_model_mat('embedding_word')
    #print(f'model_mat_word.shape: {model_mat_word[0].shape}')
    #model_mat_glove = load_model_mat('embedding_glove')
    #print(f'model_mat_glove.shape: {model_mat_glove[0].shape}')
    #model_mat_skip_gram = load_model_mat('embedding_skip_gram')
    #print(f'model_mat_skip_gram.shape: {model_mat_skip_gram[0].shape}')
    # model_mat_skip_gram = load_model_mat('embedding_skip_gram')

    # vocab_size = len(tok.word_index) + 1

    # model = Sequential()
    # model.add(Embedding(vocab_size, model_mat_word[0].shape[1], input_length=max_tweet_word_count,
    #                    weights=model_mat_word))
    # model.add(Flatten())

    # model.add(Dense(1, activation='sigmoid'))

    # word
    input = Input(shape=(max_tweet_word_count, ))
    xs = []
    for mat in embedding_mats:
        xs.append(
            Embedding(vocab_size,
                      mat[0].shape[1],
                      input_length=max_tweet_word_count,
                      weights=mat,
                      trainable=False)(input))

    combined = concatenate(xs)
    z = Conv1D(100, 5, activation='relu')(combined)
    z = Conv1D(100, 5, activation='relu')(z)
    z = MaxPooling1D()(z)

    z = Conv1D(160, 5, activation='relu')(z)
    z = Conv1D(160, 5, activation='relu')(z)
    z = GlobalMaxPooling1D()(z)
    z = Dropout(0.5)(z)

    #f = Flatten()(z)
    # conv
    #polling
    #flatten
    #dense
    # (evntl. residual conv)
    #z = Dense(10, activation="relu")(z)
    z = Dense(len(y_test[0]), activation="softmax")(z)
    # ca 20kk total params
    model = Model(inputs=[input], outputs=z)

    # run_opts = tensorflow.RunOptions(report_tensor_allocations_upon_oom=True)
    # compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['acc'])  # , options=run_opts)
    # experiment optimizer (adam vs rmsprop)
    # expeniment activation function (liki_relu, elu)
    # summarize the model
    print(model.summary())
    # fit the model

    es = EarlyStopping(monitor='val_loss')
    model.fit([x_train],
              y_train,
              epochs=50,
              verbose=1,
              batch_size=8000,
              validation_split=0.1,
              callbacks=[es])
    # todo: use return value
    # evaluate the model
    loss, accuracy = model.evaluate([x_test],
                                    y_test,
                                    verbose=1,
                                    batch_size=8000)
    print('Accuracy: %f' % (accuracy * 100))
    # small: Accuracy: 93.041664
    # all data - Accuracy: 79.458750
    #3 embedding Layers:

    model.save('sentiment_conv_ep100.h5')

    print('finished')