Esempio n. 1
0
def preprocess_data():
    # GET DATA
    data = pd.read_csv("data/StockTwits_SPY_Sentiment_2017.gz",
                       encoding="utf-8",
                       compression="gzip",
                       index_col=0)

    # GET MESSAGES AND VALUS
    messages = data.message.values
    labels = data.sentiment.values

    messages = np.array(
        [utl.preprocess_ST_message(message) for message in messages])

    full_lexicon = " ".join(messages).split()
    vocab_to_int, int_to_vocab = utl.create_lookup_tables(full_lexicon)

    messages_lens = Counter([len(x) for x in messages])
    print("Zero-length messages: {}".format(messages_lens[0]))
    print("Maximum message length: {}".format(max(messages_lens)))
    print("Average message length: {}".format(
        np.mean([len(x) for x in messages])))

    messages, labels = utl.drop_empty_messages(messages, labels)

    messages = utl.encode_ST_messages(messages, vocab_to_int)
    labels = utl.encode_ST_labels(labels)

    messages = utl.zero_pad_messages(messages, seq_len=244)

    train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split(
        messages, labels, split_frac=0.80)
    return train_x, val_x, test_x, train_y, val_y, test_y, vocab_to_int
Esempio n. 2
0
    def preprocess_data(self):
        self.vocab_to_int, self.int_to_vocab = create_lookup_tables(
            self.counter)

        for row in range(len(self.X_train)):
            #self.X_train[row][0] = self.string_to_vocab(self.X_train[row][0], self.max_sentence_length)
            self.X_train[row] = self.string_to_vocab(self.X_train[row],
                                                     self.max_sentence_length)

        test_data_size = 5000

        self.X_train = np.array(self.X_train, dtype=np.float)
        self.Y_train = np.array(self.Y_train, dtype=np.float)
def preprocess_data(text):
    """
    :param text: raw text
    :return:  tokenized data
    """
    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))
    text = text.lower()
    text = text.split()
    vocab_to_int, int_to_vocab = create_lookup_tables(
        text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    return int_text, vocab_to_int, int_to_vocab, token_dict
Esempio n. 4
0
def _test_lookup_tables():

    text = pd.Series([
        "this is a toy", "I mean not really a toy", "I mean a toy vocabulary"
    ])
    vocab_to_int, int_to_vocab = create_lookup_tables(text)

    # Make sure the dicts make the same lookup
    missmatches = [(word, id, id, int_to_vocab[id])
                   for word, id in vocab_to_int.items()
                   if int_to_vocab[id] != word]

    assert not missmatches,\
        'Found {} missmatche(s). First missmatch: vocab_to_int[{}] = {} and int_to_vocab[{}] = {}'.format(len(missmatches),
                                                                                                          *missmatches[0])
Esempio n. 5
0
def read_data_from_file(data_path):
    maybe_download()
    with open(data_path) as f:
        text = f.read()

    ###########################################################
    # ------------------- Preprocessing -----------------------
    # 1. Tokenize punctuations e.g. period -> <PERIOD>
    # 2. Remove words that show up five times or fewer
    words = utils.preprocess(text)

    # Hmm, let's take a look at the processed data
    print('First 30 words:', words[:30])
    print('Total words:', len(words))
    print('Total unique words:', len(set(words)))

    # Create two dictionaries to convert words to integers
    vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
    n_vocab = len(int_to_vocab)

    # Convert words into integers
    int_words = [vocab_to_int[w] for w in words]

    ###########################################################
    # ------------------- Subsampling -------------------------
    # Some words like "the", "a", "of" etc don't provide much
    # information. So we might want to remove some of them.
    # This results in faster and better result.
    # The probability that a word is discarded is
    # P(w) = 1 - sqrt(1 / frequency(w))
    each_word_count = Counter(int_words)
    total_count = len(int_words)
    threshold = 1e-5  # FLAGS.drop_word_threshold

    freqs = {word: count/total_count for word,
             count in each_word_count.items()}
    probs = {word: 1 - np.sqrt(threshold/freqs[word])
             for word in each_word_count}

    train_words = [word for word in int_words if random.random() <
                   (1 - probs[word])]

    print('After subsampling, first 30 words:', train_words[:30])
    print('After subsampling, total words:', len(train_words))

    # Subsampling makes it worse for eliminating contextual info
    # return train_words, int_to_vocab, vocab_to_int, n_vocab
    return int_words, int_to_vocab, vocab_to_int, n_vocab
Esempio n. 6
0
def read_data_from_file(data_path: str) -> tuple:
    """
    生成训练的词列表,以及列表的长度。
    :param data_path:
    :return:
    """
    maybe_download()
    with open(data_path) as f:
        text = f.read()
    # 将文本中的特殊标点符号用指定的字符进行替换。
    words = utils.preprocess(text)
    print('First 30 words:', words[:30])
    print('Total words:', len(words))
    print('Total unique words:', len(set(words)))
    # 根据文本生成的单词频率进行由高到低的排序,过滤掉低频词(词出现的次数<5),生成字典id2word以及word2id。
    vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
    n_vocab = len(int_to_vocab)
    # 由原来的词频进而转化成词的序列,序列通过enumerate来实现的。
    int_words = [vocab_to_int[w] for w in words]
    ###########################################################
    # ------------------- Subsampling -------------------------
    # Some words like "the", "a", "of" etc don't provide much
    # information. So we might want to remove some of them.
    # This results in faster and better result.
    # The probability that a word is discarded is
    # P(w) = 1 - sqrt(1 / frequency(w))
    each_word_count = Counter(int_words)
    total_count = len(int_words)
    threshold = FLAGS.drop_word_threshold
    # 统计词频
    freq_s = {
        word: count / total_count
        for word, count in each_word_count.items()
    }
    prob_s = {
        word: 1 - np.sqrt(threshold / freq_s[word])
        for word in each_word_count
    }

    train_words = [
        word for word in int_words if random.random() < (1 - prob_s[word])
    ]

    print('After subsampling, first 30 words:', train_words[:30])
    print('After subsampling, total words:', len(train_words))

    return train_words, int_to_vocab, vocab_to_int, n_vocab
def create_neural_network():
    global vocab_to_int, int_to_vocab, counter
    vocab_to_int, int_to_vocab = create_lookup_tables(counter)
    preprocess_data()
    print('X_train', X_train.shape)
    print('Y_train', Y_train.shape)
    print('X_test', X_test.shape)
    print('Y_test', Y_test.shape)
    print('size of vocabulary', len(vocab_to_int))
    model = RNN()
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(),
                  metrics=['accuracy'])
    model.fit(X_train,
              Y_train,
              batch_size=128,
              epochs=10,
              validation_split=0.2,
              callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
Esempio n. 8
0
def preprocess(text):
    # get list of words
    words = utils.preprocess(text)

    vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
    int_words = [vocab_to_int[word] for word in words]

    ## Subsampling
    threshold = 1e-5
    word_counts = Counter(int_words)
    # print(list(word_counts.items())[0])  # dictionary of int_words, how many times they appear

    total_count = len(int_words)
    freqs = {word: count / total_count for word, count in word_counts.items()}
    p_drop = {word: 1 - np.sqrt(threshold / freqs[word]) for word in word_counts}
    # discard some frequent words, according to the subsampling equation
    # create a new list of words for training
    train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

    preprocessed = {'train_words': train_words,
                    'vocab_to_int': vocab_to_int,
                    'int_to_vocab': int_to_vocab,
                    'freqs': freqs}
    return preprocessed
Esempio n. 9
0
    df["clean_tweet"] = tweets  # Get cleaned tweets
    df["word_count"] = df.clean_tweet.apply(
        lambda x: len(x.split()))  # Get their word count
    # Remove outliers
    old_tweet = df.loc[df.word_count == df.word_count.max(), ].tweet.values[0]
    new_tweet = old_tweet[:old_tweet.find("\r")]
    df.loc[df.word_count == df.word_count.max(), "tweet"] = new_tweet
    df.loc[df.word_count == df.word_count.max(),
           "clean_tweet"] = preprocess(new_tweet)
    df.loc[df.word_count == df.word_count.max(),
           "word_count"] = len(preprocess(new_tweet).split())

    print("Testing create lookup table function...\n")
    _test_lookup_tables()

    vocab_to_int, int_to_vocab = create_lookup_tables(tweets)

    print("Testing padding function...\n")
    _test_pad_tweets()

    MAX_LENGTH = df.word_count.max()
    pad_tweets = create_pad_fn(MAX_LENGTH)
    df["padded_tweets"] = df.clean_tweet.map(pad_tweets)

    print("Testing hate classification function...\n")
    _test_hate_classification()

    print("Testing change hate labels function...\n")
    _test_hate_labels(tweets, raw_labels)

    tweets_ints = np.array([[vocab_to_int[word] for word in tweet.split()]
Esempio n. 10
0
def word_mapping(words):

    vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
    int_words = [vocab_to_int[word] for word in words]

    return vocab_to_int, int_to_vocab, int_words
                    pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)  # 解压

import os
with open(os.path.join(dataset_folder_path, 'text8')) as f:
    text = f.read()

# words为文本中的所有单词序列
words = utils.preprocess(text)  # 大写转小写,以及符号替换,去掉低频词
print(words[:30])
print("Total words: {}".format(len(words)))  #16680599
print("Unique words: {}".format(len(set(words))))  #63641
vocab_to_int, int_to_vocab = utils.create_lookup_tables(
    words)  # word ->index , index -> word
int_words = [vocab_to_int[word] for word in words]

from collections import Counter
import random

# 计算丢弃概率,与其出现频率正相关
# 注意:丢词是针对文本里的所有单词而言,而非针对某个窗口
threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count / total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold / freqs[word]) for word in word_counts}
train_words = [
    word for word in int_words if random.random() < (1 - p_drop[word])
]
Esempio n. 12
0
import pickle


# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = codecs.open(filename, encoding = "utf8", errors ='replace').read()
raw_text = raw_text.lower()
# print(raw_text)

# create mapping of unique chars to integers
words, sentences = utils.preprocess(raw_text)

unique_words = sorted(list(set(words)))

# print(words)
word_to_int, int_to_word = utils.create_lookup_tables(unique_words)
# print(words_to_int)
# print(int_to_words)


n_vocab = len(unique_words)
# print ("Total Vocab: ", n_vocab)

# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
dataX = []
dataY = []

for sentence in sentences:
	sentence_words = sentence.split()
	if(len(sentence_words)>seq_length):
Esempio n. 13
0
def create_neural_network():
    global vocab_to_int, int_to_vocab
    vocab_to_int, int_to_vocab = create_lookup_tables(counter)
    preprocess_data()
    print('X_train', X_train.shape)
    print('Y_train', Y_train.shape)
    print('X_test', X_test.shape)
    print('Y_test', Y_test.shape)
    print('size of vocabulary', len(vocab_to_int))  #124188

    sequence_length = max_sentence_length
    embedding_length = len(vocab_to_int)
    num_classes = 2

    print('sequence_length', sequence_length)
    print('embedding_length', embedding_length)

    input_data = tf.placeholder(tf.float32,
                                [None, sequence_length, embedding_length])

    inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs')
    targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    hidden_vector_size = 100

    rnn_cell = tf.contrib.rnn.LSTMCell(hidden_vector_size)

    initial_zero_h = tf.matmul(tf.reduce_mean(tf.zeros_like(input_data), 2),
                               tf.zeros([sequence_length, hidden_vector_size]))

    initial_state = tf.contrib.rnn.LSTMStateTuple(initial_zero_h,
                                                  initial_zero_h)

    outputs, state = tf.nn.dynamic_rnn(rnn_cell,
                                       input_data,
                                       initial_state=initial_state,
                                       dtype=tf.float32)

    prediction, logits = build_output(outputs, hidden_vector_size, num_classes)

    loss = build_loss(logits, targets, hidden_vector_size, num_classes)
    optimizer = build_optimizer(loss, learning_rate, grad_clip)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(tf.global_variables_initializer())
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {
                input_data: x,
                model.targets: y,
                model.keep_prob: keep_prob,
                model.initial_state: new_state
            }
            batch_loss, new_state, _ = sess.run(
                [model.loss, model.final_state, model.optimizer],
                feed_dict=feed)

            end = time.time()
            print('Epoch: {}/{}... '.format(e + 1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end - start)))
Esempio n. 14
0
def create_neural_network():
    global vocab_to_int, int_to_vocab, counter
    vocab_to_int, int_to_vocab = create_lookup_tables(counter)
    preprocess_data()
    print('X_train', X_train.shape)
    print('Y_train', Y_train.shape)
    print('X_test', X_test.shape)
    print('Y_test', Y_test.shape)
    print('size of vocabulary', len(vocab_to_int))

    time_steps = 128
    num_units = 128  #hidden LSTM units

    n_input = 500  #rows of 28 pixels

    learning_rate = 0.001  #learning rate for adam

    n_classes = 2  #mnist is meant to be classified in 10 classes(0-9).

    batch_size = 128  #size of batch

    tf.reset_default_graph()

    out_weights = tf.Variable(tf.random_normal([n_input, n_classes]))
    out_bias = tf.Variable(tf.random_normal([n_classes]))

    x = tf.placeholder("float", [None, n_input])
    y = tf.placeholder("float", [None, n_classes])

    #input = tf.unstack(x, n_input, 0)

    lstm_layer = BasicLSTMCell(num_units, forget_bias=1)
    outputs, _ = rnn.rnn(lstm_layer, x, dtype=tf.float32)
    prediction = tf.matmul(outputs[-1], out_weights) + out_bias

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

    #model evaluation
    correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        iter = 1
        while iter < 800:
            for batch_x, batch_y in batch_features_labels(
                    X_train, Y_train, batch_size):
                #batch_x,batch_y=mnist.train.next_batch(batch_size=batch_size)
                print('batch_x', batch_x.shape)
                print('batch_y', batch_y.shape)

                #batch_x = batch_x.reshape((batch_size,time_steps,n_input))

                sess.run(opt, feed_dict={x: batch_x, y: batch_y})

                if iter % 10 == 0:
                    acc = sess.run(accuracy,
                                   feed_dict={
                                       x: batch_x,
                                       y: batch_y
                                   })
                    los = sess.run(loss, feed_dict={x: batch_x, y: batch_y})
                    print("For iter ", iter)
                    print("Accuracy ", acc)
                    print("Loss ", los)
                    print("__________________")

                iter = iter + 1
Esempio n. 15
0
def create_neural_network():
    global vocab_to_int, int_to_vocab, counter
    vocab_to_int, int_to_vocab = create_lookup_tables(counter)
    preprocess_data()
    print('X_train', X_train.shape)
    print('Y_train', Y_train.shape)
    print('X_test', X_test.shape)
    print('Y_test', Y_test.shape)
    print('size of vocabulary', len(vocab_to_int))

    epochs = 20
    #sequence_length = max_sentence_length
    #embedding_length = len(vocab_to_int)
    num_classes = 2
    grad_clip = 5

    batch_size = 10  # Sequences per batch
    num_steps = 500  # Number of sequence steps per batch
    lstm_size = 128  # Size of hidden layers in LSTMs
    num_layers = 2  # Number of LSTM layers
    learning_rate = 0.01  # Learning rate
    keep_prob = 0.5  # Dropout keep probability

    tf.reset_default_graph()

    # Build the input placeholder tensors
    inputs, targets, keep_prob = build_inputs(batch_size, num_steps)

    # Build the LSTM cell
    cell, initial_state = build_lstm(lstm_size, num_layers, batch_size,
                                     keep_prob)

    ### Run the data through the RNN layers
    # First, one-hot encode the input tokens
    x_one_hot = tf.one_hot(inputs, num_classes)
    print('inputs', inputs.shape)
    print('num_classes', num_classes)
    print('x_one_hot', x_one_hot.shape)

    # Run each sequence step through the RNN with tf.nn.dynamic_rnn
    outputs, state = tf.nn.dynamic_rnn(cell,
                                       x_one_hot,
                                       initial_state=initial_state)
    print('outputs', outputs.shape)
    final_state = state

    # Get softmax predictions and logits
    prediction, logits = build_output(outputs, lstm_size, num_classes)

    # Loss and optimizer (with gradient clipping)
    loss = build_loss(logits, targets, lstm_size, num_classes)
    optimizer = build_optimizer(loss, learning_rate, grad_clip)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        counter = 0
        for e in range(epochs):
            # Train network
            new_state = sess.run(initial_state)
            total_loss = 0
            for x, y in batch_features_labels(X_train, Y_train, batch_size):
                print('x', x.shape)
                print('y', y.shape)
                counter += 1
                start = time.time()
                feed = {
                    inputs: x,
                    targets: y,
                    keep_prob: 0.5,
                    initial_state: new_state
                }

                batch_loss, new_state, _ = sess.run(
                    [loss, final_state, optimizer], feed_dict=feed)

                end = time.time()
                print('Epoch: {}/{}... '.format(e + 1, epochs),
                      'Training Step: {}... '.format(counter),
                      'Training loss: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end - start)))
Esempio n. 16
0
with open('data/text8') as f:
    text = f.read()                 # all sequential text data
    
# Preprocessing the data
# process the raw data, replace systex with text and return a list in sequence of word
words = utils.preprocess(text)

print("Total words: {}".format(len(words)))             # 16,680,599
print("Unique words: {}".format(len(set(words))))       # 63,641


# making a look up table
# vocab_to_int['a'] = 5, which the index of token 'a'
# int_to_vocab[5] = 'a'
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)

# converting the entire data represented in form of foken number
int_words = [vocab_to_int[word] for word in words]



# Subsampling
from collections import Counter
import random

threshold = 1e-5
number_of_words = len(int_words)
word_counter = Counter(int_words)

frequencies = dict()
Esempio n. 17
0
                   encoding="utf-8",
                   compression="gzip",
                   index_col=0)

# get messages and sentiment labels
messages = data.message.values
labels = data.sentiment.values

# View sample of messages with sentiment
print(data[:10])

messages = np.array(
    [utl.preprocess_ST_message(message) for message in messages])

full_lexicon = " ".join(messages).split()
vocab_to_int, int_to_vocab = utl.create_lookup_tables(full_lexicon)

messages_lens = Counter([len(x) for x in messages])
print("Zero-length messages: {}".format(messages_lens[0]))
print("Maximum message length: {}".format(max(messages_lens)))
print("Average message length: {}".format(np.mean([len(x) for x in messages])))

messages, labels = utl.drop_empty_messages(messages, labels)
messages = utl.encode_ST_messages(messages, vocab_to_int)
labels = utl.encode_ST_labels(labels)

messages = utl.zero_pad_messages(messages, seq_len=244)

train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split(
    messages, labels, split_frac=0.80)
print("Data Set Size")