Exemple #1
0
def sentiment_analysis(indata):
    print("Predicting: {0}".format(indata))
    np.random.seed(1337)
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
    batch_size = 500
    model = load_model('best_model.h5')

    test_data = [(1, get_feature_vector(indata, unigrams, bigrams))]
    n_test_batches = int(np.ceil(len(test_data) / float(batch_size)))
    predictions = np.array([])
    for test_set_X, _ in extract_features(test_data,
                                          unigrams,
                                          bigrams,
                                          feat_type=FEAT_TYPE,
                                          batch_size=batch_size,
                                          test_file=True):
        prediction = np.round(model.predict_on_batch(test_set_X).flatten())
        predictions = np.concatenate((predictions, prediction))
    predictions = [(str(j), int(predictions[j]))
                   for j in range(len(test_data))]

    if predictions[0][1] == 0:
        print("Feedback ruim")
        return 0
    else:
        print("Feedback bom")
        return 1
Exemple #2
0
                  optimizer='adam', metrics=['accuracy'])
    return model


def evaluate_model(model, val_tweets):
    correct, total = 0, len(val_tweets)
    for val_set_X, val_set_y in extract_features(val_tweets, feat_type=FEAT_TYPE, test_file=False):
        prediction = model.predict_on_batch(val_set_X)
        prediction = np.round(prediction)
        correct += np.sum(prediction == val_set_y[:, None])
    return float(correct) / total


if __name__ == '__main__':
    np.random.seed(1337)
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print ('Extracting features & training batches')
    nb_epochs = 20
    batch_size = 500
    model = build_model()
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    best_val_acc = 0.0
Exemple #3
0
                tweets.append(feature_vector)
                labels.append(int(sentiment))
            utils.write_status(i + 1, total)
    print('\n')
    return tweets, np.array(labels)


if __name__ == '__main__':
    train = len(sys.argv) == 1
    np.random.seed(1337)
    vocab_size = 90000
    batch_size = 500
    max_length = 40
    filters = 600
    kernel_size = 3
    vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1)
    glove_vectors = get_glove_vectors(vocab)
    tweets, labels = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    # Create and embedding matrix
    embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01
    # Seed it with GloVe vectors
    for word, i in vocab.items():
        glove_vector = glove_vectors.get(word)
        if glove_vector is not None:
            embedding_matrix[i] = glove_vector
    tweets = pad_sequences(tweets, maxlen=max_length, padding='post')
    shuffled_indices = np.random.permutation(tweets.shape[0])
    tweets = tweets[shuffled_indices]
    labels = labels[shuffled_indices]
    if train:
        model = Sequential()
Exemple #4
0
    for val_set_X, val_set_y in extractFeatures(val_tweets,
                                                500,
                                                feat_type=FeatType,
                                                test_file=False):
        prediction = model.predict_on_batch(val_set_X)
        prediction = np.round(prediction)
        #numpy rounds to the nearest even value:This is specifically called Round half to even and is useful because it does not introduce bias.
        # This is especially important in finance, which it's sometimes called "bankers' rounding"
        correct += np.sum(prediction == val_set_y[:,
                                                  None])  #!!!!!!!!!!!!!!!!!!!
    return float(correct) / total


if __name__ == '__main__':
    np.random.seed(1337)
    unigrams = utils.top_n_words(FreqDistFile, UnigramSize)
    if UseBigrams:
        bigrams = utils.top_n_bigrams(BiFreqDistFile, BigramSize)
    tweets = process_tweets(TrainProcessedFile, test_file=False)
    #if TRAIN:
    train_tweets, val_tweets = utils.split_data(tweets)  #validation
    # else:
    #     random.shuffle(tweets)
    #     train_tweets=tweets
    del tweets
    print('Extracting Features and Training batches')
    nb_epochs = 20
    batch_size = 500  #!!!!!!!!!!!!!

    model = build_model()
    n_train_batches = int(np.ceil(len(train_tweets) /
Exemple #5
0
    for i in range(len(wordsList)):
        wordsList[i] = ' '.join(wordsList[i])

    print(wordsList[:3])

    # FUNCTION CALL TO SAVE THE PROCESSED DATA INTO CSV FILES
    save_processed_file(wordsList[:31963],
                        './datasets/TRAIN_PROCESSED_FILE.csv',
                        './datasets/train_tweets.csv', False)
    save_processed_file(wordsList[31962:],
                        './datasets/TEST_PROCESSED_FILE.csv',
                        './datasets/test_tweets.csv', True)

    np.random.seed(1337)
    unigrams = utils.top_n_words('./datasets/train_tweets-freqdist.pkl',
                                 UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(
            './datasets/train_tweets-freqdist-bi.pkl', BIGRAM_SIZE)

    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    # TREE CONSTRUCTION
    print('Extracting features & training batches')
    clf = DecisionTreeClassifier(max_depth=25)
    batch_size = len(train_tweets)