import random import sys import numpy as np from sklearn.ensemble import RandomForestClassifier import utils # Performs classification using RandomForest classifier. if __name__ == '__main__': service_port = int(sys.argv[1]) np.random.seed(1337) utils.init_ngrams() tweets = utils.process_tweets(utils.TRAIN_PROCESSED_FILE, service_port, test_file=False) if utils.TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print('Extracting features & training batches') clf = RandomForestClassifier(n_jobs=2, random_state=0) batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in utils.extract_features( train_tweets, test_file=False,
feature_vector.append(vocab.get(words[-1])) return feature_vector if __name__ == '__main__': train = len(sys.argv) == 2 service_port = int(sys.argv[1]) np.random.seed(1337) vocab_size = 90000 batch_size = 8 * 500 max_length = 40 filters = 600 kernel_size = 3 vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1) glove_vectors = get_glove_vectors(vocab) processed_tweets = utils.process_tweets(TRAIN_PROCESSED_FILE, service_port, test_file=False, get_feature_vector=get_feature_vector) tweets = [processed_tweet[-1] for processed_tweet in processed_tweets] labels = np.array([processed_tweet[1] for processed_tweet in processed_tweets]) # Create and embedding matrix embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01 # Seed it with GloVe vectors for word, i in vocab.items(): glove_vector = glove_vectors.get(word) if glove_vector is not None: embedding_matrix[i] = glove_vector tweets = pad_sequences(tweets, maxlen=max_length, padding='post') shuffled_indices = np.random.permutation(tweets.shape[0]) tweets = tweets[shuffled_indices] labels = labels[shuffled_indices] if train: model = Sequential()