word_freqs = x_train.sum(axis=0) word_freqs = np.squeeze(np.asarray(word_freqs)) freq_idx = np.argsort(word_freqs)[::-1] freq_idx = freq_idx[:num_frequent_words] x_train = x_train[:, freq_idx] train_vocab = [train_vocab[i] for i in freq_idx] print("Vocabulary Size (Reduced): {}".format(len(train_vocab))) # Construct reverse lookup vocabulary reverse_vocab = {k: v for v, k in enumerate(train_vocab)} # Process Google News word2vec file (in a memory-friendly way) and store relevant embeddings. print("Loading pre-trained embeddings from {}...".format(embedding_file)) embeddings = data.load_word2vec(embedding_file, reverse_vocab, embedding_dim, tf_VP=False) # Process test data using the reduced train vocabulary vectorizer = sklearn.feature_extraction.text.TfidfVectorizer( vocabulary=train_vocab) x_test = vectorizer.fit_transform(x_test) # Normalize data x_train = x_train.astype(np.float64) x_train = sklearn.preprocessing.normalize(x_train, axis=1, norm="l1") x_train = x_train.astype(np.float32) x_test = x_test.astype(np.float64) x_test = sklearn.preprocessing.normalize(x_test, axis=1, norm="l1") x_test = x_test.astype(np.float32)
maxlen=seq_len) x_train = train.data.astype(np.int32) x_test = test.data.astype(np.int32) y_train = train.labels y_test = test.labels # Correct sequence length if seq_len was originally None seq_len = x_train.shape[1] # Construct reverse lookup vocabulary reverse_vocab = {w: i for i, w in enumerate(train.vocab)} # Process Google News word2vec file (in a memory-friendly way) and store relevant embeddings print("Loading pre-trained embeddings from {}...".format(embedding_file)) embeddings = data.load_word2vec(embedding_file, reverse_vocab, embedding_dim) # Print information about the dataset utils.print_data_info(train, x_train, x_test, y_train, y_test) # To print for results.csv data_str = "{{format: 'word2ind', vocab_size: {}, seq_len: {}}}".format( len(train.vocab), seq_len) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) sess = tf.Session(config=session_conf)
test_sample_index:] del x, y, x_shuffled, y_shuffled # don't need these anymore print("Vocabulary Size: {}".format(len(vocab_processor.vocabulary_))) print("Train/Test Split: {}/{}".format(len(y_train), len(y_test))) # Initialize embedding matrix from pre-trained word2vec embeddings. 0.25 is chosen so that unknown vectors # have (approximately) the same variance as pre-trained ones. embeddings = np.random.uniform( -0.25, 0.25, (len(vocab_processor.vocabulary_), embedding_dim)) # Process Google News word2vec file (in a memory-friendly way) and store relevant embeddings. print("Loading pre-trained embeddings from {}...".format(embedding_file)) embeddings = data.load_word2vec(embedding_file, vocab_processor.vocabulary_, embedding_dim, tf_VP=True) # Embed the data with the extracted embeddings x_train = np.array([ np.mean([embeddings[idx] for idx in sentence], axis=0) for sentence in x_train ]) x_test = np.array([ np.mean([embeddings[idx] for idx in sentence], axis=0) for sentence in x_test ]) # Transform targets from arrays to labels y_train = np.argmax(y_train, 1) y_test = np.argmax(y_test, 1)