def prepare_data(): path = '../outputs/' path_data = '../data/' with open(path + 'train/vocab.pkl', 'rb') as f: vocab_train_dict = pickle.load(f) with open(path + '/test/vocab.pkl', 'rb') as f: vocab_test_dict = pickle.load(f) W = np.load(path + 'train/embeddings.npy') W_test = np.load(path + 'test/embeddings.npy') train, labels, test = load_data_and_labels( path_data + 'train_pos_clean.txt', path_data + 'train_neg_clean.txt', path_data + 'test_data_clean.txt') print("Vectorization of the tweet sets") # To be improved (does not work when passed as a function ???) ls = [] for sent in train: ls_temp = [] for word in list(sent.split()): try: ls_temp.append(vocab_train_dict[word]) except: ls_temp.append(0) ls.append(ls_temp) ls_sum = [] for ls_in in ls: sum_vect = 0 for index in ls_in: sum_vect += W[index] ls_sum.append(sum_vect) X_train = ls_sum ls = [] for sent in test: ls_temp = [] for word in list(sent.split()): try: ls_temp.append(vocab_test_dict[word]) except: ls_temp.append(0) ls.append(ls_temp) ls_sum = [] for ls_in in ls: sum_vect = 0 for index in ls_in: sum_vect += W_test[index] ls_sum.append(sum_vect) X_test = ls_sum print("Sets vectorized") return X_train, labels, X_test
import time import datetime import sys from cnn import cnn # from utils import * from sklearn.model_selection import train_test_split from tensorflow.contrib import learn import pandas as pd import helpers # import seaborn as sns data = pd.read_csv("./data/spam.csv", encoding='latin-1') data = data.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1) x_input, y_raw = helpers.load_data_and_labels(data) max_length = max([len(x.split(" ")) for x in x_input]) vocabprocess = learn.preprocessing.VocabularyProcessor(max_length) x = np.array(list(vocabprocess.fit_transform(x_input))) y = np.array(y_raw) ## shuffle data np.random.seed(10) shuffleindice = np.random.permutation(np.arange(len(y))) xshuf = x[shuffleindice] yshuf = y[shuffleindice]
from nltk.corpus import stopwords from keras.preprocessing.text import Tokenizer from keras.models import load_model # set parameters: max_features = 5000 maxlen = 400 batch_size = 1000 embedding_dims = 100 nb_filter = 250 filter_length = 3 hidden_dims = 250 nb_epoch = 2 print('Loading data...') x_train, labels, x_test = load_data_and_labels('data/train_pos.txt', 'data/train_neg.txt', 'data/test_data.txt') # Tokenize the words and creating sequences and padding sequences tokenizer = Tokenizer(nb_words=maxlen) tokenizer.fit_on_texts(x_train) sequences_test = tokenizer.texts_to_sequences(x_test) X_test = sequence.pad_sequences(sequences_test, maxlen=maxlen) print('Loading the model...') model = load_model('model_CNN_test.h5') # Predictions for the test data probas = model.predict(X_test, batch_size=32) # Replacing the predictions with -1 or +1
NUM_QUESTIONS = 2000 tf.flags.DEFINE_string("training_data_file", "./datasets/training.full.tsv", "Data source for the training data") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\n Parameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") #Load the data print("Loading data") q1, q2, y_truth, q1_len, q2_len = helpers.load_data_and_labels( FLAGS.training_data_file) dataset = list(zip(q1, q2, y_truth)) #q1 = [word for word in q1[0].split() if word not in stopwords.words('english')] model_file = "./GoogleNews-vectors-negative300.bin" model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True) def avg_feature_vector(words, model, num_features, index2word_set): #function to average all words vectors in a given paragraph featureVec = np.zeros((num_features, ), dtype="float32") nwords = 0
tf.flags.DEFINE_boolean("use_cached_embeddings", True, "Cache embeddings locally on disk for repeated runs") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") q1, q2, y, x1_lengths, x2_lengths = helpers.load_data_and_labels( FLAGS.training_data_file) #print q1[:11], y[:11], x1_lengths[:11] # Build vocabulary max_question_length = max(max([len(x.split(" ")) for x in q1]), max([len(x.split(" ")) for x in q2])) vocab_processor = learn.preprocessing.VocabularyProcessor(max_question_length) print "max_question_length: ", max_question_length #x_text = q1 + q2 x1 = np.array(list(vocab_processor.fit_transform(q1))) x2 = np.array(list(vocab_processor.fit_transform(q2))) #x = np.array(list(vocab_processor.fit_transform(x_text))) #xx1 = x[:len(q1)] #xx2 = x[len(q1):]
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Load data print("Loading data...") q1, q2, y, q1_lengths, q2_lengths = helpers.load_data_and_labels( FLAGS.test_data_file) x_raw = q1 + q2 # Build vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) x1_test = x_test[:len(q1)] x2_test = x_test[len(q1):] y_test = np.argmax(y, axis=1) print("\nEvaluating...\n") # Evaluation # ==================================================
import numpy as np import pickle from helpers import load_data_and_labels from helpers import clean_str from sklearn.ensemble import RandomForestClassifier path = 'data/mini/' with open(path + 'vocab.pkl', 'rb') as f: vocab_dict = pickle.load(f) W = np.load(path + 'embeddings.npy') x_text, y = load_data_and_labels(positive_data_file=path + 'pos_train.txt', negative_data_file=path + 'neg_train.txt') ls = [] for sent in x_text: ls_temp = [] for word in list(sent.split()): try: ls_temp.append(vocab_dict[word]) except: ls_temp.append(0) ls.append(ls_temp) # print(ls) ls_sum = [] for ls_in in ls: sum_vect = 0 for index in ls_in:
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = helpers.load_data_and_labels(FLAGS.pos_text, FLAGS.neg_text, FLAGS.max_document_length) # Build vocabulary d_vocab, x = helpers.vocab_processor(x_text) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev = x_shuffled[:-2000], x_shuffled[-2000:] y_train, y_dev = y_shuffled[:-2000], y_shuffled[-2000:] print("Vocabulary Size: {:d}".format(len(d_vocab)))