dataset_name = cfg["datasets"]["default"] print('dataset_name: ', dataset_name) if FLAGS.eval_train: if dataset_name == "mrpolarity": datasets = data_helpers.get_datasets_mrpolarity( cfg["datasets"][dataset_name]["positive_data_file"]["path"], cfg["datasets"][dataset_name]["negative_data_file"]["path"]) elif dataset_name == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup( subset="test", categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) elif dataset_name == "political_parties": print('Loading political paries') datasets = data_helpers.get_datasets_political_parties() x_raw, y_test = data_helpers.load_data_labels(datasets) y_test = np.argmax(y_test, axis=1) print("Total number of test examples: {}".format(len(y_test))) else: print("Flow shouldn't be here.") if dataset_name == "mrpolarity": datasets = {"target_names": ['positive_examples', 'negative_examples']} x_raw = [ "a masterpiece four years in the making", "everything is off." ] y_test = [1, 0] else: datasets = { "target_names": [ 'alt.atheism', 'comp.graphics', 'sci.med',
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") datasets = data_helpers.get_datasets_political_parties() x_text, y = data_helpers.load_data_labels(datasets) #print('x_text',x_text) #print('labels',y) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) print('Load pre-trained word vectors') with open('fasttext_vocab_en.dat', 'rb') as fr: vocab = pickle.load(fr) embedding = np.load('fasttext_embedding_en.npy') pretrain = vocab_processor.fit(vocab.keys()) x = np.array(list(vocab_processor.transform(x_text))) vocab_size = len(vocab) #print('VocabPr',x) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] #print('x_shuffled', x_shuffled) #print('y_shuffled', y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation #dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] #y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] train_frac = 0.7 val_frac = 0.2 test_frac = 0.1 def train_test_val_split(x_shuffled): return (x_shuffled[:int(len(x_shuffled) * train_frac)], x_shuffled[int(len(x_shuffled) * train_frac):(int(len(x_shuffled) * train_frac) + int(len(x_shuffled) * val_frac))], x_shuffled[(int(len(x_shuffled) * train_frac) + int(len(x_shuffled) * val_frac)):]) def train_test_val_labels(y_shuffled): return (y_shuffled[:int(len(y_shuffled) * train_frac)], y_shuffled[int(len(y_shuffled) * train_frac):(int(len(y_shuffled) * train_frac) + int(len(y_shuffled) * val_frac))], y_shuffled[(int(len(y_shuffled) * train_frac) + int(len(y_shuffled) * val_frac)):]) x_train, x_dev, x_test = train_test_val_split(x_shuffled) y_train, y_dev, y_test = train_test_val_labels(y_shuffled) #print('shape',x_train.shape) #print("Vocabulary". vocab_processor.vocabulary_) #print("Vocabulary",vocab_processor.vocabulary_._mapping) print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print('x_train', x_train.shape) print('y_train', y_train.shape) return x_train, y_train, vocab_processor, vocab_size, embedding, x_dev, y_dev, x_test, y_test