# learn alphabet from training data dataset = \ dataset.DatasetProvider([cfg.get('data', 'train'), cfg.get('data', 'test')]) # now load training examples and labels train_x, train_y = dataset.load(cfg.get('data', 'train')) # now load test examples and labels test_x, test_y = dataset.load(cfg.get('data', 'test')) init_vectors = None # TODO: what what are we doing for index 0 (oov words)? # use pre-trained word embeddings? if cfg.has_option('data', 'embed'): print 'embeddings:', cfg.get('data', 'embed') word2vec = word2vec_model.Model(cfg.get('data', 'embed')) init_vectors = [word2vec.select_vectors(dataset.word2int)] # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x + test_x]) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = pad_sequences(train_y, maxlen=maxlen) test_x = pad_sequences(test_x, maxlen=maxlen) test_y = pad_sequences(test_y, maxlen=maxlen) train_y = np.array([to_categorical(seq, 3) for seq in train_y]) test_y = np.array([to_categorical(seq, 3) for seq in test_y]) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape print 'test_x shape:', test_x.shape
# learn alphabet from training examples dataset = dataset.DatasetProvider(train_file) # now load training examples and labels train_x, train_y = dataset.load(train_file) maxlen = max([len(seq) for seq in train_x]) # now load test examples and labels test_x, test_y = dataset.load(test_file, maxlen=maxlen) init_vectors = None # TODO: what what are we doing for index 0 (oov words)? # use pre-trained word embeddings? if cfg.has_option('data', 'embed'): print 'embeddings:', cfg.get('data', 'embed') embed_file = os.path.join(base, cfg.has_option('data', 'embed')) word2vec = word2vec_model.Model(embed_file) init_vectors = [word2vec.select_vectors(dataset.word2int)] # turn x and y into numpy array among other things classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) test_x = pad_sequences(test_x, maxlen=maxlen) test_y = to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape print 'test_x shape:', test_x.shape print 'test_y shape:', test_y.shape, '\n' branches = [] # models to be merged
BATCH = 50 EPOCHS = 5 CLASSES = 2 EMBDIMS = 300 MAXLEN = 55 MAXFEATURES = 18000 FILTERS = 100 FILTLEN = 4 if __name__ == "__main__": dataset = dataset.DatasetProvider(MAXFEATURES) x, y = dataset.load_data() path = '/Users/Dima/Loyola/Data/Word2Vec/Models/GoogleNews-vectors-negative300.txt' word2vec = word2vec_model.Model(path) embedding_lookup = word2vec.select_vectors(dataset.alphabet) # turn x and y into numpy arrays among other things x = sequence.pad_sequences(x, maxlen=MAXLEN) y = k.utils.np_utils.to_categorical(np.array(y), CLASSES) x3d = np.zeros((10662, 55, 300)) for row in range(10662): for col in range(55): word = x[row, col] x3d[row, col, :] = embedding_lookup[word] scores = [] folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS,