def load_test_data(embed_size=50, perct=1., binary=False): ''' labeledTree.to_labeled_lines()[0] gives you a single sentence and its labeling we split it into X = list of words, Y = sentence's labeling By default, Y falls into [0, 1, 2, 3, 4] @returns: test test: List[(List[words], sentiment)] for each sentence in dataset ''' M = ModelEmbeddings(embed_size=embed_size) X = [ labeledTree.to_labeled_lines()[0][1].split(" ") for labeledTree in data['test'] ] Y = [labeledTree.to_labeled_lines()[0][0] for labeledTree in data['test']] if binary: print("train size binary reduce: ", len(X)) X = [x for (x, y) in list(zip(X, Y)) if y != 3] Y = [1 if y > 3 else 0 for y in Y if y != 3] print(" --> ", len(X)) test_size = int(len(X) * perct) X = X[:test_size] Y = Y[:test_size] X = M.embed_sentence(X) return list(zip(X, Y))
def load_dev_data(embed_size=50, dev_perct=1., binary=False): M = ModelEmbeddings(embed_size=embed_size) X = [ labeledTree.to_labeled_lines()[0][1].split(" ") for labeledTree in data['dev'] ] Y = [labeledTree.to_labeled_lines()[0][0] for labeledTree in data['dev']] if binary: X = [x for (x, y) in list(zip(X, Y)) if y != 3] Y = [1 if y > 3 else 0 for y in Y if y != 3] dev_size = int(len(X) * dev_perct) X = X[:dev_size] Y = Y[:dev_size] X = M.embed_sentence(X) # dev data doesn't need to be augmented, hence it's already zipped and # ready to be passed into model.forward() return list(zip(X, Y))