コード例 #1
0
def load_test_data(embed_size=50, perct=1., binary=False):
    '''
    labeledTree.to_labeled_lines()[0] gives you a single sentence and its labeling

    we split it into X = list of words, Y = sentence's labeling

    By default, Y falls into [0, 1, 2, 3, 4]

    @returns: test
        test: List[(List[words], sentiment)] for each sentence in dataset
    '''
    M = ModelEmbeddings(embed_size=embed_size)
    X = [
        labeledTree.to_labeled_lines()[0][1].split(" ")
        for labeledTree in data['test']
    ]
    Y = [labeledTree.to_labeled_lines()[0][0] for labeledTree in data['test']]

    if binary:
        print("train size binary reduce: ", len(X))
        X = [x for (x, y) in list(zip(X, Y)) if y != 3]
        Y = [1 if y > 3 else 0 for y in Y if y != 3]
        print(" --> ", len(X))

    test_size = int(len(X) * perct)
    X = X[:test_size]
    Y = Y[:test_size]
    X = M.embed_sentence(X)

    return list(zip(X, Y))
コード例 #2
0
def load_dev_data(embed_size=50, dev_perct=1., binary=False):
    M = ModelEmbeddings(embed_size=embed_size)
    X = [
        labeledTree.to_labeled_lines()[0][1].split(" ")
        for labeledTree in data['dev']
    ]
    Y = [labeledTree.to_labeled_lines()[0][0] for labeledTree in data['dev']]

    if binary:
        X = [x for (x, y) in list(zip(X, Y)) if y != 3]
        Y = [1 if y > 3 else 0 for y in Y if y != 3]

    dev_size = int(len(X) * dev_perct)
    X = X[:dev_size]
    Y = Y[:dev_size]
    X = M.embed_sentence(X)

    # dev data doesn't need to be augmented, hence it's already zipped and
    # ready to be passed into model.forward()
    return list(zip(X, Y))