Beispiel #1
0
        else:
            predict.append(pr[i])
    return predict


if __name__ == "__main__":

    PennTreePath = "/home/ferrone/Datasets/PTB2/"
    # Grammar = pickle.load(open("binaryGrammar23.txt", "rb"))

    # creating new grammar ordered by frequency
    # sections = "00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22".split()
    sections_train = "02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21".split(
    )

    treeList_train = loadPennTree(PennTreePath, sections_train, normalize=True)
    treeList_test = loadPennTree(PennTreePath, ['23'], normalize=True)
    treeList_valid = loadPennTree(PennTreePath, ['24'], normalize=True)

    #Grammar = grammar.Grammar.fromTrees(treeList, 15)
    #pickle.dump(Grammar, open('binaryGrammarMostFrequent15.txt', 'wb'))

    Grammar = pickle.load(open('binaryGrammarMostFrequent1500.txt', 'rb'))
    pos = posList(Grammar)
    rules = ruleList(Grammar)

    iRule = rules.index(parseRule('@ADJP -> @ADJP IN'))
    print(iRule)
    print(pos.index('@NP'))
    print(pos.index('IN'))
    print(pos.index('JJ'))
Beispiel #2
0
            finally:
                profiler.print_stats()

        return profiled_func

    return inner


if __name__ == '__main__':
    PennTreePathBinarized = "/Users/lorenzo/Documents/Universita/PHD/Lavori/Datasets/PTB2/"

    sections = ['21', '23', '24']
    N = 100
    # takes first N element from iterator
    treeIterator = loadPennTree(PennTreePathBinarized,
                                sections,
                                normalize=True)
    # treeList = list(treeIterator)[:100]
    treeIterator = itertools.islice(treeIterator, 0, N)
    treeList = list(treeIterator)

    # print (treeList)

    dtk_generator = dtk.DT(dimension=8192,
                           LAMBDA=0.6,
                           operation=op.fast_shuffled_convolution)
    dtk_generator2 = dtk2.DT(dimension=8192,
                             LAMBDA=0.6,
                             operation=op.fast_shuffled_convolution)
    dtk_generator3 = dtk2.partialTreeKernel(
        dimension=8192, LAMBDA=0.6, operation=op.fast_shuffled_convolution)
Beispiel #3
0
#sys.path.append("/home/ferrone/pyDTK2/src")
import dtk2 as dtk
import sentence_encoder
import loadPennTree
import operation as op
import numpy

dtk_generator = dtk.DT(dimension=8192,
                       LAMBDA=0.4,
                       operation=op.fast_shuffled_convolution)

if __name__ == "__main__":
    path = "/Users/lorenzo/Desktop/Current/Greg/dtknn/PTB"
    sections = "00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24".split(
    )
    l = list(loadPennTree.loadPennTree(path, sections[2:4], True))
    print(len(l))
    L = []
    for i, t in enumerate(l):
        if i % 100 == 0:
            print(".", end='', flush=True)
        if i % 1000 == 0:
            print(i, flush=True)
        sentence = t.taggedSentence
        v = sentence_encoder.encoder(sentence, dtk_generator, 3)
        L.append([v])
        dtk_generator.cleanCache()

    M = numpy.concatenate(L)
    numpy.save("/Users/lorenzo/Desktop/Current/Greg/dtknn/encoded/train.npy",
               M)
Beispiel #4
0
 def fromTreeBank(cls, pathToTreeBank, sections):
     trees = loadPennTree(pathToTreeBank, sections)
     return cls.fromTrees(trees)
Beispiel #5
0
    #         s.add(r)
    #         m = len(r.right)
    #         if m == 1:
    #             print (m, r)
    #             sys.exit(0)
    # print (len(s))
    #
    # sys.exit(0)

    sections = ["23"]
    if MODE == "binary":
        if NUMBERIFY:
            # numberified version
            treeList = [
                numberify(t) for t in list(
                    loadPennTree(PennTreePath, sections, normalize=True))
            ]
        else:
            treeList = list(
                loadPennTree(PennTreePath, sections, normalize=True))

    else:
        if NUMBERIFY:
            treeList = [
                numberify(t) for t in list(
                    loadPennTree(PennTreePath, sections, normalize=False))
            ]
        else:
            treeList = list(
                loadPennTree(PennTreePath, sections, normalize=False))
Beispiel #6
0
    if sys.platform == 'darwin':
        PennTreePathFull = "/Users/lorenzo/Documents/Universita/PHD/Lavori/Datasets/PTB3/"
        PennTreePathBinarized = "/Users/lorenzo/Documents/Universita/PHD/Lavori/Datasets/PTB2/"
        javaString = "java -classpath /Users/lorenzo/Documents/Programming/Java/StanfordParser/target/classes main"
        # path = '/Users/lorenzo/Documents/Universita/PHD/Lavori/Codice/pyCYK/'
    else:
        PennTreePathFull = "/home/ferrone/Datasets/PTB3/"
        PennTreePathBinarized = "/home/ferrone/Datasets/PTB2/"
        javaString = "java -classpath /home/ferrone/pyCYK_sync/java_debinerizer/classes main"

    # loading binarized tree to reconstruct, and full tree to compare to
    sections = ['23']
    treeListFull = [
        numberify(t) for t in list(
            loadPennTree(PennTreePathFull, sections, normalize=True))[:500]
    ]
    treeListBinarized = [
        numberify(t) for t in list(
            loadPennTree(PennTreePathBinarized, sections, normalize=True))
        [:500]
    ]

    # # loading grammar (binarized)
    # Grammar = pickle.load(open("binaryGrammar.txt", "rb"))
    Grammar = pickle.load(open("binaryGrammar23.txt", "rb"))
    #
    # # defining parser and dtk parameters
    # dimension = 1024
    # filter = 1.5
    # LAMBDA = 0.6