def test_packer_bad_chars(tokenizer): string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin' packer = Packer(tokenizer.vocab, []) doc = tokenizer(string) bits = packer.pack(doc) result = packer.unpack(bits) assert result.string == doc.string
def test_serialize_empty_doc(): vocab = spacy.en.English.Defaults.create_vocab() doc = Doc(vocab) packer = Packer(vocab, {}) b = packer.pack(doc) assert b == b'' loaded = Doc(vocab).from_bytes(b) assert len(loaded) == 0
def test_packer_unannotated(tokenizer): packer = Packer(tokenizer.vocab, []) msg = tokenizer(u'the dog jumped') assert msg.string == 'the dog jumped' bits = packer.pack(msg) result = packer.unpack(bits) assert result.string == 'the dog jumped'
def test_packer_annotated(tokenizer): vocab = tokenizer.vocab nn = vocab.strings['NN'] dt = vocab.strings['DT'] vbd = vocab.strings['VBD'] jj = vocab.strings['JJ'] det = vocab.strings['det'] nsubj = vocab.strings['nsubj'] adj = vocab.strings['adj'] root = vocab.strings['ROOT'] attr_freqs = [(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]), (DEP, { det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1 }.items()), (HEAD, { 0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1 }.items())] packer = Packer(vocab, attr_freqs) msg = tokenizer(u'the dog jumped') msg.from_array([TAG, DEP, HEAD], numpy.array([[dt, det, 1], [nn, nsubj, 1], [vbd, root, 0]], dtype=numpy.int32)) assert msg.string == 'the dog jumped' assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in msg] == [1, 1, 0] bits = packer.pack(msg) result = packer.unpack(bits) assert result.string == 'the dog jumped' assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
def test_packer_annotated(tokenizer): vocab = tokenizer.vocab nn = vocab.strings['NN'] dt = vocab.strings['DT'] vbd = vocab.strings['VBD'] jj = vocab.strings['JJ'] det = vocab.strings['det'] nsubj = vocab.strings['nsubj'] adj = vocab.strings['adj'] root = vocab.strings['ROOT'] attr_freqs = [ (TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]), (DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()), (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items()) ] packer = Packer(vocab, attr_freqs) msg = tokenizer(u'the dog jumped') msg.from_array( [TAG, DEP, HEAD], numpy.array([ [dt, det, 1], [nn, nsubj, 1], [vbd, root, 0] ], dtype=numpy.int32)) assert msg.string == 'the dog jumped' assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in msg] == [1, 1, 0] bits = packer.pack(msg) result = packer.unpack(bits) assert result.string == 'the dog jumped' assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in result] == [1, 1, 0]