def load_chunks(fpath, limit=None): with open(fpath, 'rb') as f: events = pulldom.parse(f, parser=_create_parser()) chunk_id = 0 chunk_events = _start_events(events, 'chunk') for chunk in tqdm(chunk_events, desc=f'Loading chunks from {fpath}'): for chunk in _findall(chunk, 'chunk'): if chunk_id == limit: return chunk_id += 1 tokens = [] for tok in _findall(chunk, 'tok'): orth = _findvalue(tok, 'orth') lemmas = [] ctags = [] disamb_lemma = None disamb_ctag = None for lex in _findall(tok, 'lex'): lemma = _findvalue(lex, 'base') ctag = _findvalue(lex, 'ctag') if lex.getAttribute('disamb') == '1': disamb_lemma = lemma disamb_ctag = ctag else: lemmas.append(lemma) ctags.append(ctag) token = Token(orth, lemmas, ctags, disamb_lemma, disamb_ctag) tokens.append(token) yield Chunk(tokens)
def test_MorphAnalyzer_unknown_word(): unknown_word = 'Kotkowicach' chunks = [Chunk([Token(unknown_word)])] empty_dict = Dictionary({}) analyzer = MorphAnalyzer(empty_dict) analyzer.analyze(chunks) token = chunks[0].tokens[0] assert token.lemmas == [unknown_word] assert token.ctags == ['ign']
def test_WordEmbedEncoder(): # TODO we should mock this word2vec = KeyedVectors.load_word2vec_format( 'data/nkjp+wiki-forms-all-300-skipg-ns.txt', limit=50) chunks = [Chunk([Token('5'), Token('kilogramów'), Token('pomiodorów')])] encoder = WordEmbedEncoder(word2vec) X = encoder.fit_transform(chunks) assert type(X) == np.ndarray assert X.shape == (1, 3, 300)
def merge_chunks(analyzed_fpath, gold_fpath, limit=None): analyzed_chunks = load_chunks(analyzed_fpath, limit) gold_chunks = load_chunks(gold_fpath, limit) for analyzed_chunk, gold_chunk in zip(analyzed_chunks, gold_chunks): tokens = [] if len(analyzed_chunk.tokens) != len(gold_chunk.tokens): raise ValueError('Invalid tokens number') for analyzed, gold in zip(analyzed_chunk.tokens, gold_chunk.tokens): merged = Token(analyzed.orth, analyzed.lemmas, analyzed.ctags, gold.disamb_lemma, gold.disamb_ctag) tokens.append(merged) yield Chunk(tokens)
def test_MorphAnalyzer(): known_word = 'pomidorów' chunks = [Chunk([Token(known_word)])] dict = Dictionary( {known_word: [DictEntry(known_word, 'pomidor', 'xyz', '')]}) analyzer = MorphAnalyzer(dict) analyzer.analyze(chunks) token = chunks[0].tokens[0] assert token.lemmas == ['pomidor'] assert token.ctags == ['xyz']
def test_DisambCTagEncoder(): chunks = [ Chunk([ Token('5', disamb_ctag='brev:pun'), Token('kilogramów', disamb_ctag='qub'), Token('pomidorów', disamb_ctag='conj') ]), Chunk([ Token('5', disamb_ctag='brev:pun'), Token('kilogramów', disamb_ctag='qub'), Token('pomidorów', disamb_ctag='conj'), Token('i', disamb_ctag='conj'), Token('ogórków', disamb_ctag='conj') ]) ] encoder = DisambCTagEncoder(tagset.categories) y = encoder.fit_transform(chunks) assert isinstance(y, dict) pos = y['pos'] assert type(pos) == np.ndarray assert pos.shape == (2, 5, 35)
def test_CTagsEncoder(): chunks = [ Chunk([ Token('5', ctags=['brev:pun', 'conj', 'prep:nom']), Token('kilogramów', ctags=['qub']), Token('pomidorów', ctags=['brev:pun', 'conj', 'prep:nom']) ]) ] encoder = CTagsEncoder(tagset.categories) X = encoder.fit_transform(chunks) assert type(X) == np.ndarray assert X.shape == (1, 3, 88)
def test_KerasInputFormatter(): # TODO Move somewhere else # TODO we should mock this word2vec = KeyedVectors.load_word2vec_format( 'data/nkjp+wiki-forms-all-300-skipg-ns.txt', limit=5) u = KerasInputFormatter([('word2vec', WordEmbedEncoder(word2vec)), ('tag2vec', CTagsEncoder(tagset.categories))]) chunks = [ Chunk([ Token('5', ctags=['brev:pun', 'conj', 'prep:nom']), Token('kilogramów', ctags=['qub']), Token('pomidorów', ctags=['brev:pun', 'conj', 'prep:nom']) ]) ] X = u.fit_transform(chunks) print(X)