def main(): home_path = Path('.') src_tokens_idx = '01' src_suffix = '10' tb_files = {'train-hebtb.tokens': f'data/clean/spmrl/hebtb/train-hebtb-{src_tokens_idx}-tokens.txt', 'train-hebtb-gold.lattices': f'data/clean/spmrl/hebtb/train-hebtb-{src_suffix}-gold.lattices', 'dev-hebtb.tokens': f'data/clean/spmrl/hebtb/dev-hebtb-{src_tokens_idx}-tokens.txt', 'dev-hebtb-gold.lattices': f'data/clean/spmrl/hebtb/dev-hebtb-{src_suffix}-gold.lattices', 'test-hebtb.tokens': f'data/clean/spmrl/hebtb/test-hebtb-{src_tokens_idx}-tokens.txt', 'test-hebtb-gold.lattices': f'data/clean/spmrl/hebtb/test-hebtb-{src_suffix}-gold.lattices'} lex_files = {'pref-lex': 'data/raw/spmrl/bgulex/bgupreflex_withdef.utf8.hr', 'lex': 'data/clean/spmrl/bgulex/bgulex-03.hr'} bgulex_file_path = Path('data/processed/spmrl/bgulex.pickle') hebtb_file_path = Path('data/processed/spmrl/hebtb.pickle') vocab_file_path = Path('data/processed/spmrl/hebtb-morph-vocab/vocab.pickle') if bgulex_file_path.exists(): bgulex = Lexicon.load(bgulex_file_path) else: bgulex = Lexicon(lex_files) bgulex.save(bgulex_file_path) if hebtb_file_path.exists(): hebtb = Treebank.load(hebtb_file_path) else: hebtb = Treebank(bgulex, tb_files) hebtb.save(hebtb_file_path) tb_train_size = len(hebtb.infused_train_sentences) tb_dev_size = len(hebtb.infused_dev_sentences) tb_test_size = len(hebtb.infused_test_sentences) print(f"Train sentences: {tb_train_size}") print(f"Dev sentences: {tb_dev_size}") print(f"Test sentences: {tb_test_size}") tb_sentences = (hebtb.infused_train_sentences + hebtb.infused_dev_sentences + hebtb.infused_test_sentences) if vocab_file_path.exists(): tb_vocab = MorphVocab.load(vocab_file_path) else: tb_vocab = MorphVocab(tb_sentences) tb_vocab.save(vocab_file_path) print("Vocab tokens: {}".format(len(tb_vocab.tokens))) print("Vocab forms: {}".format(len(tb_vocab.forms))) print("Vocab lemmas: {}".format(len(tb_vocab.lemmas))) print("Vocab tags: {}".format(len(tb_vocab.tags))) print("Vocab feats: {}".format(len(tb_vocab.feats))) train_ds = get_morph_dataset_partition('train-inf', home_path, tb_vocab, hebtb) dev_inf_ds = get_morph_dataset_partition('dev-inf', home_path, tb_vocab, hebtb) test_inf_ds = get_morph_dataset_partition('test-inf', home_path, tb_vocab, hebtb) dev_uninf_ds = get_morph_dataset_partition('dev-uninf', home_path, tb_vocab, hebtb) test_uninf_ds = get_morph_dataset_partition('test-uninf', home_path, tb_vocab, hebtb) print("Train infused dataset: {}".format(len(train_ds))) print("Dev infused dataset: {}".format(len(dev_inf_ds))) print("Test infused dataset: {}".format(len(test_inf_ds))) print("Dev uninfused dataset: {}".format(len(dev_uninf_ds))) print("Test uninfused dataset: {}".format(len(test_uninf_ds)))
def lattice(tokens: list, lex: Lexicon) -> morph.Lattice: lex_entries = [lex.entry(token) for token in tokens] lex_lattice = morph.Lattice() for tid, token in enumerate(tokens): token_id = tid + 1 lex_lattice[token_id] = lex_entries[tid].analyses return lex_lattice
def _create_sentence(self, lexicon: lex.Lexicon, tokens: list, md_lattice: conllx.LatticeGraph) -> nlp.Sentence: lex_entries = [lexicon.entry(token) for token in tokens] lattice = morph.Lattice() for i, entry in enumerate(lex_entries): lattice[i + 1] = entry.analyses gold_lattice = morph.Lattice() for tid, token in enumerate(tokens): token_id = tid + 1 gold_analysis = [] for path in md_lattice._token_paths[token_id]: gold_analysis.append(self._create_analysis(path)) if len(gold_analysis) != 1: raise ValueError("token gold analysis: {}".format(gold_analysis)) gold_lattice[token_id] = gold_analysis return nlp.Sentence(tokens, lattice, gold_lattice)
'dev-hebtb-gold.lattices': f'{home_path}/data/clean/spmrl/hebtb/dev-hebtb-{src_lattice_idx}-gold.lattices', 'test-hebtb.tokens': f'{home_path}/data/clean/spmrl/hebtb/test-hebtb-{src_tokens_idx}-tokens.txt', 'test-hebtb-gold.lattices': f'{home_path}/data/clean/spmrl/hebtb/test-hebtb-{src_lattice_idx}-gold.lattices' } lex_files = { 'pref-lex': 'data/raw/spmrl/bgulex/bgupreflex_withdef.utf8.hr', 'lex': 'data/clean/spmrl/bgulex/bgulex-03.hr' } bgulex_file_path = Path(f'{home_path}/data/processed/spmrl/bgulex.pickle') hebtb_file_path = Path(f'{home_path}/data/processed/spmrl/hebtb.pickle') vocab_file_path = Path( f'{home_path}/data/processed/spmrl/hebtb-token-vocab/vocab.pickle') bgulex = Lexicon.load(bgulex_file_path) hebtb = Treebank.load(hebtb_file_path) hebtb_partition = { 'train-inf': hebtb.infused_train_sentences, 'dev-inf': hebtb.infused_dev_sentences, 'test-inf': hebtb.infused_test_sentences, 'dev-uninf': hebtb.uninfused_dev_sentences, 'test-uninf': hebtb.uninfused_test_sentences } tb_vocab = TokenVocab.load(vocab_file_path) # Data train_set = get_token_dataset_partition('train-inf', home_path, tb_vocab, hebtb) dev_inf_set = get_token_dataset_partition('dev-inf', home_path, tb_vocab, hebtb)