def init_vocab(self, data): assert self.eval == False # for eval vocab must exist charvocab = CharVocab(data, self.args['shorthand']) wordvocab = WordVocab(data, self.args['shorthand'], cutoff=7, lower=True) uposvocab = WordVocab(data, self.args['shorthand'], idx=1) xposvocab = xpos_vocab_factory(data, self.args['shorthand']) featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3) lemmavocab = WordVocab(data, self.args['shorthand'], cutoff=7, idx=4, lower=True) deprelvocab = WordVocab(data, self.args['shorthand'], idx=6) vocab = MultiVocab({ 'char': charvocab, 'word': wordvocab, 'upos': uposvocab, 'xpos': xposvocab, 'feats': featsvocab, 'lemma': lemmavocab, 'deprel': deprelvocab }) return vocab
def get_factory(sh, fn): print('Resolving vocab option for {}...'.format(sh)) train_file = 'data/pos/{}.train.in.conllu'.format(sh) if not os.path.exists(train_file): raise UserWarning( 'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the ' 'XPOS vocabulary for this treebank properly, please run the following command first:\n' '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format( fn, fn)) # without the training file, there's not much we can do key = 'WordVocab(data, shorthand, idx=2)' return key doc = CoNLL.conll2doc(input_file=train_file) data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True) print(f'Original length = {len(data)}') data = filter_data(data, idx=2) print(f'Filtered length = {len(data)}') vocab = WordVocab(data, sh, idx=2, ignore=["_"]) key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])' best_size = len(vocab) - len(VOCAB_PREFIX) if best_size > 20: for sep in ['', '-', '+', '|', ',', ':']: # separators vocab = XPOSVocab(data, sh, idx=2, sep=sep) length = sum( len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) if length < best_size: key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep) best_size = length return key
def xpos_vocab_factory(data, shorthand): if shorthand in ["af_afribooms", "grc_perseus", "ar_padt", "bg_btb", "hr_set", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gl_ctg", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "ja_gsd", "lv_lvtb", "lt_alksnis", "ro_nonstandard", "ro_rrt", "gd_arcosg", "sr_set", "sk_snk", "sl_ssj", "ta_ttb", "uk_iu", "gl_treegal", "la_perseus", "sl_sst"]: return XPOSVocab(data, shorthand, idx=2, sep="") elif shorthand in ["en_test", "grc_proiel", "hy_armtdp", "eu_bdt", "be_hse", "ca_ancora", "zh-hant_gsd", "zh-hans_gsdsimp", "lzh_kyoto", "cop_scriptorium", "da_ddt", "en_ewt", "en_gum", "et_edt", "fi_tdt", "fr_ftb", "fr_gsd", "fr_sequoia", "fr_spoken", "de_gsd", "de_hdt", "got_proiel", "el_gdt", "he_htb", "hi_hdtb", "hu_szeged", "ga_idt", "ja_bccwj", "la_proiel", "lt_hse", "mt_mudt", "mr_ufal", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "cu_proiel", "fro_srcmf", "orv_torot", "fa_seraji", "pt_bosque", "pt_gsd", "ru_gsd", "ru_syntagrus", "ru_taiga", "es_ancora", "es_gsd", "swl_sslc", "te_mtg", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "bxr_bdt", "et_ewt", "kk_ktb", "kmr_mg", "olo_kkpp", "sme_giella", "hsb_ufal"]: return WordVocab(data, shorthand, idx=2, ignore=["_"]) elif shorthand in ["nl_alpino", "nl_lassysmall", "la_ittb", "sv_talbanken"]: return XPOSVocab(data, shorthand, idx=2, sep="|") elif shorthand in ["en_lines", "sv_lines", "ur_udtb"]: return XPOSVocab(data, shorthand, idx=2, sep="-") elif shorthand in ["fi_ftb"]: return XPOSVocab(data, shorthand, idx=2, sep=",") elif shorthand in ["id_gsd", "ko_gsd", "ko_kaist"]: return XPOSVocab(data, shorthand, idx=2, sep="+") elif shorthand in ["pl_lfg", "pl_pdb"]: return XPOSVocab(data, shorthand, idx=2, sep=":") else: raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand))
raise UserWarning( 'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the ' 'XPOS vocabulary for this treebank properly, please run the following command first:\n' '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn)) # without the training file, there's not much we can do key = 'WordVocab(data, shorthand, idx=2)' mapping[key].append(sh) continue doc = Document( CoNLL.conll2dict(input_file='data/pos/{}.train.in.conllu'.format(sh))) data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True) print(f'Original length = {len(data)}') data = filter_data(data, idx=2) print(f'Filtered length = {len(data)}') vocab = WordVocab(data, sh, idx=2, ignore=["_"]) key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])' best_size = len(vocab) - len(VOCAB_PREFIX) if best_size > 20: for sep in ['', '-', '+', '|', ',', ':']: # separators vocab = XPOSVocab(data, sh, idx=2, sep=sep) length = sum( len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) if length < best_size: key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep) best_size = length mapping[key].append(sh) # Generate code. This takes the XPOS vocabulary classes selected above, and generates the # actual factory class as seen in models.pos.xpos_vocab_factory. first = True