예제 #1
0
 def init_vocab(self, data):
     assert self.eval == False # for eval vocab must exist
     charvocab = CharVocab(data, self.args['shorthand'])
     wordvocab = WordVocab(data, self.args['shorthand'], cutoff=7, lower=True)
     uposvocab = WordVocab(data, self.args['shorthand'], idx=1)
     xposvocab = xpos_vocab_factory(data, self.args['shorthand'])
     featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3)
     lemmavocab = WordVocab(data, self.args['shorthand'], cutoff=7, idx=4, lower=True)
     deprelvocab = WordVocab(data, self.args['shorthand'], idx=6)
     vocab = MultiVocab({'char': charvocab,
                         'word': wordvocab,
                         'upos': uposvocab,
                         'xpos': xposvocab,
                         'feats': featsvocab,
                         'lemma': lemmavocab,
                         'deprel': deprelvocab})
     return vocab
예제 #2
0
def xpos_vocab_factory(data, shorthand):
    if shorthand in ["af_afribooms", "grc_perseus"]:
        return XPOSVocab(data, shorthand, idx=2, sep="")
    elif shorthand in ["grc_proiel", "hy_armtdp", "eu_bdt", "be_hse", "ca_ancora", "zh-hant_gsd", "zh-hans_gsdsimp", "lzh_kyoto", "cop_scriptorium", "da_ddt", "en_ewt", "en_gum", "et_edt", "fi_tdt", "fr_ftb", "fr_gsd", "fr_sequoia", "fr_spoken", "de_gsd", "de_hdt", "got_proiel", "el_gdt", "he_htb", "hi_hdtb", "hu_szeged", "ga_idt", "ja_bccwj", "la_proiel", "lt_hse", "mt_mudt", "mr_ufal", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "cu_proiel", "fro_srcmf", "orv_torot", "fa_seraji", "pt_bosque", "pt_gsd", "ru_gsd", "ru_syntagrus", "ru_taiga", "es_ancora", "es_gsd", "swl_sslc", "te_mtg", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "bxr_bdt", "et_ewt", "kk_ktb", "kmr_mg", "olo_kkpp", "sme_giella", "hsb_ufal", "ar_padt", "bg_btb", "hr_set", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gl_ctg", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "ja_gsd", "lv_lvtb", "lt_alksnis", "ro_nonstandard", "ro_rrt", "gd_arcosg", "sr_set", "sk_snk", "sl_ssj", "ta_ttb", "uk_iu", "gl_treegal", "la_perseus", "sl_sst"]:
        return WordVocab(data, shorthand, idx=2, ignore=["_"])
    elif shorthand in ["nl_alpino", "nl_lassysmall", "la_ittb", "sv_talbanken"]:
        return XPOSVocab(data, shorthand, idx=2, sep="|")
    elif shorthand in ["en_lines", "sv_lines", "ur_udtb"]:
        return XPOSVocab(data, shorthand, idx=2, sep="-")
    elif shorthand in ["fi_ftb"]:
        return XPOSVocab(data, shorthand, idx=2, sep=",")
    elif shorthand in ["id_gsd", "ko_gsd", "ko_kaist"]:
        return XPOSVocab(data, shorthand, idx=2, sep="+")
    elif shorthand in ["pl_lfg", "pl_pdb"]:
        return XPOSVocab(data, shorthand, idx=2, sep=":")
    else:
        raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand))
def xpos_vocab_factory(data, shorthand):
    if shorthand in ["af_afribooms", "grc_perseus", "ar_padt", "cs_cac", "cs_fictree", "cs_pdt", "gl_ctg", "gl_treegal", "it_isdt", "it_postwita", "la_perseus", "lv_lvtb", "ro_rrt", "sk_snk", "sl_sst", "uk_iu"]:
        return XPOSVocab(data, shorthand, idx=2, sep="")
    elif shorthand in ["grc_proiel", "hy_armtdp", "eu_bdt", "br_keb", "bxr_bdt", "ca_ancora", "zh_gsd", "hr_set", "cs_pud", "da_ddt", "en_ewt", "en_gum", "en_pud", "et_edt", "fo_oft", "fi_pud", "fi_tdt", "fr_gsd", "fr_sequoia", "fr_spoken", "de_gsd", "got_proiel", "el_gdt", "he_htb", "hi_hdtb", "hu_szeged", "ga_idt", "ja_gsd", "ja_modern", "kk_ktb", "kmr_mg", "la_proiel", "pcm_nsc", "sme_giella", "no_bokmaal", "no_nynorsk", "no_nynorsklia", "cu_proiel", "fro_srcmf", "fa_seraji", "pt_bosque", "ru_syntagrus", "ru_taiga", "sr_set", "es_ancora", "sv_pud", "th_pud", "tr_imst", "hsb_ufal", "ug_udt", "vi_vtb", "sl_ssj", "bg_btb"]:
        return WordVocab(data, shorthand, idx=2, ignore=["_"])
    elif shorthand in ["nl_alpino", "nl_lassysmall", "la_ittb", "sv_talbanken"]:
        return XPOSVocab(data, shorthand, idx=2, sep="|")
    elif shorthand in ["en_lines", "sv_lines", "ur_udtb"]:
        return XPOSVocab(data, shorthand, idx=2, sep="-")
    elif shorthand in ["fi_ftb"]:
        return XPOSVocab(data, shorthand, idx=2, sep=",")
    elif shorthand in ["id_gsd", "ko_gsd", "ko_kaist"]:
        return XPOSVocab(data, shorthand, idx=2, sep="+")
    elif shorthand in ["pl_lfg", "pl_sz"]:
        return XPOSVocab(data, shorthand, idx=2, sep=":")
    else:
        raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand))
예제 #4
0
    if not os.path.exists('data/pos/{}.train.in.conllu'.format(sh)):
        raise UserWarning('Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        mapping[key].append(sh)
        continue

    doc, metasentences = CoNLL.conll2dict(input_file='data/pos/{}.train.in.conllu'.format(sh))
    doc = Document(doc, metasentences=metasentences)
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']: # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
    mapping[key].append(sh)

# Generate code. This takes the XPOS vocabulary classes selected above, and generates the
# actual factory class as seen in models.pos.xpos_vocab_factory.
first = True
with open(output_file, 'w') as f: