Example #1
0
 def init_vocab(self, data):
     assert self.eval == False  # for eval vocab must exist
     charvocab = CharVocab(data, self.args['shorthand'])
     wordvocab = WordVocab(data,
                           self.args['shorthand'],
                           cutoff=7,
                           lower=True)
     uposvocab = WordVocab(data, self.args['shorthand'], idx=1)
     xposvocab = xpos_vocab_factory(data, self.args['shorthand'])
     featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3)
     lemmavocab = WordVocab(data,
                            self.args['shorthand'],
                            cutoff=7,
                            idx=4,
                            lower=True)
     deprelvocab = WordVocab(data, self.args['shorthand'], idx=6)
     vocab = MultiVocab({
         'char': charvocab,
         'word': wordvocab,
         'upos': uposvocab,
         'xpos': xposvocab,
         'feats': featsvocab,
         'lemma': lemmavocab,
         'deprel': deprelvocab
     })
     return vocab
Example #2
0
def get_factory(sh, fn):
    print('Resolving vocab option for {}...'.format(sh))
    train_file = 'data/pos/{}.train.in.conllu'.format(sh)
    if not os.path.exists(train_file):
        raise UserWarning(
            'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format(
                fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        return key

    doc = CoNLL.conll2doc(input_file=train_file)
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']:  # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(
                len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
    return key
Example #3
0
def xpos_vocab_factory(data, shorthand):
    if shorthand in ["af_afribooms", "grc_perseus", "ar_padt", "bg_btb", "hr_set", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gl_ctg", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "ja_gsd", "lv_lvtb", "lt_alksnis", "ro_nonstandard", "ro_rrt", "gd_arcosg", "sr_set", "sk_snk", "sl_ssj", "ta_ttb", "uk_iu", "gl_treegal", "la_perseus", "sl_sst"]:
        return XPOSVocab(data, shorthand, idx=2, sep="")
    elif shorthand in ["en_test", "grc_proiel", "hy_armtdp", "eu_bdt", "be_hse", "ca_ancora", "zh-hant_gsd", "zh-hans_gsdsimp", "lzh_kyoto", "cop_scriptorium", "da_ddt", "en_ewt", "en_gum", "et_edt", "fi_tdt", "fr_ftb", "fr_gsd", "fr_sequoia", "fr_spoken", "de_gsd", "de_hdt", "got_proiel", "el_gdt", "he_htb", "hi_hdtb", "hu_szeged", "ga_idt", "ja_bccwj", "la_proiel", "lt_hse", "mt_mudt", "mr_ufal", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "cu_proiel", "fro_srcmf", "orv_torot", "fa_seraji", "pt_bosque", "pt_gsd", "ru_gsd", "ru_syntagrus", "ru_taiga", "es_ancora", "es_gsd", "swl_sslc", "te_mtg", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "bxr_bdt", "et_ewt", "kk_ktb", "kmr_mg", "olo_kkpp", "sme_giella", "hsb_ufal"]:
        return WordVocab(data, shorthand, idx=2, ignore=["_"])
    elif shorthand in ["nl_alpino", "nl_lassysmall", "la_ittb", "sv_talbanken"]:
        return XPOSVocab(data, shorthand, idx=2, sep="|")
    elif shorthand in ["en_lines", "sv_lines", "ur_udtb"]:
        return XPOSVocab(data, shorthand, idx=2, sep="-")
    elif shorthand in ["fi_ftb"]:
        return XPOSVocab(data, shorthand, idx=2, sep=",")
    elif shorthand in ["id_gsd", "ko_gsd", "ko_kaist"]:
        return XPOSVocab(data, shorthand, idx=2, sep="+")
    elif shorthand in ["pl_lfg", "pl_pdb"]:
        return XPOSVocab(data, shorthand, idx=2, sep=":")
    else:
        raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand))
        raise UserWarning(
            'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        mapping[key].append(sh)
        continue

    doc = Document(
        CoNLL.conll2dict(input_file='data/pos/{}.train.in.conllu'.format(sh)))
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']:  # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(
                len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
    mapping[key].append(sh)

# Generate code. This takes the XPOS vocabulary classes selected above, and generates the
# actual factory class as seen in models.pos.xpos_vocab_factory.
first = True