Ejemplo n.º 1
0
 def init_vocab(self, data):
     assert self.eval == False  # for eval vocab must exist
     charvocab = CharVocab(data, self.args['shorthand'])
     wordvocab = WordVocab(data,
                           self.args['shorthand'],
                           cutoff=self.cutoff,
                           lower=True)
     uposvocab = WordVocab(data, self.args['shorthand'], idx=1)
     xposvocab = xpos_vocab_factory(data, self.args['shorthand'])
     featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3)
     lemmavocab = WordVocab(data,
                            self.args['shorthand'],
                            cutoff=self.cutoff,
                            idx=4,
                            lower=True)
     deprelvocab = WordVocab(data, self.args['shorthand'], idx=6)
     vocab = MultiVocab({
         'char': charvocab,
         'word': wordvocab,
         'upos': uposvocab,
         'xpos': xposvocab,
         'feats': featsvocab,
         'lemma': lemmavocab,
         'deprel': deprelvocab
     })
     return vocab
Ejemplo n.º 2
0
def xpos_vocab_factory(data, shorthand):
    if shorthand in [
            "af_afribooms", "grc_perseus", "ar_padt", "cs_cac", "cs_fictree",
            "cs_pdt", "gl_ctg", "gl_treegal", "it_isdt", "it_postwita",
            "la_perseus", "lv_lvtb", "ro_rrt", "sk_snk", "sl_sst", "uk_iu"
    ]:
        return XPOSVocab(data, shorthand, idx=2, sep="")
    elif shorthand in [
            "grc_proiel", "hy_armtdp", "eu_bdt", "br_keb", "bxr_bdt",
            "ca_ancora", "zh_gsd", "hr_set", "cs_pud", "da_ddt", "en_ewt",
            "en_gum", "en_pud", "et_edt", "fo_oft", "fi_pud", "fi_tdt",
            "fr_gsd", "fr_sequoia", "fr_spoken", "de_gsd", "got_proiel",
            "el_gdt", "he_htb", "hi_hdtb", "hu_szeged", "ga_idt", "ja_gsd",
            "ja_modern", "kk_ktb", "kmr_mg", "la_proiel", "pcm_nsc",
            "sme_giella", "no_bokmaal", "no_nynorsk", "no_nynorsklia",
            "cu_proiel", "fro_srcmf", "fa_seraji", "pt_bosque", "ru_syntagrus",
            "ru_taiga", "sr_set", "es_ancora", "sv_pud", "th_pud", "tr_imst",
            "hsb_ufal", "ug_udt", "vi_vtb", "sl_ssj", "bg_btb"
    ]:
        return WordVocab(data, shorthand, idx=2, ignore=["_"])
    elif shorthand in [
            "nl_alpino", "nl_lassysmall", "la_ittb", "sv_talbanken"
    ]:
        return XPOSVocab(data, shorthand, idx=2, sep="|")
    elif shorthand in ["en_lines", "sv_lines", "ur_udtb"]:
        return XPOSVocab(data, shorthand, idx=2, sep="-")
    elif shorthand in ["fi_ftb"]:
        return XPOSVocab(data, shorthand, idx=2, sep=",")
    elif shorthand in ["id_gsd", "ko_gsd", "ko_kaist"]:
        return XPOSVocab(data, shorthand, idx=2, sep="+")
    elif shorthand in ["pl_lfg", "pl_sz"]:
        return XPOSVocab(data, shorthand, idx=2, sep=":")
    else:
        raise NotImplementedError(
            'Language shorthand "{}" not found!'.format(shorthand))
Ejemplo n.º 3
0
    def init_vocab(self, data_list):
        assert self.eval == False  # for eval vocab must exist
        data_all = sum(data_list, [])
        charvocab = CharVocab(data_all, self.args['shorthand'])

        # construct wordvocab from multiple files
        wordvocabs = [WordVocab(data, self.args['shorthand'], cutoff=0, lower=True) for data in data_list]
        wordset = list(set(sum([v._id2unit[len(VOCAB_PREFIX):len(VOCAB_PREFIX) + self.args['vocab_cutoff']] for v in wordvocabs], [])))
        wordvocab = wordvocabs[0]
        wordvocab._id2unit = VOCAB_PREFIX + wordset
        wordvocab._unit2id = {w: i for i, w in enumerate(wordvocab._id2unit)}
        print('Constructing a joint word vocabulary of size {} ...'.format(len(wordvocab)))

        uposvocab = WordVocab(data_all, self.args['shorthand'], idx=1)
        xposvocab = xpos_vocab_factory(data_all, self.args['shorthand'])
        featsvocab = FeatureVocab(data_all, self.args['shorthand'], idx=3)
        lemmavocab = WordVocab(data_all, self.args['shorthand'], cutoff=self.cutoff, idx=4, lower=True)
        vocab = MultiVocab({'char': charvocab,
                            'word': wordvocab,
                            'upos': uposvocab,
                            'xpos': xposvocab,
                            'feats': featsvocab,
                            'lemma': lemmavocab, })
        return vocab
Ejemplo n.º 4
0
mapping = defaultdict(list)
for sh, fn in zip(shorthands, fullnames):
    print('Resolving vocab option for {}...'.format(sh))
    if not os.path.exists('data/pos/{}.train.in.conllu'.format(sh)):
        raise UserWarning(
            'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        mapping[key].append(sh)
        continue

    conll_file = CoNLLFile('data/pos/{}.train.in.conllu'.format(sh))
    data = conll_file.get(['word', 'upos', 'xpos', 'feats'], as_sentences=True)
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']:  # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(
                len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
    mapping[key].append(sh)

# Generate code. This takes the XPOS vocabulary classes selected above, and generates the
# actual factory class as seen in models.pos.xpos_vocab_factory.
first = True