Python WordVectorsの例、msp.data.WordVectors Pythonの例

コード例 #1

0

ファイルを表示

def prepare_test(args, ConfType=None):
    # conf
    conf: OverallConf = init_everything(args, ConfType)
    dconf, mconf = conf.dconf, conf.mconf
    iconf = mconf.iconf
    # vocab
    vpack = IEVocabPackage.build_by_reading(conf)
    # prepare data
    test_streamer = get_data_reader(dconf.test,
                                    dconf.input_format,
                                    dconf.use_label0,
                                    dconf.noef_link0,
                                    dconf.aux_repr_test,
                                    max_evt_layers=dconf.max_evt_layers)
    # model
    model = build_model(conf.model_type, conf, vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_files
    if len(extra_embed_files) > 0:
        # get embeddings
        extra_codes = []  # todo(note): ignore this mode for this project
        if len(extra_codes) == 0:
            extra_codes = [""] * len(extra_embed_files)
        extra_embedding = WordVectors.load(extra_embed_files[0],
                                           aug_code=extra_codes[0])
        extra_embedding.merge_others([
            WordVectors.load(one_file,
                             aug_code=one_code) for one_file, one_code in zip(
                                 extra_embed_files[1:], extra_codes[1:])
        ])
        # get extra dictionary (only those words hit in extra-embed)
        extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(
            test_streamer, extra_embedding),
                                                     sort_by_count=True,
                                                     pre_list=(),
                                                     post_list=())
        # give them to the model
        new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding)
        vpack.put_voc("word", new_vocab)
    # =====
    # use bert? todo(note): no pre-compute here in testing!
    if dconf.use_bert:
        bmodel = get_berter(dconf.bconf)
        test_streamer = BerterDataAuger(test_streamer, bmodel, "aux_repr")
    #
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter = batch_stream(
        index_stream(test_streamer, vpack, False, False, test_inst_preparer),
        iconf, False)
    return conf, model, vpack, test_iter

コード例 #2

0

ファイルを表示

ファイル: vocab.py プロジェクト: ValentinaPy/zmsp

 def aug_word2_vocab(self, stream, extra_stream, extra_embed_file: str):
     zlog(
         f"Aug another word vocab from streams and extra_embed_file={extra_embed_file}"
     )
     word_builder = VocabBuilder("word2")
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
     # embeddings
     if len(extra_embed_file) > 0:
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         w2vec = WordVectors.load(extra_embed_file)
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         word_vocab = word_builder.finish()  # no filtering!!
         word_embed1 = word_vocab.filter_embed(w2vec,
                                               init_nohit=1.0,
                                               scale=1.0)
     else:
         zwarn("WARNING: No pretrain file for aug node!!")
         word_vocab = word_builder.finish()  # no filtering!!
         word_embed1 = None
     self.put_voc("word2", word_vocab)
     self.put_emb("word2", word_embed1)

コード例 #3

0

ファイルを表示

ファイル: test.py プロジェクト: ValentinaPy/zmsp

def prepare_test(args, ConfType=None):
    # conf
    conf = init_everything(args, ConfType)
    dconf, mconf = conf.dconf, conf.mconf
    # vocab
    vpack = MLMVocabPackage.build_by_reading(dconf.dict_dir)
    # prepare data
    test_streamer = PreprocessStreamer(get_data_reader(dconf.test, dconf.input_format),
                                       lower_case=dconf.lower_case, norm_digit=dconf.norm_digit)
    # model
    model = build_model(conf, vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # -----
    # augment with extra embeddings for test stream?
    extra_embed_files = dconf.vconf.test_extra_pretrain_files
    if len(extra_embed_files) > 0:
        # get embeddings
        extra_codes = dconf.vconf.test_extra_pretrain_codes
        if len(extra_codes) == 0:
            extra_codes = [""] * len(extra_embed_files)
        extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0])
        extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in
                                      zip(extra_embed_files[1:], extra_codes[1:])])
        # get extra dictionary (only those words hit in extra-embed)
        extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding),
                                                     sort_by_count=True, pre_list=(), post_list=())
        # give them to the model
        new_vocab = aug_words_and_embs(model, extra_vocab, extra_embedding)
        vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    backoff_pos_idx = dconf.backoff_pos_idx
    test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer, backoff_pos_idx),
                             mconf.test_batch_size, mconf, False)
    return conf, model, vpack, test_iter

コード例 #4

0

ファイルを表示

ファイル: test.py プロジェクト: ValentinaPy/zmsp

def prepare_test(args, ConfType=None):
    # conf
    conf = init_everything(args, ConfType)
    dconf, pconf = conf.dconf, conf.pconf
    iconf = pconf.iconf
    # vocab
    vpack = ParserVocabPackage.build_by_reading(dconf)
    # prepare data
    test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.code_test, dconf.use_label0,
                                    dconf.aux_repr_test, dconf.aux_score_test)
    # model
    model = build_model(conf.partype, conf, vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_files
    if len(extra_embed_files) > 0:
        # get embeddings
        extra_codes = dconf.test_extra_pretrain_codes
        if len(extra_codes) == 0:
            extra_codes = [""] * len(extra_embed_files)
        extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0])
        extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in
                                      zip(extra_embed_files[1:], extra_codes[1:])])
        # get extra dictionary (only those words hit in extra-embed)
        extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding),
                                                     sort_by_count=True, pre_list=(), post_list=())
        # give them to the model
        new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding)
        vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer), iconf, False)
    return conf, model, vpack, test_iter

コード例 #5

0

ファイルを表示

 def filter_pembed(self,
                   wv: WordVectors,
                   init_nohit=0.,
                   scale=1.0,
                   assert_all_hit=True,
                   set_init=True):
     if init_nohit <= 0.:
         get_nohit = lambda s: np.zeros((s, ), dtype=np.float32)
     else:
         get_nohit = lambda s: (Random.random_sample(
             (s, )).astype(np.float32) - 0.5) * (2 * init_nohit)
     #
     ret = [np.zeros((wv.embed_size, ),
                     dtype=np.float32)]  # init NIL is zero
     record = defaultdict(int)
     for ws in self.pools_hint_lexicon[1:]:
         res = np.zeros((wv.embed_size, ), dtype=np.float32)
         for w in ws:
             hit, norm_name, norm_w = wv.norm_until_hit(w)
             if hit:
                 value = np.asarray(wv.get_vec(norm_w, norm=False),
                                    dtype=np.float32)
                 record[norm_name] += 1
             else:
                 value = get_nohit(wv.embed_size)
                 record["no-hit"] += 1
             res += value
         ret.append(res)
     #
     assert not assert_all_hit or record[
         "no-hit"] == 0, f"Filter-embed error: assert all-hit but get no-hit of {record['no-hit']}"
     zlog(
         f"Filter pre-trained Pembed: {record}, no-hit is inited with {init_nohit}."
     )
     ret = np.asarray(ret, dtype=np.float32) * scale
     if set_init:
         self.set_pool_init(ret)
     return ret

コード例 #6

0

ファイルを表示

ファイル: vocab.py プロジェクト: ValentinaPy/zmsp

 def build_from_stream(conf: OverallConf, stream, extra_stream):
     dconf = conf.dconf
     zlog("Build vocabs from streams.")
     ret = IEVocabPackage({}, {}, dconf)
     # here, collect them all
     # -- basic inputs
     word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     lemma_builder = VocabBuilder("lemma")
     upos_builder = VocabBuilder("upos")
     ulabel_builder = VocabBuilder("ulabel")
     # -- outputs (event type, entity/filler type, arg role type) (type -> count)
     evt_type_builder = defaultdict(int)
     ef_type_builder = defaultdict(int)
     arg_role_builder = defaultdict(int)
     for inst in stream:
         # -- basic inputs
         for sent in inst.sents:
             word_builder.feed_stream(sent.words.vals)
             for w in sent.words.vals:
                 char_builder.feed_stream(w)
             lemma_builder.feed_stream(sent.lemmas.vals)
             upos_builder.feed_stream(sent.uposes.vals)
             ulabel_builder.feed_stream(sent.ud_labels.vals)
         # -- outputs
         # assert inst.entity_fillers is not None, "For building vocabs, need to provide training instances!"
         assert inst.events is not None, "For building vocabs, need to provide training instances!"
         if inst.entity_fillers is not None:
             for one_ef in inst.entity_fillers:
                 ef_type_builder[one_ef.type] += 1
         for one_evt in inst.events:
             evt_type_builder[one_evt.type] += 1
             if one_evt.links is not None:
                 for one_arg in one_evt.links:
                     arg_role_builder[one_arg.role] += 1
     # build real hlabel-types
     hl_evt = HLabelVocab("event", conf.mconf.hl_evt, evt_type_builder)
     hl_ef = HLabelVocab("entity_filler", conf.mconf.hl_ef, ef_type_builder)
     hl_arg = HLabelVocab("arg", conf.mconf.hl_arg, arg_role_builder)
     # deal with pre-trained word embeddings
     w2vec = None
     if dconf.init_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         # collect extra words and lemmas
         extra_word_set = set()
         extra_lemma_set = set()
         for inst in extra_stream:
             for sent in inst.sents:
                 for w in sent.words.vals:
                     extra_word_set.add(w)
                 for w in sent.lemmas.vals:
                     extra_lemma_set.add(w)
         # must provide dconf.pretrain_file
         w2vec = WordVectors.load(dconf.pretrain_file)
         # first filter according to thresholds
         word_builder.filter(lambda ww, rank, val:
                             (val >= dconf.word_fthres and rank <= dconf.
                              word_rthres) or w2vec.has_key(ww))
         lemma_builder.filter(lambda ww, rank, val:
                              (val >= dconf.word_fthres and rank <= dconf.
                               word_rthres) or w2vec.has_key(ww))
         # then add extra ones
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         for w in extra_lemma_set:
             if w2vec.has_key(w) and (
                     not lemma_builder.has_key_currently(w)):
                 lemma_builder.feed_one(w)
         # finially build the vocab and embeds
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
         lemma_vocab = lemma_builder.finish()
         lemma_embed1 = lemma_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
         # first build pool-embeds, the final decision will depend on each of the flags
         # todo(WARN): assert all hit?
         hl_evt_pembed = hl_evt.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         hl_ef_pembed = hl_ef.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         hl_arg_pembed = hl_arg.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         # by-product of filtered output pre-trained embeddings for later faster processing
         if dconf.output_pretrain_filter:
             w2vec.save_hits(dconf.output_pretrain_filter)
     else:
         word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres,
                                                 fthres=dconf.word_fthres)
         lemma_vocab = lemma_builder.finish_thresh(rthres=dconf.word_rthres,
                                                   fthres=dconf.word_fthres)
         word_embed1 = lemma_embed1 = None
         #
         for one_cc in [
                 conf.mconf.hl_evt, conf.mconf.hl_ef, conf.mconf.hl_arg
         ]:
             if hasattr(one_cc, "pool_init_hint"):
                 assert not one_cc.pool_init_hint, "cannot init pool because the overall pre-train-init flag is not set"
         hl_evt_pembed = hl_ef_pembed = hl_arg_pembed = None
     char_vocab = char_builder.finish()
     upos_vocab = upos_builder.finish()
     ulabel_vocab = ulabel_builder.finish()
     # =====
     # finally assign things
     ret.put_voc("word", word_vocab)
     ret.put_voc("lemma", lemma_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("upos", upos_vocab)
     ret.put_voc("ulabel", ulabel_vocab)
     ret.put_emb("word", word_embed1)
     ret.put_emb("lemma", lemma_embed1)
     # don't need to be jsonable since we are using pickle all at once
     # todo(WARN): the conf in vocab is also stored!!
     ret.put_voc("hl_evt", hl_evt)
     ret.put_voc("hl_ef", hl_ef)
     ret.put_voc("hl_arg", hl_arg)
     ret.put_emb("hl_evt", hl_evt_pembed)
     ret.put_emb("hl_ef", hl_ef_pembed)
     ret.put_emb("hl_arg", hl_arg_pembed)
     return ret

コード例 #7

0

ファイルを表示

ファイル: vocab.py プロジェクト: ValentinaPy/zmsp

 def build_from_stream(dconf: DConf, stream, extra_stream):
     zlog("Build vocabs from streams.")
     ret = ParserVocabPackage({}, {}, dconf)
     #
     word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     label_builder = VocabBuilder("label")
     word_normer = ret.word_normer
     if dconf.vocab_add_prevalues:
         zlog(
             f"Add pre-defined values for upos({len(ParserVocabPackage.PRE_VALUES_UPOS)}) and "
             f"ulabel({len(ParserVocabPackage.PRE_VALUES_ULAB)}).")
         pos_builder.feed_stream(ParserVocabPackage.PRE_VALUES_UPOS)
         label_builder.feed_stream(ParserVocabPackage.PRE_VALUES_ULAB)
     for inst in stream:
         # todo(warn): only do special handling for words
         # there must be words
         word_builder.feed_stream(word_normer.norm_stream(inst.words.vals))
         for w in inst.words.vals:
             char_builder.feed_stream(w)
         # pos and label can be optional
         if inst.poses.has_vals():
             pos_builder.feed_stream(inst.poses.vals)
         if inst.labels.has_vals():
             label_builder.feed_stream(inst.labels.vals)
     #
     w2vec = None
     if dconf.init_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set()
         for inst in extra_stream:
             for w in word_normer.norm_stream(inst.words.vals):
                 extra_word_set.add(w)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide dconf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = dconf.pretrain_file, dconf.code_pretrain
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(lambda ww, rank, val:
                             (val >= dconf.word_fthres and rank <= dconf.
                              word_rthres) or w2vec.has_key(ww))
         # then add extra ones
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres,
                                                 fthres=dconf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     # todo(+1): extra pos/label symbols?
     TARGET_END = VocabHelper.convert_special_pattern("unk")
     pos_vocab = pos_builder.finish(
         target_range=(1, TARGET_END))  # only real tags
     label_vocab = label_builder.finish(target_range=(1, TARGET_END))
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("label", label_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret

コード例 #8

0

ファイルを表示

ファイル: vocab.py プロジェクト: ValentinaPy/zmsp

 def build_from_stream(build_conf: MLMVocabPackageConf, stream,
                       extra_stream):
     zlog("Build vocabs from streams.")
     ret = MLMVocabPackage({}, {})
     # -----
     if build_conf.add_ud2_pos_backoffs:
         ud2_pos_pre_list = list(VocabBuilder.DEFAULT_PRE_LIST) + [
             UD2_POS_UNK_MAP[p] for p in UD2_POS_LIST
         ]
         word_builder = VocabBuilder("word", pre_list=ud2_pos_pre_list)
     else:
         word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     deplabel_builder = VocabBuilder("deplabel")
     ner_builder = VocabBuilder("ner")
     if build_conf.add_ud2_prevalues:
         zlog(
             f"Add pre-defined UD2 values for upos({len(UD2_POS_LIST)}) and ulabel({len(UD2_LABEL_LIST)})."
         )
         pos_builder.feed_stream(UD2_POS_LIST)
         deplabel_builder.feed_stream(UD2_LABEL_LIST)
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
         for w in inst.word_seq.vals:
             char_builder.feed_stream(w)
         # todo(+N): currently we are assuming that we are using UD pos/deps, and directly go with the default ones
         # pos and label can be optional??
         # if inst.poses.has_vals():
         #     pos_builder.feed_stream(inst.poses.vals)
         # if inst.deplabels.has_vals():
         #     deplabel_builder.feed_stream(inst.deplabels.vals)
         if hasattr(inst, "ner_seq") and inst.ner_seq.has_vals():
             ner_builder.feed_stream(inst.ner_seq.vals)
     # ===== embeddings
     w2vec = None
     if build_conf.read_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide build_conf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = build_conf.pretrain_file, build_conf.pretrain_codes
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(
             lambda ww, rank, val: (val >= build_conf.word_fthres and rank
                                    <= build_conf.word_rthres) or
             (build_conf.ignore_thresh_with_pretrain and w2vec.has_key(ww)))
         # then add extra ones
         if build_conf.ignore_thresh_with_pretrain:
             for w in extra_word_set:
                 if w2vec.has_key(w) and (
                         not word_builder.has_key_currently(w)):
                     word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=build_conf.pretrain_init_nohit,
             scale=build_conf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(
             rthres=build_conf.word_rthres, fthres=build_conf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     pos_vocab = pos_builder.finish(sort_by_count=False)
     deplabel_vocab = deplabel_builder.finish(sort_by_count=False)
     ner_vocab = ner_builder.finish()
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("deplabel", deplabel_vocab)
     ret.put_voc("ner", ner_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret