def prepare_test(args, ConfType=None): # conf conf: OverallConf = init_everything(args, ConfType) dconf, mconf = conf.dconf, conf.mconf iconf = mconf.iconf # vocab vpack = IEVocabPackage.build_by_reading(conf) # prepare data test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.use_label0, dconf.noef_link0, dconf.aux_repr_test, max_evt_layers=dconf.max_evt_layers) # model model = build_model(conf.model_type, conf, vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ===== # augment with extra embeddings extra_embed_files = dconf.test_extra_pretrain_files if len(extra_embed_files) > 0: # get embeddings extra_codes = [] # todo(note): ignore this mode for this project if len(extra_codes) == 0: extra_codes = [""] * len(extra_embed_files) extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0]) extra_embedding.merge_others([ WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in zip( extra_embed_files[1:], extra_codes[1:]) ]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = VocabBuilder.build_from_stream(iter_hit_words( test_streamer, extra_embedding), sort_by_count=True, pre_list=(), post_list=()) # give them to the model new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding) vpack.put_voc("word", new_vocab) # ===== # use bert? todo(note): no pre-compute here in testing! if dconf.use_bert: bmodel = get_berter(dconf.bconf) test_streamer = BerterDataAuger(test_streamer, bmodel, "aux_repr") # # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter = batch_stream( index_stream(test_streamer, vpack, False, False, test_inst_preparer), iconf, False) return conf, model, vpack, test_iter
def aug_word2_vocab(self, stream, extra_stream, extra_embed_file: str): zlog( f"Aug another word vocab from streams and extra_embed_file={extra_embed_file}" ) word_builder = VocabBuilder("word2") for inst in stream: word_builder.feed_stream(inst.word_seq.vals) # embeddings if len(extra_embed_file) > 0: extra_word_set = set(w for inst in extra_stream for w in inst.word_seq.vals) w2vec = WordVectors.load(extra_embed_file) for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) word_vocab = word_builder.finish() # no filtering!! word_embed1 = word_vocab.filter_embed(w2vec, init_nohit=1.0, scale=1.0) else: zwarn("WARNING: No pretrain file for aug node!!") word_vocab = word_builder.finish() # no filtering!! word_embed1 = None self.put_voc("word2", word_vocab) self.put_emb("word2", word_embed1)
def prepare_test(args, ConfType=None): # conf conf = init_everything(args, ConfType) dconf, mconf = conf.dconf, conf.mconf # vocab vpack = MLMVocabPackage.build_by_reading(dconf.dict_dir) # prepare data test_streamer = PreprocessStreamer(get_data_reader(dconf.test, dconf.input_format), lower_case=dconf.lower_case, norm_digit=dconf.norm_digit) # model model = build_model(conf, vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ----- # augment with extra embeddings for test stream? extra_embed_files = dconf.vconf.test_extra_pretrain_files if len(extra_embed_files) > 0: # get embeddings extra_codes = dconf.vconf.test_extra_pretrain_codes if len(extra_codes) == 0: extra_codes = [""] * len(extra_embed_files) extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0]) extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in zip(extra_embed_files[1:], extra_codes[1:])]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding), sort_by_count=True, pre_list=(), post_list=()) # give them to the model new_vocab = aug_words_and_embs(model, extra_vocab, extra_embedding) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) backoff_pos_idx = dconf.backoff_pos_idx test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer, backoff_pos_idx), mconf.test_batch_size, mconf, False) return conf, model, vpack, test_iter
def prepare_test(args, ConfType=None): # conf conf = init_everything(args, ConfType) dconf, pconf = conf.dconf, conf.pconf iconf = pconf.iconf # vocab vpack = ParserVocabPackage.build_by_reading(dconf) # prepare data test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.code_test, dconf.use_label0, dconf.aux_repr_test, dconf.aux_score_test) # model model = build_model(conf.partype, conf, vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ===== # augment with extra embeddings extra_embed_files = dconf.test_extra_pretrain_files if len(extra_embed_files) > 0: # get embeddings extra_codes = dconf.test_extra_pretrain_codes if len(extra_codes) == 0: extra_codes = [""] * len(extra_embed_files) extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0]) extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in zip(extra_embed_files[1:], extra_codes[1:])]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding), sort_by_count=True, pre_list=(), post_list=()) # give them to the model new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer), iconf, False) return conf, model, vpack, test_iter
def filter_pembed(self, wv: WordVectors, init_nohit=0., scale=1.0, assert_all_hit=True, set_init=True): if init_nohit <= 0.: get_nohit = lambda s: np.zeros((s, ), dtype=np.float32) else: get_nohit = lambda s: (Random.random_sample( (s, )).astype(np.float32) - 0.5) * (2 * init_nohit) # ret = [np.zeros((wv.embed_size, ), dtype=np.float32)] # init NIL is zero record = defaultdict(int) for ws in self.pools_hint_lexicon[1:]: res = np.zeros((wv.embed_size, ), dtype=np.float32) for w in ws: hit, norm_name, norm_w = wv.norm_until_hit(w) if hit: value = np.asarray(wv.get_vec(norm_w, norm=False), dtype=np.float32) record[norm_name] += 1 else: value = get_nohit(wv.embed_size) record["no-hit"] += 1 res += value ret.append(res) # assert not assert_all_hit or record[ "no-hit"] == 0, f"Filter-embed error: assert all-hit but get no-hit of {record['no-hit']}" zlog( f"Filter pre-trained Pembed: {record}, no-hit is inited with {init_nohit}." ) ret = np.asarray(ret, dtype=np.float32) * scale if set_init: self.set_pool_init(ret) return ret
def build_from_stream(conf: OverallConf, stream, extra_stream): dconf = conf.dconf zlog("Build vocabs from streams.") ret = IEVocabPackage({}, {}, dconf) # here, collect them all # -- basic inputs word_builder = VocabBuilder("word") char_builder = VocabBuilder("char") lemma_builder = VocabBuilder("lemma") upos_builder = VocabBuilder("upos") ulabel_builder = VocabBuilder("ulabel") # -- outputs (event type, entity/filler type, arg role type) (type -> count) evt_type_builder = defaultdict(int) ef_type_builder = defaultdict(int) arg_role_builder = defaultdict(int) for inst in stream: # -- basic inputs for sent in inst.sents: word_builder.feed_stream(sent.words.vals) for w in sent.words.vals: char_builder.feed_stream(w) lemma_builder.feed_stream(sent.lemmas.vals) upos_builder.feed_stream(sent.uposes.vals) ulabel_builder.feed_stream(sent.ud_labels.vals) # -- outputs # assert inst.entity_fillers is not None, "For building vocabs, need to provide training instances!" assert inst.events is not None, "For building vocabs, need to provide training instances!" if inst.entity_fillers is not None: for one_ef in inst.entity_fillers: ef_type_builder[one_ef.type] += 1 for one_evt in inst.events: evt_type_builder[one_evt.type] += 1 if one_evt.links is not None: for one_arg in one_evt.links: arg_role_builder[one_arg.role] += 1 # build real hlabel-types hl_evt = HLabelVocab("event", conf.mconf.hl_evt, evt_type_builder) hl_ef = HLabelVocab("entity_filler", conf.mconf.hl_ef, ef_type_builder) hl_arg = HLabelVocab("arg", conf.mconf.hl_arg, arg_role_builder) # deal with pre-trained word embeddings w2vec = None if dconf.init_from_pretrain: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs # collect extra words and lemmas extra_word_set = set() extra_lemma_set = set() for inst in extra_stream: for sent in inst.sents: for w in sent.words.vals: extra_word_set.add(w) for w in sent.lemmas.vals: extra_lemma_set.add(w) # must provide dconf.pretrain_file w2vec = WordVectors.load(dconf.pretrain_file) # first filter according to thresholds word_builder.filter(lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf. word_rthres) or w2vec.has_key(ww)) lemma_builder.filter(lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf. word_rthres) or w2vec.has_key(ww)) # then add extra ones for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) for w in extra_lemma_set: if w2vec.has_key(w) and ( not lemma_builder.has_key_currently(w)): lemma_builder.feed_one(w) # finially build the vocab and embeds word_vocab = word_builder.finish() word_embed1 = word_vocab.filter_embed( w2vec, init_nohit=dconf.pretrain_init_nohit, scale=dconf.pretrain_scale) lemma_vocab = lemma_builder.finish() lemma_embed1 = lemma_vocab.filter_embed( w2vec, init_nohit=dconf.pretrain_init_nohit, scale=dconf.pretrain_scale) # first build pool-embeds, the final decision will depend on each of the flags # todo(WARN): assert all hit? hl_evt_pembed = hl_evt.filter_pembed( w2vec, init_nohit=dconf.pretrain_init_nohit, assert_all_hit=False) hl_ef_pembed = hl_ef.filter_pembed( w2vec, init_nohit=dconf.pretrain_init_nohit, assert_all_hit=False) hl_arg_pembed = hl_arg.filter_pembed( w2vec, init_nohit=dconf.pretrain_init_nohit, assert_all_hit=False) # by-product of filtered output pre-trained embeddings for later faster processing if dconf.output_pretrain_filter: w2vec.save_hits(dconf.output_pretrain_filter) else: word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) lemma_vocab = lemma_builder.finish_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) word_embed1 = lemma_embed1 = None # for one_cc in [ conf.mconf.hl_evt, conf.mconf.hl_ef, conf.mconf.hl_arg ]: if hasattr(one_cc, "pool_init_hint"): assert not one_cc.pool_init_hint, "cannot init pool because the overall pre-train-init flag is not set" hl_evt_pembed = hl_ef_pembed = hl_arg_pembed = None char_vocab = char_builder.finish() upos_vocab = upos_builder.finish() ulabel_vocab = ulabel_builder.finish() # ===== # finally assign things ret.put_voc("word", word_vocab) ret.put_voc("lemma", lemma_vocab) ret.put_voc("char", char_vocab) ret.put_voc("upos", upos_vocab) ret.put_voc("ulabel", ulabel_vocab) ret.put_emb("word", word_embed1) ret.put_emb("lemma", lemma_embed1) # don't need to be jsonable since we are using pickle all at once # todo(WARN): the conf in vocab is also stored!! ret.put_voc("hl_evt", hl_evt) ret.put_voc("hl_ef", hl_ef) ret.put_voc("hl_arg", hl_arg) ret.put_emb("hl_evt", hl_evt_pembed) ret.put_emb("hl_ef", hl_ef_pembed) ret.put_emb("hl_arg", hl_arg_pembed) return ret
def build_from_stream(dconf: DConf, stream, extra_stream): zlog("Build vocabs from streams.") ret = ParserVocabPackage({}, {}, dconf) # word_builder = VocabBuilder("word") char_builder = VocabBuilder("char") pos_builder = VocabBuilder("pos") label_builder = VocabBuilder("label") word_normer = ret.word_normer if dconf.vocab_add_prevalues: zlog( f"Add pre-defined values for upos({len(ParserVocabPackage.PRE_VALUES_UPOS)}) and " f"ulabel({len(ParserVocabPackage.PRE_VALUES_ULAB)}).") pos_builder.feed_stream(ParserVocabPackage.PRE_VALUES_UPOS) label_builder.feed_stream(ParserVocabPackage.PRE_VALUES_ULAB) for inst in stream: # todo(warn): only do special handling for words # there must be words word_builder.feed_stream(word_normer.norm_stream(inst.words.vals)) for w in inst.words.vals: char_builder.feed_stream(w) # pos and label can be optional if inst.poses.has_vals(): pos_builder.feed_stream(inst.poses.vals) if inst.labels.has_vals(): label_builder.feed_stream(inst.labels.vals) # w2vec = None if dconf.init_from_pretrain: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs extra_word_set = set() for inst in extra_stream: for w in word_normer.norm_stream(inst.words.vals): extra_word_set.add(w) # ----- load (possibly multiple) pretrain embeddings # must provide dconf.pretrain_file (there can be multiple pretrain files!) list_pretrain_file, list_code_pretrain = dconf.pretrain_file, dconf.code_pretrain list_code_pretrain.extend( [""] * len(list_pretrain_file)) # pad default ones w2vec = WordVectors.load(list_pretrain_file[0], aug_code=list_code_pretrain[0]) if len(list_pretrain_file) > 1: w2vec.merge_others([ WordVectors.load(list_pretrain_file[i], aug_code=list_code_pretrain[i]) for i in range(1, len(list_pretrain_file)) ]) # ----- # first filter according to thresholds word_builder.filter(lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf. word_rthres) or w2vec.has_key(ww)) # then add extra ones for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) word_vocab = word_builder.finish() word_embed1 = word_vocab.filter_embed( w2vec, init_nohit=dconf.pretrain_init_nohit, scale=dconf.pretrain_scale) else: word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) word_embed1 = None # char_vocab = char_builder.finish() # todo(+1): extra pos/label symbols? TARGET_END = VocabHelper.convert_special_pattern("unk") pos_vocab = pos_builder.finish( target_range=(1, TARGET_END)) # only real tags label_vocab = label_builder.finish(target_range=(1, TARGET_END)) # assign ret.put_voc("word", word_vocab) ret.put_voc("char", char_vocab) ret.put_voc("pos", pos_vocab) ret.put_voc("label", label_vocab) ret.put_emb("word", word_embed1) # return ret
def build_from_stream(build_conf: MLMVocabPackageConf, stream, extra_stream): zlog("Build vocabs from streams.") ret = MLMVocabPackage({}, {}) # ----- if build_conf.add_ud2_pos_backoffs: ud2_pos_pre_list = list(VocabBuilder.DEFAULT_PRE_LIST) + [ UD2_POS_UNK_MAP[p] for p in UD2_POS_LIST ] word_builder = VocabBuilder("word", pre_list=ud2_pos_pre_list) else: word_builder = VocabBuilder("word") char_builder = VocabBuilder("char") pos_builder = VocabBuilder("pos") deplabel_builder = VocabBuilder("deplabel") ner_builder = VocabBuilder("ner") if build_conf.add_ud2_prevalues: zlog( f"Add pre-defined UD2 values for upos({len(UD2_POS_LIST)}) and ulabel({len(UD2_LABEL_LIST)})." ) pos_builder.feed_stream(UD2_POS_LIST) deplabel_builder.feed_stream(UD2_LABEL_LIST) for inst in stream: word_builder.feed_stream(inst.word_seq.vals) for w in inst.word_seq.vals: char_builder.feed_stream(w) # todo(+N): currently we are assuming that we are using UD pos/deps, and directly go with the default ones # pos and label can be optional?? # if inst.poses.has_vals(): # pos_builder.feed_stream(inst.poses.vals) # if inst.deplabels.has_vals(): # deplabel_builder.feed_stream(inst.deplabels.vals) if hasattr(inst, "ner_seq") and inst.ner_seq.has_vals(): ner_builder.feed_stream(inst.ner_seq.vals) # ===== embeddings w2vec = None if build_conf.read_from_pretrain: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs extra_word_set = set(w for inst in extra_stream for w in inst.word_seq.vals) # ----- load (possibly multiple) pretrain embeddings # must provide build_conf.pretrain_file (there can be multiple pretrain files!) list_pretrain_file, list_code_pretrain = build_conf.pretrain_file, build_conf.pretrain_codes list_code_pretrain.extend( [""] * len(list_pretrain_file)) # pad default ones w2vec = WordVectors.load(list_pretrain_file[0], aug_code=list_code_pretrain[0]) if len(list_pretrain_file) > 1: w2vec.merge_others([ WordVectors.load(list_pretrain_file[i], aug_code=list_code_pretrain[i]) for i in range(1, len(list_pretrain_file)) ]) # ----- # first filter according to thresholds word_builder.filter( lambda ww, rank, val: (val >= build_conf.word_fthres and rank <= build_conf.word_rthres) or (build_conf.ignore_thresh_with_pretrain and w2vec.has_key(ww))) # then add extra ones if build_conf.ignore_thresh_with_pretrain: for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) word_vocab = word_builder.finish() word_embed1 = word_vocab.filter_embed( w2vec, init_nohit=build_conf.pretrain_init_nohit, scale=build_conf.pretrain_scale) else: word_vocab = word_builder.finish_thresh( rthres=build_conf.word_rthres, fthres=build_conf.word_fthres) word_embed1 = None # char_vocab = char_builder.finish() pos_vocab = pos_builder.finish(sort_by_count=False) deplabel_vocab = deplabel_builder.finish(sort_by_count=False) ner_vocab = ner_builder.finish() # assign ret.put_voc("word", word_vocab) ret.put_voc("char", char_vocab) ret.put_voc("pos", pos_vocab) ret.put_voc("deplabel", deplabel_vocab) ret.put_voc("ner", ner_vocab) ret.put_emb("word", word_embed1) # return ret