def main(args): conf, model, vpack, test_iter = prepare_test(args) dconf = conf.dconf # todo(note): here is the main change # make sure the model is order 1 graph model, otherwise cannot run through all_results = [] all_insts = [] with utils.Timer(tag="Run-score", info="", print_date=True): for cur_insts in test_iter: all_insts.extend(cur_insts) batched_arc_scores, batched_label_scores = model.score_on_batch( cur_insts) batched_arc_scores, batched_label_scores = BK.get_value( batched_arc_scores), BK.get_value(batched_label_scores) for cur_idx in range(len(cur_insts)): cur_len = len(cur_insts[cur_idx]) + 1 # discarding paddings cur_res = (batched_arc_scores[cur_idx, :cur_len, :cur_len], batched_label_scores[cur_idx, :cur_len, :cur_len]) all_results.append(cur_res) # reorder to the original order orig_indexes = [z.inst_idx for z in all_insts] orig_results = [None] * len(orig_indexes) for new_idx, orig_idx in enumerate(orig_indexes): assert orig_results[orig_idx] is None orig_results[orig_idx] = all_results[new_idx] # saving with utils.Timer(tag="Run-write", info=f"Writing to {dconf.output_file}", print_date=True): import pickle with utils.zopen(dconf.output_file, "wb") as fd: for one in orig_results: pickle.dump(one, fd) utils.printing("The end.")
def main(args): conf, model, vpack, test_iter = prepare_test(args) dconf = conf.dconf # go rr = ParserTestingRunner(model, vpack, dconf.output_file, dconf.test, dconf.output_format) x = rr.run(test_iter) utils.printing("The end.")
def save_hits(self, fname): num_hits = len(self.hits) printing(f"Saving hit w2v num_words={num_hits:d}, embed_size={self.embed_size:d} to {fname}.") with zopen(fname, "w") as fd: tmp_words = sorted(self.hits.keys(), key=lambda k: self.wmap[k]) # use original ordering tmp_vecs = [self.vecs[self.wmap[k]] for k in tmp_words] WordVectors.save_txt(fd, tmp_words, tmp_vecs, self.sep)
def main(args): conf, model, vpack, test_iter = prepare_test(args) dconf = conf.dconf # go rr = MyIETestingRunner(model, vpack, dconf.output_file, dconf.test, dconf.output_format, dconf.eval_conf, release_resources=True) x = rr.run(test_iter) utils.printing("The end.")
def _load_bin(fname): printing("Going to load pre-trained (binary) w2v from %s ..." % fname) one = WordVectors() # from gensim.models import KeyedVectors # kv = KeyedVectors.load_word2vec_format(fname, binary=True) # KeyedVectors.save_word2vec_format() one.num_words, one.embed_size = len(kv.vectors), len(kv.vectors[0]) for w, z in kv.vocab.items(): one.vecs.append(kv.vectors[z.index]) one.wmap[w] = len(one.words) one.words.append(w) printing("Read ok: w2v num_words=%d, embed_size=%d." % (one.num_words, one.embed_size)) return one
def finish(self, word_filter=(lambda ww, rank, val: True), sort_by_count=True, target_range=DEFAULT_TARGET_RANGE): v = self.v if sort_by_count: v.v, v.final_vals = VocabBuilder.ranking_vals( self.counts_, v.pre_list, v.post_list, self.default_val_, True, word_filter=word_filter) else: tmp_counts_ = OrderedDict([(k, self.counts_[k]) for k in self.keys_]) v.v, v.final_vals = VocabBuilder.ranking_vals( tmp_counts_, v.pre_list, v.post_list, self.default_val_, False, word_filter=word_filter) v.final_words = Helper.reverse_idx(v.v) printing("Build Vocab %s ok, from %d to %d, as %s." % (v.name, len(self.counts_), len(v), str(v))) # VocabBuilder._build_check(v) VocabBuilder._build_target_range(v, target_range[0], target_range[1]) VocabBuilder._build_prop(v) return v
def filter_embed(self, wv: 'WordVectors', init_nohit=0., scale=1.0, assert_all_hit=False): if init_nohit <= 0.: get_nohit = lambda s: np.zeros((s,), dtype=np.float32) else: get_nohit = lambda s: (Random.random_sample((s,)).astype(np.float32)-0.5) * (2*init_nohit) # ret = [] res = defaultdict(int) for w in self.final_words: hit, norm_name, norm_w = wv.norm_until_hit(w) if hit: value = np.asarray(wv.get_vec(norm_w, norm=False), dtype=np.float32) res[norm_name] += 1 else: value = get_nohit(wv.embed_size) # value = np.zeros((wv.embed_size,), dtype=np.float32) res["no-hit"] += 1 ret.append(value) # if assert_all_hit: zcheck(res["no-hit"]==0, f"Filter-embed error: assert all-hit but get no-hit of {res['no-hit']}") printing("Filter pre-trained embed: %s, no-hit is inited with %s." % (res, init_nohit)) return np.asarray(ret, dtype=np.float32) * scale
def _load_txt(fname, sep=" "): printing("Going to load pre-trained (txt) w2v from %s ..." % fname) one = WordVectors(sep=sep) repeated_count = 0 with zopen(fname) as fd: # first line line = fd.readline() try: one.num_words, one.embed_size = [int(x) for x in line.split(sep)] printing("Reading w2v num_words=%d, embed_size=%d." % (one.num_words, one.embed_size)) line = fd.readline() except: printing("Reading w2v.") # the rest while len(line) > 0: line = line.rstrip() fields = line.split(sep) word, vec = fields[0], [float(x) for x in fields[1:]] # zcheck(word not in one.wmap, "Repeated key.") # keep the old one if word in one.wmap: repeated_count += 1 zwarn(f"Repeat key {word}") line = fd.readline() continue # if one.embed_size is None: one.embed_size = len(vec) else: zcheck(len(vec) == one.embed_size, "Unmatched embed dimension.") one.vecs.append(vec) one.wmap[word] = len(one.words) one.words.append(word) line = fd.readline() # final if one.num_words is not None: zcheck(one.num_words == len(one.vecs)+repeated_count, "Unmatched num of words.") one.num_words = len(one.vecs) printing(f"Read ok: w2v num_words={one.num_words:d}, embed_size={one.embed_size:d}, repeat={repeated_count:d}") return one
def save(self, fname): printing(f"Saving w2v num_words={self.num_words:d}, embed_size={self.embed_size:d} to {fname}.") zcheck(self.num_words == len(self.vecs), "Internal error: unmatched number!") with zopen(fname, "w") as fd: WordVectors.save_txt(fd, self.words, self.vecs, self.sep)
def filter(self, word_filter=(lambda ww, rank, val: True)): new_counts = VocabBuilder.filter_vals(self.counts_, word_filter) printing("Filter in VocabBuilder %s ok, from %d to %d." % (self.v.name, len(self.counts_), len(new_counts))) self.counts_ = new_counts