Esempio n. 1
0
def main(args):
    conf, model, vpack, test_iter = prepare_test(args)
    dconf = conf.dconf
    # todo(note): here is the main change
    # make sure the model is order 1 graph model, otherwise cannot run through
    all_results = []
    all_insts = []
    with utils.Timer(tag="Run-score", info="", print_date=True):
        for cur_insts in test_iter:
            all_insts.extend(cur_insts)
            batched_arc_scores, batched_label_scores = model.score_on_batch(
                cur_insts)
            batched_arc_scores, batched_label_scores = BK.get_value(
                batched_arc_scores), BK.get_value(batched_label_scores)
            for cur_idx in range(len(cur_insts)):
                cur_len = len(cur_insts[cur_idx]) + 1
                # discarding paddings
                cur_res = (batched_arc_scores[cur_idx, :cur_len, :cur_len],
                           batched_label_scores[cur_idx, :cur_len, :cur_len])
                all_results.append(cur_res)
    # reorder to the original order
    orig_indexes = [z.inst_idx for z in all_insts]
    orig_results = [None] * len(orig_indexes)
    for new_idx, orig_idx in enumerate(orig_indexes):
        assert orig_results[orig_idx] is None
        orig_results[orig_idx] = all_results[new_idx]
    # saving
    with utils.Timer(tag="Run-write",
                     info=f"Writing to {dconf.output_file}",
                     print_date=True):
        import pickle
        with utils.zopen(dconf.output_file, "wb") as fd:
            for one in orig_results:
                pickle.dump(one, fd)
    utils.printing("The end.")
Esempio n. 2
0
def main(args):
    conf, model, vpack, test_iter = prepare_test(args)
    dconf = conf.dconf
    # go
    rr = ParserTestingRunner(model, vpack, dconf.output_file, dconf.test, dconf.output_format)
    x = rr.run(test_iter)
    utils.printing("The end.")
Esempio n. 3
0
 def save_hits(self, fname):
     num_hits = len(self.hits)
     printing(f"Saving hit w2v num_words={num_hits:d}, embed_size={self.embed_size:d} to {fname}.")
     with zopen(fname, "w") as fd:
         tmp_words = sorted(self.hits.keys(), key=lambda k: self.wmap[k])  # use original ordering
         tmp_vecs = [self.vecs[self.wmap[k]] for k in tmp_words]
         WordVectors.save_txt(fd, tmp_words, tmp_vecs, self.sep)
Esempio n. 4
0
def main(args):
    conf, model, vpack, test_iter = prepare_test(args)
    dconf = conf.dconf
    # go
    rr = MyIETestingRunner(model,
                           vpack,
                           dconf.output_file,
                           dconf.test,
                           dconf.output_format,
                           dconf.eval_conf,
                           release_resources=True)
    x = rr.run(test_iter)
    utils.printing("The end.")
Esempio n. 5
0
 def _load_bin(fname):
     printing("Going to load pre-trained (binary) w2v from %s ..." % fname)
     one = WordVectors()
     #
     from gensim.models import KeyedVectors
     #
     kv = KeyedVectors.load_word2vec_format(fname, binary=True)
     # KeyedVectors.save_word2vec_format()
     one.num_words, one.embed_size = len(kv.vectors), len(kv.vectors[0])
     for w, z in kv.vocab.items():
         one.vecs.append(kv.vectors[z.index])
         one.wmap[w] = len(one.words)
         one.words.append(w)
     printing("Read ok: w2v num_words=%d, embed_size=%d." % (one.num_words, one.embed_size))
     return one
Esempio n. 6
0
 def finish(self, word_filter=(lambda ww, rank, val: True), sort_by_count=True, target_range=DEFAULT_TARGET_RANGE):
     v = self.v
     if sort_by_count:
         v.v, v.final_vals = VocabBuilder.ranking_vals(
             self.counts_, v.pre_list, v.post_list, self.default_val_, True, word_filter=word_filter)
     else:
         tmp_counts_ = OrderedDict([(k, self.counts_[k]) for k in self.keys_])
         v.v, v.final_vals = VocabBuilder.ranking_vals(
             tmp_counts_, v.pre_list, v.post_list, self.default_val_, False, word_filter=word_filter)
     v.final_words = Helper.reverse_idx(v.v)
     printing("Build Vocab %s ok, from %d to %d, as %s." % (v.name, len(self.counts_), len(v), str(v)))
     #
     VocabBuilder._build_check(v)
     VocabBuilder._build_target_range(v, target_range[0], target_range[1])
     VocabBuilder._build_prop(v)
     return v
Esempio n. 7
0
 def filter_embed(self, wv: 'WordVectors', init_nohit=0., scale=1.0, assert_all_hit=False):
     if init_nohit <= 0.:
         get_nohit = lambda s: np.zeros((s,), dtype=np.float32)
     else:
         get_nohit = lambda s: (Random.random_sample((s,)).astype(np.float32)-0.5) * (2*init_nohit)
     #
     ret = []
     res = defaultdict(int)
     for w in self.final_words:
         hit, norm_name, norm_w = wv.norm_until_hit(w)
         if hit:
             value = np.asarray(wv.get_vec(norm_w, norm=False), dtype=np.float32)
             res[norm_name] += 1
         else:
             value = get_nohit(wv.embed_size)
             # value = np.zeros((wv.embed_size,), dtype=np.float32)
             res["no-hit"] += 1
         ret.append(value)
     #
     if assert_all_hit:
         zcheck(res["no-hit"]==0, f"Filter-embed error: assert all-hit but get no-hit of {res['no-hit']}")
     printing("Filter pre-trained embed: %s, no-hit is inited with %s." % (res, init_nohit))
     return np.asarray(ret, dtype=np.float32) * scale
Esempio n. 8
0
 def _load_txt(fname, sep=" "):
     printing("Going to load pre-trained (txt) w2v from %s ..." % fname)
     one = WordVectors(sep=sep)
     repeated_count = 0
     with zopen(fname) as fd:
         # first line
         line = fd.readline()
         try:
             one.num_words, one.embed_size = [int(x) for x in line.split(sep)]
             printing("Reading w2v num_words=%d, embed_size=%d." % (one.num_words, one.embed_size))
             line = fd.readline()
         except:
             printing("Reading w2v.")
         # the rest
         while len(line) > 0:
             line = line.rstrip()
             fields = line.split(sep)
             word, vec = fields[0], [float(x) for x in fields[1:]]
             # zcheck(word not in one.wmap, "Repeated key.")
             # keep the old one
             if word in one.wmap:
                 repeated_count += 1
                 zwarn(f"Repeat key {word}")
                 line = fd.readline()
                 continue
             #
             if one.embed_size is None:
                 one.embed_size = len(vec)
             else:
                 zcheck(len(vec) == one.embed_size, "Unmatched embed dimension.")
             one.vecs.append(vec)
             one.wmap[word] = len(one.words)
             one.words.append(word)
             line = fd.readline()
     # final
     if one.num_words is not None:
         zcheck(one.num_words == len(one.vecs)+repeated_count, "Unmatched num of words.")
     one.num_words = len(one.vecs)
     printing(f"Read ok: w2v num_words={one.num_words:d}, embed_size={one.embed_size:d}, repeat={repeated_count:d}")
     return one
Esempio n. 9
0
 def save(self, fname):
     printing(f"Saving w2v num_words={self.num_words:d}, embed_size={self.embed_size:d} to {fname}.")
     zcheck(self.num_words == len(self.vecs), "Internal error: unmatched number!")
     with zopen(fname, "w") as fd:
         WordVectors.save_txt(fd, self.words, self.vecs, self.sep)
Esempio n. 10
0
 def filter(self, word_filter=(lambda ww, rank, val: True)):
     new_counts = VocabBuilder.filter_vals(self.counts_, word_filter)
     printing("Filter in VocabBuilder %s ok, from %d to %d." % (self.v.name, len(self.counts_), len(new_counts)))
     self.counts_ = new_counts