def read_vocab(vocab_file, vocab_limit): if vocab_file.endswith(".json"): vocab = load_json(vocab_file) else: vocab = {l.strip(): c for c, l in enumerate(line_reader(vocab_file))} assert vocab["<s>"] == 0 return {w: i for w, i in vocab.items() if i < vocab_limit}
def save_w2v_to_sep(dirname): with open("{}/W_e.txt".format(dirname), "w") as out_f_e, open("{}/W_v.txt".format(dirname), "w") as out_f_v: for l in line_reader("{}/W_w.txt".format(dirname)): w, e = l.split(" ", 1) out_f_v.write("{}\n".format(w)) out_f_e.write(e)
def load_w2v(f): """ Loads word2vec-format embeddings. """ ws = [] with open(f) as in_f: m, n = map(eval, in_f.readline().strip().split()) e_m = np.zeros((m, n)) for c, l in enumerate(line_reader(f, skip=1)): # skip dimensions w, *e = l.strip().split() assert len(e) == n if not w or not e: print("Empty w or e.") ws.append(w) e_m[c] = e assert len(ws) == e_m.shape[0] w_index = {w: c for c, w in enumerate(ws)} return w_index, e_m
def v_to_json(filename, outfilename): w_index = {} for c, l in enumerate(line_reader(filename)): w_index[l.strip()] = c save_json(w_index, outfilename)
def create(self, f, w_index=None, downcase=False): for c, l in enumerate(line_reader(f)): inst = Instance() inst.extract(l, w_index, downcase=downcase) self.append(inst)
type=int, default=15, help="Number of closest words.") parser.add_argument("-weight_type", default="pivot", choices=["pivot", "context", "both", "shared"], help="Whether to use pivot/context/shared embeddings.") parser.add_argument("-skip_top", type=int, default=100, help="Number of most frequent words to skip.") args = parser.parse_args() print(args.input_dir) if args.ws_file: ws = [w.strip() for w in line_reader(args.ws_file)] else: ws = args.ws w_index_path = "{}/w_index.json".format(args.input_dir) w_ind = load_json(w_index_path) inv_w_ind = {v: k for k, v in w_ind.items()} print("Loaded vocabulary: {}".format(len(w_ind))) if args.weight_type == "pivot": model_path = "{}/W_w.npy".format(args.input_dir) W_w = load_npy(model_path) elif args.weight_type == "context": model_path = "{}/W_c.npy".format(args.input_dir) W_c = load_npy(model_path) elif args.weight_type == "both":
] parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-input_dir", help="Directory containing model and vocabulary files.", required=True) parser.add_argument("-ws", default=words, type=str, nargs="+", help="List of words to query.") parser.add_argument("-ws_file", help="Filepath containing a list of words to query.") parser.add_argument("-n_closest", type=int, default=15, help="Number of closest words.") parser.add_argument("-weight_type", default="pivot", choices=["pivot", "context", "both", "shared"], help="Whether to use pivot/context/shared embeddings.") parser.add_argument("-skip_top", type=int, default=100, help="Number of most frequent words to skip.") args = parser.parse_args() print(args.input_dir) if args.ws_file: ws = [w.strip() for w in line_reader(args.ws_file)] else: ws = args.ws w_index_path = "{}/w_index.json".format(args.input_dir) w_ind = load_json(w_index_path) inv_w_ind = {v: k for k, v in w_ind.items()} print("Loaded vocabulary: {}".format(len(w_ind))) if args.weight_type == "pivot": model_path = "{}/W_w.npy".format(args.input_dir) W_w = load_npy(model_path) elif args.weight_type == "context": model_path = "{}/W_c.npy".format(args.input_dir) W_c = load_npy(model_path)