Beispiel #1
0
def read_vocab(vocab_file, vocab_limit):
    if vocab_file.endswith(".json"):
        vocab = load_json(vocab_file)
    else:
        vocab = {l.strip(): c for c, l in enumerate(line_reader(vocab_file))}
    assert vocab["<s>"] == 0
    return {w: i for w, i in vocab.items() if i < vocab_limit}
Beispiel #2
0
def read_vocab(vocab_file, vocab_limit):
    if vocab_file.endswith(".json"):
        vocab = load_json(vocab_file)
    else:
        vocab = {l.strip(): c for c, l in enumerate(line_reader(vocab_file))}
    assert vocab["<s>"] == 0
    return {w: i for w, i in vocab.items() if i < vocab_limit}
Beispiel #3
0
def save_w2v_to_sep(dirname):
    with open("{}/W_e.txt".format(dirname),
              "w") as out_f_e, open("{}/W_v.txt".format(dirname),
                                    "w") as out_f_v:
        for l in line_reader("{}/W_w.txt".format(dirname)):
            w, e = l.split(" ", 1)
            out_f_v.write("{}\n".format(w))
            out_f_e.write(e)
Beispiel #4
0
def load_w2v(f):
    """
    Loads word2vec-format embeddings.
    """
    ws = []
    with open(f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m, n))
    for c, l in enumerate(line_reader(f, skip=1)):  # skip dimensions
        w, *e = l.strip().split()
        assert len(e) == n
        if not w or not e:
            print("Empty w or e.")
        ws.append(w)
        e_m[c] = e
    assert len(ws) == e_m.shape[0]
    w_index = {w: c for c, w in enumerate(ws)}

    return w_index, e_m
Beispiel #5
0
def load_w2v(f):
    """
    Loads word2vec-format embeddings.
    """
    ws = []
    with open(f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m, n))
    for c, l in enumerate(line_reader(f, skip=1)):  # skip dimensions
        w, *e = l.strip().split()
        assert len(e) == n
        if not w or not e:
            print("Empty w or e.")
        ws.append(w)
        e_m[c] = e
    assert len(ws) == e_m.shape[0]
    w_index = {w: c for c, w in enumerate(ws)}

    return w_index, e_m
Beispiel #6
0
def v_to_json(filename, outfilename):
    w_index = {}
    for c, l in enumerate(line_reader(filename)):
        w_index[l.strip()] = c
    save_json(w_index, outfilename)
Beispiel #7
0
 def create(self, f, w_index=None, downcase=False):
     for c, l in enumerate(line_reader(f)):
         inst = Instance()
         inst.extract(l, w_index, downcase=downcase)
         self.append(inst)
Beispiel #8
0
                    type=int,
                    default=15,
                    help="Number of closest words.")
parser.add_argument("-weight_type",
                    default="pivot",
                    choices=["pivot", "context", "both", "shared"],
                    help="Whether to use pivot/context/shared embeddings.")
parser.add_argument("-skip_top",
                    type=int,
                    default=100,
                    help="Number of most frequent words to skip.")
args = parser.parse_args()
print(args.input_dir)

if args.ws_file:
    ws = [w.strip() for w in line_reader(args.ws_file)]
else:
    ws = args.ws

w_index_path = "{}/w_index.json".format(args.input_dir)
w_ind = load_json(w_index_path)
inv_w_ind = {v: k for k, v in w_ind.items()}
print("Loaded vocabulary: {}".format(len(w_ind)))

if args.weight_type == "pivot":
    model_path = "{}/W_w.npy".format(args.input_dir)
    W_w = load_npy(model_path)
elif args.weight_type == "context":
    model_path = "{}/W_c.npy".format(args.input_dir)
    W_c = load_npy(model_path)
elif args.weight_type == "both":
Beispiel #9
0
]


parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-input_dir", help="Directory containing model and vocabulary files.", required=True)
parser.add_argument("-ws", default=words, type=str, nargs="+", help="List of words to query.")
parser.add_argument("-ws_file", help="Filepath containing a list of words to query.")
parser.add_argument("-n_closest", type=int, default=15, help="Number of closest words.")
parser.add_argument("-weight_type", default="pivot", choices=["pivot", "context", "both", "shared"],
                    help="Whether to use pivot/context/shared embeddings.")
parser.add_argument("-skip_top", type=int, default=100, help="Number of most frequent words to skip.")
args = parser.parse_args()
print(args.input_dir)

if args.ws_file:
    ws = [w.strip() for w in line_reader(args.ws_file)]
else:
    ws = args.ws

w_index_path = "{}/w_index.json".format(args.input_dir)
w_ind = load_json(w_index_path)
inv_w_ind = {v: k for k, v in w_ind.items()}
print("Loaded vocabulary: {}".format(len(w_ind)))


if args.weight_type == "pivot":
    model_path = "{}/W_w.npy".format(args.input_dir)
    W_w = load_npy(model_path)
elif args.weight_type == "context":
    model_path = "{}/W_c.npy".format(args.input_dir)
    W_c = load_npy(model_path)
Beispiel #10
0
 def create(self, f, w_index=None, downcase=False):
     for c, l in enumerate(line_reader(f)):
         inst = Instance()
         inst.extract(l, w_index, downcase=downcase)
         self.append(inst)
Beispiel #11
0
def v_to_json(filename, outfilename):
    w_index = {}
    for c, l in enumerate(line_reader(filename)):
        w_index[l.strip()] = c
    save_json(w_index, outfilename)
Beispiel #12
0
def save_w2v_to_sep(dirname):
    with open("{}/W_e.txt".format(dirname), "w") as out_f_e, open("{}/W_v.txt".format(dirname), "w") as out_f_v:
        for l in line_reader("{}/W_w.txt".format(dirname)):
            w, e = l.split(" ", 1)
            out_f_v.write("{}\n".format(w))
            out_f_e.write(e)