예제 #1
0
def run_senseinfer(mbatch_size):
    """
    Obtain sense predictions for every word in the text.
    """
    start = time.time()
    log.info("Starting inference.")
    pivots = []
    contexts = []
    labels = []
    max_y_is_1s = []
    cur_mbatch_size = 0
    all_ss = []

    for i, (p, cs, ls, max_y_is_1, n_line) in enumerate(mbatch_skipgrams_inference(line_reader(args.corpus_path),
                                                                                      v,
                                                                                      max_window_size=args.max_window_size), 1):
        pivots.append(p)
        contexts.append(cs)
        labels.append(ls)
        max_y_is_1s.append(max_y_is_1)

        cur_mbatch_size += 1
        if cur_mbatch_size != mbatch_size:
            continue
        if pivots:
            x_p = np.array(pivots, dtype="int32")  # n_batches
            X_c = np.array(contexts, dtype="int32")  # n_batches*n_contexts
            L = np.array(labels, dtype="int32")  # n_batches*n_contexts
            max_y = np.array(max_y_is_1s, dtype="int32")  # n_batches
            #M_pad = X_c > 0  # mask for padded
            assert X_c.shape[1] == L.shape[1]
            assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) #== M_pad.shape[0]
            p, ss = modl.train(x_p, X_c, L, max_y)  # n_batches
            all_ss.extend(ss)
        cur_mbatch_size = 0
        pivots = []
        contexts = []
        labels = []
        max_y_is_1s = []

        if i % 10000 == 0:
            rep = "Number of instances: {0}; sequences: {1}".format(i, n_line)
            dyn_print(rep)
    # deal with out-of-mbatch
    if pivots:
        x_p = np.array(pivots, dtype="int32")  # n_batches
        X_c = np.array(contexts, dtype="int32")  # n_batches*n_contexts
        L = np.array(labels, dtype="int32")  # n_batches*n_contexts
        max_y = np.array(max_y_is_1s, dtype="int32")  # n_batches
        #M_pad = X_c > 0  # mask for padded
        assert X_c.shape[1] == L.shape[1]
        assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) #== M_pad.shape[0]
        p, ss = modl.train(x_p, X_c, L, max_y)  # n_batches
        all_ss.extend(ss)
    rep = "Number of instances: {0}; sequences: {1}".format(i, n_line)
    dyn_print(rep)
    print("\n")
    log.info("Inference completed: {}s".format(time.time()-start))

    return all_ss
예제 #2
0
파일: text.py 프로젝트: fangzheng354/bimu
    def save(self, save_dir):
        log.info("Saving vocabulary.")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        save_json(self.w_index, "{}/w_index.json".format(save_dir))
        save_json(self.w_cn, "{}/w_cn.json".format(save_dir))
예제 #3
0
def run(mbatch_size, S_inferred):
    def update():
        """
        :return: updated counts of senses
        """
        # pivots can repeat, intermediate updates need to be cached
        if modl_type == SensesInference:
            np.add.at(S_inferred, (p, ss), 1)
        elif modl_type == SensesExpectationInference:
            np.add.at(S_inferred, p, ss)
        else:
            sys.exit("Wrong inference type.")


    start = time.time()
    log.info("Starting inference.")
    pivots = []
    contexts = []
    labels = []
    max_y_is_1s = []
    cur_mbatch_size = 0

    for i, (p, cs, ls, max_y_is_1, n_line) in enumerate(mbatch_skipgrams_inference(line_reader(args.corpus_path),
                                                                                      v,
                                                                                      max_window_size=args.max_window_size), 1):
        pivots.append(p)
        contexts.append(cs)
        labels.append(ls)
        max_y_is_1s.append(max_y_is_1)

        cur_mbatch_size += 1
        if cur_mbatch_size != mbatch_size:
            continue
        if pivots:
            x_p = np.array(pivots, dtype="int32")  # n_batches
            X_c = np.array(contexts, dtype="int32")  # n_batches*n_contexts
            L = np.array(labels, dtype="int32")  # n_batches*n_contexts
            max_y = np.array(max_y_is_1s, dtype="int32")  # n_batches
            #M_pad = X_c > 0  # mask for padded
            assert X_c.shape[1] == L.shape[1]
            assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) #== M_pad.shape[0]

            #if args.model == "senses" or args.model == "senses_expect":
            #
            p, ss = modl.train(x_p, X_c, L, max_y)  # n_batches
            update()
        cur_mbatch_size = 0
        pivots = []
        contexts = []
        labels = []
        max_y_is_1s = []

        if i % 10000 == 0:
            rep = "Number of instances: {0}; sequences: {1}".format(i, n_line)
            dyn_print(rep)
    print("\n")
    log.info("Inference completed: {}s".format(time.time()-start))

    return S_inferred
예제 #4
0
def run_fastskipgram(nepochs, mbatch_size):
    start = time.time()
    total_eff_len = 0
    for e in range(nepochs):
        log.info("Epoch {}.".format(e))
        losses = []
        n_instances = 0
        cur_mbatch_size = 0

        pairs = []
        labels = []

        for i, line in enumerate(line_reader(args.corpus_path), 1):
            ps, ls, eff_len = skipgrams(v.line_to_seq(line),
                                        max_window_size=args.max_window_size,
                                        n_neg=args.n_neg,
                                        sampling_tab=s_tab,
                                        subsampling_tab=ss_tab)
            pairs.extend(ps)
            labels.extend(ls)
            cur_mbatch_size += 1
            total_eff_len += eff_len
            if cur_mbatch_size != mbatch_size:
                continue
            else:
                cur_mbatch_size = 0
            #if args.optimizer == "SGDDynamic":
            #    lrate = max(args.min_lr, args.lr * (1 - total_eff_len / total_len))
            #else:
            lrate = args.lr
            if pairs:
                X = np.array(pairs, dtype="int32")
                labs = np.array(labels).reshape(-1, 1)
                #loss = sg.train(X, labs, lrate)
                loss = modl.train(X, labs)
                losses.append(loss)
                n_instances += len(labels)

            pairs = []
            labels = []

            if i % 10000 == 0:
                if not w1 or not w2:
                    sc = "-"
                else:
                    sc = sim(modl.params[0].get_value()[w1], modl.params[0].get_value()[w2])
                rep = "Number of instances: {0}; sequences: {1}; Fr/Ge pivot cos:{2}".format(n_instances,
                                                                                             i,
                                                                                             sc)
                if len(losses) > 0:
                    rep = "{} {}".format(rep, "Mean loss over last {} seqs: {}.".format(len(losses) * mbatch_size,
                                                                                        np.mean(losses)))
                    losses = []
                dyn_print(rep)
        print("\n")
    log.info("Training completed: {}s/epoch".format((time.time() - start) / nepochs))
예제 #5
0
def write_senses(sense_preds):
    """
    Write sense predictions by appending to words,
    like in 'the/1' (here, second sense is assigned, counting from 0).
    """
    corpus_path_senses = "{}.senses".format(args.corpus_path)
    log.info("Writing to {}.".format(corpus_path_senses))
    p_count = 0
    with open(corpus_path_senses, "w") as outfile:
        for l in line_reader(args.corpus_path):
            new_l = []
            for w in l.strip().split(" "):
                new_l.append("{0}/{1}".format(w, sense_preds[p_count]))
                p_count += 1
            outfile.write(" ".join(new_l)+"\n")
예제 #6
0
        e += entropy(v)
    return e / len(frequent)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-input_dir", help="Directory containing model and vocabulary files.", required=True)
    parser.add_argument("-f", default="W_w", help="Model file (optional), meant for models per epoch.")
    parser.add_argument("-data_path", help="Filepath containing the SCWS dataset.", default="data/SCWS/ratings.txt")
    parser.add_argument("-win_size", default=3, type=int,
                        help="Context window size (n words to the left and n to the right).")
    parser.add_argument("-n_most_freq", type=int, help="Only consider n most freq. words from vocabulary.")
    args = parser.parse_args()

    w_index_path = "{}/w_index.json".format(args.input_dir)
    # model_path = "{}/sg.pickle".format(args.input_dir)
    log.info("Loading model.")
    w_index = load_json(w_index_path)
    if args.n_most_freq:
        w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1}
        print(len(w_index))

    embs = load_npy("{}/{}.npy".format(args.input_dir, args.f))
    c_embs = load_npy("{}/W_c.npy".format(args.input_dir))
    try:
        if args.f == "W_w":
            n = ""
        else:
            n = eval(args.f[-1])
            assert 0 <= n < 9
        bias = load_npy("{}/Wb{}.npy".format(args.input_dir, n))
    except FileNotFoundError:
예제 #7
0
파일: score.py 프로젝트: MorLong/bimu
    def Sequence_Level(self, train_file, test_file, num_label, epochs):
        log.debug("Declaring theano vars.")
        random.seed(5)
        W1 = theano.shared(0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1)
        W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1)
        W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1)
        U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1)

        x = T.dmatrix("x")  # len(l) by win*dimension
        y = T.lvector("y")
        learn_rate = T.scalar("learn_rate")

        A1 = T.dot(x, W1)
        B1 = A1 * (A1 > 0)
        A2 = T.dot(B1, W2)
        B2 = A2 * (A2 > 0)
        A3 = T.dot(B2, W3)
        B3 = A3 * (A3 > 0)
        G = T.dot(B3, U)
        L1 = T.nnet.softmax(G)  # len(l) by num_label

        #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U))

        cost = T.nnet.categorical_crossentropy(L1, y).mean()
        gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U])
        #gw_x = T.grad(cost, [x])

        log.info("Compiling theano model.")
        f1 = theano.function(inputs=[x, y, learn_rate], outputs=[cost], updates=(
            (W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3),
            (U, U - learn_rate * gu)))

        #f2 = theano.function(inputs=[x, y], outputs=cost)
        prediction = T.argmax(L1, axis=1)
        discrepancy = prediction - y
        f3 = theano.function(inputs=[x, y], outputs=[discrepancy,prediction])
        #f4 = theano.function(inputs=[x, y], outputs=gw_x)

        alpha = self.alpha
        log.info("Read-in the training and test data.")
        open_train = open(train_file, "r")
        train_lines = open_train.readlines()
        open_test = open(test_file, "r")
        test_lines = open_test.readlines()

        log.info("Start training.")
        counter = 0
        start = time.time()
        iter_ = epochs
        for j in range(0, iter_):
            log.info("Epoch: {}...".format(j+1))
            x_ = []
            y_ = []
            for i in range(len(train_lines)):
                if i % 1000 == 0:
                    log.debug(i)
                counter = counter + 1
                current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines))
                if current_alpha < 0.01: current_alpha = 0.01
                line_ = train_lines[i]
                G = line_.split("|")
                token_line = G[0]
                label_line = G[1]
                token_list = list(fromstring(token_line, dtype=int, sep=' '))
                x_ = self.contextwin(token_list)  # len(l) by win*dimension
                y_ = fromstring(label_line, dtype=int, sep=' ')
                f1(x_, y_, current_alpha)

            total_num = 0
            total_value = 0
            goldlabels = []
            predictions = []
            for i in range(len(test_lines)):
                line_ = test_lines[i]
                G = line_.split("|")
                token_line = G[0].strip()
                label_line = G[1].strip()

                y = fromstring(label_line, dtype=int, sep=' ')
                x = self.contextwin(list(fromstring(token_line, dtype=int, sep=' ')))
                total_num = total_num + x.shape[0]
                discrep, preds = f3(x, y)
                goldlabels.extend(list(y))
                predictions.extend(list(preds))
                total_value = total_value + x.shape[0] - count_nonzero(discrep)

            assert len(goldlabels) == len(predictions)
            log.info("f1 {}".format(f1_score(goldlabels, predictions, average="weighted")))
            acc = 1.00 * total_value / total_num
            log.info("acc " + str(acc))
        log.info("Training completed: {}s/epoch".format((time.time()-start)/iter_))
예제 #8
0
파일: score.py 프로젝트: MorLong/bimu
    parser.add_argument("-hidden3", type=int, default=50)
    parser.add_argument("-oov", type=int, default=0)
    parser.add_argument("-tag_vocab_file", default="../../../Datasets/wsj/wsjtrain.tagvocab.json")
    parser.add_argument("-test_file", default="../../../Datasets/wsj/wsjtest")
    parser.add_argument("-train_file", default="../../../Datasets/wsj/wsjtrain")
    parser.add_argument("-vocab_file", required=True, help="Either in indexed json format or one word/line format")
    parser.add_argument("-vocab_limit", type=int, default=1000000)
    parser.add_argument("-vocab_limit_noexp", action="store_true",
                        help="Will use simple avg instead of weighted avg. for all words that exceed the args.vocab_limit (ie outside of vocab_limit-most frequent words.)")
    parser.add_argument("-win", type=int, default=7)
    parser.add_argument("-infer_side_win", type=int, default=3, help="Window size to each side during sense inference step")



    args = parser.parse_args()
    log.info("Settings: {}".format(args))
    log.info("Loading embeddings.")

    # context-sensitive inference for multisense embeddings
    if args.cembedding_file is not None:
        A = ScoreExpEmbedding(args.embedding_file, args.vocab_file, oov=args.oov, hidden1=args.hidden1,
                              hidden2=args.hidden2, hidden3=args.hidden3, win=args.win, alpha=args.alpha,
                              cembedding_file=args.cembedding_file, infer_side_win=args.infer_side_win)
    else:
        A = ScoreEmbedding(args.embedding_file, args.vocab_file, oov=args.oov, hidden1=args.hidden1,
                           hidden2=args.hidden2, hidden3=args.hidden3, win=args.win, alpha=args.alpha)  # use 0th, <s> symbol as oov

    log.info("Preparing training and test data.")
    train_file, len_tag_vocab = process(args.train_file, args.train_file+".bar", args.vocab_file, args.tag_vocab_file, args.vocab_limit)
    test_file, _ = process(args.test_file, args.test_file+".bar", args.vocab_file, args.tag_vocab_file, args.vocab_limit)
예제 #9
0
def run(nepochs, mbatch_size):
    start = time.time()
    for e in range(nepochs):
        log.info("Epoch {}.".format(e))
        losses = []
        cur_mbatch_size = 0
        print_size = 0

        pivots = []
        contexts = []
        contexts_a = []  # affiliated contexts
        labels = []
        max_y_is_1s = []

        for i, (p, cs, cs_a, ls, max_y_is_1, n_line) in enumerate(
                mbatch_skipgrams_affil(
                    line_reader(args.corpus_path_e),
                    line_reader(args.corpus_path_f),
                    line_reader(args.corpus_path_a),
                    v_e,
                    v_f,
                    max_window_size=args.max_window_size,
                    max_window_size_f=args.max_window_size_f,
                    n_neg=args.n_neg,
                    sampling_tab=s_tab,
                    subsampling_tab=ss_tab,
                    leaveout_m0=args.leaveout_m0), 1):
            if np.all(np.array(ls) == 0):
                continue
            pivots.append(p)
            contexts.append(cs)
            contexts_a.append(cs_a)
            labels.append(ls)
            max_y_is_1s.append(max_y_is_1)

            cur_mbatch_size += 1
            if cur_mbatch_size != mbatch_size:
                continue
            if pivots:
                x_p = np.array(pivots, dtype="int32")  # n_batches
                X_c = np.array(contexts, dtype="int32")  # n_batches*n_contexts
                # l' contexts:
                X_c_f = np.array(contexts_a,
                                 dtype="int32")  # n_batches*n_contexts_f
                X_c_f_mask = X_c_f > 0
                L = np.array(labels, dtype="int32")  # n_batches*n_contexts
                max_y = np.array(max_y_is_1s, dtype="int32")  # n_batches
                M_pad = X_c > 0  # mask for padded
                assert X_c.shape[1] == L.shape[1]
                assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(
                    max_y) == M_pad.shape[0]
                loss = modl.train(x_p, X_c, X_c_f, X_c_f_mask, L, max_y, M_pad)
                losses.append(loss)

            pivots = []
            contexts = []
            contexts_a = []
            labels = []
            max_y_is_1s = []

            print_size += cur_mbatch_size
            if print_size % 10000 == 0:
                if not w1 or not w2:
                    sc = "-"
                else:
                    sc = sim(modl.params[0].get_value()[w1],
                             modl.params[0].get_value()[w2])
                rep = "Number of instances: {0}; sequences: {1}; Fr/Ge pivot cos:{2}".format(
                    print_size, n_line, sc)
                if len(losses) > 0:
                    rep = "{} {}".format(
                        rep, "Mean loss {}.".format(np.mean(losses)))
                    losses = []
                dyn_print(rep)
            cur_mbatch_size = 0
        print("\n")
        if args.save_model_per_ep and not e == nepochs - 1:  # save of last epoch done below
            save_weights(modl, str_app=str(e))
    log.info("Training completed: {}s/epoch".format(
        (time.time() - start) / nepochs))
예제 #10
0
                    rep = "{} {}".format(
                        rep, "Mean loss {}.".format(np.mean(losses)))
                    losses = []
                dyn_print(rep)
            cur_mbatch_size = 0
        print("\n")
        if args.save_model_per_ep and not e == nepochs - 1:  # save of last epoch done below
            save_weights(modl, str_app=str(e))
    log.info("Training completed: {}s/epoch".format(
        (time.time() - start) / nepochs))


if not os.path.exists(save_dir):
    os.makedirs(save_dir)

log.info("Saving run details to {}.".format(save_dir))
save_run_details()

log.info("Building vocabulary.")
v_e = VocabBuild(min_freq=args.min_freq,
                 max_n_words=args.max_n_words,
                 discard_n_top_freq=args.discard_n_top_freq,
                 downcase=args.downcase)

r = line_reader(args.corpus_path_e)
v_e.create(r)
log.info("Loading l' vocabulary.")
sett_f = load_json("{}/{}".format(args.model_f_dir, args.settings_save_fname))
v_f = VocabBuild(sett_f["min_freq"], sett_f["max_n_words"],
                 sett_f["discard_n_top_freq"], sett_f["downcase"])
v_f.load(args.model_f_dir)
예제 #11
0
파일: run_bimu.py 프로젝트: MorLong/bimu
def run(nepochs, mbatch_size):
    start = time.time()
    for e in range(nepochs):
        log.info("Epoch {}.".format(e))
        losses = []
        cur_mbatch_size = 0
        print_size = 0

        pivots = []
        contexts = []
        contexts_a = []  # affiliated contexts
        labels = []
        max_y_is_1s = []

        for i, (p,
                cs,
                cs_a,
                ls,
                max_y_is_1,
                n_line) in enumerate(mbatch_skipgrams_affil(line_reader(args.corpus_path_e),
                                                            line_reader(args.corpus_path_f),
                                                            line_reader(args.corpus_path_a),
                                                            v_e,
                                                            v_f,
                                                            max_window_size=args.max_window_size,
                                                            max_window_size_f=args.max_window_size_f,
                                                            n_neg=args.n_neg,
                                                            sampling_tab=s_tab,
                                                            subsampling_tab=ss_tab,
                                                            leaveout_m0=args.leaveout_m0), 1):
            if np.all(np.array(ls) == 0):
                continue
            pivots.append(p)
            contexts.append(cs)
            contexts_a.append(cs_a)
            labels.append(ls)
            max_y_is_1s.append(max_y_is_1)

            cur_mbatch_size += 1
            if cur_mbatch_size != mbatch_size:
                continue
            if pivots:
                x_p = np.array(pivots, dtype="int32")  # n_batches
                X_c = np.array(contexts, dtype="int32")  # n_batches*n_contexts
                # l' contexts:
                X_c_f = np.array(contexts_a, dtype="int32")  # n_batches*n_contexts_f
                X_c_f_mask = X_c_f > 0
                L = np.array(labels, dtype="int32")  # n_batches*n_contexts
                max_y = np.array(max_y_is_1s, dtype="int32")  # n_batches
                M_pad = X_c > 0  # mask for padded
                assert X_c.shape[1] == L.shape[1]
                assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) == M_pad.shape[0]
                loss = modl.train(x_p, X_c, X_c_f, X_c_f_mask, L, max_y, M_pad)
                losses.append(loss)

            pivots = []
            contexts = []
            contexts_a = []
            labels = []
            max_y_is_1s = []

            print_size += cur_mbatch_size
            if print_size % 10000 == 0:
                if not w1 or not w2:
                    sc = "-"
                else:
                    sc = sim(modl.params[0].get_value()[w1], modl.params[0].get_value()[w2])
                rep = "Number of instances: {0}; sequences: {1}; Fr/Ge pivot cos:{2}".format(print_size,
                                                                                             n_line,
                                                                                             sc)
                if len(losses) > 0:
                    rep = "{} {}".format(rep, "Mean loss {}.".format(np.mean(losses)))
                    losses = []
                dyn_print(rep)
            cur_mbatch_size = 0
        print("\n")
        if args.save_model_per_ep and not e == nepochs - 1:  # save of last epoch done below
            save_weights(modl, str_app=str(e))
    log.info("Training completed: {}s/epoch".format((time.time() - start) / nepochs))
예제 #12
0
파일: run_bimu.py 프로젝트: MorLong/bimu
                                                                                             sc)
                if len(losses) > 0:
                    rep = "{} {}".format(rep, "Mean loss {}.".format(np.mean(losses)))
                    losses = []
                dyn_print(rep)
            cur_mbatch_size = 0
        print("\n")
        if args.save_model_per_ep and not e == nepochs - 1:  # save of last epoch done below
            save_weights(modl, str_app=str(e))
    log.info("Training completed: {}s/epoch".format((time.time() - start) / nepochs))


if not os.path.exists(save_dir):
    os.makedirs(save_dir)

log.info("Saving run details to {}.".format(save_dir))
save_run_details()

log.info("Building vocabulary.")
v_e = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words,
                 discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase)

r = line_reader(args.corpus_path_e)
v_e.create(r)
log.info("Loading l' vocabulary.")
sett_f = load_json("{}/{}".format(args.model_f_dir, args.settings_save_fname))
v_f = VocabBuild(sett_f["min_freq"],
                 sett_f["max_n_words"],
                 sett_f["discard_n_top_freq"],
                 sett_f["downcase"])
v_f.load(args.model_f_dir)
예제 #13
0
        print("Loaded model.")
    #elif args.embs_format == "mssg":  # Neelakantan et al.'s multisense embs
    #    w_index, embs = load_mssg("{}/vectors-MSSGKMeans.gz".format(args.input_dir))
    #    if args.n_most_freq:
    #        w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1}
    #        print(len(w_index))
    #    print("Loaded model.")
    elif args.embs_format == "bimu":
        try:
            w_index_path = "{}/w_index.json".format(args.input_dir)
            w_index = load_json(w_index_path)
        # model_path = "{}/sg.pickle".format(args.input_dir)
        except FileNotFoundError:
            w_index_path = "{}/W_v.txt".format(args.input_dir)
            w_index = {l.strip(): c for c, l in enumerate(line_reader(w_index_path))}
        log.info("Loading model.")

        if args.n_most_freq:
            w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1}
            print(len(w_index))
        if args.model == "senses":
            embs = load_npy("{}/{}.npy".format(args.input_dir, args.f))
            if args.sim == "local" or args.sim == "avg_exp":
                if args.f == "W_w":
                    n = ""
                else:
                    n = eval(args.f[-1])
                    assert 0 <= n < 9
                c_embs = load_npy("{}/W_c{}.npy".format(args.input_dir, n))
            try:
                if args.f == "W_w":
예제 #14
0
    Write sense predictions by appending to words,
    like in 'the/1' (here, second sense is assigned, counting from 0).
    """
    corpus_path_senses = "{}.senses".format(args.corpus_path)
    log.info("Writing to {}.".format(corpus_path_senses))
    p_count = 0
    with open(corpus_path_senses, "w") as outfile:
        for l in line_reader(args.corpus_path):
            new_l = []
            for w in l.strip().split(" "):
                new_l.append("{0}/{1}".format(w, sense_preds[p_count]))
                p_count += 1
            outfile.write(" ".join(new_l)+"\n")


log.info("Saving run details to {}.".format("{}/{}".format(args.model_load_dir, args.settings_save_fname)))
save_run_details()

log.info("Building vocabulary.")
v = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words,
               discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase)
v.load(args.model_load_dir)

#v.save(save_dir)

W_w = load_npy("{}/{}.npy".format(args.model_load_dir, "W_w"))
W_c = load_npy("{}/{}.npy".format(args.model_load_dir, "W_c"))
if args.infer_type == "argmax":
    modl_type = SensesInference
elif args.infer_type == "expect":
    modl_type = SensesExpectationInference
예제 #15
0
            update()
        cur_mbatch_size = 0
        pivots = []
        contexts = []
        labels = []
        max_y_is_1s = []

        if i % 10000 == 0:
            rep = "Number of instances: {0}; sequences: {1}".format(i, n_line)
            dyn_print(rep)
    print("\n")
    log.info("Inference completed: {}s".format(time.time()-start))

    return S_inferred

log.info("Saving run details to {}.".format("{}/{}".format(args.model_load_dir, args.settings_save_fname)))
save_run_details()

log.info("Building vocabulary.")
v = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words,
               discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase)
v.load(args.model_load_dir)

#v.save(save_dir)

W_w = load_npy("{}/{}.npy".format(args.model_load_dir, "W_w"))
W_c = load_npy("{}/{}.npy".format(args.model_load_dir, "W_c"))
if args.infer_type == "argmax":
    modl_type = SensesInference
elif args.infer_type == "expect":
    modl_type = SensesExpectationInference