Beispiel #1
0
def get_pc(data, We, weight4ind, params):
    "Comput the principal component"

    def get_weighted_average(We, x, w):
        "Compute the weighted average vectors"
        n_samples = x.shape[0]
        emb = np.zeros((n_samples, We.shape[1]))
        for i in range(n_samples):
            emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :])
        return emb

    for i in data:
        i[0].populate_embeddings(words)
        if not params.task == "sentiment":
            i[1].populate_embeddings(words)
    if params.task == "ent":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sim":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sentiment":
        (scores, g1x, g1mask) = data_io.getDataSentiment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    emb = get_weighted_average(We, g1x, g1mask)
    svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0)
    svd.fit(emb)
    return svd.components_
Beispiel #2
0
def train_util(model, train_data, dev, test, train, words, params):
    "utility function for training the model"
    start_time = time()
    try:
        for eidx in range(params.epochs):
            kf = data_io.get_minibatches_idx(len(train_data),
                                             params.batchsize,
                                             shuffle=True)
            uidx = 0
            for _, train_index in kf:
                uidx += 1
                batch = [train_data[t] for t in train_index]
                # load the word ids
                for i in batch:
                    i[0].populate_embeddings(words)
                    if not params.task == "sentiment":
                        i[1].populate_embeddings(words)
                # load the data
                if params.task == "ent":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataEntailment(batch)
                elif params.task == "sim":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataSim(batch, model.nout)
                elif params.task == "sentiment":
                    (scores, g1x, g1mask) = data_io.getDataSentiment(batch)
                else:
                    raise ValueError('Task should be ent or sim.')
                # train
                if not params.task == "sentiment":
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                        g2mask = data_io.seq2weight(g2x, g2mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g2x, g1mask,
                                                g2mask)
                else:
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g1mask)
                if np.isnan(cost) or np.isinf(cost):
                    print('NaN detected')
                # undo batch to save RAM
                for i in batch:
                    i[0].representation = None
                    i[0].unpopulate_embeddings()
                    if not params.task == "sentiment":
                        i[1].representation = None
                        i[1].unpopulate_embeddings()
            # evaluate
            if params.task == "sim":
                dp, ds = eval.supervised_evaluate(model, words, dev, params)
                tp, ts = eval.supervised_evaluate(model, words, test, params)
                rp, rs = eval.supervised_evaluate(model, words, train, params)
                print(("evaluation: ", dp, ds, tp, ts, rp, rs))
            elif params.task == "ent" or params.task == "sentiment":
                ds = eval.supervised_evaluate(model, words, dev, params)
                ts = eval.supervised_evaluate(model, words, test, params)
                rs = eval.supervised_evaluate(model, words, train, params)
                print(("evaluation: ", ds, ts, rs))
            else:
                raise ValueError('Task should be ent or sim.')
            print(('Epoch ', (eidx + 1), 'Cost ', cost))
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("Training interupted")
    end_time = time()
    print(("total time:", (end_time - start_time)))