Exemple #1
0
def get_pc(data, We, weight4ind, params):
    "Comput the principal component"

    def get_weighted_average(We, x, w):
        "Compute the weighted average vectors"
        n_samples = x.shape[0]
        emb = np.zeros((n_samples, We.shape[1]))
        for i in range(n_samples):
            emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :])
        return emb

    for i in data:
        i[0].populate_embeddings(words)
        if not params.task == "sentiment":
            i[1].populate_embeddings(words)
    if params.task == "ent":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sim":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sentiment":
        (scores, g1x, g1mask) = data_io.getDataSentiment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    emb = get_weighted_average(We, g1x, g1mask)
    svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0)
    svd.fit(emb)
    return svd.components_
Exemple #2
0
def getAccSentiment(model, words, f, params=[]):
    f = open(f, 'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        score = i[1]
        X1 = data_io.getSeq(p1, words)
        seq1.append(X1)
        ct += 1
        if ct % 100 == 0:
            x1, m1 = data_io.prepare_data(seq1)
            if params and params.weightfile:
                m1 = data_io.seq2weight(x1, m1, params.weight4ind)
            scores = model.scoring_function(x1, m1)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
        golds.append(score)
    if len(seq1) > 0:
        x1, m1 = data_io.prepare_data(seq1)
        if params and params.weightfile:
            m1 = data_io.seq2weight(x1, m1, params.weight4ind)
        scores = model.scoring_function(x1, m1)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return accSentiment(preds, golds)
Exemple #3
0
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params):
    f = open(f, 'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        p2 = i[1]
        score = float(i[2])
        X1, X2 = data_io.getSeqs(p1, p2, words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    scores = scoring_function(We, x1, x2, m1, m2, params)
    preds = np.squeeze(scores)
    return pearsonr(preds, golds)[0], spearmanr(preds, golds)[0]
def embeding_sentence_cosine_similarity(s1,s2):    
    word_idx_seq_of_sentence, mask = data_io.sentences2idx([s1,s2], Word2Indx) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    print(s1,s2)
    print('word_idx_seq_of_sentence')
    print(word_idx_seq_of_sentence)
    print('mask')
    print(mask)
    word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) 
    s1_embed = embedding[0]
    s2_embed = embedding[1]    

    return distance.cosine(s1_embed,s2_embed)
Exemple #5
0
def cosine_distance_by_sentence_vector(s1, s2):
    word_idx_seq_of_sentence, mask = data_io.sentences2idx(
        [' '.join(s1), ' '.join(s2)], Word2Indx
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location

    word_weight_of_sentence = data_io.seq2weight(
        word_idx_seq_of_sentence, mask, Index2Weight)  # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector,
                                            word_idx_seq_of_sentence,
                                            word_weight_of_sentence, param)
    s1_embed = embedding[0]
    s2_embed = embedding[1]

    return cosine_similarity(s1_embed, s2_embed)
Exemple #6
0
def train_util(model, train_data, dev, test, train, words, params):
    "utility function for training the model"
    start_time = time()
    try:
        for eidx in range(params.epochs):
            kf = data_io.get_minibatches_idx(len(train_data),
                                             params.batchsize,
                                             shuffle=True)
            uidx = 0
            for _, train_index in kf:
                uidx += 1
                batch = [train_data[t] for t in train_index]
                # load the word ids
                for i in batch:
                    i[0].populate_embeddings(words)
                    if not params.task == "sentiment":
                        i[1].populate_embeddings(words)
                # load the data
                if params.task == "ent":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataEntailment(batch)
                elif params.task == "sim":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataSim(batch, model.nout)
                elif params.task == "sentiment":
                    (scores, g1x, g1mask) = data_io.getDataSentiment(batch)
                else:
                    raise ValueError('Task should be ent or sim.')
                # train
                if not params.task == "sentiment":
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                        g2mask = data_io.seq2weight(g2x, g2mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g2x, g1mask,
                                                g2mask)
                else:
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g1mask)
                if np.isnan(cost) or np.isinf(cost):
                    print('NaN detected')
                # undo batch to save RAM
                for i in batch:
                    i[0].representation = None
                    i[0].unpopulate_embeddings()
                    if not params.task == "sentiment":
                        i[1].representation = None
                        i[1].unpopulate_embeddings()
            # evaluate
            if params.task == "sim":
                dp, ds = eval.supervised_evaluate(model, words, dev, params)
                tp, ts = eval.supervised_evaluate(model, words, test, params)
                rp, rs = eval.supervised_evaluate(model, words, train, params)
                print("evaluation: ", dp, ds, tp, ts, rp, rs)
            elif params.task == "ent" or params.task == "sentiment":
                ds = eval.supervised_evaluate(model, words, dev, params)
                ts = eval.supervised_evaluate(model, words, test, params)
                rs = eval.supervised_evaluate(model, words, train, params)
                print("evaluation: ", ds, ts, rs)
            else:
                raise ValueError('Task should be ent or sim.')
            print('Epoch ', (eidx + 1), 'Cost ', cost)
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("Training interupted")
    end_time = time()
    print("total time:", (end_time - start_time))
# input
wordfile = '../data/glove.840B.300d.txt'  # word vector file, can be downloaded from GloVe website
weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme
sentences = [
    'this is an example sentence',
    'this is another sentence that is slightly longer'
]

# load word vectors
(words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    words, word2weight)  # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(
    sentences, words
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i