Esempio n. 1
0
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params):
    f = open(f, 'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        p2 = i[1]
        score = float(i[2])
        X1, X2 = data_io.getSeqs(p1, p2, words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    scores = scoring_function(We, x1, x2, m1, m2, params)
    print seq1[0]
    print seq2[0]
    print scores[0]
    preds = np.squeeze(scores)
    return pearsonr(preds, golds)[0], spearmanr(preds, golds)[0]
Esempio n. 2
0
def getAccSentiment(model,words,f, params=[]):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; score = i[1]
        X1 = data_io.getSeq(p1,words)
        seq1.append(X1)
        ct += 1
        if ct % 100 == 0:
            x1,m1 = data_io.prepare_data(seq1)
            if params and params.weightfile:
                m1 = data_io.seq2weight(x1, m1, params.weight4ind)
            scores = model.scoring_function(x1,m1)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
        golds.append(score)
    if len(seq1) > 0:
        x1,m1 = data_io.prepare_data(seq1)
        if params and params.weightfile:
            m1 = data_io.seq2weight(x1, m1, params.weight4ind)
        scores = model.scoring_function(x1,m1)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return accSentiment(preds,golds)
Esempio n. 3
0
def sim_badSents(We, words, weight4ind, scoring_function, params, fpc, sent1,
                 sent2):
    seq1 = []
    seq2 = []

    X1, X2 = data_io.getSeqs(sent1, sent2, words)
    seq1.append(X1)
    seq2.append(X2)

    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    scores = scoring_function(We, x1, x2, m1, m2, params, fpc)
    preds = np.squeeze(scores)
    preds = preds * 2 + 3
    return preds
Esempio n. 4
0
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params, fpc,
                       test_name):
    f = open(f, 'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    index = []
    idx = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        p2 = i[1]
        score = float(i[2])
        X1, X2 = data_io.getSeqs(p1, p2, words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
        index.append(idx)
        idx += 1
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    golds = np.asarray(golds)
    scores = scoring_function(We, x1, x2, m1, m2, params, fpc)
    # scores = scoring_function(We, x1, x2, m1, m2, golds, params, fpc)
    # preds = np.squeeze(scores).reshape(-1, 1)
    preds = np.squeeze(scores)
    # print('the prediction list is {}'.format(preds))

    # add SVM predictor
    # clf = pickle.load(open('../score_predictor/model_svm', 'rb'))
    # clf.fit(preds, golds)
    # preds = clf.predict(preds)

    print(preds)
    # np.save(open("../pred_list", 'wb'), preds)
    # np.save(open("../gold_list", 'wb'), golds)
    # show_result_image(preds, golds, index, fpc, test_name)
    # find_bad_scores(preds.tolist(), lower_threshold=2.5, higher_threshold=3.8)
    MSE = sqrt(mean_squared_error(golds, preds))
    return pearsonr(preds, golds)[0], MSE
Esempio n. 5
0
def getCorrelation(model,words,f, params=[]):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = float(i[2])
        X1, X2 = data_io.getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1,m1 = data_io.prepare_data(seq1)
    x2,m2 = data_io.prepare_data(seq2)
    if params and params.weightfile:
        m1 = data_io.seq2weight(x1, m1, params.weight4ind)
        m2 = data_io.seq2weight(x2, m2, params.weight4ind)
    scores = model.scoring_function(x1,x2,m1,m2)
    preds = np.squeeze(scores)
    return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
Esempio n. 6
0
def prepare_first_pc(We, words, weight4ind, generation_function, params, fpc):
    print("reading file: {}.".format(fpc))
    # pre_calculate_first_pc(We, words, fpc, weight4ind, generation_function, params)
    file_name = fpc
    f = os.path.join("../data/", fpc)
    f = open(f, 'r')
    seq = []
    for i in f.readlines():
        X = data_io.getSeq(i, words)
        seq.append(X)
    x, m = data_io.prepare_data(seq)
    m = data_io.seq2weight(x, m, weight4ind)
    generation_function(We, x, m, params, file_name)
Esempio n. 7
0
def sim_getCorrelation1(We, words, file_index, weight4ind, scoring_function,
                        params):
    f = open(file_index[0], 'r')
    #print(f)
    line = f.readlines()
    lines = [lin for lin in line]
    f = open(file_index[1], 'r')
    #print(f)
    score_line = f.readlines()
    score_lines = [score for score in score_line]
    golds = []
    seq1 = []
    seq2 = []
    for index in range(len(lines)):
        i = lines[index]
        j = score_lines[index]
        i = i.split("\t")
        #print(i)
        #print(i)
        p1 = i[0].lower()
        p2 = i[1].lower()
        try:
            score = float(j)
            X1, X2 = data_io.getSeqs(p1, p2, words)
            seq1.append(X1)
            seq2.append(X2)
            golds.append(score)
        except:
            pass
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    # print(x1,x2,m1,m2)
    # print(x1.shape,x2.shape,m1.shape,m2.shape)
    scores = scoring_function(We, x1, x2, m1, m2, params)
    preds = np.squeeze(scores)
    return pearsonr(preds, golds)[0]
def sentences2idx(texts, words):
    """
  Take in data, output array of word indices that can be fed into the algorithms.
  :param texts: List of texts
  :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
  """
    seq = []
    for t in texts:
        # Doing some cleaning of the text
        stopwords = utils.get_stopwords()
        text = t.strip().strip('"')
        text_clean = utils.clean_text(text)
        s = [w for w in text_clean.split(" ") if w not in stopwords]
        s = s[0:MAX_WORDS]
        seq.append(data_io.getSeq(' '.join(s), words))
    x1, m1 = data_io.prepare_data(seq)
    return x1, m1
Esempio n. 9
0
sentences = [
    'the lion is the king of the jungle'.split(),
    'tigers hunt alone at night'.split(), 'long live the emperor'.split(),
    'we call him little bobby tables'.split()
]

# import pickle
# sentences = pickle.load(open('tiger.pd', 'rb'))

db = data_io.setup_db()

# weights = data_io.weights_from_file(weightfile, weightpara)
# data_io.glove_to_db(wordfile, db,  weights=weights)

# load sentences
idx_mat, weight_mat, data = data_io.prepare_data(sentences, db)

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(idx_mat, weight_mat, data, params)

with open('svdump.pd', 'wb') as f:
    pickle.dump(embedding, f)
pprint.pprint(list(enumerate(sentences)))

print("Cosine dist"),
pprint.pprint(
    scipy.spatial.distance.squareform(
        scipy.spatial.distance.pdist(embedding, 'cosine')))