def evaluateAll(): testds = open("../dataset/sick/test.txt") levendistances = [] scores = [] for l in testds.readlines(): splits = l.split("\t") sen1 = splits[1] sen2 = splits[2] score = float("%.1f" % float(splits[3])) scores.append(score) levendistances.append(-levendistance.leven(sen1, sen2)) calibrated = metrics.calibration(levendistances) metrics.evaluate(calibrated, scores) partitionScores = [[], [], [], []] calibratedScores = [[], [], [], []] for i in range(len(scores)): if scores[i] == 5.0: partitionScores[3].append(scores[i]) calibratedScores[3].append(calibrated[i]) else: position = int(scores[i]) - 1 partitionScores[position].append(scores[i]) calibratedScores[position].append(calibrated[i]) print partitionScores[1][1:4] print calibratedScores[1][1:4] for i in range(4): metrics.evaluate(partitionScores[i], calibratedScores[i])
def predict(numt): lda = models.LdaModel.load("../dataset/sick/model.lda") #lda = models.LdaModel.load("../dataset/sick/modeltfidf.lda") dictionary = corpora.Dictionary.load("../dataset/sick/sick.dict") testds = open("../dataset/sick/test.txt") def splitSent(sent): words = re.split(",| ", sent) wordlist = [] for word in words: if word == "": continue else: wordlist.append(word) return wordlist simscores = [] scores = [] for l in testds.readlines(): items = l.split("\t") sent1 = items[1] txt1 = dictionary.doc2bow(splitSent(sent1)) sent2 = items[2] txt2 = dictionary.doc2bow(splitSent(sent2)) corpus = [txt1, txt2] index = similarities.MatrixSimilarity(lda[corpus], num_features=numt) sim = index[lda[txt2]] simscores.append(sim[0]) score = float("%.1f" % float(items[3])) scores.append(score) calibrated = metrics.calibration(simscores) #print calibrated #print scores metrics.evaluate(calibrated, scores) partitionScores = [[], [], [], []] calibratedScores = [[], [], [], []] for i in range(len(scores)): if scores[i] == 5.0: partitionScores[3].append(scores[i]) calibratedScores[3].append(calibrated[i]) else: position = int(scores[i]) - 1 partitionScores[position].append(scores[i]) calibratedScores[position].append(calibrated[i]) print partitionScores[1][1:4] print calibratedScores[1][1:4] for i in range(4): metrics.evaluate(partitionScores[i], calibratedScores[i])
#losses = tf.reduce_mean(loss) train_op = tf.train.GradientDescentOptimizer(0.03).minimize(losses) lenth = len(inputs1) with tf.Session() as sess: sess.run(init) print "\n" for epoch in range(num_epoch): for i in range(lenth / batch_num): (data1, data2, labels) = next_batch(batch_num, inputs1, inputs2, originalTraining) sess.run(train_op, feed_dict={ x1: data1, x2: data2, pivot: labels }) transform_result = sess.run(prediction, feed_dict={ x1: test1[:lenthtest], x2: test2[:lenthtest], pivot: [[0, 0, 0, 0, 0]] }) newScores = [] for item in transform_result: newScores.append(item[0]) calibrated = metrics.calibration(newScores) metrics.evaluate(calibrated, originalScores)
embedding = emb W = tf.Variable(emb, trainable=False, name="W") pivot = tf.placeholder(tf.float32, shape=[None, 5], name="pivot") leftseqs=tf.placeholder(tf.int32, shape=[None, None], name="leftseqs") leftlength=tf.placeholder(tf.float32, shape=[None], name="leftlength") rightseqs=tf.placeholder(tf.int32, shape=[None, None], name='rightseqs') rightlength=tf.placeholder(tf.float32, shape=[None], name="rightlength") leftEmbedding = tf.nn.embedding_lookup(W, leftseqs) rightEmbedding = tf.nn.embedding_lookup(W, rightseqs) leftSum = tf.reduce_sum(leftEmbedding, axis=1) leftAverage = tf.transpose(tf.multiply(tf.transpose(leftSum), leftlength)) rightSum = tf.reduce_sum(rightEmbedding, axis=1) rightAverage = tf.transpose(tf.multiply(tf.transpose(rightSum), rightlength)) product=tf.reduce_sum(tf.multiply(leftAverage, rightAverage), axis=1) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print "\n" transform_result = sess.run(product , feed_dict={leftseqs: testlbatch, rightseqs: testrbatch, leftlength:testllength, rightlength:testrlength, pivot: testscores}) print transform_result print len(transform_result) newScores=metrics.calibration(transform_result) metrics.evaluate(newScores,testscore)
model = gensim.models.Doc2Vec.load("../dataset/sick/doc2vec") testdoc = open("../dataset/sick/test.txt") cosines=[] scores=[] for l in testdoc.readlines(): items = l.split("\t") sent1 = items[1] sent2 = items[2] words1 = main.simple_preprocess(sent1) words2 = main.simple_preprocess(sent2) vec1 = list(model.infer_vector(words1)) vec2 = list(model.infer_vector(words2)) cosines.append(1- spatial.distance.cosine(vec1, vec2)) score = float("%.1f" % float(items[3])) scores.append(score) calibrated = metrics.calibration(cosines) print calibrated print scores metrics.evaluate(calibrated, scores) partitionScores = [[],[],[],[]] calibratedScores = [[],[],[],[]] for i in range(len(scores)): if scores[i] == 5.0: partitionScores[3].append(scores[i]) calibratedScores[3].append(calibrated[i]) else: position = int(scores[i])-1 partitionScores[position].append(scores[i]) calibratedScores[position].append(calibrated[i]) print partitionScores[1][1:4]
if __name__=="__main__": idfmap = id.getIdf() testds = open("../dataset/sick/test.txt") tfidfs = [] scores = [] for l in testds.readlines(): items = l.split("\t") id = items[0] sentenceA = items[1] sentenceB = items[2] score = float("%.1f" % float(items[3])) tfidfs.append(tfidf(sentenceA,sentenceB)) scores.append(score) calibrated = metrics.calibration(tfidfs) print calibrated print scores metrics.evaluate(calibrated, scores) partitionScores = [[],[],[],[]] calibratedScores = [[],[],[],[]] for i in range(len(scores)): if scores[i] == 5.0: partitionScores[3].append(scores[i]) calibratedScores[3].append(calibrated[i]) else: position = int(scores[i])-1 partitionScores[position].append(scores[i]) calibratedScores[position].append(calibrated[i]) print partitionScores[1][1:4]