def ComputePrecisionK(modelfile, testfile, K_list): maxParagraphLength = 10 maxParagraphs = 4 #nlabels=1001 #vocabularySize=76391 labels = 8 vocabularySize = 244 model = Model(maxParagraphLength, maxParagraphs, labels, vocabularySize) testing = DataParser(maxParagraphLength, maxParagraphs, labels, vocabularySize) print(testfile) testing.getDataFromfile(testfile) print("data loading done") print("no of test examples: " + str(testing.totalPages)) model.load(modelfile) print("model loading done") batchSize = 1 testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) precAtK = {} for itr in K_list: precAtK[itr] = 0 for i, v in enumerate(pred): temp = [(labId, labProb) for labId, labProb in enumerate(v)] # print(temp) temp = sorted(temp, key=lambda x: x[1], reverse=True) for ele in K_list: pBag = 0 for itr in range(ele): if truePre[i][0][temp[itr][0]] == 1: pBag += 1 # print(float(pBag)/float(ele)) precAtK[ele] += float(pBag) / float(ele) f = open("results/precAtK_model3_n", "w") for key in sorted(precAtK.keys()): # print(key, precAtK[key]/len(pred)) print(precAtK[key] / len(pred)) f.write(str(key) + "\t" + str(precAtK[key] / len(pred)) + "\n") f.close()
def _initialize(self, interactions): self._num_items = interactions.num_items self._num_users = interactions.num_users self.test_sequence = interactions.test_sequences self._net = Model3(self._num_users, self._num_items, self.model_args).to(self._device) self._optimizer = optim.Adam(self._net.parameters(), weight_decay=self._l2, lr=self._learning_rate)
from model3 import Model3 as Model maxParagraphLength = 100 maxParagraphs = 1 #nlabels=1001 #vocabularySize=76391 nlabels = 8 vocabularySize = 244 training = DataParser(maxParagraphLength, maxParagraphs, nlabels, vocabularySize) #training.getDataFromfile("data/wiki_fea_76390_Label_1000_train") training.getDataFromfile( "C:/gitrepo/Wiki-Text-Categorization/Distant Supervision/Reuter_dataset/reuters_sparse_training.txt" ) model = Model(maxParagraphLength, maxParagraphs, nlabels, vocabularySize) batchSize = 64 epoch = 0 epochEnd = 105 for e in range(epoch, epochEnd): print('Epoch: ' + str(e + 1)) cost = 0 for itr in range(int(training.totalPages / batchSize)): cost += model.train(training.nextBatch(batchSize)) print(str(cost / training.totalPages)) if (e + 1) % 10 == 0 and e > 60: print('saving model..') model.save("models/model3_reuter_" + str(e + 1))
tf.cast(tf.equal(tf.argmax(predictions1, 1), tf.argmax(model1.Y, 1)), tf.float32)) # Make model 2 model2 = Model2(X2, Y2, keep_prob2) logits2, predictions2 = model2.build() loss_op2 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=model2.Y2)) train_op2 = tf.train.AdamOptimizer( learning_rate=model2.learning_rate).minimize(loss_op2) accuracy2 = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(predictions2, 1), tf.argmax(model2.Y2, 1)), tf.float32)) # Make model 3 model3 = Model3(X3, Y3, keep_prob3) logits3, predictions3 = model3.build() loss_op3 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits3, labels=model3.Y3)) train_op3 = tf.train.AdamOptimizer( learning_rate=model3.learning_rate).minimize(loss_op3) accuracy3 = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(predictions3, 1), tf.argmax(model3.Y3, 1)), tf.float32)) # # Make model 4 model4 = Model4(logitse1, logitse2, Y4) logits4, predictions4 = model4.build() loss_op4 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits4, labels=model4.Y4)) train_op4 = tf.train.AdamOptimizer(
def ComputeFscore(modelfile, testfile, outputfile): maxParagraphLength = int(sys.argv[1]) maxParagraphs = int(sys.argv[2]) filterSizes = [int(i) for i in sys.argv[3].split("-")] num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) # batchSize= int(sys.argv[6]) # epochs= int(sys.argv[7]) # folder_name = sys.argv[8] # output = sys.argv[9] lrate = sys.argv[10] poolLength = int(sys.argv[11]) labels = 8 vocabularySize = 244 model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize, filterSizes, num_filters, poolLength, wordEmbeddingDimension, lrate) testing = DataParser(maxParagraphs, maxParagraphLength, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) labelsCount = {} ConfusionMa = {} fScr = {} thres = 0.5 valid = int( len(truePre) * 0.5 ) #using first 50% data for threshold tuning - we have merged test and cv files labelsCount = {} ConfusionMa = {} fScr = {} thresLab = {} for la in range(labels): if la % 25 == 0: print("Current label", la) t = [] p = [] for i in range(valid): t.append(truePre[i][0][la]) p.append(pred[i][la]) bestF, bestThre = thresholdTuning(t, p) t = [] p = [] for i in range(valid, len(truePre)): t.append(truePre[i][0][la]) p.append(pred[i][la]) p = np.array(p) fScr[la] = f1_score(t, p >= bestThre) ConfusionMa[la] = confusion_matrix(t, p > bestThre) thresLab[la] = bestThre f = open(outputfile, "a") output = sys.argv[9] sum_fscore = 0.0 for i in range(labels): sum_fscore = sum_fscore + fScr[i] output = output + "," + str(fScr[i]) output += "," + str(sum_fscore / float(labels - 1)) print("Fscore at " + sys.argv[7] + " epochs: " + str(sum_fscore / float(labels - 1))) f.write(output + "\n") f.close()
def ComputeFscore(modelfile, testfile, outputfile): maxParagraphLength = 20 maxParagraphs = 10 #nlabels=1001 #vocabularySize=76391 labels = 8 vocabularySize = 244 model = Model(maxParagraphLength, maxParagraphs, labels, vocabularySize) testing = DataParser(maxParagraphLength, maxParagraphs, labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) labelsCount = {} ConfusionMa = {} fScr = {} thres = 0.5 valid = int( len(truePre) * 0.5 ) #using first 50% data for threshold tuning - we have merged test and cv files labelsCount = {} ConfusionMa = {} fScr = {} thresLab = {} for la in range(labels): if la % 25 == 0: print("Current label", la) t = [] p = [] for i in range(valid): t.append(truePre[i][0][la]) p.append(pred[i][la]) bestF, bestThre = thresholdTuning(t, p) t = [] p = [] for i in range(valid, len(truePre)): t.append(truePre[i][0][la]) p.append(pred[i][la]) p = np.array(p) fScr[la] = f1_score(t, p >= bestThre) ConfusionMa[la] = confusion_matrix(t, p > bestThre) thresLab[la] = bestThre f = open(outputfile, "w") sum_fscore = 0.0 for i in range(labels): sum_fscore = sum_fscore + fScr[i] inp = str(i) + "," + str(thresLab[i]) + "," + str(fScr[i]) + "\n" f.write(inp) f.write(str(sum_fscore / float(labels - 1))) print(sum_fscore) print(sum_fscore / float((labels - 1))) f.close() return (sum_fscore / float((labels - 1)))
num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) batchSize = int(sys.argv[6]) epochEnd = int(sys.argv[7]) folder_name = sys.argv[8] lrate = float(sys.argv[9]) poolLength = int(sys.argv[10]) nlabels = 10 vocabularySize = 101940 training = DataParser(maxParagraphs, paragraphLength, nlabels, vocabularySize) training.getDataFromfile( "../dataset/preprocessed_data/toplabels_split/wiki10-top10labels_train.txt" ) model = Model(maxParagraphs, paragraphLength, nlabels, vocabularySize, filterSizes, num_filters, poolLength, wordEmbeddingDimension, lrate) costfile = open("results/costfile.txt", "a") output = folder_name epoch = 0 # epochEnd=400 costepochs = [] for e in range(epoch, epochEnd): cost = 0 for itr in range(int(training.totalPages / batchSize)): cost += model.train(training.nextBatch(batchSize))