예제 #1
0
def predict(total_number):
    hit = 0

    for n in range(total_number):

        target = APIREC.target_context[n][0]
        change_context = APIREC.target_context[n][1]
        code_context = APIREC.target_context[n][2]

        h = []
        for candidate in APIREC.P_list:

            candidate_target = dict(target)
            candidate_target['label'] = candidate[2]
            _score = score(candidate_target, change_context, code_context)
            q = heapq.heappush(h, (_score, candidate))

        top10 = heapq.nlargest(10, h)
        # print("Top 10: " + str(top10))
        prediction = Mining.atmoic_change(target)
        # print("Target: " + str(prediction))

        for i in top10:
            candidate = i[1]
            if candidate == prediction:
                hit += 1
                # print("hit")
                break

    print("Final prediction accuracy: " + str(1.0 * hit / total_number))
예제 #2
0
def _predict(n):
    # print("{} with {}".format(n, training))

    global top
    global weights

    hit = 0
    target = APIREC.target_context[n][0]
    change_context = APIREC.target_context[n][1]
    code_context = APIREC.target_context[n][2]

    h = []

    for candidate in APIREC.P_list:

        candidate_target = dict(target)
        candidate_target['label'] = candidate[2]
        change_score = APIREC.change_score_transaction(candidate_target,
                                                       change_context)
        code_score = APIREC.code_score_transaction(candidate_target,
                                                   code_context)

        w_change = weights[0]
        w_code = 1 - w_change
        _score = w_change * change_score + w_code * code_score
        heapq.heappush(h, (_score, candidate))

    hit = [0] * 5
    for i in range(5):
        topX = heapq.nlargest(top[i], h)
        # print("Top 10: " + str(top10))
        prediction = Mining.atmoic_change(target)
        # print("Target: " + str(prediction))

        for item in topX:
            candidate = item[1]
            if candidate == prediction:
                hit[i] += 1
                # print("{} hit".format(i))
                break
    # print("return {}".format(hit))
    return hit
예제 #3
0
def _predict(n, training=[0.5]):
    # print("{} with {}".format(n, training))

    hit = 0
    target = APIREC.target_context[n][0]
    change_context = APIREC.target_context[n][1]
    code_context = APIREC.target_context[n][2]

    h = []
    for i in training:
        h.append([])

    for candidate in APIREC.P_list:

        candidate_target = dict(target)
        candidate_target['label'] = candidate[2]
        change_score = APIREC.change_score_transaction(candidate_target,
                                                       change_context)
        code_score = APIREC.code_score_transaction(candidate_target,
                                                   code_context)
        for i in range(len(training)):
            w_change = training[i]
            w_code = 1 - w_change
            _score = w_change * change_score + w_code * code_score
            heapq.heappush(h[i], (_score, candidate))

    hit = [0] * len(training)
    for i in range(len(training)):
        top10 = heapq.nlargest(10, h[i])
        # print("Top 10: " + str(top10))
        prediction = Mining.atmoic_change(target)
        # print("Target: " + str(prediction))

        for item in top10:
            candidate = item[1]
            if candidate == prediction:
                hit[i] += 1
                # print("{} hit".format(i))
                break
    # print("return {}".format(hit))
    return hit
예제 #4
0
def test_doctorAI(
	modelFile='model.txt',
	seqFile='seq.txt',
	inputDimSize=20000,
	labelFile='label.txt',
	numClass=500,
	timeFile='',
	predictTime=False,
	useLogTime=True,
	hiddenDimSize=[200,200],
	batchSize=100,
	logEps=1e-8,
	mean_duration=20.0,
	verbose=False
):
	options = locals().copy()

	if len(timeFile) > 0: useTime = True
	else: useTime = False
	options['useTime'] = useTime

	models = np.load(modelFile)
	tparams = init_tparams(models)

	print 'build model ... ',
	if predictTime:
		x, t, mask, codePred, timePred = build_model(tparams, options)
		predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code')
		predict_time = theano.function(inputs=[x,t,mask], outputs=timePred, name='predict_time')
	elif useTime:
		x, t, mask, codePred = build_model(tparams, options)
		predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code')
	else:
		x, mask, codePred = build_model(tparams, options)
		predict_code = theano.function(inputs=[x,mask], outputs=codePred, name='predict_code')

	options['inputDimSize']=models['W_emb'].shape[0]
	options['numClass']=models['b_output'].shape[0]
	print 'load data ... ', 
	testSet = load_data(seqFile, labelFile, timeFile)
	n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize)))
	print 'done'

	predVec = []
	trueVec = []
	predTimeVec = []
	trueTimeVec = []
	iteration = 0
	for batchIndex in range(n_batches):
		tempX = testSet[0][batchIndex*batchSize: (batchIndex+1)*batchSize]
		tempY = testSet[1][batchIndex*batchSize: (batchIndex+1)*batchSize]
		if predictTime:
			tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize]
			x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options)
			codeResults = predict_code(x, t, mask)
			timeResults = predict_time(x, t, mask)
		elif useTime:
			tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize]
			x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options)
			codeResults = predict_code(x, t, mask)
		else:
			x, mask, lengths = padMatrixWithoutTime(tempX, options)
			codeResults = predict_code(x, mask)

		for i in range(codeResults.shape[1]):
			tensorMatrix = codeResults[:,i,:]
			thisY = tempY[i][1:]
			for timeIndex in range(lengths[i]):
				if len(thisY[timeIndex]) == 0: continue
				trueVec.append(thisY[timeIndex])
				output = tensorMatrix[timeIndex]
				predVec.append(zip(*heapq.nlargest(30, enumerate(output), key=operator.itemgetter(1)))[0])

		if predictTime:
			for i in range(timeResults.shape[1]):
				timeVec = timeResults[:,i]
				trueTimeVec.extend(tempT[i][1:])
				for timeIndex in range(lengths[i]):
					predTimeVec.append(timeVec[timeIndex])

		if (iteration % 10 == 0) and verbose: print 'iteration:%d/%d' % (iteration, n_batches)
		iteration += 1
		if iteration == 10: break
			
	recall = recallTop(trueVec, predVec)
	print 'recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1], recall[2])

	if predictTime: 
		r_squared = calculate_r_squared(trueTimeVec, predTimeVec, options)
		print 'R2:%f' % r_squared
예제 #5
0
def test_doctorAI(modelFile='model.txt',
                  seqFile='seq.txt',
                  inputDimSize=20000,
                  labelFile='label.txt',
                  numClass=500,
                  timeFile='',
                  predictTime=False,
                  useLogTime=True,
                  hiddenDimSize=[200, 200],
                  batchSize=100,
                  logEps=1e-8,
                  mean_duration=20.0,
                  verbose=False,
                  embFile='embFile.txt',
                  topicSize=20,
                  topicFile='',
                  experiment='1'):
    options = locals().copy()

    if len(timeFile) > 0: useTime = True
    else: useTime = False
    options['useTime'] = useTime

    if len(topicFile) > 0: useTopics = True
    else: useTopics = False
    options['useTopics'] = useTopics

    models = np.load(modelFile)
    tparams = init_tparams(models)

    # print tparams.keys()

    def load_embedding(infile):
        Wemb = np.array(pickle.load(open(infile, 'rb'))).astype(config.floatX)
        return Wemb

    if len(embFile) > 0:
        W_emb = load_embedding(embFile)
    else:
        W_emb = None

    print 'build model ... ',
    if useTime and useTopics:
        x, t, topics, mask, codePred = build_model(tparams, options, W_emb)
        predict_code = theano.function(inputs=[x, t, topics, mask],
                                       outputs=codePred,
                                       name='predict_code')
    elif useTime:
        x, t, topics, mask, codePred = build_model(tparams, options, W_emb)
        predict_code = theano.function(inputs=[x, t, mask],
                                       outputs=codePred,
                                       name='predict_code')
    elif useTopics:
        x, topics, mask, codePred = build_model(tparams, options, W_emb)
        predict_code = theano.function(inputs=[x, topics, mask],
                                       outputs=codePred,
                                       name='predict_code')
    else:
        x, topics, mask, codePred = build_model(tparams, options, W_emb)
        predict_code = theano.function(inputs=[x, mask],
                                       outputs=codePred,
                                       name='predict_code')

    if 'W_emb' in models.keys():
        options['inputDimSize'] = models['W_emb'].shape[0]
    elif W_emb is not None:
        options['inputDimSize'] = W_emb.shape[0]

    options['numClass'] = models['b_output'].shape[0]
    print 'load data ... ',
    testSet = load_data(seqFile, labelFile, timeFile, topicFile)
    n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize)))
    print 'done'

    predVec = []
    trueVec = []
    predTimeVec = []
    trueTimeVec = []
    iteration = 0
    for batchIndex in range(n_batches):
        tempX = testSet[0][batchIndex * batchSize:(batchIndex + 1) * batchSize]
        tempY = testSet[1][batchIndex * batchSize:(batchIndex + 1) * batchSize]
        if useTime and useTopics:
            tempT = testSet[2][batchIndex * batchSize:(batchIndex + 1) *
                               batchSize]
            tempTopics = testSet[3][batchIndex * batchSize:(batchIndex + 1) *
                                    batchSize]
            x, t, topics, mask, lengths = padMatrixWithTime(
                tempX, tempT, options, tempTopics)
            codeResults = predict_code(x, t, topics, mask)
        elif useTime:
            tempT = testSet[2][batchIndex * batchSize:(batchIndex + 1) *
                               batchSize]
            x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options)
            codeResults = predict_code(x, t, mask)
        elif useTopics:
            tempTopics = testSet[3][batchIndex * batchSize:(batchIndex + 1) *
                                    batchSize]
            x, topics, mask, lengths = padMatrixWithoutTime(
                tempX, options, tempTopics)  # todo
            codeResults = predict_code(x, topics, mask)
        else:
            x, mask, lengths = padMatrixWithoutTime(tempX, options)
            codeResults = predict_code(x, mask)

        for i in range(codeResults.shape[1]):
            tensorMatrix = codeResults[:, i, :]
            thisY = tempY[i][1:]
            for timeIndex in range(lengths[i]):
                if len(thisY[timeIndex]) == 0: continue
                trueVec.append(thisY[timeIndex])
                output = tensorMatrix[timeIndex]
                predVec.append(
                    zip(*heapq.nlargest(
                        30, enumerate(output), key=operator.itemgetter(1)))[0])

        if (iteration % 10 == 0) and verbose:
            print 'iteration:%d/%d' % (iteration, n_batches)
        iteration += 1
        if iteration == 10: break

    # pickle.dump(trueVec,open('experiments/trueVec_exp' + str(experiment),'wr'))
    # pickle.dump(predVec,open('experiments/predVec_exp' + str(experiment),'wr'))
    recall = recallTop(trueVec, predVec)
    # for cross validation
    # pickle.dump(recall,open('experiments-cv/recall' + str(experiment),'wr'))

    # for experiments
    pickle.dump(recall, open('experiments/recall' + str(experiment), 'wr'))

    print 'recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1],
                                                        recall[2])