Esempio n. 1
0
def loadDataset(filename, trainingSet=[] , testSet=[]):
	test=[]
	with open(filename, 'r') as csvfile:
		lines = csv.reader(csvfile)
		dataset = list(lines)
	movies = di.getAllMovies()
	tagIds = di.getAllTags()
	allTagLen = len(tagIds)
	dataset_copy = [['' for i in range(allTagLen+1)] for j in range(len(movies))]
	idfMovArr = idf.idfMovieTag()
	for i in range(len(dataset)):
		idfVect = idf.tfIdfMovieTag(dataset[i][0], idfMovArr)
		for j in range(len(idfVect)):
			dataset_copy[i][j] = idfVect[j]
		dataset_copy[i][allTagLen]=dataset[i][1]
		trainingSet.append(dataset_copy[i])
	train = [0 for i in range(len(dataset))]
	for i in range(len(dataset)):
		train[i] = int(dataset[i][0])
	k=0
	labels = ['0', '1']
	testset_copy = [['' for i in range(allTagLen+1)] for j in range(len(movies)-len(train))]
	for i in range(len(movies)):
			if(int(movies[i][0]) in train):
				pass
			else:
				test.append(movies[i][0])
				idfVect1 = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
				for j in range(len(idfVect1)):
					testset_copy[k][j] = idfVect1[j]
				#testset_copy[k][allTagLen]=db.getMovieGenre(movies[i][0])[0]
				testset_copy[k][allTagLen]=random.choice(labels)
				testSet.append(testset_copy[k])
				k=k+1
	return test,trainingSet,testSet
def formSvdMat(numSemantics):
	mat = np.zeros((movieLen,allTagLen))
	if(len(mat)<numSemantics or len(mat[0])<numSemantics):
		print("cant report top semantics")
		sys.exit()
	idfMovArr = idf.idfMovieTag()
	for i in range(movieLen):
		mat[i] = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
	U, s, V = np.linalg.svd(mat,full_matrices=False)
	movieFacts = np.zeros((movieLen, numSemantics))
	for i in range(movieLen):
		for j in range(numSemantics):
			movieFacts[i][j] = U[i][j]
	return movieFacts
def loadDataset(filename, trainingSet=[], testSet=[]):

    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
    labels = ['' for i in range(len(dataset))]
    movies = di.getAllMovies()
    tagIds = di.getAllTags()
    allTagLen = len(tagIds)
    dataset_copy = [['' for i in range(allTagLen)]
                    for j in range(len(dataset))]
    #dataset_copy = numpy.zeros((len(movies),allTagLen+1))
    #dataset_copy = [[0 for i in range(allTagLen+1)] for j in range(len(movies))]
    idfMovArr = idf.idfMovieTag()
    #print(idfMovArr)
    for i in range(len(dataset)):
        idfVect = idf.tfIdfMovieTag(dataset[i][0], idfMovArr)
        for j in range(len(idfVect)):
            dataset_copy[i][j] = idfVect[j]
        #dataset_copy[i][allTagLen]=dataset[i][1]
        labels[i] = dataset[i][1]
        trainingSet.append(dataset_copy[i])
    train = [0 for i in range(len(dataset))]

    target = ['' for i in range(len(movies))]
    for i in range(len(dataset)):
        train[i] = int(dataset[i][0])
    k = 0
    test = []
    label = ['0', '1']
    testset_copy = [['' for i in range(allTagLen)] for j in range(len(movies))]
    for i in range(len(movies)):
        if (int(movies[i][0]) in train):
            pass
        else:
            test.append(movies[i][0])
            idfVect1 = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
            for j in range(len(idfVect1)):
                testset_copy[k][j] = idfVect1[j]
            #testset_copy[k][allTagLen]=di.getMovieGenre(movies[i][0])[0]
            #testset_copy[k][allTagLen]=random.choice(labels)
            target[k] = random.choice(label)
            testSet.append(testset_copy[k])
            k = k + 1
    #print("train data =",trainingSet)
    #print("\n\n test data =",testSet)
    return trainingSet, testSet, labels, target, test
def formPPRMatrix():
	mat = np.zeros((movieLen,allTagLen))
	idfMovArr = idf.idfMovieTag()
	for i in range(len(movies)):
		mat[i] = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
	return mat
if __name__ == "__main__":
	trainingSet = []
	movies = loadDataset('foo.csv')
	numsemantics = 500
	allMovies = di.getAllMovies()
	tagIds = di.getAllTags()
	allTagLen = len(tagIds)
	movNames = di.getAllMovieNames()
	movieNames = np.array(['' for i in range(len(movies))])
	for i in range(len(allMovies)):
		mov = allMovies[i]
		if(mov[0] in movies):
			idx = movies.index(mov[0])
			movieNames[idx] = movNames[i][0]
	mat = np.zeros((len(movies),allTagLen))
	idfMovArr = idf.idfMovieTag()
	for i in range(len(movies)):
		mat[i] = idf.tfIdfMovieTag(movies[i], idfMovArr)
	a = svd.svdUout(mat,numsemantics)
	d = 5
	xmax = 20
	points = np.array(a)
	l = int(input("Enter number of layers:"))
	k = int(input("Enter number of hashes per layer:"))
	r = int(input("Enter number of neighbours:"))
	q = input("Enter query movieid:")
	ks = []
	ls = []
	ks.append(k)
	ls.append(l)
	query=[]