def loadDataset(filename, trainingSet=[] , testSet=[]): test=[] with open(filename, 'r') as csvfile: lines = csv.reader(csvfile) dataset = list(lines) movies = di.getAllMovies() tagIds = di.getAllTags() allTagLen = len(tagIds) dataset_copy = [['' for i in range(allTagLen+1)] for j in range(len(movies))] idfMovArr = idf.idfMovieTag() for i in range(len(dataset)): idfVect = idf.tfIdfMovieTag(dataset[i][0], idfMovArr) for j in range(len(idfVect)): dataset_copy[i][j] = idfVect[j] dataset_copy[i][allTagLen]=dataset[i][1] trainingSet.append(dataset_copy[i]) train = [0 for i in range(len(dataset))] for i in range(len(dataset)): train[i] = int(dataset[i][0]) k=0 labels = ['0', '1'] testset_copy = [['' for i in range(allTagLen+1)] for j in range(len(movies)-len(train))] for i in range(len(movies)): if(int(movies[i][0]) in train): pass else: test.append(movies[i][0]) idfVect1 = idf.tfIdfMovieTag(movies[i][0], idfMovArr) for j in range(len(idfVect1)): testset_copy[k][j] = idfVect1[j] #testset_copy[k][allTagLen]=db.getMovieGenre(movies[i][0])[0] testset_copy[k][allTagLen]=random.choice(labels) testSet.append(testset_copy[k]) k=k+1 return test,trainingSet,testSet
def vectActMovTag(): actors = di.getAllActors() tags = di.getAllTags() movies = di.getAllMovies() years = di.getAllYears() movYearsArray = di.getAllMovieYrs() movYears = {} for arr in movYearsArray: movYears[arr[0]] = arr[1] #print("movYears", movYears) actMoviesDb = {} for act in actors: actMovies = di.getActorMovieIds(act[0]) actMov = [] for mov in actMovies: actMov.append(mov[0]) actMoviesDb[act[0]] = actMov vect = defaultdict(lambda: defaultdict(dict)) for mov in movies: movTags = di.getMovieTagIds(mov[0])[0][0].split(",") #print(len(movTags)) for act in actors: actMovies = actMoviesDb[act[0]] #print("actMovies:",actMovies) for tag in tags: #print("tag",tag[0]) vect[mov[0]][act[0]][tag[0]] = 0 #print("i am here") # Set the value to 1 if the given cond. is satisfied if ((mov[0] in actMovies) and (tag[0] in movTags)): #and (movYears[mov[0]] == yr[0])): vect[act[0]][mov[0]][tag[0]] = movYears[mov[0]] #print(vect['1']) return (vect, actors, movies, years)
def getActorTagMatrix(): tagIds = di.getAllTags() tagLen = len(tagIds) actorNames = di.getAllActorNames() actorlist = di.getAllActors() actorTags = np.zeros((len(actorlist), tagLen)) i = 0 idfActVector = idf.idfActorTag() for actor in actorlist: actVect = idf.tfIdfActorTag(actor[0], idfActVector) for j in range(tagLen): if (tagIds[j][0] in actVect.keys()): actorTags[i][j] = actVect[tagIds[j][0]] i += 1 return actorTags
def idfActorTag(): idfActVect = {} allTags = di.getAllTags() allActors = di.getAllActors() actorCount = len(allActors) for tag in allTags: tagCount = 0 idfActVect[tag[0]] = 0 for actor in allActors: tags = di.getActorTags(actor[0]) if (tag[0] in tags[0]): tagCount = tagCount + 1 if (tagCount != 0): idfActVect[tag[0]] = math.log(actorCount / tagCount) #print(idfActVect) return idfActVect
def idfUserTag(): idfUserVect = {} allTags = di.getAllTags() allUsers = di.getAllUsers() userCount = len(allUsers) for tag in allTags: tagCount = 0 idfUserVect[tag[0]] = 0 for user in allUsers: tags = di.getUserTags(user[0]) if (tag[0] in tags[0]): tagCount = tagCount + 1 if (tagCount != 0): idfUserVect[tag[0]] = math.log(userCount / tagCount) #print(idfUserVect) return idfUserVect
def idfGenreTag(): idfGenVect = {} allTags = di.getAllTags() allGenres = di.getAllGenres() genreCount = len(allGenres) for tag in allTags: tagCount = 0 idfGenVect[tag[0]] = 0 for genre in allGenres: tags = di.getGenreTags(genre[0]) if (tag[0] in tags[0]): tagCount = tagCount + 1 if (tagCount != 0): idfGenVect[tag[0]] = math.log(genreCount / tagCount) #print(idfGenVect) return idfGenVect
def loadDataset(filename, trainingSet=[], testSet=[]): with open(filename, 'r') as csvfile: lines = csv.reader(csvfile) dataset = list(lines) labels = ['' for i in range(len(dataset))] movies = di.getAllMovies() tagIds = di.getAllTags() allTagLen = len(tagIds) dataset_copy = [['' for i in range(allTagLen)] for j in range(len(dataset))] #dataset_copy = numpy.zeros((len(movies),allTagLen+1)) #dataset_copy = [[0 for i in range(allTagLen+1)] for j in range(len(movies))] idfMovArr = idf.idfMovieTag() #print(idfMovArr) for i in range(len(dataset)): idfVect = idf.tfIdfMovieTag(dataset[i][0], idfMovArr) for j in range(len(idfVect)): dataset_copy[i][j] = idfVect[j] #dataset_copy[i][allTagLen]=dataset[i][1] labels[i] = dataset[i][1] trainingSet.append(dataset_copy[i]) train = [0 for i in range(len(dataset))] target = ['' for i in range(len(movies))] for i in range(len(dataset)): train[i] = int(dataset[i][0]) k = 0 test = [] label = ['0', '1'] testset_copy = [['' for i in range(allTagLen)] for j in range(len(movies))] for i in range(len(movies)): if (int(movies[i][0]) in train): pass else: test.append(movies[i][0]) idfVect1 = idf.tfIdfMovieTag(movies[i][0], idfMovArr) for j in range(len(idfVect1)): testset_copy[k][j] = idfVect1[j] #testset_copy[k][allTagLen]=di.getMovieGenre(movies[i][0])[0] #testset_copy[k][allTagLen]=random.choice(labels) target[k] = random.choice(label) testSet.append(testset_copy[k]) k = k + 1 #print("train data =",trainingSet) #print("\n\n test data =",testSet) return trainingSet, testSet, labels, target, test
def getGenreMovieTags(movie): tagIds = di.getAllTags() tagLen = len(tagIds) tfArray = [0 for i in range(tagLen)] unqTags = movie.getUnqTags() tags = movie.getTags() totalTags = len(tags) i = 0 tfVect = {} for tagId in unqTags: tfFactor = 0 for tag in tags: if (tag.getId() == tagId): tfFactor = tfFactor + tag.getTimeWeight() tfVect[tagId] = tfFactor / totalTags for i in range(tagLen): if (tagIds[i][0] in tfVect.keys()): tfArray[i] = tfVect[tagIds[i][0]] return tfArray
def idfMovieTag(): allTags = di.getAllTags() allMovies = di.getAllMovies() movieCount = len(allMovies) idfMovTagArr = np.zeros(len(allTags)) movTags = [] for mov in allMovies: movTags.append(di.getMovieTagIds(mov[0])[0][0].split(",")) for i in range(len(allTags)): tagCount = 0 for j in range(len(allMovies)): if (allTags[i][0] in movTags[j]): tagCount = tagCount + 1 res = 0 if (tagCount != 0): res = math.log(movieCount / tagCount) idfMovTagArr[i] = res #print(idfMovTagArr) return idfMovTagArr
def vectTagMovRat(): tags = di.getAllTags() movies = di.getAllMovies() ratings = di.getAllRatings() avgRatingsArray = di.getAllMovieRtngs() avgRatings = {} for arr in avgRatingsArray: avgRatings[arr[0]] = arr[1] #print("avgRatings",avgRatings) vect = defaultdict(lambda: defaultdict(dict)) for mov in movies: movTags = di.getMovieTagIds(mov[0])[0][0].split(",") for tag in tags: for rtng in ratings: vect[tag[0]][mov[0]][rtng[0]] = 0 # Set the value to 1 if the given cond. is satisfied if ((tag[0] in movTags) and (rtng[0] <= avgRatings[mov[0]])): vect[tag[0]][mov[0]][rtng[0]] = 1 #print(vect['1']) return (vect, tags, movies, ratings)
movies = db.getAllMovies() movieNames = db.getAllMovieNames() tfmovies = {} for movieId in movies: Taglist = db.getMovieTags(movieId[0]) UnqTags = db.getMovieTagIds(movieId[0])[0][0].split(",") #print(UnqTags,movieId,Taglist) tfvect = {} for tag in UnqTags: tffact = 0 for t in Taglist: if (t[0] == tag): tffact += 1 tfvect[tag[0]] = tffact / len(Taglist) tfmovies[movieId[0]] = tfvect tagids = db.getAllTags() #print(tagids) movietf = np.zeros((len(tfmovies), len(tagids))) for i in range(len(tfmovies)): for j in range(len(tagids)): if (tagids[j][0] in tfmovies[movies[i][0]].keys()): movietf[i][j] = tfmovies[movies[i][0]][tagids[j][0]] matrix = np.matmul(movietf, np.transpose(movietf)) seedList = db.getUserMoviesRates(userId) seeds = [] for seed in seedList: seeds.append(seed[0]) seedNames = [] for i in range(len(movies)): if (movies[i][0] in seeds): seedNames.append(movieNames[i][0])
di.delRows("mltags", "movie_id", mov) di.delRows("movie_actor", "movie_id", mov) di.delRows("movie_info", "movie_id", mov) allUsers = di.getAllUsers() delUsers = [] for usr in allUsers: if (int(usr[0]) <= 71550): delUsers.append(usr[0]) print("delUsers", len(delUsers)) for usr in delUsers: di.delRows("mlratings", "user_id", usr) di.delRows("mltags", "user_id", usr) di.delRows("mlusers", "user_id", usr) print("usr ="******"actor = ", act[0]) di.delRows("imdb_actor_info", "actor_id", act[0]) for tag in allTags: if (tag[0] not in mlTg):
import dbInfo as db import numpy as np import utils import tfCalc as tf import warnings warnings.filterwarnings("ignore") allTags = db.getAllTags() lenTags = len(allTags) #this function will generate a Matrix to be used as input to SVD def genSVDMatrix(genrelist): genObj = tf.createGenObj(genrelist) movies = genObj.getMovies() matrix = [[0 for x in range(0, lenTags)] for y in range(0, len(movies))] i = 0 for movie in movies: matrix[i] = utils.getGenreMovieTags(movie) i += 1 return matrix def svdCalc(mat, numSem): U, s, V = np.linalg.svd(mat, full_matrices=False) sem = np.zeros((numSem, len(V[0]))) for i in range(numSem): for j in range(len(V[0])): sem[i][j] = V[i][j] return sem
import dbInfo as di import utils import lda import sys from operator import itemgetter import tensorDecomp as td import persPageRank as ppr import tfCalc as tf import tfIdfCalc as idf import numpy as np from scipy.stats import mode movies = di.getAllMovies() tagIds = di.getAllTags() allTagLen = len(tagIds) movieLen = len(movies) def formSvdMat(numSemantics): mat = np.zeros((movieLen,allTagLen)) if(len(mat)<numSemantics or len(mat[0])<numSemantics): print("cant report top semantics") sys.exit() idfMovArr = idf.idfMovieTag() for i in range(movieLen): mat[i] = idf.tfIdfMovieTag(movies[i][0], idfMovArr) U, s, V = np.linalg.svd(mat,full_matrices=False) movieFacts = np.zeros((movieLen, numSemantics)) for i in range(movieLen): for j in range(numSemantics): movieFacts[i][j] = U[i][j] return movieFacts