def predict(q, nlpServer_Port): print() print("Query = " + q) allMovies = AllfilesInFolder.getAllFilesInFolder("Data\Plots") c_score = character_score(q, nlpServer_Port) en_score = entity_score(q, nlpServer_Port) ev_score = event_score(q, nlpServer_Port) ne_score = ner_score(q, nlpServer_Port) s_score = semantic_score(q) total_score = [0] * len(allMovies) print() print("Final Score:") for i in range(len(allMovies)): total_score[i] = round((c_score[i] + en_score[i] + ev_score[i] + ne_score[i] + s_score[i]), 5) print(allMovies[i] + " Character Score: " + str(c_score[i]) + " Entity Score: " + str(en_score[i]) + " Event Score: " + str(ev_score[i]) + " NE score: " + str(ne_score[i]) + " Semantic score: " + str(s_score[i]) + " Total Score: " + str(total_score[i])) norm_factor = sum(total_score) prob = [ round(x / norm_factor, 5) if (norm_factor > 0) else norm_factor for x in total_score ] topPicks = ranking(allMovies, prob, topPicksCount) return topPicks
def matchEntities(allMovies, query_entities, threshold): mFolder = "Data\Entities" mFile = str(AllfilesInFolder.getAllFilesInFolder(mFolder)[0]) f = open(mFolder + '/' + mFile, 'r') allMovieEntities = json.loads(f.read()) score = [0] * len(allMovies) if len(query_entities) == 0: return score for query_entity in query_entities: for movie in allMovieEntities: movie_entities = movie["Entities"] movie_entities = movie_entities.split(",") en_score = 0 for movie_entity in movie_entities: a1 = wn.synsets(movie_entity) a2 = wn.synsets(query_entity) try: sim = a1[0].wup_similarity(a2[0]) if sim > en_score: en_score = sim except: pass if en_score >= threshold: score[allMovies.index(movie["movieTitle"])] += en_score print("Entity '" + query_entity + "' found in " + movie["movieTitle"]) return (score)
def extractVerbsFromDB(nlpServer_Port): movies = AllfilesInFolder.getAllFilesInFolder('Data\Plots\\') print("Extracting events from plots") for movie in tqdm(movies): r = open('Data\Plots\\' + movie, 'r') plot = r.read() r.close() verbs = getVerbs(plot, nlpServer_Port) w = open('Data\Event\AllVerbs\\' + movie, 'w') w.writelines(["%s " % vb for vb in verbs]) w.close()
def event_score(q, nlpServer_Port): allMovies = AllfilesInFolder.getAllFilesInFolder("Data\Plots") # Events print() query_events = list(set(extractEvents.getVerbs(q, nlpServer_Port))) if len(query_events) > 0: print("Events identified in query: " + str(query_events)) event_score = Score.matchEvent(allMovies, query_events, threshold) else: print("No events identified in query") event_score = [0] * len(allMovies) if len(query_events) > 0: event_score = [round(x / len(query_events), 5) for x in event_score] return event_score
def ner_score(q, nlpServer_Port): allMovies = AllfilesInFolder.getAllFilesInFolder("Data\Plots") # NE print() query_ne = extractNER.getNE(q, nlpServer_Port) if len(query_ne) > 0: print("NE identified in query: " + str(query_ne)) ne_score = Score.matchNE(allMovies, query_ne) else: print("No NE identified in query") ne_score = [0] * len(allMovies) if len(query_ne) > 0: ne_score = [round(x / len(query_ne), 5) for x in ne_score] return ne_score
def extractMovieCharacersfromDB(nlpServer_Port): dataFolder = "Data\Plots" plotFiles = AllfilesInFolder.getAllFilesInFolder(dataFolder) characters = [] print ("Extracting characters from plots") for movie in tqdm(plotFiles): f = open(dataFolder+"/"+movie, 'r') plot = f.read() allCharacters = getCharacterNames(plot,nlpServer_Port) movieChar = {"movieTitle": movie,"characters": ",".join(allCharacters)} characters.append(movieChar) f.close() r = json.dumps(characters) f = open('Data\Characters\Characters.txt', 'w') f.write(r) f.close() print ("Characters file successfully created")
def entity_score(q, nlpServer_Port): allMovies = AllfilesInFolder.getAllFilesInFolder("Data\Plots") # Entities print() query_entites = extractEntities.getEntities(q, nlpServer_Port) if len(query_entites) > 0: print("Entities identified in query: " + str(query_entites)) entitiy_score = Score.matchEntities(allMovies, query_entites, threshold) else: print("No entities identitifed in query") entitiy_score = [0] * len(allMovies) if len(query_entites) > 0: entitiy_score = [ round(x / len(query_entites), 5) for x in entitiy_score ] return entitiy_score
def character_score(q, nlpServer_Port): allMovies = AllfilesInFolder.getAllFilesInFolder("Data\Plots") # Characters print() query_characters = extractCharacters.getCharacterNames(q, nlpServer_Port) if len(query_characters) > 0: print("Characters identified in query: " + str(query_characters)) characters_score = Score.matchCharacters(allMovies, query_characters) else: print("No characters identitifed in query") characters_score = [0] * len(allMovies) if len(query_characters) > 0: characters_score = [ round(x / len(query_characters), 5) for x in characters_score ] return characters_score
def cleanUpDB(): readFolder = "Data\RawPlots" writeFolder = "Data\Plots" movies = AllfilesInFolder.getAllFilesInFolder(readFolder) for movie in (movies): f = open(readFolder + "/" + movie, 'r', encoding="utf8") plot = f.read() f.close() cleanPlot = cleanUp(plot) if len(cleanPlot) > 0: w = open(writeFolder + "//" + movie, 'w') w.write(cleanPlot) w.close() else: print(movie + " is empty") print("Movie plots cleaned up successfully")
def semantic_score(q): allMovies = AllfilesInFolder.getAllFilesInFolder("Data\Plots") # Semantics query_semantics = extractSemantics.extractSemanctic(q) if len(query_semantics) > 0: print("Semantics found in query:") for idx, query_semantic in enumerate(query_semantics): print(str(idx + 1) + ". " + query_semantic.toString()) s_score = Score.matchSemantics(allMovies, query_semantics, allMovies, threshold) else: print("No Semantics found in query") s_score = [0] * len(allMovies) if len(query_semantics) > 0: s_score = [round(x, 5) for x in s_score] return s_score
def extractEntitiesFromDB(nlpServer_Port): dataFolder = "Data\Plots" plotFiles = AllfilesInFolder.getAllFilesInFolder(dataFolder) movieEntities = [] print("Extracting Entites from plots") for movie in tqdm(plotFiles): f = open(dataFolder + "/" + movie, 'r') plot = f.read() Entities_list = getEntities(plot, nlpServer_Port) Entities_list = Entities_list Entities = {"movieTitle": movie, "Entities": ",".join(Entities_list)} movieEntities.append(Entities) f.close() r = json.dumps(movieEntities) f = open("Data\Entities\Entities.txt", "w") f.write(str(r)) f.close print("Entity file successfully created")
def matchNE(allMovies, query_NEs): mFolder = "Data\\NE" mFile = str(AllfilesInFolder.getAllFilesInFolder(mFolder)[0]) f = open(mFolder + '/' + mFile, 'r') allMovieNE = json.loads(f.read()) score = [0] * len(allMovies) if len(query_NEs) == 0: return score for movie in allMovieNE: movie_NEs = movie["NE"] movie_NEs = movie_NEs.split(",") for query_NE in query_NEs: for movie_NE in movie_NEs: if (query_NE in movie_NE): score[allMovies.index(movie["movieTitle"])] += 1 print("NE '" + query_NE + "' found in " + movie["movieTitle"]) break return (score)
def matchCharacters(allMovies, query_characters): mFolder = "Data\Characters" mFile = str(AllfilesInFolder.getAllFilesInFolder(mFolder)[0]) f = open(mFolder + '/' + mFile, 'r') allMovieCharacters = json.loads(f.read()) score = [0] * len(allMovies) if len(query_characters) == 0: return score for movie in allMovieCharacters: movie_chars = movie["characters"] movie_chars = movie_chars.split(",") for query_char in query_characters: for movie_char in movie_chars: if (query_char in movie_char): score[allMovies.index(movie["movieTitle"])] += 1 print("Character '" + query_char + "' found in " + movie["movieTitle"]) break return (score)
def extractSemanticFromDB(): dataFolder = "Data\Plots" plotFiles = AllfilesInFolder.getAllFilesInFolder(dataFolder) MongoDB.delete_all() print("Extracting semantics roles from plots") for movie in tqdm(plotFiles): f = open(dataFolder + "/" + movie, 'r') plot = f.read() f.close() sentences = ParagraphToSentences.PtoS(plot) semantics = semanticRoleSimilarity(sentences) semanticsMongo = [] for semantic in semantics: r = semantic.root p = semantic.properties w = semantic.weight semanticsMongo.append([r, p, w, movie]) MongoDB.bulk_insert(semanticsMongo) print(str(MongoDB.recordCount()) + " semantic roles present in DB")
def extractNEFromDB(nlpServer_Port): dataFolder = "Data\Plots" plotFiles = AllfilesInFolder.getAllFilesInFolder(dataFolder) all_NE = [] print("Extracting NE from plots") for movie in tqdm(plotFiles): f = open(dataFolder + "/" + movie, 'r') plot = f.read() allNE = getNE(plot, nlpServer_Port) trunc_NE = [] for NE in allNE: if not (NE in trunc_NE): trunc_NE.append(NE) NE = {"movieTitle": movie, "NE": ",".join(trunc_NE)} all_NE.append(NE) f.close() r = json.dumps(all_NE) f = open("Data\\NE\MovieNE.txt", "w") f.write(r) f.close() print("NE file successfully created")
def extractTfidFromDB(): movies = AllfilesInFolder.getAllFilesInFolder('Data\Event\AllVerbs\\') rawDocs = [] print("Building TF-IDF for the events") for movie in movies: r = open('Data\Event\AllVerbs\\' + movie, 'r') q = r.read() rawDocs.append(tb(q)) r.close() i = 0 for doc in tqdm(rawDocs): scores = {word: tfidf(word, doc, rawDocs) for word in doc.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) Tf_Score = [] for word, score in sorted_words[:]: if score > 0.00001: Tf_Score.append({"Event": word, "Score": round(score, 5)}) w = open('Data\Event\TFIDF\\' + movies[i], 'w') Tf_json = json.dumps(Tf_Score) w.write(Tf_json) w.close() i += 1 print("Event files successfully created")
def matchEvent(allMovies, queryEvents, threshold): score = [0] * len(allMovies) mFolder = "Data\Event\TFIDF" mFiles = AllfilesInFolder.getAllFilesInFolder(mFolder) for idx, mFile in enumerate(mFiles): print("Movie: " + mFile) f = open(mFolder + '/' + mFile, 'r') allMovieEvents = json.loads(f.read()) for queryEvent in queryEvents: eScore = [] for movieEvent in allMovieEvents: mEvent = movieEvent["Event"] a1 = wn.synsets(mEvent) a2 = wn.synsets(queryEvent) try: sim = a1[0].wup_similarity(a2[0]) if sim >= threshold: eScore.append(round(sim, 5)) else: eScore.append(0) except: eScore.append(0) if len(eScore) > 0: max_value = max(eScore) if max_value > 0: max_index = eScore.index(max_value) print(" Event = '" + queryEvent + "' matched with '" + allMovieEvents[max_index][ 'Event'] + "'. " + "Similarity Score:" + str(max_value) + " Tf-Idf Score:" + str( allMovieEvents[max_index]['Score'])) score[idx] += allMovieEvents[max_index]['Score'] * max_value else: print(" No matches found for Event = '" + queryEvent + "'") else: print(" No Events were found in the movie") return score