def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 1800000 # Approximate number of total lines in the file. count = 0 updateCount = 0 bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 print("=== Starting Parse of language.list ===") startTime = time.time() with open("imdbdata/language.list", encoding="latin1") as tsv: languages = [] title = -1 lastTitle = -1 for line in csv.reader(tsv, delimiter="\t"): valueInd = 0 #Which column in the TSV file are we reading? count += 1 if count % progressInterval == 0: print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime)) for value in line: if value == "": continue if valueInd == 0: #Movie Title if imdbUtil.isEpisode(value): break lastTitle = title title = value # We moved onto the next movie; update the the database with the language info of the previous movie first. if title != lastTitle and len(languages) > 0: bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(lastTitle)} ).update( { "$set": { "languages":languages.copy() } } ) updateCount += 1 bulkCount += 1 languages = [] elif valueInd == 1: #Company languages.append(__formatLanguage(value)) valueInd += 1 if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkCount = 0 if len(languages) > 0: bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(lastTitle)} ).update( { "$set": { "languages":languages.copy() } } ) updateCount += 1 bulkCount += 1 if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime)) print("[*] Attemped updating", str(updateCount), "movies with language information.")
def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 2700000 # Approximate number of total lines in the file. count = 0 updateCount = 0 bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 #Relations we are interested in logging relations = ["follows", "followed by", "remake of", "remade as", "version of", "spin off from"] print("=== Starting Parse of movie-links.list ===") startTime = time.time() f = open("imdbdata/movie-links.list", encoding="latin1") title = -1 links = [] nextReady = True #are we finished with the current movie's information and ready to move onto the next? for line in f: count += 1 if count % progressInterval == 0: print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime)) if nextReady: #Title Line title = imdbUtil.formatTitle(line) nextReady = False if line.strip() == "": # Update the corresponding movie entries in the database with the relational info. if len(links) > 0 and not imdbUtil.isEpisode(title): bulkPayload.find( {"imdbtitle":title} ).update( { "$set": { "related":links } } ) bulkCount += 1 updateCount += 1 links = [] nextReady = True for relation in relations: if relation in line: #Relation Line links.append(imdbUtil.formatTitle(line[line.index(relation)+len(relation)+1:].strip()[:-1])) break if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkCount = 0 f.close() if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime)) print("[*] Attemped updating", str(updateCount), "movies with taxonomic information.")
def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 1000000 # Approximate number of total lines in the file. count = 0 updateCount = 0 bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 print("=== Starting Parse of aka-titles.list ===") startTime = time.time() f = open("imdbdata/aka-titles.list", encoding="latin1") title = -1 isEpisode = False akatitles = [] for line in f: count += 1 if count % progressInterval == 0: print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime)) if "(aka" in line: if not isEpisode: akatitles.append(imdbUtil.simpleTitle(line[line.index("(aka")+5:])) else: if imdbUtil.isEpisode(line): isEpisode = True else: # Update the corresponding movie entry in the database with the alternate title info. if len(akatitles) > 0: bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(title)} ).update( { "$addToSet": { "title": {"$each" : list(set(akatitles.copy()))} } } ) bulkCount += 1 updateCount += 1 isEpisode = False title = line akatitles = [] if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkCount = 0 f.close() if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime)) print("[*] Attemped updating", str(updateCount), "movies with alternate title information.")
def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 1800000 # Approximate number of total lines in the file. count = 0 updateCount = 0 bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 print("=== Starting Parse of countries.list ===") startTime = time.time() with open("imdbdata/countries.list", encoding="latin1") as tsv: for line in csv.reader(tsv, delimiter="\t"): title = -1 country = -1 valueInd = 0 #Which column in the TSV file are we reading? count += 1 if count % progressInterval == 0: print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime)) for value in line: if value == "": continue if valueInd == 0: #Movie Title if imdbUtil.isEpisode(value): break title = value elif valueInd == 1: #Country country = value valueInd += 1 # Update the corresponding movie entries in the database with the country info. if title != -1 and country != -1: bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(title)} ).update( { "$set": { "country":country } } ) bulkCount += 1 updateCount += 1 if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkCount = 0 if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime)) print("[*] Attemped updating", str(updateCount), "movies with country information.")
def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 3700000 # Approximate number of total lines in the file. bulkSize = 5000 # How many documents should we store in memory before inserting them into the database in bulk? bulkCount = 0 pendingDoc = { } # Current document we are parsing data for. Once finished, will be appended to bulkPayload. # List of documents that will be given to the database to be inserted to the collection in bulk. bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db[collectionName], ordered=False) count = 0 movieCount = 0 tvCount = 0 skipCount = 0 print("=== Starting Parse of movies.list ===") startTime = time.time() with open("imdbdata/movies.list", encoding="latin1") as tsv: for line in csv.reader(tsv, delimiter="\t"): title = -1 year = -1 valueInd = 0 #Which column in the TSV file are we reading? isEpisode = False count += 1 if count % progressInterval == 0: print( str(count), "lines processed so far. (" + str(int((count / progressTotal) * 100)) + "%%) (%0.2fs)" % (time.time() - startTime)) # Parse the text data from the read TSV line for value in line: if value == "": continue if valueInd == 0: #Movie Title if imdbUtil.isEpisode(value): isEpisode = True break title = value elif valueInd == 1: #Year year = imdbUtil.parseYear(value) valueInd += 1 # This line cooresponds to a movie. Add it to the database. if title != -1 and year != -1: if "imdbtitle" in pendingDoc and imdbUtil.stripEpisode( title) != imdbUtil.stripEpisode( pendingDoc["imdbtitle"]): if "tv" in pendingDoc: tvCount += 1 else: movieCount += 1 pendingDoc["imdbtitle"] = imdbUtil.formatTitle( pendingDoc["imdbtitle"]) bulkPayload.insert(pendingDoc.copy()) bulkCount += 1 pendingDoc.clear() if bulkCount >= bulkSize: try: bulkPayload.execute() except pymongo.errors.OperationFailure as e: skipCount += len(e.details["writeErrors"]) bulkPayload = pymongo.bulk.BulkOperationBuilder( mongo.db[collectionName], ordered=False) bulkCount = 0 pendingDoc["imdbtitle"] = title pendingDoc["title"] = [imdbUtil.simpleTitle(title)] pendingDoc["year"] = year #This line cooresponds to a TV episode. Mark the previously logged movie as actually being a TV show, rather than a movie. if isEpisode or "(TV)" in str(title): pendingDoc["tv"] = 1 if pendingDoc != {}: if "tv" in pendingDoc: tvCount += 1 else: movieCount += 1 pendingDoc["imdbtitle"] = imdbUtil.formatTitle(pendingDoc["imdbtitle"]) bulkPayload.insert(pendingDoc.copy()) bulkCount += 1 if bulkCount > 0: try: bulkPayload.execute() except pymongo.errors.OperationFailure as e: skipCount += len(e.details["writeErrors"]) print("[*] Parse Complete (%0.2fs)" % (time.time() - startTime)) print("[*] Found", str(movieCount), "movies.") print("[*] Found", str(tvCount), "TV shows.") print("[*] Skipped", str(skipCount), "insertions.")
def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 2200000 # Approximate number of total lines in the file. count = 0 updateCount = 0 removeCount = 0 ignoreUntil = "8: THE GENRES LIST" #Ignore parsing of lines until one matching this string is found ignoring = True bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 print("=== Starting Parse of genres.list ===") startTime = time.time() with open("imdbdata/genres.list", encoding="latin1") as tsv: genres = [] title = -1 lastTitle = -1 for line in csv.reader(tsv, delimiter="\t"): valueInd = 0 #Which column in the TSV file are we reading? count += 1 if count % progressInterval == 0: print( str(count), "lines processed so far. (" + str(int((count / progressTotal) * 100)) + "%%) (%0.2fs)" % (time.time() - startTime)) for value in line: if value == "": continue if value == ignoreUntil: ignoring = False if ignoring: break if valueInd == 0: #Movie Title if imdbUtil.isEpisode(value): break lastTitle = title title = value # We moved onto the next movie; update the the database with the genre info of the previous movie first. if title != lastTitle and len(genres) > 0: if "Adult" in genres: removeCount += 1 bulkPayload.find({ "imdbtitle": imdbUtil.formatTitle(lastTitle) }).remove() else: updateCount += 1 bulkPayload.find({ "imdbtitle": imdbUtil.formatTitle(lastTitle) }).update({"$set": { "genres": genres.copy() }}) bulkCount += 1 genres = [] elif valueInd == 1: #Genre genres.append(value) valueInd += 1 if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[ collectionName].initialize_unordered_bulk_op() bulkCount = 0 if len(genres) > 0: if "Adult" in genres: removeCount += 1 bulkPayload.remove({"imdbtitle": imdbUtil.formatTitle(title)}) else: updateCount += 1 bulkPayload.find({ "imdbtitle": imdbUtil.formatTitle(title) }).update({"$set": { "genres": genres.copy() }}) bulkCount += 1 if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time() - startTime)) print("[*] Attemped updating", str(updateCount), "movies with genre information.") print("[*] Removed", str(removeCount), "pornographic films from the database.")
def parse(mongo, collectionName): progressInterval = 250000 # How often should we print a progress report to the console? progressTotal = 6400000 # Approximate number of total lines in the file. count = 0 updateCount = 0 removeCount = 0 ignoreUntil = "8: THE KEYWORDS LIST" #Ignore parsing of lines until one matching this string is found ignoring = True bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 print("=== Starting Parse of keywords.list ===") startTime = time.time() with open("imdbdata/keywords.list", encoding="latin1") as tsv: keywords = [] title = -1 lastTitle = -1 for line in csv.reader(tsv, delimiter="\t"): valueInd = 0 #Which column in the TSV file are we reading? count += 1 if count % progressInterval == 0: print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime)) for value in line: if value == "": continue if value == ignoreUntil: ignoring = False if ignoring: break if valueInd == 0: #Movie Title if imdbUtil.isEpisode(value): break lastTitle = title title = value # We moved onto the next movie; update the the database with the keyword info of the previous movie first. if title != lastTitle and len(keywords) > 0: bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(lastTitle)} ).update( { "$set": { "keywords":keywords.copy() } } ) bulkCount += 1 keywords = [] elif valueInd == 1: #Keyword keywords.append(value) updateCount += 1 valueInd += 1 if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkCount = 0 if len(keywords) > 0: bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(title)} ).update( { "$set": { "keywords":keywords.copy() } } ) bulkCount += 1 if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime)) print("[*] Added", str(updateCount), "keywords to movies in the database.")
def parse(mongo, collectionName, field, listfile, progressTotal): progressInterval = 250000 # How often should we print a progress report to the console? if progressTotal > 10000000: progressInterval = 500000 count = 0 updateCount = 0 bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op() bulkSize = 5000 # How many queries should we store in memory before sending them to the database in bulk? bulkCount = 0 #Conditions that must be found in the file before names will start being parsed startFlag = False startFlag2 = False endFlag = False print("=== Starting Parse of " + listfile + " ===") startTime = time.time() with open("imdbdata/" + listfile, encoding="latin1") as tsv: name = -1 for line in csv.reader(tsv, delimiter="\t"): foundOnLine = False #Did we find any content on this line? Lines with no content reset and set-up for the next name. count += 1 if count % progressInterval == 0: print( str(count), "lines processed so far. (" + str(int((count / progressTotal) * 100)) + "%%) (%0.2fs)" % (time.time() - startTime)) for value in line: if endFlag: break if value.strip()[:4] == "Name": startFlag = True break if value.strip()[:4] == "----" and startFlag: startFlag2 = True break if not startFlag or not startFlag2: break if "-----------------" in value and startFlag and startFlag2: endFlag = True break if value == "": continue else: foundOnLine = True if name == -1: name = imdbUtil.formatName(value) updateCount += 1 # Skipping logging people for TV episodes, because there's no easy way to tell if a person is a "main character", and I don't want to be logging # every single minor/cameo person who appeared in some single episode of some series. Plus, we're more concered about movies, not TV shows. elif not imdbUtil.isEpisode(value): bulkPayload.find({ "imdbtitle": imdbUtil.formatTitle(value) }).update({"$addToSet": { field: name }}) bulkCount += 1 if not foundOnLine: name = -1 if bulkCount >= bulkSize: bulkPayload.execute() bulkPayload = mongo.db[ collectionName].initialize_unordered_bulk_op() bulkCount = 0 if bulkCount > 0: bulkPayload.execute() print("[*] Parse Complete (%0.2fs)" % (time.time() - startTime)) print("[*] Attemped updating movies with information about", str(updateCount), "people.")
def parse(mongo, collectionName): progressInterval = 100000 # How often should we print a progress report to the console? progressTotal = 3700000 # Approximate number of total lines in the file. bulkSize = 5000 # How many documents should we store in memory before inserting them into the database in bulk? bulkCount = 0 pendingDoc = {} # Current document we are parsing data for. Once finished, will be appended to bulkPayload. # List of documents that will be given to the database to be inserted to the collection in bulk. bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db[collectionName], ordered=False) count = 0 movieCount = 0 tvCount = 0 skipCount = 0 print("=== Starting Parse of movies.list ===") startTime = time.time() with open("imdbdata/movies.list", encoding="latin1") as tsv: for line in csv.reader(tsv, delimiter="\t"): title = -1 year = -1 valueInd = 0 #Which column in the TSV file are we reading? isEpisode = False count += 1 if count % progressInterval == 0: print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime)) # Parse the text data from the read TSV line for value in line: if value == "": continue if valueInd == 0: #Movie Title if imdbUtil.isEpisode(value): isEpisode = True break title = value elif valueInd == 1: #Year year = imdbUtil.parseYear(value) valueInd += 1 # This line cooresponds to a movie. Add it to the database. if title != -1 and year != -1: if "imdbtitle" in pendingDoc and imdbUtil.stripEpisode(title) != imdbUtil.stripEpisode(pendingDoc["imdbtitle"]): if "tv" in pendingDoc: tvCount += 1 else: movieCount += 1 pendingDoc["imdbtitle"] = imdbUtil.formatTitle(pendingDoc["imdbtitle"]) bulkPayload.insert(pendingDoc.copy()) bulkCount += 1 pendingDoc.clear() if bulkCount >= bulkSize: try: bulkPayload.execute() except pymongo.errors.OperationFailure as e: skipCount += len(e.details["writeErrors"]) bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db[collectionName], ordered=False) bulkCount = 0 pendingDoc["imdbtitle"] = title pendingDoc["title"] = [imdbUtil.simpleTitle(title)] pendingDoc["year"] = year #This line cooresponds to a TV episode. Mark the previously logged movie as actually being a TV show, rather than a movie. if isEpisode or "(TV)" in str(title): pendingDoc["tv"] = 1 if pendingDoc != {}: if "tv" in pendingDoc: tvCount += 1 else: movieCount += 1 pendingDoc["imdbtitle"] = imdbUtil.formatTitle(pendingDoc["imdbtitle"]) bulkPayload.insert(pendingDoc.copy()) bulkCount += 1 if bulkCount > 0: try: bulkPayload.execute() except pymongo.errors.OperationFailure as e: skipCount += len(e.details["writeErrors"]) print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime)) print("[*] Found", str(movieCount), "movies.") print("[*] Found", str(tvCount), "TV shows.") print("[*] Skipped", str(skipCount), "insertions.")