コード例 #1
0
def parse(mongo, collectionName):
	progressInterval = 100000  # How often should we print a progress report to the console?
	progressTotal = 1800000     # Approximate number of total lines in the file.	  
	count = 0
	updateCount = 0

	bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
	bulkSize = 5000 		  # How many queries should we store in memory before sending them to the database in bulk?
	bulkCount = 0

	print("=== Starting Parse of language.list ===")
	startTime = time.time()
	with open("imdbdata/language.list", encoding="latin1") as tsv:
		languages = []
		title = -1
		lastTitle = -1
		for line in csv.reader(tsv, delimiter="\t"):			
			valueInd = 0 #Which column in the TSV file are we reading?
			count += 1
			if count % progressInterval == 0:
				print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime))

			for value in line:
				if value == "":
					continue
				if valueInd == 0: #Movie Title
					if imdbUtil.isEpisode(value):
						break
					lastTitle = title
					title = value

					# We moved onto the next movie; update the the database with the language info of the previous movie first.
					if title != lastTitle and len(languages) > 0:
						bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(lastTitle)} ).update( {
							"$set": { "languages":languages.copy() }
						} )
						updateCount += 1
						bulkCount += 1
						languages = []
				elif valueInd == 1: #Company
					languages.append(__formatLanguage(value))
				valueInd += 1

			if bulkCount >= bulkSize:
				bulkPayload.execute()
				bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
				bulkCount = 0

	if len(languages) > 0:
		bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(lastTitle)} ).update( {
			"$set": { "languages":languages.copy() }
		} )
		updateCount += 1
		bulkCount += 1

	if bulkCount > 0:
		bulkPayload.execute()

	print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime))
	print("[*] Attemped updating", str(updateCount), "movies with language information.")
コード例 #2
0
def parse(mongo, collectionName):
	progressInterval = 100000 # How often should we print a progress report to the console?
	progressTotal = 2700000   # Approximate number of total lines in the file.	  
	count = 0
	updateCount = 0

	bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
	bulkSize = 5000 		  # How many queries should we store in memory before sending them to the database in bulk?
	bulkCount = 0

	#Relations we are interested in logging
	relations = ["follows", "followed by", "remake of", "remade as", "version of", "spin off from"]

	print("=== Starting Parse of movie-links.list ===")
	startTime = time.time()
	f = open("imdbdata/movie-links.list", encoding="latin1")
	title = -1
	links = []
	nextReady = True #are we finished with the current movie's information and ready to move onto the next?

	for line in f:			
		count += 1
		if count % progressInterval == 0:
			print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime))

		if nextReady:
			#Title Line
			title = imdbUtil.formatTitle(line)
			nextReady = False
		if line.strip() == "":
			# Update the corresponding movie entries in the database with the relational info.
			if len(links) > 0 and not imdbUtil.isEpisode(title):
				bulkPayload.find( {"imdbtitle":title} ).update( {
						"$set": { "related":links }
					} )				
				bulkCount += 1
				updateCount += 1
			links = []
			nextReady = True

		for relation in relations:
			if relation in line:
				#Relation Line
				links.append(imdbUtil.formatTitle(line[line.index(relation)+len(relation)+1:].strip()[:-1]))
				break

		if bulkCount >= bulkSize:
			bulkPayload.execute()
			bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
			bulkCount = 0
	f.close()

	if bulkCount > 0:
		bulkPayload.execute()

	print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime))
	print("[*] Attemped updating", str(updateCount), "movies with taxonomic information.")
コード例 #3
0
def parse(mongo, collectionName):
	progressInterval = 100000  # How often should we print a progress report to the console?
	progressTotal = 1000000     # Approximate number of total lines in the file.	  
	count = 0
	updateCount = 0

	bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
	bulkSize = 5000 		  # How many queries should we store in memory before sending them to the database in bulk?
	bulkCount = 0

	print("=== Starting Parse of aka-titles.list ===")
	startTime = time.time()
	f = open("imdbdata/aka-titles.list", encoding="latin1")
	title = -1
	isEpisode = False
	akatitles = []

	for line in f:			
		count += 1
		if count % progressInterval == 0:
			print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime))

		if "(aka" in line:
			if not isEpisode:
				akatitles.append(imdbUtil.simpleTitle(line[line.index("(aka")+5:]))
		else:
			if imdbUtil.isEpisode(line):
				isEpisode = True
			else:
				# Update the corresponding movie entry in the database with the alternate title info.
				if len(akatitles) > 0:
					bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(title)} ).update( {
						"$addToSet": { "title": {"$each" : list(set(akatitles.copy()))} }
					} )
					bulkCount += 1
					updateCount += 1

				isEpisode = False
				title = line
				akatitles = []		

		if bulkCount >= bulkSize:
			bulkPayload.execute()
			bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
			bulkCount = 0
	f.close()

	if bulkCount > 0:
		bulkPayload.execute()

	print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime))
	print("[*] Attemped updating", str(updateCount), "movies with alternate title information.")
コード例 #4
0
def parse(mongo, collectionName):
	progressInterval = 100000 # How often should we print a progress report to the console?
	progressTotal = 1800000   # Approximate number of total lines in the file.	  
	count = 0
	updateCount = 0

	bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
	bulkSize = 5000 		  # How many queries should we store in memory before sending them to the database in bulk?
	bulkCount = 0

	print("=== Starting Parse of countries.list ===")
	startTime = time.time()
	with open("imdbdata/countries.list", encoding="latin1") as tsv:
		for line in csv.reader(tsv, delimiter="\t"):			
			title = -1
			country = -1
			valueInd = 0 #Which column in the TSV file are we reading?
			count += 1
			if count % progressInterval == 0:
				print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime))

			for value in line:
				if value == "":
					continue
				if valueInd == 0: #Movie Title
					if imdbUtil.isEpisode(value):
						break
					title = value
				elif valueInd == 1: #Country
					country = value
				valueInd += 1

			# Update the corresponding movie entries in the database with the country info.
			if title != -1 and country != -1:
				bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(title)} ).update( {
						"$set": { "country":country }
					} )
				bulkCount += 1
				updateCount += 1

				if bulkCount >= bulkSize:
					bulkPayload.execute()
					bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
					bulkCount = 0

	if bulkCount > 0:
		bulkPayload.execute()

	print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime))
	print("[*] Attemped updating", str(updateCount), "movies with country information.")
コード例 #5
0
def parse(mongo, collectionName):
    progressInterval = 100000  # How often should we print a progress report to the console?
    progressTotal = 3700000  # Approximate number of total lines in the file.
    bulkSize = 5000  # How many documents should we store in memory before inserting them into the database in bulk?
    bulkCount = 0
    pendingDoc = {
    }  # Current document we are parsing data for. Once finished, will be appended to bulkPayload.
    # List of documents that will be given to the database to be inserted to the collection in bulk.
    bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db[collectionName],
                                                    ordered=False)
    count = 0
    movieCount = 0
    tvCount = 0
    skipCount = 0

    print("=== Starting Parse of movies.list ===")
    startTime = time.time()
    with open("imdbdata/movies.list", encoding="latin1") as tsv:
        for line in csv.reader(tsv, delimiter="\t"):
            title = -1
            year = -1
            valueInd = 0  #Which column in the TSV file are we reading?
            isEpisode = False
            count += 1
            if count % progressInterval == 0:
                print(
                    str(count), "lines processed so far. (" +
                    str(int((count / progressTotal) * 100)) + "%%) (%0.2fs)" %
                    (time.time() - startTime))

            # Parse the text data from the read TSV line
            for value in line:
                if value == "":
                    continue
                if valueInd == 0:  #Movie Title
                    if imdbUtil.isEpisode(value):
                        isEpisode = True
                        break
                    title = value
                elif valueInd == 1:  #Year
                    year = imdbUtil.parseYear(value)
                valueInd += 1

            # This line cooresponds to a movie. Add it to the database.
            if title != -1 and year != -1:
                if "imdbtitle" in pendingDoc and imdbUtil.stripEpisode(
                        title) != imdbUtil.stripEpisode(
                            pendingDoc["imdbtitle"]):
                    if "tv" in pendingDoc:
                        tvCount += 1
                    else:
                        movieCount += 1
                    pendingDoc["imdbtitle"] = imdbUtil.formatTitle(
                        pendingDoc["imdbtitle"])
                    bulkPayload.insert(pendingDoc.copy())
                    bulkCount += 1
                    pendingDoc.clear()

                if bulkCount >= bulkSize:
                    try:
                        bulkPayload.execute()
                    except pymongo.errors.OperationFailure as e:
                        skipCount += len(e.details["writeErrors"])
                    bulkPayload = pymongo.bulk.BulkOperationBuilder(
                        mongo.db[collectionName], ordered=False)
                    bulkCount = 0

                pendingDoc["imdbtitle"] = title
                pendingDoc["title"] = [imdbUtil.simpleTitle(title)]
                pendingDoc["year"] = year

            #This line cooresponds to a TV episode. Mark the previously logged movie as actually being a TV show, rather than a movie.
            if isEpisode or "(TV)" in str(title):
                pendingDoc["tv"] = 1

    if pendingDoc != {}:
        if "tv" in pendingDoc:
            tvCount += 1
        else:
            movieCount += 1
        pendingDoc["imdbtitle"] = imdbUtil.formatTitle(pendingDoc["imdbtitle"])
        bulkPayload.insert(pendingDoc.copy())
        bulkCount += 1

    if bulkCount > 0:
        try:
            bulkPayload.execute()
        except pymongo.errors.OperationFailure as e:
            skipCount += len(e.details["writeErrors"])

    print("[*] Parse Complete (%0.2fs)" % (time.time() - startTime))
    print("[*] Found", str(movieCount), "movies.")
    print("[*] Found", str(tvCount), "TV shows.")
    print("[*] Skipped", str(skipCount), "insertions.")
コード例 #6
0
def parse(mongo, collectionName):
    progressInterval = 100000  # How often should we print a progress report to the console?
    progressTotal = 2200000  # Approximate number of total lines in the file.
    count = 0
    updateCount = 0
    removeCount = 0
    ignoreUntil = "8: THE GENRES LIST"  #Ignore parsing of lines until one matching this string is found
    ignoring = True

    bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
    bulkSize = 5000  # How many queries should we store in memory before sending them to the database in bulk?
    bulkCount = 0

    print("=== Starting Parse of genres.list ===")
    startTime = time.time()
    with open("imdbdata/genres.list", encoding="latin1") as tsv:
        genres = []
        title = -1
        lastTitle = -1
        for line in csv.reader(tsv, delimiter="\t"):
            valueInd = 0  #Which column in the TSV file are we reading?
            count += 1
            if count % progressInterval == 0:
                print(
                    str(count), "lines processed so far. (" +
                    str(int((count / progressTotal) * 100)) + "%%) (%0.2fs)" %
                    (time.time() - startTime))

            for value in line:
                if value == "":
                    continue
                if value == ignoreUntil:
                    ignoring = False
                if ignoring:
                    break
                if valueInd == 0:  #Movie Title
                    if imdbUtil.isEpisode(value):
                        break
                    lastTitle = title
                    title = value

                    # We moved onto the next movie; update the the database with the genre info of the previous movie first.
                    if title != lastTitle and len(genres) > 0:
                        if "Adult" in genres:
                            removeCount += 1
                            bulkPayload.find({
                                "imdbtitle":
                                imdbUtil.formatTitle(lastTitle)
                            }).remove()
                        else:
                            updateCount += 1
                            bulkPayload.find({
                                "imdbtitle":
                                imdbUtil.formatTitle(lastTitle)
                            }).update({"$set": {
                                "genres": genres.copy()
                            }})
                        bulkCount += 1
                        genres = []
                elif valueInd == 1:  #Genre
                    genres.append(value)
                valueInd += 1

            if bulkCount >= bulkSize:
                bulkPayload.execute()
                bulkPayload = mongo.db[
                    collectionName].initialize_unordered_bulk_op()
                bulkCount = 0

    if len(genres) > 0:
        if "Adult" in genres:
            removeCount += 1
            bulkPayload.remove({"imdbtitle": imdbUtil.formatTitle(title)})
        else:
            updateCount += 1
            bulkPayload.find({
                "imdbtitle": imdbUtil.formatTitle(title)
            }).update({"$set": {
                "genres": genres.copy()
            }})
        bulkCount += 1

    if bulkCount > 0:
        bulkPayload.execute()

    print("[*] Parse Complete (%0.2fs)" % (time.time() - startTime))
    print("[*] Attemped updating", str(updateCount),
          "movies with genre information.")
    print("[*] Removed", str(removeCount),
          "pornographic films from the database.")
コード例 #7
0
ファイル: imdbKeywords.py プロジェクト: MrHohn/MovieRecommend
def parse(mongo, collectionName):
	progressInterval = 250000 # How often should we print a progress report to the console?
	progressTotal = 6400000   # Approximate number of total lines in the file.	  
	count = 0
	updateCount = 0
	removeCount = 0
	ignoreUntil = "8: THE KEYWORDS LIST" #Ignore parsing of lines until one matching this string is found
	ignoring = True

	bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
	bulkSize = 5000 		  # How many queries should we store in memory before sending them to the database in bulk?
	bulkCount = 0

	print("=== Starting Parse of keywords.list ===")
	startTime = time.time()
	with open("imdbdata/keywords.list", encoding="latin1") as tsv:
		keywords = []
		title = -1
		lastTitle = -1
		for line in csv.reader(tsv, delimiter="\t"):			
			valueInd = 0 #Which column in the TSV file are we reading?
			count += 1
			if count % progressInterval == 0:
				print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime))

			for value in line:
				if value == "":
					continue
				if value == ignoreUntil:
					ignoring = False
				if ignoring:
					break
				if valueInd == 0: #Movie Title
					if imdbUtil.isEpisode(value):
						break
					lastTitle = title
					title = value

					# We moved onto the next movie; update the the database with the keyword info of the previous movie first.
					if title != lastTitle and len(keywords) > 0:
						bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(lastTitle)} ).update( {
							"$set": { "keywords":keywords.copy() }
						} )
						bulkCount += 1
						keywords = []
				elif valueInd == 1: #Keyword
					keywords.append(value)
					updateCount += 1
				valueInd += 1

			if bulkCount >= bulkSize:
				bulkPayload.execute()
				bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
				bulkCount = 0

	if len(keywords) > 0:
		bulkPayload.find( {"imdbtitle":imdbUtil.formatTitle(title)} ).update( {
				"$set": { "keywords":keywords.copy() }
		} )
		bulkCount += 1

	if bulkCount > 0:
		bulkPayload.execute()

	print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime))
	print("[*] Added", str(updateCount), "keywords to movies in the database.")
コード例 #8
0
def parse(mongo, collectionName, field, listfile, progressTotal):
    progressInterval = 250000  # How often should we print a progress report to the console?
    if progressTotal > 10000000:
        progressInterval = 500000
    count = 0
    updateCount = 0

    bulkPayload = mongo.db[collectionName].initialize_unordered_bulk_op()
    bulkSize = 5000  # How many queries should we store in memory before sending them to the database in bulk?
    bulkCount = 0

    #Conditions that must be found in the file before names will start being parsed
    startFlag = False
    startFlag2 = False
    endFlag = False

    print("=== Starting Parse of " + listfile + " ===")
    startTime = time.time()
    with open("imdbdata/" + listfile, encoding="latin1") as tsv:
        name = -1
        for line in csv.reader(tsv, delimiter="\t"):
            foundOnLine = False  #Did we find any content on this line? Lines with no content reset and set-up for the next name.
            count += 1
            if count % progressInterval == 0:
                print(
                    str(count), "lines processed so far. (" +
                    str(int((count / progressTotal) * 100)) + "%%) (%0.2fs)" %
                    (time.time() - startTime))

            for value in line:
                if endFlag:
                    break
                if value.strip()[:4] == "Name":
                    startFlag = True
                    break
                if value.strip()[:4] == "----" and startFlag:
                    startFlag2 = True
                    break
                if not startFlag or not startFlag2:
                    break
                if "-----------------" in value and startFlag and startFlag2:
                    endFlag = True
                    break

                if value == "":
                    continue
                else:
                    foundOnLine = True
                    if name == -1:
                        name = imdbUtil.formatName(value)
                        updateCount += 1

                    # Skipping logging people for TV episodes, because there's no easy way to tell if a person is a "main character", and I don't want to be logging
                    # every single minor/cameo person who appeared in some single episode of some series. Plus, we're more concered about movies, not TV shows.
                    elif not imdbUtil.isEpisode(value):
                        bulkPayload.find({
                            "imdbtitle":
                            imdbUtil.formatTitle(value)
                        }).update({"$addToSet": {
                            field: name
                        }})
                        bulkCount += 1

            if not foundOnLine:
                name = -1

            if bulkCount >= bulkSize:
                bulkPayload.execute()
                bulkPayload = mongo.db[
                    collectionName].initialize_unordered_bulk_op()
                bulkCount = 0

    if bulkCount > 0:
        bulkPayload.execute()

    print("[*] Parse Complete (%0.2fs)" % (time.time() - startTime))
    print("[*] Attemped updating movies with information about",
          str(updateCount), "people.")
コード例 #9
0
ファイル: imdbMovies.py プロジェクト: MrHohn/MovieRecommend
def parse(mongo, collectionName):
	progressInterval = 100000 # How often should we print a progress report to the console?
	progressTotal = 3700000   # Approximate number of total lines in the file.
	bulkSize = 5000 		  # How many documents should we store in memory before inserting them into the database in bulk?
	bulkCount = 0
	pendingDoc = {}			  # Current document we are parsing data for. Once finished, will be appended to bulkPayload.
	# List of documents that will be given to the database to be inserted to the collection in bulk.
	bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db[collectionName], ordered=False)		  
	count = 0
	movieCount = 0
	tvCount = 0
	skipCount = 0

	print("=== Starting Parse of movies.list ===")
	startTime = time.time()
	with open("imdbdata/movies.list", encoding="latin1") as tsv:
		for line in csv.reader(tsv, delimiter="\t"):			
			title = -1
			year = -1
			valueInd = 0 #Which column in the TSV file are we reading?
			isEpisode = False
			count += 1
			if count % progressInterval == 0:
				print(str(count), "lines processed so far. ("+str(int((count/progressTotal)*100))+"%%) (%0.2fs)" % (time.time()-startTime))

			# Parse the text data from the read TSV line
			for value in line:
				if value == "":
					continue
				if valueInd == 0: #Movie Title
					if imdbUtil.isEpisode(value):
						isEpisode = True
						break
					title = value
				elif valueInd == 1: #Year
					year = imdbUtil.parseYear(value)
				valueInd += 1

			# This line cooresponds to a movie. Add it to the database.
			if title != -1 and year != -1:
				if "imdbtitle" in pendingDoc and imdbUtil.stripEpisode(title) != imdbUtil.stripEpisode(pendingDoc["imdbtitle"]):
					if "tv" in pendingDoc:
						tvCount += 1
					else:
						movieCount += 1
					pendingDoc["imdbtitle"] = imdbUtil.formatTitle(pendingDoc["imdbtitle"])
					bulkPayload.insert(pendingDoc.copy())
					bulkCount += 1
					pendingDoc.clear()

				if bulkCount >= bulkSize:
					try:
						bulkPayload.execute()
					except pymongo.errors.OperationFailure as e:
						skipCount += len(e.details["writeErrors"])
					bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db[collectionName], ordered=False)	
					bulkCount = 0

				pendingDoc["imdbtitle"] = title
				pendingDoc["title"] = [imdbUtil.simpleTitle(title)]
				pendingDoc["year"] = year

			#This line cooresponds to a TV episode. Mark the previously logged movie as actually being a TV show, rather than a movie.
			if isEpisode or "(TV)" in str(title):
				pendingDoc["tv"] = 1

	if pendingDoc != {}:
		if "tv" in pendingDoc:
			tvCount += 1
		else:
			movieCount += 1
		pendingDoc["imdbtitle"] = imdbUtil.formatTitle(pendingDoc["imdbtitle"])
		bulkPayload.insert(pendingDoc.copy())
		bulkCount += 1

	if bulkCount > 0:
		try:
			bulkPayload.execute()
		except pymongo.errors.OperationFailure as e:
			skipCount += len(e.details["writeErrors"])

	print("[*] Parse Complete (%0.2fs)" % (time.time()-startTime))
	print("[*] Found", str(movieCount), "movies.")
	print("[*] Found", str(tvCount), "TV shows.")
	print("[*] Skipped", str(skipCount), "insertions.")