-
Notifications
You must be signed in to change notification settings - Fork 3
/
movieLensToIMDB.py
106 lines (93 loc) · 3.67 KB
/
movieLensToIMDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from DataService import Mongo
import pymongo
import time
import omdb
# Retrieving movie infomation from imdb for each movie in MovieLens database.
# db name: movieRecommend
# collection name: movie
# Adds fields to collection:
# - year
# - country
# - language
# - poster
# - plot
# - type
# - runtime
# - metascore
# - rated
# - imdb_rating
# - imdb_votes
# - genre
# - director
# - actors
# - writer
# - title_imdb
def retrieve(mongo):
progressInterval = 100 # How often should we print a progress report to the console?
progressTotal = 34208 # Approximate number of total lines in the file.
bulkSize = 100 # How many documents should we store in memory before inserting them into the database in bulk?
# List of documents that will be given to the database to be inserted to the collection in bulk.
bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db["movie"], ordered = False)
count = 0
skipCount = 0
print("[movieLensToIMDB] Starting retrieve of movie info from IMDB...")
startTime = time.time()
# save all data in dict
# output the data into MongoDB
cursor = mongo.db["movie"].find({}, no_cursor_timeout=True)
for cur_movie in cursor:
count += 1
if count % progressInterval == 0:
print("[movieLensToIMDB] %5d lines processed so far. (%d%%) (%0.2fs)" % (count, int(count * 100 / progressTotal), time.time() - startTime))
cur_mid = cur_movie["mid"]
cur_imdbid_len = len(str(cur_movie["imdbid"]))
# Construct the real imdbid
cur_imdbid = "tt"
for i in range(7 - cur_imdbid_len):
cur_imdbid += "0"
cur_imdbid += str(cur_movie["imdbid"])
# retrieve movie info from IMDB
imdb_movie = omdb.imdbid(cur_imdbid)
cur_genres = []
for genre in imdb_movie["genre"].split(","):
cur_genres.append(genre.strip())
cur_actors = []
for actor in imdb_movie["actors"].split(","):
cur_actors.append(actor.strip())
bulkPayload.find({"mid": cur_mid}).update({"$set": {
"year": imdb_movie["year"],
"country": imdb_movie["country"],
"language": imdb_movie["language"],
"poster": imdb_movie["poster"],
"type": imdb_movie["type"],
"runtime": imdb_movie["runtime"],
"plot": imdb_movie["plot"],
"metascore": imdb_movie["metascore"],
"rated": imdb_movie["rated"],
"imdb_rating": imdb_movie["imdb_rating"],
"imdb_votes": imdb_movie["imdb_votes"],
"genres": cur_genres,
"director": imdb_movie["director"],
"actors": cur_actors,
"writer": imdb_movie["writer"],
"title_imdb": imdb_movie["title"]
}})
if count % bulkSize == 0:
try:
bulkPayload.execute()
except pymongo.errors.OperationFailure as e:
skipCount += len(e.details["writeErrors"])
bulkPayload = pymongo.bulk.BulkOperationBuilder(mongo.db["movie"], ordered = False)
if count % bulkSize > 0:
try:
bulkPayload.execute()
except pymongo.errors.OperationFailure as e:
skipCount += len(e.details["writeErrors"])
print("[movieLensToIMDB] Parse Complete (%0.2fs)" % (time.time() - startTime))
print("[movieLensToIMDB] Found " + str(count) + " movies.")
print("[movieLensToIMDB] Skipped " + str(skipCount) + " insertions.")
def main():
mongo = Mongo("movieRecommend")
retrieve(mongo)
if __name__ == "__main__":
main()