forked from rawler/bhindex
/
scraper.py
executable file
·185 lines (161 loc) · 6.07 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/python
import sys, os.path
from time import time
try:
from imdb import IMDb
ia = IMDb()
except ImportError:
print "WARNING: failed to load imdb-scraper due to missing library imdbpy."
ia = None
HERE = os.path.dirname(__file__)
sys.path.append(os.path.join(HERE, "tvdb_api"))
import tvdb_api
def imdb_scraper(obj, id):
if not ia:
print "WARNING: failed to scrape from imdb due to missing library imdbpy."
return
movie = ia.get_movie(id)
def plot_map(plots):
if plots:
return plots[0].rsplit("::", 1)[0]
else:
return None
if movie:
t = time()
def map_item(localName, names, filter=unicode):
if not isinstance(names, tuple):
names = (names,)
for name in names:
val = movie.get(name)
if val:
obj.update_key(localName, filter(movie[name]), t)
return True
print u"No match for %s" % name
obj.update_key(u'imdb', unicode(id), t)
map_item(u'rating', 'rating')
map_item(u'title', 'title')
map_item(u'image', 'cover url')
map_item(u'year', 'year')
map_item(u'genre', 'genres', set)
map_item(u'plot', ('plot', 'plot outline'), plot_map)
map_item(u'country', 'countries', set)
directors = movie.get('director') or movie.get('directors')
if directors:
obj.update_key(u'director', (p['name'] for p in directors), t)
cast = movie.get('cast')
if cast:
obj.update_key(u'actor', (p['name'] for p in cast), t)
return True
else:
print "Movie not found in IMDB"
return False
def imdb_search(obj):
if not ia:
print "WARNING: failed to scrape from imdb due to missing library imdbpy."
return False
for title in obj['title']:
movies = ia.search_movie(title)
for movie in movies:
year = unicode(movie['year'])
if year in obj['year']:
print "IMDB Scraper found match for %s (%s)" % (title, year)
return imdb_scraper(obj, movie.movieID)
return False
def tvdb_search(obj):
lang = obj.get('language')
tvdb = tvdb_api.Tvdb(language=lang)
def iter_series():
res = False
if 'series_tvdbid' in obj:
for seriesid in obj['series_tvdbid']:
try:
res = iter_seasons(tvdb[seriesid]) or res
except tvdb_api.tvdb_shownotfound:
pass
if res: return res
for series in obj['series']:
try:
year = obj.get('year')
if year:
series = "%s (%s)" % (series, year)
res = iter_seasons(tvdb[series]) or res
except tvdb_api.tvdb_shownotfound:
pass
return res
def iter_seasons(series):
res = False
for season in obj['season']:
try:
res = iter_episodes(series, series[int(season)]) or res
except tvdb_api.tvdb_seasonnotfound:
pass
return res
def iter_episodes(series, season):
res = False
for episode in obj['episode']:
try:
res = map(series, season, season[int(episode)]) or res
except tvdb_api.tvdb_episodenotfound:
pass
return res
def map(series, season, episode):
def trim_split(str, delim='|'):
return (x for x in str.split(delim) if x)
def genre_split(str):
res = set()
for x in trim_split(str):
for y in x.split(' and '):
res.add(y.strip())
return res
t = time()
def map_item(localName, remote_dict, name, filter=unicode):
try:
value = remote_dict[name]
except tvdb_api.tvdb_attributenotfound:
return
if value:
obj.update_key(localName, filter(value), t)
obj.update_key(u'episode_tvdbid', unicode(episode['id']))
map_item(u'episode_rating', episode, 'rating')
map_item(u'episode_name', episode, 'episodename')
map_item(u'plot', episode, 'overview')
map_item(u'actor', episode, 'gueststars', trim_split)
map_item(u'director', episode, 'director', trim_split)
map_item(u'writer', episode, 'writer', trim_split)
obj.update_key(u'series_tvdbid', unicode(series['id']))
map_item(u'actor', series, 'actors', trim_split)
map_item(u'genre', series, 'genre', genre_split)
map_item(u'image', series, 'poster', trim_split)
map_item(u'rating', series, 'rating', trim_split)
return True
return iter_series()
def scrape_for(obj):
if obj.get('imdb'):
return imdb_scraper(obj, obj['imdb'].any())
elif obj.get('title') and obj.get('year'):
return imdb_search(obj)
elif obj.get('series') and obj.get('season') and obj.get('episode'):
return tvdb_search(obj)
if __name__ == '__main__':
import db, config, sys, cliopt
config = config.read()
db = db.open(config)
usage = "usage: %prog [options] [assetid] ..."
parser = cliopt.OptionParser(usage=usage)
parser.add_option("-a", "--add", action="append", dest="adds",
help="Add a value for an attr for objects, such as '-tname:monkey'. Previous tags for the attribute will be kept.")
parser.add_option("-s", "--set", action="append", dest="attrs",
help="Overwrite an attr tag for objects, such as '-tname:monkey'. Previous tags for the attribute will be removed.")
(options, args) = parser.parse_args()
attrs = cliopt.parse_attrs(options.attrs)
adds = cliopt.parse_attrs(options.adds)
for arg in args:
obj = db[arg]
if obj:
for k,v in attrs.iteritems():
obj[k] = v
for k,v in adds.iteritems():
obj.update_key(k, v)
if scrape_for(obj):
db.update(obj)
db.commit()