def movie_from_div(div): """ 从一个 div 里面获取到一个电影信息 """ e = pq(div) # 小作用域变量用单字符 m = Movie() m.name = e('.title').text() m.other = e('.other').text() m.score = e('.rating_num').text() m.quote = e('.inq').text() m.cover_url = e('img').attr('src') m.ranking = e('.pic').find('em').text() return m
def get_info(): global stage time.sleep(5) print "-CRAWLER- Start to get movie feature..." while (not mvIDQ.empty()) or stage == 0: try: mvID = mvIDQ.get() # get info from imdmpy with movie id # print "-CRAWLER- Getting movie(id: %s) feature..." % mvID mvIN = imdb_access.get_movie(mvID) # create new Movie object mvOJ = Movie() # ID string mvOJ.id = mvID # title string mvOJ.title = mvIN.get('title') # poster url string mvOJ.cover_url = mvIN.get('cover url') # Bigger poster url string mvOJ.giant_cover_url = mvIN.get('full-size cover url') # genres string list if mvIN.has_key('genres'): sIN = "" for i in mvIN.get('genres'): sIN += (i + '$') mvOJ.genres = sIN[0:len(sIN) - 1] # color string list if mvIN.has_key('color info'): sIN = "" for i in mvIN.get('color info'): sIN += (i + '$') mvOJ.color_info = sIN[0:len(sIN) - 1] # director string list if mvIN.has_key('director'): sIN = "" for i in mvIN.get('director'): sIN += i['name'] + '$' mvOJ.director = sIN[0:len(sIN) - 1] # 1st Actor mvOJ.cast_1st = mvIN.get('cast')[0]['name'] if len(mvIN.get('cast')) >= 2: # 2nd Actor mvOJ.cast_2nd = mvIN.get('cast')[1]['name'] if len(mvIN.get('cast')) >= 3: # 3rd Actor mvOJ.cast_3rd = mvIN.get('cast')[2]['name'] # country string list if mvIN.has_key('countries'): sIN = "" for i in mvIN.get('countries'): sIN += (i + '$') mvOJ.countries = sIN[0:len(sIN) - 1] # language string list if mvIN.has_key('languages'): sIN = "" for i in mvIN.get('languages'): sIN += (i + '$') mvOJ.languages = sIN[0:len(sIN) - 1] # writer string list if mvIN.has_key('writer'): sIN = "" for i in mvIN.get('writer'): sIN += i['name'] + '$' mvOJ.writer = sIN[0:len(sIN) - 1] # editor string list if mvIN.has_key('editor'): sIN = "" for i in mvIN.get('editor'): sIN += i['name'] + '$' mvOJ.editor = sIN[0:len(sIN) - 1] # cinematographer string list if mvIN.has_key('cinematographer'): sIN = "" for i in mvIN.get('cinematographer'): sIN += i['name'] + '$' mvOJ.cinematographer = sIN[0:len(sIN) - 1] # art direction string list if mvIN.has_key('art direction'): sIN = "" for i in mvIN.get('art direction'): sIN += i['name'] + '$' mvOJ.art_director = sIN[0:len(sIN) - 1] # costume designer string list if mvIN.has_key('costume designer'): sIN = "" for i in mvIN.get('costume designer'): sIN += i['name'] + '$' mvOJ.costume_designer = sIN[0:len(sIN) - 1] # music By string list if mvIN.has_key('original music'): sIN = "" for i in mvIN.get('original music'): sIN += i['name'] + '$' mvOJ.original_music = sIN[0:len(sIN) - 1] # sound string list if mvIN.has_key('sound mix'): sIN = "" for i in mvIN.get('sound mix'): sIN += (i + '$') mvOJ.sound_mix = sIN[0:len(sIN) - 1] # production company string list if mvIN.has_key('production companies'): sIN = "" for i in mvIN.get('production companies'): sIN += i['name'] + '$' mvOJ.production_companies = sIN[0:len(sIN) - 1] # year int if mvIN.has_key('year'): mvOJ.year = mvIN.get('year') else: mvOJ.year = 0 # running time int if mvIN.has_key('runtimes'): try: if str(mvIN.get('runtimes')[0]).find(':') != -1: mvOJ.runtimes = int( str(mvIN.get('runtimes')[0]).split(':')[1]) else: mvOJ.runtimes = int(mvIN.get('runtimes')[0]) except Exception: mvOJ.runtimes = 0 else: mvOJ.runtimes = 0 # budget int # if 'budget' in mvIN: # mvOJ.budget = mvIN.get('budget') # get rating for old movies if mode == "old": mvOJ.number_of_votes = get_rating(mvID) mvINQ.put(mvOJ) mvIDQ.task_done() # print '-CRAWLER- Get movie features(ID: %s) successfully.' % mvID # TODO cannot handle exception except Exception, e: print '-CRAWLER- An {} exception occured!'.format(e), mvID mvINQ.put(mvID) time.sleep(1)