Beispiel #1
0
def process():
    while True:
        p = rd.spop("imageTask")
        IM = p.replace('"', "")
        IM = IM.replace('\n', "")
        data = IM.split(',')
        url = 'http://183.59.160.50:8082/EPG/jsp/images/universal/film/poster/' + data[3]
        path = u"E:/posters_5/"+"/".join(data[3].split('/')[:-1])+"/"
        try:
        	os.makedirs(path)
        except Exception as e:
        	# print(str(e))
        	pass
        local_filename = path+url.split('/')[-1]
        r = requests_get(url)
        print("r.status_code:",r.status_code)
        if r.status_code == 404 or r == False:
            with open("E:/404.txt", "a") as myfile:
                myfile.write(p)
            rd.sadd("imageTaskFailed",p)
            print("failed", p)
            continue
        f = open(local_filename, 'wb')
        for chunk in r.iter_content(chunk_size=512 * 1024):
            if chunk:
            	f.write(chunk)
        f.close()
        print("done", local_filename)
Beispiel #2
0
def go_detail_list_task():
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_video_detail_task)
        if q is None:
            print(u"yk_video_detail_task sleeping 20 sec....")
            # time.sleep(task_wait)
            return True
        detail_url = json.loads(q)
        #if rd.sismember(config.yk_video_detail_failed,q)==True or rd.sismember(config.yk_video_detail_done,detail_url['url'])==True:
        if rd.sismember(config.yk_video_detail_done,
                        detail_url['url']) == True:
            print("pass", detail_url['url'])
            continue
        # r = go_detail_list_page(detail_url)
        r = requests_get(detail_url['url'], headers=youku_home_headers)
        d = parse_detail_list_page(r, detail_url['url'])
        data = d['data']
        if data is False or data == None:
            rd.sadd(config.yk_video_detail_failed, q)
            continue
        for x in d['stars']:
            rd.sadd(config.yk_star_task, x)  # 明星采集队列,redis set特性去重
        print('detail_url done:', detail_url['url'], data)
        done = rd.sadd(config.yk_video_detail_done,
                       detail_url['url'])  # finished
        #if done == 1:
        youku_videos.insert(data, check_keys=False)  # save tv data
        # 每50步更新一次session
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #3
0
def task_types_fetch():
    retry = 5
    i = 0
    while True:
        type_url = rd.spop(config.yk_types_task)
        if type_url is None:
            print(u"yk_types_task sleeping 20sec....")
            return True
        if rd.sismember(config.yk_types_failed,
                        type_url) == True or rd.sismember(
                            config.yk_types_done, type_url) == True:
            continue
        r = requests_get(url=type_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:
            print(u'filed task:%s' % type_url)
            rd.sadd(config.yk_types_failed, type_url)
            continue
        pages = parse_category_show(r, type_url)
        print("task_types_fetch data:", pages)
        for page in xrange(1, int(pages['pages'])):
            page_url = re.sub('(\.html.*)',
                              '_s_1_d_1_p_{page}.html'.format(page=page),
                              type_url)
            print("task_types_fetch for :", page_url)
            if rd.sismember(config.yk_page_failed,
                            page_url) == False and rd.sismember(
                                config.yk_page_done, page_url) == False:
                rd.sadd(config.yk_page_task, page_url)
        rd.sadd(config.yk_types_done, type_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #4
0
def process():
    path = u"E:/avatar/"
    while True:
        p = rd.spop("stars")
        if not p:
        	return True
        task = json.loads(p)
        if task.get("avatar") and not task.get("img_url"):
        	im = requests_get(task['avatar'])
        elif not task.get("avatar") and task.get("img_url"):
        	im = requests_get(task['img_url'])
        else:
        	print("----",p)
        	continue
        #print("r.status_code:",r.status_code)
        #if r.status_code == 404 or r == False:
        if not im:
            rd.sadd("avatar_failed",p)
            print("failed", p)
            continue
        #im = Image.open(r.raw)
        file_name = "/".join([task.get("_id"),"%s.jpg"%(task.get("_id"))])
        try:
        	os.makedirs(re.search('(.*/)',path+file_name).group(1))
        except Exception as e:
        	#print(str(e))
        	pass
        im.convert('RGB').save(path+file_name)
        result = mongo_conn.stars.update_one({"_id":ObjectId(task['_id'])},{"$set":{"file_path":file_name}})
        print("done-----%s-----%s"%(result.modified_count,path+file_name))
        return
Beispiel #5
0
def task_merge_doubanvideo():
    m = Merge()
    while True:
        task = rd.spop("task_merge_doubanvideo")
        if task:
            m.merge_doubanvideo(query={"_id": ObjectId(task)})
        else:
            break
Beispiel #6
0
def task_merge_youku_videos():
    m = Merge()
    while True:
        task = rd.spop("task_merge_youku_videos")
        if task:
            m.merge_youku_videos(query={"_id": ObjectId(task)})
        else:
            break
Beispiel #7
0
def task_merge_letvstar():
    m = Merge()
    while True:
        task = rd.spop("task_merge_letvstar")
        if task:
            m.merge_letvstar(query={"_id": ObjectId(task)})
        else:
            break
Beispiel #8
0
def task_video():
    """
    """
    retry = 5
    i = 0
    while True:
        id = rd.spop(config.douban_tv_task)
        # id = rd.spop(config.douban_tv_failed)
        if id is None:
            print(u"task_page sleeping....20sec")
            return True
        if rd.sismember(config.doubantv_ajax_task_done, id) == True:
            print(u"already done%s" % id)
            continue
        url = tv_url.format(id=id)
        r = requests_get(url=url, headers=douban_home_headers)
        if r == False or r == None:
            rd.sadd(config.douban_tv_failed, id)
            continue
        try:
            cb = check_block(r)
        except Exception as e:
            print("check_block:", str(e))
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_video(r)
        piw = piwik(page_title=page_title(r),
                    session_time=session_time,
                    origin_url=url,
                    urlref='')
        print("piw", piw)
        if data.get("title") == None:
            rd.sadd(config.douban_tv_failed, id)
            time.sleep(task_wait)
            # update_session()
            print("------spider ben block...")
            continue
        data['doubanid'] = id
        print(json.dumps(data))
        mongo_r = mongo_douban_tvs.insert(data, check_keys=False)  #
        photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)})
        if rd.sismember(config.douban_star_done,
                        photostask) == False and rd.sismember(
                            config.douban_photos_failed, photostask) == False:
            rd.sadd(config.douban_photos_task, photostask)
        print(photostask)
        # return True
        rd.sadd(config.douban_tv_done, id)
        # tv_after(id=id, url=url)
        print("done.. sleep %s seconds." % task_wait)
        delay()
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #9
0
 def failed_job(self):
     print("go failed_job")
     while True:
         '''监听task'''
         p = rd.spop(config.content_work_task_failed)
         if p == None:
             return True
         task = json.loads(p)
         if task.get("contentName") == None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.content_work_task_failed, p)
Beispiel #10
0
 def job(self):
     '''后台job'''
     while True:
         '''监听task'''
         p = rd.spop(config.gd_task_bkbk.encode('latin1'))
         if p==None:
             print("sleep 6s...")
             time.sleep(6)
             continue
         task = pickle.loads(p)
         if task.get("name") == None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.gd_task_bkbk,p)
             pass
Beispiel #11
0
 def job(self):
     '''后台job'''
     print("go job")
     while True:
         '''监听task'''
         p = rd.spop(config.content_work_task)
         if p == None:
             self.failed_job()
             print("sleep 6s...")
             time.sleep(6)
             continue
         task = json.loads(p)
         if task.get("contentName") is None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.content_work_task_failed, p)
Beispiel #12
0
 def failed_job(self):
     '''后台job'''
     while True:
         '''监听task'''
         # p = rd.spop(config.gd_task_bkbk.encode('latin1'))
         p = rd.spop(config.gd_task_failed)
         if p == None:
             return True
         task = pickle.loads(p)
         if task.get("name") == None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.gd_task_failed, p)
             pass
         else:
             rd.sadd(config.gd_task_bkbk, p)
             pass
Beispiel #13
0
def get_detailurl_task():
    """
    get_detailurl_task yk_get_detailurl_task 解析到detail_list页面的url
    """
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_get_detailurl_task)
        if q is None:
            print(u"yk_get_detailurl_task sleeping 20 sec")
            # time.sleep(task_wait)
            return True
        to_detail_url = json.loads(q)
        headers = youku_home_headers
        headers['Referer'] = to_detail_url['Referer']
        # if rd.sismember(config.yk_get_detailurl_done,q)==True or rd.sismember(config.yk_get_detailurl_field,q)==True:
        if rd.sismember(config.yk_get_detailurl_done,q)==True:
            print("pass")
            continue
        r = requests_get(to_detail_url['url'], headers=headers)
        # headers = youku_home_headers
        # headers['Referer'] = to_detail_url['url']
        # try:
        #     session.get('http://cmstool.youku.com/cms/player/userinfo/user_info?specialTest=test&client=pc&callback=tuijsonp1',headers=headers)
        # except Exception as e:
        #     pass
        print("to_detail_url",to_detail_url['url'])
        detail_url = parse_tv_show(r, to_detail_url['url'])
        print("detail_url:",detail_url)
        if detail_url == False or detail_url==None:
            rd.sadd(config.yk_get_detailurl_field, q)
            continue
        # if rd.sismember(config.yk_video_detail_done,json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))==False:
        if rd.sismember(config.yk_video_detail_done,detail_url)==False:
            red = rd.sadd(config.yk_video_detail_task, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
            if red==1:
                print("yes")
        rd.sadd(config.yk_get_detailurl_done,q)
        # rd.sadd(config.yk_video_detail_task_, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #14
0
def process():
    path = u"E:/posters/"
    while True:
        p = rd.spop(config.image_v)
        if not p:
            print("done! sleep 6s")
            time.sleep(6)
            continue
        task = json.loads(p)
        # im = requests_get(u'http://meeting.itvfocus.com/'+task['image_v'])
        im = requests_get(
            u'http://183.59.160.50:8082/EPG/jsp/images/universal/film/poster/'
            + task['image_v'])
        if not im:
            rd.sadd("image_v_failed", p)
            print("failed", p)
            continue
        #im = Image.open(r.raw)
        if im.width < 180:
            continue
        file_name = "/".join([
            task.get("content_id"),
            "%s_%sx%s.jpg" % (task.get("content_id"), im.width, im.height)
        ])
        try:
            os.makedirs(re.search('(.*/)', path + file_name).group(1))
        except Exception as e:
            #print(str(e))
            pass
        im.convert('RGB').save(path + file_name)
        ise = mongo_conn.posters.find({
            "file_path": file_name,
            "content_id": task['content_id']
        })
        if ise.count() != 0:
            continue
        task['file_path'] = file_name
        task['url'] = task['image_v']
        if task.get("_id"):
            del task['_id']
        _id = mongo_conn.posters.insert(task, check_keys=False)
        print(task['content_id'], _id, file_name)
Beispiel #15
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        # task = rd.spop(config.douban_star_task)
        task = rd.spop(config.douban_star_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            break
            continue
        # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True:
        if rd.sismember(config.douban_star_done, task) == True:
            print(u"already done%s" % task)
            continue
        url = star_url.format(id=task)
        print(url)
        r = requests_get(url=url)
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_star(r)
        if data == False or data == None or data.get("name") == None:
            rd.sadd(config.douban_star_failed, task)
            update_session()
            time.sleep(20)
            print("------spider ben sleep 20 sec...")
            continue
        data['doubanid'] = task
        print(json.dumps(data))
        result = mongo_douban_stars.insert(data, check_keys=False)
        rd.sadd(config.douban_star_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #16
0
def task_page_fetch():
    """
    解析每一个category下每个分类下的每一页list数据中的所有tv url,
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        page_url = rd.spop(config.yk_page_task)
        # page_url = rd.spop(config.yk_page_failed) #retry
        if page_url is None:
            print(u"task_page_fetch sleeping 20sec....")
            # time.sleep(task_wait)
            return True
        print("page_url", page_url)
        if rd.sismember(config.yk_page_failed,
                        page_url) == True or rd.sismember(
                            config.yk_page_done, page_url) == True:
            continue
        r = requests_get(url=page_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % page_url)
            rd.sadd(config.yk_page_failed, page_url)
            continue
        print("done task_page_fetch:", page_url)
        data = parse_page_fetch(r, page_url)
        for x in data['yk_get_detailurl_task']:
            rd.sadd(config.yk_get_detailurl_task,
                    json.dumps(x))  # 链接是直接到播放页面的V_show类型
        for x in data['yk_video_detail_task']:
            r_add = rd.sadd(config.yk_video_detail_task,
                            json.dumps(x))  # detail_list_task
        rd.sadd(config.yk_page_done, page_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #17
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        task = rd.spop(config.le_star_task)
        # task = u'{"7088": "石田卓也"}'
        if task is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        print(task)
        is_done = rd.sismember(config.le_star_done, task)
        if is_done == True:
            print("already done.")
            continue
        task_json = json.loads(task)
        url = so_url.format(wd=task_json[task_json.keys()[0]])
        r = requests_get(url=url, headers=leso_headers)
        if r is False or r == None:  # 失败
            print(u'filed task:%s' % url)
            rd.sadd(config.le_star_failed, task)
            continue
        data = parse_sostar(r, task_json)
        if data == False or data == None:
            rd.sadd(config.le_star_failed, task)
            continue
        mongo_id = mongo_letv_stars.insert(data, check_keys=False)  #
        if mongo_id:
            rd.sadd(config.le_star_done, task)
        else:
            print(mongo_id)
            rd.sadd(config.le_star_failed, task)
        print('done.')
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #18
0
def task_category():
    """
    解析每一个category下的分类,
    并获取该category 每个分类下的全部资源的url任务, 
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        category = rd.spop(config.yk_category_task)
        if category is None:
            print(u"task_category sleeping....20sec")
            # time.sleep(task_wait)
            return True
        category = json.loads(category)
        print(category)
        r = requests_get(url=category['url'], headers=youku_home_headers,session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % category['url'])
            rd.sadd(config.yk_category_task_failed, category['url'])
            continue
        data = parse_category_show(r, category['url'])
        print("category and types:", json.dumps(data))
        if len(data['types']) == 0:  # category下没有type,
            re_sadd = rd.sadd(config.yk_types_task,category['url'])  # types url
        else:
            for ty in data['types']:
                if rd.sismember(config.yk_types_done,data['types'][ty]) == False and rd.sismember(config.yk_types_failed,data['types'][ty]) == False:
                    rd.sadd(config.yk_types_task,data['types'][ty])  # types fetch task
                re_sadd = rd.sadd(config.yk_types_done,data['types'][ty])  # types url 数据库去重
                if re_sadd == 0:  # 去重保存
                    continue
                youku_video_types.insert(
                    {"name": ty, "url": data['types'][ty], "category": category['name']}, check_keys=False)  # save tv types
        rd.sadd(config.yk_category_task_done, category['url'])
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #19
0
 def job(self):
     '''后台job'''
     while True:
         '''监听task'''
         # p = rd.spop(config.gd_task_bk.encode('latin1'))
         # p = rd.spop(config.gd_task_bkbk.encode('latin1'))
         p = rd.spop(config.gd_task)
         if p == None:
             self.failed_job()
             print("sleep 60s...")
             time.sleep(60)
             continue
         task = pickle.loads(p)
         if task.get("name") == None:
             continue
         r = self.process(task)
         print("process", r)
         if not r:
             rd.sadd(config.gd_task_failed, p)
             pass
         else:
             rd.sadd(config.gd_task_bkbk, p)
             pass
Beispiel #20
0
def task_page():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.le_page_task)
        # url = rd.spop(config.le_page_failed)
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        if rd.sismember(config.le_page_done, url) == True:
            print(u"already done%s" % url)
            continue
        r = requests_get(url, headers=leshi_headers)
        if r is False or r == None:  # 失败
            print(u'filed task:%s' % url)
            rd.sadd(config.le_page_failed, url)
            continue
        m = re.search(
            u"frontUrl\: *'(http://list\.le\.com\/getLesoData([^',]+?))',", r)
        print("task_page:", url)
        if m:
            # http://list.le.com/getLesoData?from=pc&src=1&stype=1&ps=30&pn=1&ph=420001&dt=1&cg=2&or=4&stt=1&vt=180001
            ajax_url = m.group(1)
            pn = 1
            while True:
                ajax_url = re.sub(u"pn=\d*", 'pn=%s' % pn, ajax_url)
                print("ajax_url:", ajax_url)
                r = requests_get(url=ajax_url, headers=leshi_ajax_headers)
                if r == False or r == None:
                    rd.sadd(config.le_page_ajax_failed, ajax_url)
                    continue
                pn += 1
                # print(r)
                try:
                    list_data = json.loads(r)
                except Exception as e:
                    print(str(e))
                    print(r)
                    print(ajax_url)
                    rd.sadd(config.le_page_ajax_failed, ajax_url)
                    print("continue")
                    continue
                if list_data.get("data").get("more") == False:
                    print("this url page fetch done")
                    break
                for x in list_data.get("data").get("arr"):
                    is_done = rd.sismember(config.le_tv_done, x["unique_id"])
                    if is_done == True:
                        print("already done!")
                        print(x['name'])
                        # return False
                        continue
                    # 初步清洗
                    data = {}
                    data = x
                    data['created_at'] = time.time()
                    data['updated_at'] = time.time()
                    # print(json.dumps(x))
                    # data["summary"] = x['description']
                    # data["category"] = x['categoryName']
                    # data["title"] = x['name']
                    # data["alias"] = x['otherName']
                    # data["subname"] = x['subname']
                    # data["englishName"] = x['englishName']
                    # data["language"] = x['language']
                    # data["area"] = x['areaName']
                    # data["plays_num"] = x['playCount']
                    # data["le_score"] = x['rating']
                    # # data["isEnd"] = x['isEnd']
                    # data["subCategoryName"] = x['subCategoryName']
                    # data["videoTypeName"] = x['videoTypeName']
                    # data["duration"] = x['duration'] #时长 单集视频是秒,电视剧剧集资源是每集的分钟数
                    # data["doubanid"] = x['doubanid'] #doubanid
                    # data["urlLink"] = x['urlLink']
                    # data["copyright"] = x['copyright']
                    # data["imgUrl"] = x['imgUrl']
                    # data["tag"] = x['tag']
                    # data["vids"] = x['vids']  #子集ids
                    # data["shortDesc"] = x['shortDesc']
                    # data["monthCount"] = x['monthCount']
                    # data["intro"] = x['intro']
                    # data["publishCompany"] = x['publishCompany']
                    # data["fitAge"] = x['fitAge']
                    # data["weekCount"] = x['weekCount']
                    # data["style"] = x['style']
                    # data["letv_original_id"] = x['letv_original_id']
                    # data["global_id"] = x['global_id']
                    # data["tvTitle"] = x['tvTitle']
                    # data["videoBaseType"] = x['videoBaseType']
                    # data["pubName"] = x['pubName']
                    # data["nameQuanpin"] = x['nameQuanpin']
                    # data["nameJianpin"] = x['nameJianpin']
                    # data["allowforeign"] = x['allowforeign']
                    # data["subSrc"] = x['subSrc']
                    # data["updataInfo"] = x['updataInfo']
                    # data["downloadPlatform"] = x['downloadPlatform']
                    # data["pushFlag"] = x['pushFlag']
                    # data["payPlatform"] = x['payPlatform']
                    # data["vid"] = x['vid']
                    # data["episodes"] = x['episodes']  #集数
                    # data["nowEpisodes"] = x['nowEpisodes'] #当前更新到
                    # data["ispay"] = x['ispay']
                    # data["country"] = x['country']
                    # data["videoList"] = x['videoList']
                    # try:
                    # 	data["published_at"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(x['releaseDate'])/1000)) #乐视平台的发布时间
                    # except Exception as e:
                    # 	data["published_at"] = x['releaseDate']  #有-28800000,-126259200000此类值
                    data["ctime"] = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(int(x['ctime']) /
                                       1000))  # 乐视平台的ctime,待分析,不明意义
                    data["mtime"] = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(int(x['mtime']) /
                                       1000))  # 乐视平台的mtime,待分析,不明意义
                    data["images"] = [{
                        "url": x['images'][k],
                        "width": k.split('*')[0],
                        "height": k.split('*')[1]
                    } for k in x['images']]  # hai bao
                    data["actors"] = "".join(
                        [x['actor'][it] + "," for it in x['actor']])  # 演员
                    data["directors"] = "".join([
                        x['directory'][it] + "," for it in x['directory']
                    ])  # 导演
                    starring_type = type(x['starring']).__name__
                    if starring_type != u'str':
                        for it in x['starring']:
                            if rd.sismember(config.le_star_done,
                                            json.dumps(it)) == True:
                                continue
                            if rd.sismember(config.le_star_failed,
                                            json.dumps(it)) == True:
                                continue
                            rd.sadd(config.le_star_task, json.dumps(it))
                        # 主演   坑啊,python 拷贝 可变类型.... x['starring']和data["starring"]的值在同一块内存地址
                        data["starring"] = "".join([
                            starring[starring.keys()[0]] + ","
                            for starring in x['starring']
                        ])
                    if type(x['actor']).__name__ != u'str':
                        for it in x['actor']:
                            if rd.sismember(config.le_star_done,
                                            json.dumps({it: x['actor'][it]
                                                        })) == True:
                                continue
                            if rd.sismember(config.le_star_failed,
                                            json.dumps({it: x['actor'][it]
                                                        })) == True:
                                continue
                            print(json.dumps({it: x['actor'][it]}))
                            rd.sadd(config.le_star_task,
                                    json.dumps({it: x['actor'][it]}))
                    if type(x['directory']).__name__ != u'str':
                        for it in x['directory']:
                            if rd.sismember(
                                    config.le_star_done,
                                    json.dumps({it:
                                                x['directory'][it]})) == True:
                                continue
                            if rd.sismember(
                                    config.le_star_failed,
                                    json.dumps({it:
                                                x['directory'][it]})) == True:
                                continue
                            json.dumps({it: x['directory'][it]})
                            rd.sadd(config.le_star_task,
                                    json.dumps({it: x['directory'][it]}))
                    # print(json.dumps(data))
                    print("done!")
                    mongo_letv_tvs.insert(data, check_keys=False)  #
                    rd.sadd(config.le_tv_done, x['unique_id'])
        else:
            print(u'filed task:%s' % url)
            rd.sadd(config.le_page_failed, url)
            continue
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #21
0
def task_photos():
    """
    """
    retry = 5
    i = 0
    photos_url = u'https://movie.douban.com/subject/{id}/photos?type=R'
    while True:
        #线程锁,必须加这里.
        #with threading.Lock():
        # task = rd.spop(config.douban_photos_task)
        task = rd.spop(config.douban_photos_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            return True
        # if rd.sismember(config.douban_photos_failed, task) == True or rd.sismember(config.douban_photos_done, task) == True:
        if rd.sismember(config.douban_photos_done, task) == True:
            print(u"already done%s" % task)
            continue
        T = json.loads(task)
        # T = {}
        # task = ""
        # T['id'] = "25827963"
        url = photos_url.format(id=T['id'])
        print(url)
        # data = []
        data = get_photos(url=url, id=T['id'])
        # for x in get_photos(url=url, id=T['id']):
        #     #if x == False or len(x) == 0 or x == None:
        #     if x == False or x == None:
        #         # rd.sadd(config.douban_photos_failed, task)
        #         rd.sadd(config.douban_photos_task, task)
        #         print("------spider ben sleep 20 sec...")
        #         update_session()
        #         break
        #     print(json.dumps(x))
        #     print(len(x))
        #     data += x
        print("++++++++++++++++%s+++++++++++++%s++++++++++++" %
              (task, len(data)))
        if len(data) == 0:
            #rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
            continue
        print(json.dumps(data))
        # return
        '''这是后面的骚操作.....'''
        mongo_douban_tvs.update({'_id': ObjectId(T['mongoTVID'])},
                                {'$unset': {
                                    'poster': 1
                                }},
                                multi=True)
        result = mongo_douban_tvs.update_one({'_id': ObjectId(T['mongoTVID'])},
                                             {'$set': {
                                                 'poster': data
                                             }})
        if result.modified_count == 0:
            rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
        rd.sadd(config.douban_photos_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result.modified_count)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #22
0
def task_api():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.doubantv_ajax_task)
        origin_url = url
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True:
        if rd.sismember(config.doubantv_ajax_task_done, url) == True:
            print(u"already done%s" % url)
            continue
        start = 0
        while True:
            url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url)
            print(url)
            r = requests_get(url, headers=douban_referer_tag_headers)
            if r is False or r == None:  # 失败
                print(u'filed task:%s' % url)
                rd.sadd(config.doubantv_ajax_task_failed, url)
                continue
            try:
                r_data = json.loads(r)
            except Exception as e:
                rd.sadd(config.doubantv_ajax_task_failed, url)
                print(r)
                print(str(e))
                update_session()
                time.sleep(task_wait)
                print("-----spider  ben   sleep 10 sec....")
                continue
            if len(r_data['data']) == 0:
                rd.sadd(config.doubantv_ajax_task_done, origin_url)
                print("done%s" % origin_url)
                break
            for x in r_data['data']:
                if rd.sismember(config.douban_tv_done,
                                x['id']) == False and rd.sismember(
                                    config.douban_tv_failed, x['id']) == False:
                    add_task = rd.sadd(config.douban_tv_task, x['id'])
                    if add_task == 1:
                        print(
                            "---------------join task.----%s--------------------"
                            % x['id'])
                    else:
                        print(
                            '***********task repeat-******%s********************'
                            % x['id'])
                    rd.sadd(config.douban_tvids, x['id'])
            rd.sadd(config.doubantv_ajax_task_done, origin_url)
            print("sleep 2 seconds")
            delay()
            i += 1
            start += 1
            if i % max_step == 0:
                bid = random_str(10)
                session.cookies.set('bid', bid, domain='.douban.com', path='/')
                try:
                    session.get(url=ad_url.format(bid=bid),
                                headers=douban_referer_tag_headers,
                                timeout=timeout)
                except Exception as e:
                    pass