Beispiel #1
0
    def crawl(self, urls):
        r = requests_get(url=urls['url'], headers=headers)
        albumId_tvId = self.parser.parse_albumId_tvId(r=r, url=urls['url'])
        print("albumId_tvId", albumId_tvId)
        if not albumId_tvId or not albumId_tvId.get("tvId"):
            return {"status": False, 'urls': urls}
        exists = self.before_crawl(albumId_tvId['tvId'])
        if exists:
            return exists
        info = self.vinfo(tvId=albumId_tvId.get("tvId"))
        if not info:
            return False
        data = self.parser.merge_fields(info)
        data = self.check_crawl_star(data)
        if albumId_tvId.get('play') and info.get("cast") and len(
                info.get("cast").get("directors")) == 0 and len(
                    info.get("cast").get("mainActors")) == 0 and len(
                        info.get("cast").get("singers")) == 0 and len(
                            info.get("cast").get("actors")) == 0 and len(
                                info.get("cast").get("guests")) == 0:
            play = requests_get(url=albumId_tvId["play"], headers=headers)
            _temp = self.parser.plays_parser(play)
            print(
                "////////////////////////////////////////////////////////////////_temp",
                _temp)
            data = dict(data.items() + _temp.items())
            if _temp.get("year"):
                data['year'] = _temp.get("year")
            if data.get("directors_list"):
                directors_list = []
                # print(data.get("directors_list"))
                for x in data.get("directors_list"):
                    _temp = self.crawl_star(x["iqiyi_url"])
                    if _temp is not None:
                        directors_list.append(_temp)
                data['directors_list'] = directors_list

            if data.get("starring_list"):
                starring_list = []
                for x in data.get("starring_list"):
                    _temp = self.crawl_star(x["iqiyi_url"])
                    if _temp:
                        starring_list.append(_temp)
                data['starring_list'] = starring_list
            if data.get("actors_list"):
                actors_list = []
                for x in data.get("actors_list"):
                    _temp = self.crawl_star(x["iqiyi_url"])
                    if _temp:
                        actors_list.append(_temp)
                data['actors_list'] = actors_list

        data["user_profile"] = self.user_profile(
            albumId=albumId_tvId.get("albumId"))
        data['iqiyi_playCountPCMobileCb'] = self.playCountPCMobileCb(
            albumId=albumId_tvId.get("albumId"))

        if not data or not data.get("title"):
            return False
        return self.save(data)
Beispiel #2
0
 def crawl(self,urls):
     m = re.search(u'v\.qq\.com/x/cover/',urls['r_url'])
     url = urls["url"]
     if m:
         r = requests_get(url=url,headers=headers)
         url = self.parsers.detail_url_parser(r)
     r = requests_get(url=url,headers=headers)
     # class="player_title"
     data = self.parsers.vdetail_parser(r)
     data = self.check_crawl_star(data)
     if not data or not data.get("title"):
         return False
     return self.save(data)
Beispiel #3
0
def go_detail_list_task():
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_video_detail_task)
        if q is None:
            print(u"yk_video_detail_task sleeping 20 sec....")
            # time.sleep(task_wait)
            return True
        detail_url = json.loads(q)
        #if rd.sismember(config.yk_video_detail_failed,q)==True or rd.sismember(config.yk_video_detail_done,detail_url['url'])==True:
        if rd.sismember(config.yk_video_detail_done,
                        detail_url['url']) == True:
            print("pass", detail_url['url'])
            continue
        # r = go_detail_list_page(detail_url)
        r = requests_get(detail_url['url'], headers=youku_home_headers)
        d = parse_detail_list_page(r, detail_url['url'])
        data = d['data']
        if data is False or data == None:
            rd.sadd(config.yk_video_detail_failed, q)
            continue
        for x in d['stars']:
            rd.sadd(config.yk_star_task, x)  # 明星采集队列,redis set特性去重
        print('detail_url done:', detail_url['url'], data)
        done = rd.sadd(config.yk_video_detail_done,
                       detail_url['url'])  # finished
        #if done == 1:
        youku_videos.insert(data, check_keys=False)  # save tv data
        # 每50步更新一次session
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #4
0
 def crawl(self, urls):
     # m = re.search(u'movie\.douban\.com/subje',urls['r_url'])
     m = re.search(u'douban\.com/subje', urls['url'])
     url = urls["url"]
     if not m:
         iid = re.search(u'(\d{5,})', urls['url'])
         if iid:
             url = u'https://movie.douban.com/subject/{}/'.format(
                 iid.group(1))
     m = re.search(u'(\d*)', url)
     if m:
         exists = self.crawl_before(doubanid=m.group(1))
         if exists:
             return exists
     r = requests_get(url=url, headers=headers)
     data = self.parsers.vdetail_parser(r)
     if not data or not data.get("doubanid"):
         return False
     poster = self.crawl_poster(data.get("doubanid"))
     if poster == None or poster == False:
         return False
     data['poster'] = poster
     data = self.check_crawl_star(data)
     if not data or not data.get("title"):
         return False
     return self.save(data)
Beispiel #5
0
 def crawl_poster(self, id, data=[], result={}):
     url = self.poster_url.format(id=id)
     h = headers
     h["Referer"] = self.detail_url.format(id=id)
     data = []
     if len(result) == 0:
         result = {"next": url}
     url = url
     while url:
         print(url)
         r = requests_get(url=url, headers=h)
         if r == False or r == None:
             # yield False
             data = False
             url = False
             break
         if u'检测到有异常请求从你的 IP 发出' in r:
             print("------spider ben block... break......")
             url = False
             data = False
             break
         result = self.parsers.parse_photos(r, id)
         data += result.get('data')
         if result.get("next"):
             url = result.get("next")
         else:
             url = False
     return data
Beispiel #6
0
def task_types_fetch():
    retry = 5
    i = 0
    while True:
        type_url = rd.spop(config.yk_types_task)
        if type_url is None:
            print(u"yk_types_task sleeping 20sec....")
            return True
        if rd.sismember(config.yk_types_failed,
                        type_url) == True or rd.sismember(
                            config.yk_types_done, type_url) == True:
            continue
        r = requests_get(url=type_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:
            print(u'filed task:%s' % type_url)
            rd.sadd(config.yk_types_failed, type_url)
            continue
        pages = parse_category_show(r, type_url)
        print("task_types_fetch data:", pages)
        for page in xrange(1, int(pages['pages'])):
            page_url = re.sub('(\.html.*)',
                              '_s_1_d_1_p_{page}.html'.format(page=page),
                              type_url)
            print("task_types_fetch for :", page_url)
            if rd.sismember(config.yk_page_failed,
                            page_url) == False and rd.sismember(
                                config.yk_page_done, page_url) == False:
                rd.sadd(config.yk_page_task, page_url)
        rd.sadd(config.yk_types_done, type_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #7
0
def task_video():
    """
    """
    retry = 5
    i = 0
    while True:
        id = rd.spop(config.douban_tv_task)
        # id = rd.spop(config.douban_tv_failed)
        if id is None:
            print(u"task_page sleeping....20sec")
            return True
        if rd.sismember(config.doubantv_ajax_task_done, id) == True:
            print(u"already done%s" % id)
            continue
        url = tv_url.format(id=id)
        r = requests_get(url=url, headers=douban_home_headers)
        if r == False or r == None:
            rd.sadd(config.douban_tv_failed, id)
            continue
        try:
            cb = check_block(r)
        except Exception as e:
            print("check_block:", str(e))
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_video(r)
        piw = piwik(page_title=page_title(r),
                    session_time=session_time,
                    origin_url=url,
                    urlref='')
        print("piw", piw)
        if data.get("title") == None:
            rd.sadd(config.douban_tv_failed, id)
            time.sleep(task_wait)
            # update_session()
            print("------spider ben block...")
            continue
        data['doubanid'] = id
        print(json.dumps(data))
        mongo_r = mongo_douban_tvs.insert(data, check_keys=False)  #
        photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)})
        if rd.sismember(config.douban_star_done,
                        photostask) == False and rd.sismember(
                            config.douban_photos_failed, photostask) == False:
            rd.sadd(config.douban_photos_task, photostask)
        print(photostask)
        # return True
        rd.sadd(config.douban_tv_done, id)
        # tv_after(id=id, url=url)
        print("done.. sleep %s seconds." % task_wait)
        delay()
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #8
0
 def crawl_star(self, url):
     m = re.search(u'com/celebrity/(\d*)/', url)
     if m:
         exists = mongo_conn.stars.find({"doubanid": m.group(1)})
         if exists.count() > 0:
             r = exists[0]
             r['_id'] = str(r['_id'])
             return r
     r = requests_get(url=url)
     return self.parsers.parse_star(r, url)
Beispiel #9
0
def get_category():
    """获取分类,做种子"""
    start = 1
    retry = 5
    print('get_category')
    while retry > 0:
        try:
            r = requests_get(url=category_url,
                             headers=youku_home_headers,
                             timeout=timeout,
                             session=session)
            page = etree.HTML(r)
            lis = page.xpath(
                u'//label[contains(text(),"分类:")]/following-sibling::ul/li')
            o = urlparse(category_url)
            host = o.scheme + '://' + o.netloc
            categories = []
            for x in xrange(1, len(lis)):
                categories.append({
                    "name": lis[x].find('a').text,
                    'url': host + lis[x].find('a').get('href')
                })
                # categories[lis[x].find('a').text] = host + lis[x].find('a').get('href')
            print("categories:", json.dumps(categories))
            # categories = {lis[x].find('a').text : host + lis[x].find('a').get('href') for x in xrange(1,len(lis)) if lis[x].find('a')!=None}  #
            if len(categories) == 0:
                update_session(proxy)
                continue
            for x in categories:
                if rd.sismember(config.yk_category_task_done,
                                x['url']) == False and rd.sismember(
                                    config.yk_category_task_failed,
                                    x['url']) == False:
                    task_sadd = rd.sadd(config.yk_category_task,
                                        json.dumps(x))  # 种子
                re_sadd = rd.sadd(config.yk_category_url, json.dumps(x))  # 种子
                if re_sadd != 0:  # 去重保存
                    youku_category.insert(x,
                                          check_keys=False)  # save categories
            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % 20 == 0:  # 每50步更新一次session
        update_session()
Beispiel #10
0
def piwik(page_title, session_time, origin_url, urlref=''):
    '''用户行为数据上报'''
    # https://fundin.douban.com/piwik?action_name=脱单告急 (豆瓣)&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26661189%2F&_id=7a36e03deb79996b&_idts=1525176862&_idvc=1&_idn=1&_refts=0&_viewts=1525176862&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768&gt_ms=1143
    url = u'https://fundin.douban.com/piwik?action_name={page_title}&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url={origin_url}&urlref={urlref}&_id={_id}&_idts={_idts}&_idvc=1&_idn=1&_refts=0&_viewts={_viests}&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768&gt_ms=1143'
    url = url.format(page_title=page_title,
                     origin_url=origin_url,
                     _id=random_str(16, True),
                     _idts=session_time,
                     _viests=int(time.time()) + 3,
                     urlref=urlref)
    headers = douban_home_headers
    headers['Referer'] = origin_url
    return requests_get(url=url, headers=headers)
Beispiel #11
0
 def crawl(self, urls):
     m = re.search(u'youku\.com/show/', urls['r_url'])
     url = urls["url"]
     if not m:
         # r = requests_get(url=url,headers=headers)
         # url = self.parsers.detail_url_parser(r)
         return None
     r = requests_get(url=url, headers=headers)
     # class="player_title"
     data = self.parser.parse_detail(r=r)
     data = self.check_crawl_star(data)
     if not data or not data.get("title"):
         return None
     data = self.save(data)
     return data
Beispiel #12
0
 def crawl(self, urls):
     r = requests_get(url=urls["url"], headers=headers)
     playlistid = self.parser.playlistId_parser(r)
     if not playlistid:
         data = self.parser.vdetail_parser(r)
     exists = self.crawl_before(playlistid)
     if exists:
         return exists
     info = self.vinfo(playlistid=playlistid)
     if not info:
         return False
     data = self.parser.merge_content_fields(info)
     data = self.check_crawl_star(data)
     if data == False:
         return False
     return self.save(data)
Beispiel #13
0
def get_detailurl_task():
    """
    get_detailurl_task yk_get_detailurl_task 解析到detail_list页面的url
    """
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_get_detailurl_task)
        if q is None:
            print(u"yk_get_detailurl_task sleeping 20 sec")
            # time.sleep(task_wait)
            return True
        to_detail_url = json.loads(q)
        headers = youku_home_headers
        headers['Referer'] = to_detail_url['Referer']
        # if rd.sismember(config.yk_get_detailurl_done,q)==True or rd.sismember(config.yk_get_detailurl_field,q)==True:
        if rd.sismember(config.yk_get_detailurl_done,q)==True:
            print("pass")
            continue
        r = requests_get(to_detail_url['url'], headers=headers)
        # headers = youku_home_headers
        # headers['Referer'] = to_detail_url['url']
        # try:
        #     session.get('http://cmstool.youku.com/cms/player/userinfo/user_info?specialTest=test&client=pc&callback=tuijsonp1',headers=headers)
        # except Exception as e:
        #     pass
        print("to_detail_url",to_detail_url['url'])
        detail_url = parse_tv_show(r, to_detail_url['url'])
        print("detail_url:",detail_url)
        if detail_url == False or detail_url==None:
            rd.sadd(config.yk_get_detailurl_field, q)
            continue
        # if rd.sismember(config.yk_video_detail_done,json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))==False:
        if rd.sismember(config.yk_video_detail_done,detail_url)==False:
            red = rd.sadd(config.yk_video_detail_task, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
            if red==1:
                print("yes")
        rd.sadd(config.yk_get_detailurl_done,q)
        # rd.sadd(config.yk_video_detail_task_, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #14
0
def get_photos(url, id, data=[], result={}):
    data = []
    if len(result) == 0:
        result = {"next": url}
    # while result.get("next") != None:
    url = url
    while url:
        # print('get_photos:', result.get("next"))
        print('get_photos:', url)
        headers = douban_home_headers
        headers['Referer'] = tv_url.format(id=id)
        # r = requests_get(url=result.get("next"), headers=headers)
        r = requests_get(url=url, headers=headers)
        cb = check_block(r)
        # if cb==None:
        #     # yield False
        #     url = False
        #     break
        piwik(page_title=page_title(r),
              session_time=session_time,
              origin_url=url,
              urlref=headers['Referer'])
        if r == False or r == None:
            # yield False
            data = False
            url = False
            break
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            # yield False
            url = False
            data = False
            break
        result = parse_photos(r, id)
        # yield result.get('data')
        data += result.get('data')
        if result.get("next"):
            url = result.get("next")
        else:
            url = False
    return data
Beispiel #15
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        # task = rd.spop(config.douban_star_task)
        task = rd.spop(config.douban_star_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            break
            continue
        # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True:
        if rd.sismember(config.douban_star_done, task) == True:
            print(u"already done%s" % task)
            continue
        url = star_url.format(id=task)
        print(url)
        r = requests_get(url=url)
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_star(r)
        if data == False or data == None or data.get("name") == None:
            rd.sadd(config.douban_star_failed, task)
            update_session()
            time.sleep(20)
            print("------spider ben sleep 20 sec...")
            continue
        data['doubanid'] = task
        print(json.dumps(data))
        result = mongo_douban_stars.insert(data, check_keys=False)
        rd.sadd(config.douban_star_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #16
0
def task_category():
    """
    解析每一个category下的分类,
    并获取该category 每个分类下的全部资源的url任务, 
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        category = rd.spop(config.yk_category_task)
        if category is None:
            print(u"task_category sleeping....20sec")
            # time.sleep(task_wait)
            return True
        category = json.loads(category)
        print(category)
        r = requests_get(url=category['url'], headers=youku_home_headers,session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % category['url'])
            rd.sadd(config.yk_category_task_failed, category['url'])
            continue
        data = parse_category_show(r, category['url'])
        print("category and types:", json.dumps(data))
        if len(data['types']) == 0:  # category下没有type,
            re_sadd = rd.sadd(config.yk_types_task,category['url'])  # types url
        else:
            for ty in data['types']:
                if rd.sismember(config.yk_types_done,data['types'][ty]) == False and rd.sismember(config.yk_types_failed,data['types'][ty]) == False:
                    rd.sadd(config.yk_types_task,data['types'][ty])  # types fetch task
                re_sadd = rd.sadd(config.yk_types_done,data['types'][ty])  # types url 数据库去重
                if re_sadd == 0:  # 去重保存
                    continue
                youku_video_types.insert(
                    {"name": ty, "url": data['types'][ty], "category": category['name']}, check_keys=False)  # save tv types
        rd.sadd(config.yk_category_task_done, category['url'])
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #17
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        task = rd.spop(config.le_star_task)
        # task = u'{"7088": "石田卓也"}'
        if task is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        print(task)
        is_done = rd.sismember(config.le_star_done, task)
        if is_done == True:
            print("already done.")
            continue
        task_json = json.loads(task)
        url = so_url.format(wd=task_json[task_json.keys()[0]])
        r = requests_get(url=url, headers=leso_headers)
        if r is False or r == None:  # 失败
            print(u'filed task:%s' % url)
            rd.sadd(config.le_star_failed, task)
            continue
        data = parse_sostar(r, task_json)
        if data == False or data == None:
            rd.sadd(config.le_star_failed, task)
            continue
        mongo_id = mongo_letv_stars.insert(data, check_keys=False)  #
        if mongo_id:
            rd.sadd(config.le_star_done, task)
        else:
            print(mongo_id)
            rd.sadd(config.le_star_failed, task)
        print('done.')
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #18
0
def task_page_fetch():
    """
    解析每一个category下每个分类下的每一页list数据中的所有tv url,
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        page_url = rd.spop(config.yk_page_task)
        # page_url = rd.spop(config.yk_page_failed) #retry
        if page_url is None:
            print(u"task_page_fetch sleeping 20sec....")
            # time.sleep(task_wait)
            return True
        print("page_url", page_url)
        if rd.sismember(config.yk_page_failed,
                        page_url) == True or rd.sismember(
                            config.yk_page_done, page_url) == True:
            continue
        r = requests_get(url=page_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % page_url)
            rd.sadd(config.yk_page_failed, page_url)
            continue
        print("done task_page_fetch:", page_url)
        data = parse_page_fetch(r, page_url)
        for x in data['yk_get_detailurl_task']:
            rd.sadd(config.yk_get_detailurl_task,
                    json.dumps(x))  # 链接是直接到播放页面的V_show类型
        for x in data['yk_video_detail_task']:
            r_add = rd.sadd(config.yk_video_detail_task,
                            json.dumps(x))  # detail_list_task
        rd.sadd(config.yk_page_done, page_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Beispiel #19
0
 def v_search(self):
     '''视频搜索   优酷 爱奇艺 腾讯 PP视频  ...'''
     from Spiders.setting import baidu_headers
     r = requests_get(url=self.url, headers=baidu_headers, data=self.params)
     result_map = BaiduParser.v_search_parser(r)
     print("result_map", result_map)
     if not result_map:
         return result_map
     for mid_url in result_map:
         print(mid_url)
         result_url = self.get_url_bymid(mid_url['url'])
         for x in self.host:
             if not result_url and mid_url.get(
                     "r_url") and x in mid_url.get("r_url"):
                 data = self.host_map[x]().crawl(mid_url)
                 if data and data.get("status") != False:
                     return data
             elif result_url and x in result_url:
                 print(self.host_map[x])
                 mid_url['url'] = result_url
                 data = self.host_map[x]().crawl(mid_url)
                 if data and data.get("status") != False:
                     return data
Beispiel #20
0
 def vinfo(self, playlistid=None):
     r = requests_get(url=self.playlist.format(playlistid=playlistid),
                      headers=headers)
     return self.parser.parser_vinfo(r)
Beispiel #21
0
 def crawl_star(self, url):
     r = requests_get(url=url, headers=headers)
     return self.parsers.star_parser(r, url=url)
Beispiel #22
0
 def crawl(self, url):
     r = requests_get(url=url, headers=headers)
     return self.parsers.vdetail_parser(r)
Beispiel #23
0
 def crawl(self, urls):
     from Spiders.setting import baidu_headers
     r = requests_get(url=urls["url"], headers=baidu_headers)
     return BaiduParser.baike_parser(r)
Beispiel #24
0
 def get_url_bymid(self, url):
     '''百度搜索结果页面的url是中间url,这里hui得到 目标page url'''
     from Spiders.setting import baidu_headers
     r = requests_get(url=url, headers=baidu_headers)
     return BaiduParser.parse_mid_tourl(r)
Beispiel #25
0
 def user_profile(self, albumId=None):
     r = requests_get(url=self.get_user_profile_url.format(albumId=albumId),
                      headers=headers)
     return self.parser.parse_user_profile(r)
Beispiel #26
0
 def playCountPCMobileCb(self, albumId=None):
     r = requests_get(
         url=self.playCountPCMobileCb_url.format(albumId=albumId),
         headers=headers)
     return self.parser.parse_playCountPCMobileCb(r)
Beispiel #27
0
 def recommend(self, uid=None, session=None):
     return requests_get(url=self.playCountPCMobileCb_url.format(uid=uid),
                         headers=headers,
                         session=session)
Beispiel #28
0
def tv_after(id, url):
    headers = douban_home_headers
    headers['Referer'] = url
    headers['Accept'] = u'application/json, text/javascript, */*; q=0.01'
    return requests_get(url=verify_users_url.format(id=id), headers=headers)
Beispiel #29
0
def spider_seed(tag_url=tag_url):
    """获取分类,做种子"""
    start = 1
    retry = 5
    while retry > 0:
        try:
            r = requests_get(url=tag_url,
                             headers=douban_home_headers,
                             timeout=timeout)
            # page = etree.HTML(r)
            appjs_url = re.search(
                u'<script type="text/javascript" src="((.*)app\.js)"></script>',
                r).group(1)
            print(appjs_url)
            appjs = requests_get(url=appjs_url, headers=douban_appjs_headers)
            jsdata = re.search(
                u'mixins\:\[f\.mixin\],data\:function\(\)\{return(.*)\},ready\:function\(\)\{window',
                appjs).group(1)
            print(jsdata)
            jsdata = re.sub(u'!', '', jsdata)
            jsdata = re.sub(
                u'browserHeight:document.documentElement.clientHeight', '',
                jsdata)
            jsdata = demjson.decode(jsdata)
            save_tags = rd.sadd(config.doubantv_tags,
                                json.dumps(jsdata['tag_categories']))
            if save_tags == 1:
                # mongo_douban_tags.insert(json.dumps(jsdata["tag_categories"]), check_keys=False)  #
                mongo_douban_tags.insert(
                    {"tag_categories": jsdata["tag_categories"]},
                    check_keys=False)  #
            ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}'
            print(len(jsdata["tag_categories"][0]))
            print(len(jsdata["tag_categories"][1]))
            print(len(jsdata["tag_categories"][2]))
            print(len(jsdata["tag_categories"][3]))
            jsdata["tag_categories"][1][0] = ""
            jsdata["tag_categories"][2][0] = ""
            jsdata["tag_categories"][3][0] = ""
            jsdata["tag_categories"][0][0] = ""
            for x in xrange(0, len(jsdata["tag_categories"][1])):  # "全部类型"
                c1 = jsdata["tag_categories"][1][x]
                for xx in xrange(0,
                                 len(jsdata["tag_categories"][2])):  # "全部地区"
                    c2 = jsdata["tag_categories"][2][xx]
                    # "全部特色"  tag2
                    for xx in xrange(0, len(jsdata["tag_categories"][3])):
                        c3 = jsdata["tag_categories"][3][xx]
                        url = ajax_list_url.format(tags=c3,
                                                   genres=c1,
                                                   countries=c2)
                        if rd.sismember(config.doubantv_ajax_task_failed,
                                        url) == False and rd.sismember(
                                            config.doubantv_ajax_task_done,
                                            url) == False:
                            rd.sadd(config.doubantv_ajax_task, url)
                        # rd.sadd(config.doubantv_ajax_url,url)
                        print(url)
                        # "全部形式" tag1
                        for xx in xrange(0, len(jsdata["tag_categories"][0])):
                            c0 = jsdata["tag_categories"][0][xx]
                            c3c0 = c3 + ',' + c0
                            c3c0 = re.sub(u',$', "", c3c0)
                            c3c0 = re.sub(u'^,', "", c3c0)
                            url = ajax_list_url.format(tags=c3c0,
                                                       genres=c1,
                                                       countries=c2)
                            if rd.sismember(config.doubantv_ajax_task_failed,
                                            url) == False and rd.sismember(
                                                config.doubantv_ajax_task_done,
                                                url) == False:
                                rd.sadd(config.doubantv_ajax_task, url)
                            # rd.sadd(config.doubantv_ajax_url,url)
                            url = ajax_list_url.format(tags=c0,
                                                       genres=c1,
                                                       countries=c2)
                            if rd.sismember(config.doubantv_ajax_task_failed,
                                            url) == False and rd.sismember(
                                                config.doubantv_ajax_task_done,
                                                url) == False:
                                rd.sadd(config.doubantv_ajax_task, url)
                            # rd.sadd(config.doubantv_ajax_url,url)
                            print(url)

            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % max_step == 0:  # 每50步更新一次session
        update_session()
Beispiel #30
0
def task_api():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.doubantv_ajax_task)
        origin_url = url
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True:
        if rd.sismember(config.doubantv_ajax_task_done, url) == True:
            print(u"already done%s" % url)
            continue
        start = 0
        while True:
            url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url)
            print(url)
            r = requests_get(url, headers=douban_referer_tag_headers)
            if r is False or r == None:  # 失败
                print(u'filed task:%s' % url)
                rd.sadd(config.doubantv_ajax_task_failed, url)
                continue
            try:
                r_data = json.loads(r)
            except Exception as e:
                rd.sadd(config.doubantv_ajax_task_failed, url)
                print(r)
                print(str(e))
                update_session()
                time.sleep(task_wait)
                print("-----spider  ben   sleep 10 sec....")
                continue
            if len(r_data['data']) == 0:
                rd.sadd(config.doubantv_ajax_task_done, origin_url)
                print("done%s" % origin_url)
                break
            for x in r_data['data']:
                if rd.sismember(config.douban_tv_done,
                                x['id']) == False and rd.sismember(
                                    config.douban_tv_failed, x['id']) == False:
                    add_task = rd.sadd(config.douban_tv_task, x['id'])
                    if add_task == 1:
                        print(
                            "---------------join task.----%s--------------------"
                            % x['id'])
                    else:
                        print(
                            '***********task repeat-******%s********************'
                            % x['id'])
                    rd.sadd(config.douban_tvids, x['id'])
            rd.sadd(config.doubantv_ajax_task_done, origin_url)
            print("sleep 2 seconds")
            delay()
            i += 1
            start += 1
            if i % max_step == 0:
                bid = random_str(10)
                session.cookies.set('bid', bid, domain='.douban.com', path='/')
                try:
                    session.get(url=ad_url.format(bid=bid),
                                headers=douban_referer_tag_headers,
                                timeout=timeout)
                except Exception as e:
                    pass