Esempio n. 1
0
def task_types_fetch():
    retry = 5
    i = 0
    while True:
        type_url = rd.spop(config.yk_types_task)
        if type_url is None:
            print(u"yk_types_task sleeping 20sec....")
            return True
        if rd.sismember(config.yk_types_failed,
                        type_url) == True or rd.sismember(
                            config.yk_types_done, type_url) == True:
            continue
        r = requests_get(url=type_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:
            print(u'filed task:%s' % type_url)
            rd.sadd(config.yk_types_failed, type_url)
            continue
        pages = parse_category_show(r, type_url)
        print("task_types_fetch data:", pages)
        for page in xrange(1, int(pages['pages'])):
            page_url = re.sub('(\.html.*)',
                              '_s_1_d_1_p_{page}.html'.format(page=page),
                              type_url)
            print("task_types_fetch for :", page_url)
            if rd.sismember(config.yk_page_failed,
                            page_url) == False and rd.sismember(
                                config.yk_page_done, page_url) == False:
                rd.sadd(config.yk_page_task, page_url)
        rd.sadd(config.yk_types_done, type_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Esempio n. 2
0
def task_video():
    """
    """
    retry = 5
    i = 0
    while True:
        id = rd.spop(config.douban_tv_task)
        # id = rd.spop(config.douban_tv_failed)
        if id is None:
            print(u"task_page sleeping....20sec")
            return True
        if rd.sismember(config.doubantv_ajax_task_done, id) == True:
            print(u"already done%s" % id)
            continue
        url = tv_url.format(id=id)
        r = requests_get(url=url, headers=douban_home_headers)
        if r == False or r == None:
            rd.sadd(config.douban_tv_failed, id)
            continue
        try:
            cb = check_block(r)
        except Exception as e:
            print("check_block:", str(e))
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_video(r)
        piw = piwik(page_title=page_title(r),
                    session_time=session_time,
                    origin_url=url,
                    urlref='')
        print("piw", piw)
        if data.get("title") == None:
            rd.sadd(config.douban_tv_failed, id)
            time.sleep(task_wait)
            # update_session()
            print("------spider ben block...")
            continue
        data['doubanid'] = id
        print(json.dumps(data))
        mongo_r = mongo_douban_tvs.insert(data, check_keys=False)  #
        photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)})
        if rd.sismember(config.douban_star_done,
                        photostask) == False and rd.sismember(
                            config.douban_photos_failed, photostask) == False:
            rd.sadd(config.douban_photos_task, photostask)
        print(photostask)
        # return True
        rd.sadd(config.douban_tv_done, id)
        # tv_after(id=id, url=url)
        print("done.. sleep %s seconds." % task_wait)
        delay()
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Esempio n. 3
0
def get_category():
    """获取分类,做种子"""
    start = 1
    retry = 5
    print('get_category')
    while retry > 0:
        try:
            r = requests_get(url=category_url,
                             headers=youku_home_headers,
                             timeout=timeout,
                             session=session)
            page = etree.HTML(r)
            lis = page.xpath(
                u'//label[contains(text(),"分类:")]/following-sibling::ul/li')
            o = urlparse(category_url)
            host = o.scheme + '://' + o.netloc
            categories = []
            for x in xrange(1, len(lis)):
                categories.append({
                    "name": lis[x].find('a').text,
                    'url': host + lis[x].find('a').get('href')
                })
                # categories[lis[x].find('a').text] = host + lis[x].find('a').get('href')
            print("categories:", json.dumps(categories))
            # categories = {lis[x].find('a').text : host + lis[x].find('a').get('href') for x in xrange(1,len(lis)) if lis[x].find('a')!=None}  #
            if len(categories) == 0:
                update_session(proxy)
                continue
            for x in categories:
                if rd.sismember(config.yk_category_task_done,
                                x['url']) == False and rd.sismember(
                                    config.yk_category_task_failed,
                                    x['url']) == False:
                    task_sadd = rd.sadd(config.yk_category_task,
                                        json.dumps(x))  # 种子
                re_sadd = rd.sadd(config.yk_category_url, json.dumps(x))  # 种子
                if re_sadd != 0:  # 去重保存
                    youku_category.insert(x,
                                          check_keys=False)  # save categories
            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % 20 == 0:  # 每50步更新一次session
        update_session()
Esempio n. 4
0
def go_detail_list_task():
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_video_detail_task)
        if q is None:
            print(u"yk_video_detail_task sleeping 20 sec....")
            # time.sleep(task_wait)
            return True
        detail_url = json.loads(q)
        #if rd.sismember(config.yk_video_detail_failed,q)==True or rd.sismember(config.yk_video_detail_done,detail_url['url'])==True:
        if rd.sismember(config.yk_video_detail_done,
                        detail_url['url']) == True:
            print("pass", detail_url['url'])
            continue
        # r = go_detail_list_page(detail_url)
        r = requests_get(detail_url['url'], headers=youku_home_headers)
        d = parse_detail_list_page(r, detail_url['url'])
        data = d['data']
        if data is False or data == None:
            rd.sadd(config.yk_video_detail_failed, q)
            continue
        for x in d['stars']:
            rd.sadd(config.yk_star_task, x)  # 明星采集队列,redis set特性去重
        print('detail_url done:', detail_url['url'], data)
        done = rd.sadd(config.yk_video_detail_done,
                       detail_url['url'])  # finished
        #if done == 1:
        youku_videos.insert(data, check_keys=False)  # save tv data
        # 每50步更新一次session
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Esempio n. 5
0
def get_detailurl_task():
    """
    get_detailurl_task yk_get_detailurl_task 解析到detail_list页面的url
    """
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_get_detailurl_task)
        if q is None:
            print(u"yk_get_detailurl_task sleeping 20 sec")
            # time.sleep(task_wait)
            return True
        to_detail_url = json.loads(q)
        headers = youku_home_headers
        headers['Referer'] = to_detail_url['Referer']
        # if rd.sismember(config.yk_get_detailurl_done,q)==True or rd.sismember(config.yk_get_detailurl_field,q)==True:
        if rd.sismember(config.yk_get_detailurl_done,q)==True:
            print("pass")
            continue
        r = requests_get(to_detail_url['url'], headers=headers)
        # headers = youku_home_headers
        # headers['Referer'] = to_detail_url['url']
        # try:
        #     session.get('http://cmstool.youku.com/cms/player/userinfo/user_info?specialTest=test&client=pc&callback=tuijsonp1',headers=headers)
        # except Exception as e:
        #     pass
        print("to_detail_url",to_detail_url['url'])
        detail_url = parse_tv_show(r, to_detail_url['url'])
        print("detail_url:",detail_url)
        if detail_url == False or detail_url==None:
            rd.sadd(config.yk_get_detailurl_field, q)
            continue
        # if rd.sismember(config.yk_video_detail_done,json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))==False:
        if rd.sismember(config.yk_video_detail_done,detail_url)==False:
            red = rd.sadd(config.yk_video_detail_task, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
            if red==1:
                print("yes")
        rd.sadd(config.yk_get_detailurl_done,q)
        # rd.sadd(config.yk_video_detail_task_, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Esempio n. 6
0
def task_page_fetch():
    """
    解析每一个category下每个分类下的每一页list数据中的所有tv url,
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        page_url = rd.spop(config.yk_page_task)
        # page_url = rd.spop(config.yk_page_failed) #retry
        if page_url is None:
            print(u"task_page_fetch sleeping 20sec....")
            # time.sleep(task_wait)
            return True
        print("page_url", page_url)
        if rd.sismember(config.yk_page_failed,
                        page_url) == True or rd.sismember(
                            config.yk_page_done, page_url) == True:
            continue
        r = requests_get(url=page_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % page_url)
            rd.sadd(config.yk_page_failed, page_url)
            continue
        print("done task_page_fetch:", page_url)
        data = parse_page_fetch(r, page_url)
        for x in data['yk_get_detailurl_task']:
            rd.sadd(config.yk_get_detailurl_task,
                    json.dumps(x))  # 链接是直接到播放页面的V_show类型
        for x in data['yk_video_detail_task']:
            r_add = rd.sadd(config.yk_video_detail_task,
                            json.dumps(x))  # detail_list_task
        rd.sadd(config.yk_page_done, page_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Esempio n. 7
0
def task_category():
    """
    解析每一个category下的分类,
    并获取该category 每个分类下的全部资源的url任务, 
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        category = rd.spop(config.yk_category_task)
        if category is None:
            print(u"task_category sleeping....20sec")
            # time.sleep(task_wait)
            return True
        category = json.loads(category)
        print(category)
        r = requests_get(url=category['url'], headers=youku_home_headers,session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % category['url'])
            rd.sadd(config.yk_category_task_failed, category['url'])
            continue
        data = parse_category_show(r, category['url'])
        print("category and types:", json.dumps(data))
        if len(data['types']) == 0:  # category下没有type,
            re_sadd = rd.sadd(config.yk_types_task,category['url'])  # types url
        else:
            for ty in data['types']:
                if rd.sismember(config.yk_types_done,data['types'][ty]) == False and rd.sismember(config.yk_types_failed,data['types'][ty]) == False:
                    rd.sadd(config.yk_types_task,data['types'][ty])  # types fetch task
                re_sadd = rd.sadd(config.yk_types_done,data['types'][ty])  # types url 数据库去重
                if re_sadd == 0:  # 去重保存
                    continue
                youku_video_types.insert(
                    {"name": ty, "url": data['types'][ty], "category": category['name']}, check_keys=False)  # save tv types
        rd.sadd(config.yk_category_task_done, category['url'])
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Esempio n. 8
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        # task = rd.spop(config.douban_star_task)
        task = rd.spop(config.douban_star_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            break
            continue
        # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True:
        if rd.sismember(config.douban_star_done, task) == True:
            print(u"already done%s" % task)
            continue
        url = star_url.format(id=task)
        print(url)
        r = requests_get(url=url)
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_star(r)
        if data == False or data == None or data.get("name") == None:
            rd.sadd(config.douban_star_failed, task)
            update_session()
            time.sleep(20)
            print("------spider ben sleep 20 sec...")
            continue
        data['doubanid'] = task
        print(json.dumps(data))
        result = mongo_douban_stars.insert(data, check_keys=False)
        rd.sadd(config.douban_star_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Esempio n. 9
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        task = rd.spop(config.le_star_task)
        # task = u'{"7088": "石田卓也"}'
        if task is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        print(task)
        is_done = rd.sismember(config.le_star_done, task)
        if is_done == True:
            print("already done.")
            continue
        task_json = json.loads(task)
        url = so_url.format(wd=task_json[task_json.keys()[0]])
        r = requests_get(url=url, headers=leso_headers)
        if r is False or r == None:  # 失败
            print(u'filed task:%s' % url)
            rd.sadd(config.le_star_failed, task)
            continue
        data = parse_sostar(r, task_json)
        if data == False or data == None:
            rd.sadd(config.le_star_failed, task)
            continue
        mongo_id = mongo_letv_stars.insert(data, check_keys=False)  #
        if mongo_id:
            rd.sadd(config.le_star_done, task)
        else:
            print(mongo_id)
            rd.sadd(config.le_star_failed, task)
        print('done.')
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Esempio n. 10
0
def task_photos():
    """
    """
    retry = 5
    i = 0
    photos_url = u'https://movie.douban.com/subject/{id}/photos?type=R'
    while True:
        #线程锁,必须加这里.
        #with threading.Lock():
        # task = rd.spop(config.douban_photos_task)
        task = rd.spop(config.douban_photos_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            return True
        # if rd.sismember(config.douban_photos_failed, task) == True or rd.sismember(config.douban_photos_done, task) == True:
        if rd.sismember(config.douban_photos_done, task) == True:
            print(u"already done%s" % task)
            continue
        T = json.loads(task)
        # T = {}
        # task = ""
        # T['id'] = "25827963"
        url = photos_url.format(id=T['id'])
        print(url)
        # data = []
        data = get_photos(url=url, id=T['id'])
        # for x in get_photos(url=url, id=T['id']):
        #     #if x == False or len(x) == 0 or x == None:
        #     if x == False or x == None:
        #         # rd.sadd(config.douban_photos_failed, task)
        #         rd.sadd(config.douban_photos_task, task)
        #         print("------spider ben sleep 20 sec...")
        #         update_session()
        #         break
        #     print(json.dumps(x))
        #     print(len(x))
        #     data += x
        print("++++++++++++++++%s+++++++++++++%s++++++++++++" %
              (task, len(data)))
        if len(data) == 0:
            #rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
            continue
        print(json.dumps(data))
        # return
        '''这是后面的骚操作.....'''
        mongo_douban_tvs.update({'_id': ObjectId(T['mongoTVID'])},
                                {'$unset': {
                                    'poster': 1
                                }},
                                multi=True)
        result = mongo_douban_tvs.update_one({'_id': ObjectId(T['mongoTVID'])},
                                             {'$set': {
                                                 'poster': data
                                             }})
        if result.modified_count == 0:
            rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
        rd.sadd(config.douban_photos_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result.modified_count)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Esempio n. 11
0
def parse_video(r):
    data = {}
    page = etree.HTML(r)
    year = re.search(u'<span class="year">\((\d{4})\)</span>', r)
    if year:
        data['year'] = year.group(1)
    title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r)
    if title:
        data['title'] = title.group(1)
    bianju = page.xpath(u'//span[contains(text(),"编剧")]')
    if len(bianju) > 0:
        bianju_a = bianju[0].getnext()
        if bianju_a:
            bianju_a = bianju_a.findall('a')
            data['screenwriter_list'] = []
            screenwriters = ''
            for x in bianju_a:
                screenwriters = screenwriters + x.text + ","
                # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href")
                if re.search(u'/celebrity/(\d*)/', x.get("href")):
                    doubanid = re.search(u'/celebrity/(\d*)/',
                                         x.get("href")).group(1)
                    if rd.sismember(config.douban_star_done,
                                    doubanid) == False and rd.sismember(
                                        config.douban_star_failed,
                                        doubanid) == False:
                        rd.sadd(config.douban_star_task, doubanid)
                else:
                    doubanid = x.get("href")
                data['screenwriter_list'].append({
                    "name": x.text,
                    "doubanid": doubanid
                })
            screenwriters = screenwriters.strip(',')
            data['screenwriters'] = screenwriters

    directors_el = page.xpath(u'//span[contains(text(),"导演")]')
    if len(directors_el) > 0:
        directors_a = directors_el[0].getnext()
        if directors_a:
            directors_a = directors_a.findall('a')
            data['directors_list'] = []
            directors = ""
            for x in directors_a:
                directors = directors + x.text + ","
                # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href")
                if re.search(u'/celebrity/(\d*)/', x.get("href")):
                    doubanid = re.search(u'/celebrity/(\d*)/',
                                         x.get("href")).group(1)
                    if rd.sismember(config.douban_star_done,
                                    doubanid) == False and rd.sismember(
                                        config.douban_star_failed,
                                        doubanid) == False:
                        rd.sadd(config.douban_star_task, doubanid)
                else:
                    doubanid = x.get("href")
                data["directors_list"].append({
                    "name": x.text,
                    "doubanid": doubanid
                })
            directors = directors.strip(',')
            data['directors'] = directors

    starring_el = page.xpath(u'//span[contains(text(),"主演")]')
    if len(starring_el) > 0:
        starring_a = starring_el[0].getnext()
        if starring_a:
            starring_a = starring_a.findall('a')
            data['starring_list'] = []
            starring = ""
            for x in starring_a:
                starring = starring + x.text + ","
                # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href")
                if re.search(u'/celebrity/(\d*)/', x.get("href")):
                    doubanid = re.search(u'/celebrity/(\d*)/',
                                         x.get("href")).group(1)
                    if rd.sismember(config.douban_star_done,
                                    doubanid) == False and rd.sismember(
                                        config.douban_star_failed,
                                        doubanid) == False:
                        rd.sadd(config.douban_star_task, doubanid)
                else:
                    doubanid = x.get("href")
                data["starring_list"].append({
                    "name": x.text,
                    "doubanid": doubanid
                })
            starring = starring.strip(',')
            data['starring'] = starring
    type_el = page.xpath(u'//span[@property="v:genre"]')  # 类型
    if len(type_el) > 0:
        mvtype = ""
        for x in type_el:
            mvtype = mvtype + x.text + ","
        mvtype = mvtype.strip(',')
        data['type'] = mvtype

    producer_country_el = page.xpath(u'//span[contains(text(),"制片国家/地区:")]')
    if len(producer_country_el) > 0:
        data['producer_country'] = page.xpath(
            u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0]

    language_el = page.xpath(u'//span[contains(text(),"语言:")]')
    if len(language_el) > 0:
        data['language'] = page.xpath(
            u'//span[contains(text(),"语言:")]/following::text()[1]')[0]

    all_episode = page.xpath(u'//span[contains(text(),"集数:")]')
    if len(all_episode) > 0:
        data['all_episode'] = page.xpath(
            u'//span[contains(text(),"集数:")]/following::text()[1]')[0]

    episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]')
    if len(episode_time) > 0:
        data['episode_time'] = page.xpath(
            u'//span[contains(text(),"单集片长:")]')[0].text

    season = page.xpath(
        u'//select[@id="season"]/option[@selected="selected"]')  #season季数
    if len(season) > 0:
        data['season'] = season[0].text

    release_date_el = page.xpath(
        u'//span[@property="v:initialReleaseDate"]')  #首播
    if len(release_date_el) > 0:
        release_date = ""
        for x in release_date_el:
            release_date = release_date + x.text + "|"
        release_date = release_date.strip('|')
        data['release_date'] = release_date
    duration_el = page.xpath(u'//span[@property="v:runtime"]')
    if len(duration_el) > 0:
        data['duration'] = duration_el[0].text  # 片长

    alias_al = page.xpath(u'//span[contains(text(),"又名:")]')
    if len(alias_al) > 0:
        data["alias"] = page.xpath(
            u'//span[contains(text(),"又名:")]/following::text()[1]')[0]

    IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]')
    if len(IMDb_el) > 0:
        data["IMDb"] = IMDb_el[0].getnext().get("href")

    rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r)
    if rating:
        data['rating'] = rating.group(1)

    rating_sum = page.xpath(u'//span[@property="v:votes"]')
    if len(rating_sum) > 0:
        data['rating_sum'] = rating_sum[0].text

    summary_all = page.xpath(u'//span[@class="all hidden"]')
    summary = page.xpath(u'//span[@property="v:summary"]')
    if len(summary_all) > 0:
        data['summary'] = ''.join(
            page.xpath(u'//span[@class="all hidden"]/text()'))
    elif len(summary) > 0:
        data['summary'] = ''.join(
            page.xpath(u'//span[@property="v:summary"]/text()'))

    img_url = page.xpath(u'//img[@title="点击看更多海报"]')
    nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]')
    if len(img_url) > 0:
        data["img_url"] = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src")
    elif len(nbgnbg) > 0:
        data["img_url"] = nbgnbg[0].get("href")
    tags = page.xpath(u'//div[@class="tags-body"]/a')
    data['tags'] = ''
    for x in tags:
        data['tags'] += "".join([x.text, ','])
    data['tags'] = data['tags'].strip(',')
    if len(data) == 0:
        print(r)
    return data
Esempio n. 12
0
def task_api():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.doubantv_ajax_task)
        origin_url = url
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True:
        if rd.sismember(config.doubantv_ajax_task_done, url) == True:
            print(u"already done%s" % url)
            continue
        start = 0
        while True:
            url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url)
            print(url)
            r = requests_get(url, headers=douban_referer_tag_headers)
            if r is False or r == None:  # 失败
                print(u'filed task:%s' % url)
                rd.sadd(config.doubantv_ajax_task_failed, url)
                continue
            try:
                r_data = json.loads(r)
            except Exception as e:
                rd.sadd(config.doubantv_ajax_task_failed, url)
                print(r)
                print(str(e))
                update_session()
                time.sleep(task_wait)
                print("-----spider  ben   sleep 10 sec....")
                continue
            if len(r_data['data']) == 0:
                rd.sadd(config.doubantv_ajax_task_done, origin_url)
                print("done%s" % origin_url)
                break
            for x in r_data['data']:
                if rd.sismember(config.douban_tv_done,
                                x['id']) == False and rd.sismember(
                                    config.douban_tv_failed, x['id']) == False:
                    add_task = rd.sadd(config.douban_tv_task, x['id'])
                    if add_task == 1:
                        print(
                            "---------------join task.----%s--------------------"
                            % x['id'])
                    else:
                        print(
                            '***********task repeat-******%s********************'
                            % x['id'])
                    rd.sadd(config.douban_tvids, x['id'])
            rd.sadd(config.doubantv_ajax_task_done, origin_url)
            print("sleep 2 seconds")
            delay()
            i += 1
            start += 1
            if i % max_step == 0:
                bid = random_str(10)
                session.cookies.set('bid', bid, domain='.douban.com', path='/')
                try:
                    session.get(url=ad_url.format(bid=bid),
                                headers=douban_referer_tag_headers,
                                timeout=timeout)
                except Exception as e:
                    pass
Esempio n. 13
0
def spider_seed(tag_url=tag_url):
    """获取分类,做种子"""
    start = 1
    retry = 5
    while retry > 0:
        try:
            r = requests_get(url=tag_url,
                             headers=douban_home_headers,
                             timeout=timeout)
            # page = etree.HTML(r)
            appjs_url = re.search(
                u'<script type="text/javascript" src="((.*)app\.js)"></script>',
                r).group(1)
            print(appjs_url)
            appjs = requests_get(url=appjs_url, headers=douban_appjs_headers)
            jsdata = re.search(
                u'mixins\:\[f\.mixin\],data\:function\(\)\{return(.*)\},ready\:function\(\)\{window',
                appjs).group(1)
            print(jsdata)
            jsdata = re.sub(u'!', '', jsdata)
            jsdata = re.sub(
                u'browserHeight:document.documentElement.clientHeight', '',
                jsdata)
            jsdata = demjson.decode(jsdata)
            save_tags = rd.sadd(config.doubantv_tags,
                                json.dumps(jsdata['tag_categories']))
            if save_tags == 1:
                # mongo_douban_tags.insert(json.dumps(jsdata["tag_categories"]), check_keys=False)  #
                mongo_douban_tags.insert(
                    {"tag_categories": jsdata["tag_categories"]},
                    check_keys=False)  #
            ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}'
            print(len(jsdata["tag_categories"][0]))
            print(len(jsdata["tag_categories"][1]))
            print(len(jsdata["tag_categories"][2]))
            print(len(jsdata["tag_categories"][3]))
            jsdata["tag_categories"][1][0] = ""
            jsdata["tag_categories"][2][0] = ""
            jsdata["tag_categories"][3][0] = ""
            jsdata["tag_categories"][0][0] = ""
            for x in xrange(0, len(jsdata["tag_categories"][1])):  # "全部类型"
                c1 = jsdata["tag_categories"][1][x]
                for xx in xrange(0,
                                 len(jsdata["tag_categories"][2])):  # "全部地区"
                    c2 = jsdata["tag_categories"][2][xx]
                    # "全部特色"  tag2
                    for xx in xrange(0, len(jsdata["tag_categories"][3])):
                        c3 = jsdata["tag_categories"][3][xx]
                        url = ajax_list_url.format(tags=c3,
                                                   genres=c1,
                                                   countries=c2)
                        if rd.sismember(config.doubantv_ajax_task_failed,
                                        url) == False and rd.sismember(
                                            config.doubantv_ajax_task_done,
                                            url) == False:
                            rd.sadd(config.doubantv_ajax_task, url)
                        # rd.sadd(config.doubantv_ajax_url,url)
                        print(url)
                        # "全部形式" tag1
                        for xx in xrange(0, len(jsdata["tag_categories"][0])):
                            c0 = jsdata["tag_categories"][0][xx]
                            c3c0 = c3 + ',' + c0
                            c3c0 = re.sub(u',$', "", c3c0)
                            c3c0 = re.sub(u'^,', "", c3c0)
                            url = ajax_list_url.format(tags=c3c0,
                                                       genres=c1,
                                                       countries=c2)
                            if rd.sismember(config.doubantv_ajax_task_failed,
                                            url) == False and rd.sismember(
                                                config.doubantv_ajax_task_done,
                                                url) == False:
                                rd.sadd(config.doubantv_ajax_task, url)
                            # rd.sadd(config.doubantv_ajax_url,url)
                            url = ajax_list_url.format(tags=c0,
                                                       genres=c1,
                                                       countries=c2)
                            if rd.sismember(config.doubantv_ajax_task_failed,
                                            url) == False and rd.sismember(
                                                config.doubantv_ajax_task_done,
                                                url) == False:
                                rd.sadd(config.doubantv_ajax_task, url)
                            # rd.sadd(config.doubantv_ajax_url,url)
                            print(url)

            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % max_step == 0:  # 每50步更新一次session
        update_session()
Esempio n. 14
0
def spider_seed(category_url=category_url):
    """获取分类,做种子"""
    start = 1
    list_url = u'http://list.youku.com'
    retry = 5
    while retry > 0:
        try:
            r = requests_get(url=category_url,
                             headers=leshi_headers,
                             timeout=timeout)
            page = etree.HTML(r)
            category_el = page.xpath(
                u'//div[@class="list_box"]/div[@class="column"]/ul[@class="list_cnt"]/li'
            )
            # categories = [{"url":list_url + x.find("a").get("href"),"category":x.find("a").text.replace(" ","").replace("\n","")} for x in category_el if x.find("a") != None]
            categories = []
            for x in category_el:
                if x.find("a") != None:
                    categories.append({
                        "url":
                        list_url + x.find("a").get("href"),
                        "title":
                        x.find("a").text.replace(" ", "").replace("\n", "")
                    })
                else:
                    categories.append({
                        "url":
                        category_url,
                        "title":
                        x.text.replace(" ", "").replace("\n", "")
                    })
            print(json.dumps(categories))
            # return categories
            for x in categories:
                rd.sadd(config.le_page_task, x['url'])
                rd.sadd(config.le_page_urls, x['url'])
                urls = parse_all_url(x["url"])  # 获取该category下的urls
                if urls == False:
                    re.sadd(config.le_getpage_task, x["url"])  #获取该url下的urls失败
                    continue
                for xx in urls:  # 遍历每一个url,得到该页面的全部url
                    rd.sadd(config.le_page_task, xx['url'])
                    rd.sadd(config.le_page_urls, xx['url'])
                    print(xx['url'])
                    print(xx['title'])
                    r = requests_get(r=r, url=xx["url"])
                    rr_urls = parse_all_url(r)
                    if rr_urls == False:
                        re.sadd(config.le_getpage_task,
                                x["url"])  #获取该url下的urls失败
                        continue
                    for xxx in rr_urls:
                        if rd.sismember(config.le_page_failed,
                                        xxx['url']) == True:
                            continue
                        if rd.sismember(config.le_page_done,
                                        xxx['url']) == True:
                            continue
                        rd.sadd(config.le_page_task, xxx['url'])
                        rd.sadd(config.le_page_urls, xxx['url'])
            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % 20 == 0:  # 每50步更新一次session
        update_session()
Esempio n. 15
0
def task_page():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.le_page_task)
        # url = rd.spop(config.le_page_failed)
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        if rd.sismember(config.le_page_done, url) == True:
            print(u"already done%s" % url)
            continue
        r = requests_get(url, headers=leshi_headers)
        if r is False or r == None:  # 失败
            print(u'filed task:%s' % url)
            rd.sadd(config.le_page_failed, url)
            continue
        m = re.search(
            u"frontUrl\: *'(http://list\.le\.com\/getLesoData([^',]+?))',", r)
        print("task_page:", url)
        if m:
            # http://list.le.com/getLesoData?from=pc&src=1&stype=1&ps=30&pn=1&ph=420001&dt=1&cg=2&or=4&stt=1&vt=180001
            ajax_url = m.group(1)
            pn = 1
            while True:
                ajax_url = re.sub(u"pn=\d*", 'pn=%s' % pn, ajax_url)
                print("ajax_url:", ajax_url)
                r = requests_get(url=ajax_url, headers=leshi_ajax_headers)
                if r == False or r == None:
                    rd.sadd(config.le_page_ajax_failed, ajax_url)
                    continue
                pn += 1
                # print(r)
                try:
                    list_data = json.loads(r)
                except Exception as e:
                    print(str(e))
                    print(r)
                    print(ajax_url)
                    rd.sadd(config.le_page_ajax_failed, ajax_url)
                    print("continue")
                    continue
                if list_data.get("data").get("more") == False:
                    print("this url page fetch done")
                    break
                for x in list_data.get("data").get("arr"):
                    is_done = rd.sismember(config.le_tv_done, x["unique_id"])
                    if is_done == True:
                        print("already done!")
                        print(x['name'])
                        # return False
                        continue
                    # 初步清洗
                    data = {}
                    data = x
                    data['created_at'] = time.time()
                    data['updated_at'] = time.time()
                    # print(json.dumps(x))
                    # data["summary"] = x['description']
                    # data["category"] = x['categoryName']
                    # data["title"] = x['name']
                    # data["alias"] = x['otherName']
                    # data["subname"] = x['subname']
                    # data["englishName"] = x['englishName']
                    # data["language"] = x['language']
                    # data["area"] = x['areaName']
                    # data["plays_num"] = x['playCount']
                    # data["le_score"] = x['rating']
                    # # data["isEnd"] = x['isEnd']
                    # data["subCategoryName"] = x['subCategoryName']
                    # data["videoTypeName"] = x['videoTypeName']
                    # data["duration"] = x['duration'] #时长 单集视频是秒,电视剧剧集资源是每集的分钟数
                    # data["doubanid"] = x['doubanid'] #doubanid
                    # data["urlLink"] = x['urlLink']
                    # data["copyright"] = x['copyright']
                    # data["imgUrl"] = x['imgUrl']
                    # data["tag"] = x['tag']
                    # data["vids"] = x['vids']  #子集ids
                    # data["shortDesc"] = x['shortDesc']
                    # data["monthCount"] = x['monthCount']
                    # data["intro"] = x['intro']
                    # data["publishCompany"] = x['publishCompany']
                    # data["fitAge"] = x['fitAge']
                    # data["weekCount"] = x['weekCount']
                    # data["style"] = x['style']
                    # data["letv_original_id"] = x['letv_original_id']
                    # data["global_id"] = x['global_id']
                    # data["tvTitle"] = x['tvTitle']
                    # data["videoBaseType"] = x['videoBaseType']
                    # data["pubName"] = x['pubName']
                    # data["nameQuanpin"] = x['nameQuanpin']
                    # data["nameJianpin"] = x['nameJianpin']
                    # data["allowforeign"] = x['allowforeign']
                    # data["subSrc"] = x['subSrc']
                    # data["updataInfo"] = x['updataInfo']
                    # data["downloadPlatform"] = x['downloadPlatform']
                    # data["pushFlag"] = x['pushFlag']
                    # data["payPlatform"] = x['payPlatform']
                    # data["vid"] = x['vid']
                    # data["episodes"] = x['episodes']  #集数
                    # data["nowEpisodes"] = x['nowEpisodes'] #当前更新到
                    # data["ispay"] = x['ispay']
                    # data["country"] = x['country']
                    # data["videoList"] = x['videoList']
                    # try:
                    # 	data["published_at"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(x['releaseDate'])/1000)) #乐视平台的发布时间
                    # except Exception as e:
                    # 	data["published_at"] = x['releaseDate']  #有-28800000,-126259200000此类值
                    data["ctime"] = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(int(x['ctime']) /
                                       1000))  # 乐视平台的ctime,待分析,不明意义
                    data["mtime"] = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(int(x['mtime']) /
                                       1000))  # 乐视平台的mtime,待分析,不明意义
                    data["images"] = [{
                        "url": x['images'][k],
                        "width": k.split('*')[0],
                        "height": k.split('*')[1]
                    } for k in x['images']]  # hai bao
                    data["actors"] = "".join(
                        [x['actor'][it] + "," for it in x['actor']])  # 演员
                    data["directors"] = "".join([
                        x['directory'][it] + "," for it in x['directory']
                    ])  # 导演
                    starring_type = type(x['starring']).__name__
                    if starring_type != u'str':
                        for it in x['starring']:
                            if rd.sismember(config.le_star_done,
                                            json.dumps(it)) == True:
                                continue
                            if rd.sismember(config.le_star_failed,
                                            json.dumps(it)) == True:
                                continue
                            rd.sadd(config.le_star_task, json.dumps(it))
                        # 主演   坑啊,python 拷贝 可变类型.... x['starring']和data["starring"]的值在同一块内存地址
                        data["starring"] = "".join([
                            starring[starring.keys()[0]] + ","
                            for starring in x['starring']
                        ])
                    if type(x['actor']).__name__ != u'str':
                        for it in x['actor']:
                            if rd.sismember(config.le_star_done,
                                            json.dumps({it: x['actor'][it]
                                                        })) == True:
                                continue
                            if rd.sismember(config.le_star_failed,
                                            json.dumps({it: x['actor'][it]
                                                        })) == True:
                                continue
                            print(json.dumps({it: x['actor'][it]}))
                            rd.sadd(config.le_star_task,
                                    json.dumps({it: x['actor'][it]}))
                    if type(x['directory']).__name__ != u'str':
                        for it in x['directory']:
                            if rd.sismember(
                                    config.le_star_done,
                                    json.dumps({it:
                                                x['directory'][it]})) == True:
                                continue
                            if rd.sismember(
                                    config.le_star_failed,
                                    json.dumps({it:
                                                x['directory'][it]})) == True:
                                continue
                            json.dumps({it: x['directory'][it]})
                            rd.sadd(config.le_star_task,
                                    json.dumps({it: x['directory'][it]}))
                    # print(json.dumps(data))
                    print("done!")
                    mongo_letv_tvs.insert(data, check_keys=False)  #
                    rd.sadd(config.le_tv_done, x['unique_id'])
        else:
            print(u'filed task:%s' % url)
            rd.sadd(config.le_page_failed, url)
            continue
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()