Beispiel #1
0
def task_video():
    """
    """
    retry = 5
    i = 0
    while True:
        id = rd.spop(config.douban_tv_task)
        # id = rd.spop(config.douban_tv_failed)
        if id is None:
            print(u"task_page sleeping....20sec")
            return True
        if rd.sismember(config.doubantv_ajax_task_done, id) == True:
            print(u"already done%s" % id)
            continue
        url = tv_url.format(id=id)
        r = requests_get(url=url, headers=douban_home_headers)
        if r == False or r == None:
            rd.sadd(config.douban_tv_failed, id)
            continue
        try:
            cb = check_block(r)
        except Exception as e:
            print("check_block:", str(e))
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_video(r)
        piw = piwik(page_title=page_title(r),
                    session_time=session_time,
                    origin_url=url,
                    urlref='')
        print("piw", piw)
        if data.get("title") == None:
            rd.sadd(config.douban_tv_failed, id)
            time.sleep(task_wait)
            # update_session()
            print("------spider ben block...")
            continue
        data['doubanid'] = id
        print(json.dumps(data))
        mongo_r = mongo_douban_tvs.insert(data, check_keys=False)  #
        photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)})
        if rd.sismember(config.douban_star_done,
                        photostask) == False and rd.sismember(
                            config.douban_photos_failed, photostask) == False:
            rd.sadd(config.douban_photos_task, photostask)
        print(photostask)
        # return True
        rd.sadd(config.douban_tv_done, id)
        # tv_after(id=id, url=url)
        print("done.. sleep %s seconds." % task_wait)
        delay()
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #2
0
def piwik(page_title, session_time, origin_url, urlref=''):
    '''用户行为数据上报'''
    # https://fundin.douban.com/piwik?action_name=脱单告急 (豆瓣)&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26661189%2F&_id=7a36e03deb79996b&_idts=1525176862&_idvc=1&_idn=1&_refts=0&_viewts=1525176862&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768&gt_ms=1143
    url = u'https://fundin.douban.com/piwik?action_name={page_title}&idsite=100001&rec=1&r=579246&h=20&m=14&s=21&url={origin_url}&urlref={urlref}&_id={_id}&_idts={_idts}&_idvc=1&_idn=1&_refts=0&_viewts={_viests}&pdf=1&qt=0&realp=0&wma=0&dir=0&fla=0&java=0&gears=0&ag=0&cookie=1&res=1366x768&gt_ms=1143'
    url = url.format(page_title=page_title,
                     origin_url=origin_url,
                     _id=random_str(16, True),
                     _idts=session_time,
                     _viests=int(time.time()) + 3,
                     urlref=urlref)
    headers = douban_home_headers
    headers['Referer'] = origin_url
    return requests_get(url=url, headers=headers)
Beispiel #3
0
def update_session(proxy=None):
    """
    更新session
    proxy:
    """
    # if proxy != None:
    #     delete_proxy(proxy)
    # proxy = get_proxy()
    # print("proxy:", proxy)
    # session = requests.Session()
    # session.cookies['bid'] = random_str(10)
    bid = random_str(10)
    session.cookies.set('bid', bid, domain='.douban.com', path='/')
    # session.cookies['ll'] = '218319'
    session.adapters.DEFAULT_RETRIES = 5
    session_time = int(time.time())
Beispiel #4
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        # task = rd.spop(config.douban_star_task)
        task = rd.spop(config.douban_star_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            break
            continue
        # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True:
        if rd.sismember(config.douban_star_done, task) == True:
            print(u"already done%s" % task)
            continue
        url = star_url.format(id=task)
        print(url)
        r = requests_get(url=url)
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_star(r)
        if data == False or data == None or data.get("name") == None:
            rd.sadd(config.douban_star_failed, task)
            update_session()
            time.sleep(20)
            print("------spider ben sleep 20 sec...")
            continue
        data['doubanid'] = task
        print(json.dumps(data))
        result = mongo_douban_stars.insert(data, check_keys=False)
        rd.sadd(config.douban_star_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #5
0
def task_photos():
    """
    """
    retry = 5
    i = 0
    photos_url = u'https://movie.douban.com/subject/{id}/photos?type=R'
    while True:
        #线程锁,必须加这里.
        #with threading.Lock():
        # task = rd.spop(config.douban_photos_task)
        task = rd.spop(config.douban_photos_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            return True
        # if rd.sismember(config.douban_photos_failed, task) == True or rd.sismember(config.douban_photos_done, task) == True:
        if rd.sismember(config.douban_photos_done, task) == True:
            print(u"already done%s" % task)
            continue
        T = json.loads(task)
        # T = {}
        # task = ""
        # T['id'] = "25827963"
        url = photos_url.format(id=T['id'])
        print(url)
        # data = []
        data = get_photos(url=url, id=T['id'])
        # for x in get_photos(url=url, id=T['id']):
        #     #if x == False or len(x) == 0 or x == None:
        #     if x == False or x == None:
        #         # rd.sadd(config.douban_photos_failed, task)
        #         rd.sadd(config.douban_photos_task, task)
        #         print("------spider ben sleep 20 sec...")
        #         update_session()
        #         break
        #     print(json.dumps(x))
        #     print(len(x))
        #     data += x
        print("++++++++++++++++%s+++++++++++++%s++++++++++++" %
              (task, len(data)))
        if len(data) == 0:
            #rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
            continue
        print(json.dumps(data))
        # return
        '''这是后面的骚操作.....'''
        mongo_douban_tvs.update({'_id': ObjectId(T['mongoTVID'])},
                                {'$unset': {
                                    'poster': 1
                                }},
                                multi=True)
        result = mongo_douban_tvs.update_one({'_id': ObjectId(T['mongoTVID'])},
                                             {'$set': {
                                                 'poster': data
                                             }})
        if result.modified_count == 0:
            rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
        rd.sadd(config.douban_photos_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result.modified_count)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Beispiel #6
0
def task_api():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.doubantv_ajax_task)
        origin_url = url
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True:
        if rd.sismember(config.doubantv_ajax_task_done, url) == True:
            print(u"already done%s" % url)
            continue
        start = 0
        while True:
            url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url)
            print(url)
            r = requests_get(url, headers=douban_referer_tag_headers)
            if r is False or r == None:  # 失败
                print(u'filed task:%s' % url)
                rd.sadd(config.doubantv_ajax_task_failed, url)
                continue
            try:
                r_data = json.loads(r)
            except Exception as e:
                rd.sadd(config.doubantv_ajax_task_failed, url)
                print(r)
                print(str(e))
                update_session()
                time.sleep(task_wait)
                print("-----spider  ben   sleep 10 sec....")
                continue
            if len(r_data['data']) == 0:
                rd.sadd(config.doubantv_ajax_task_done, origin_url)
                print("done%s" % origin_url)
                break
            for x in r_data['data']:
                if rd.sismember(config.douban_tv_done,
                                x['id']) == False and rd.sismember(
                                    config.douban_tv_failed, x['id']) == False:
                    add_task = rd.sadd(config.douban_tv_task, x['id'])
                    if add_task == 1:
                        print(
                            "---------------join task.----%s--------------------"
                            % x['id'])
                    else:
                        print(
                            '***********task repeat-******%s********************'
                            % x['id'])
                    rd.sadd(config.douban_tvids, x['id'])
            rd.sadd(config.doubantv_ajax_task_done, origin_url)
            print("sleep 2 seconds")
            delay()
            i += 1
            start += 1
            if i % max_step == 0:
                bid = random_str(10)
                session.cookies.set('bid', bid, domain='.douban.com', path='/')
                try:
                    session.get(url=ad_url.format(bid=bid),
                                headers=douban_referer_tag_headers,
                                timeout=timeout)
                except Exception as e:
                    pass
Beispiel #7
0
tv_url = u'https://movie.douban.com/subject/{id}/'
ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}'
verify_users_url = u'https://m.douban.com/rexxar/api/v2/movie/{id}/verify_users?start=0&count=2&ck='
star_url = u'https://movie.douban.com/celebrity/{id}/'

session = requests.Session()
session.adapters.DEFAULT_RETRIES = 5
# session.cookies[] = u'll="118318"; bid=JMjve9nh9Ug; __yadk_uid=rma3RP9OuF1JDekWWGEQLIVRGDlSc5wR; _vwo_uuid_v2=D4BE7289F6AA483D6B792C38D0EC9C2F1|992a80a86f70b1cd20ef12e7e7959793; ap=1; dbcl2="154152988:v2gmo0C6RvA"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.15415; ck=sy5V; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1525319368%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=74d116d143255fe8.1525244970.3.1525319368.1525256304.; _pk_ses.100001.4cf6=*; __utma=30149280.487994295.1525244966.1525256302.1525319370.3; __utmc=30149280; __utmz=30149280.1525319370.3.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.310121982.1525244970.1525256304.1525319370.3; __utmb=223695111.0.10.1525319370; __utmc=223695111; __utmz=223695111.1525319370.3.3.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.2.10.1525319370'
# session.get(url=home_url, headers=douban_home_headers, timeout=10)
timeout = 3
proxy = ''
task_wait = 0
block_wait = 10
max_step = 2  # 线程数越多,该值就尽量调小 40 / 线程数,,,减少 block
ll = 118318
bid = random_str(10)
session_time = int(time.time())

ad_url = u'https://erebor.douban.com/count/?ad=195767&bid={bid}&unit=dale_movie_tag_bottom_banner&type=impression'


def delay(wait=0):
    time.sleep(wait)


def update_session(proxy=None):
    """
    更新session
    proxy:
    """
    # if proxy != None: