Beispiel #1
0
def get_current_reposts(url, weibo_mid):
    """
    抓取主程序,抓取当前微博及其子微博,不向上追溯源微博
    """
    crawler.info('正在抓取url为{}的微博'.format(url))
    spread_other_caches = list()
    spread_others = list()
    spread_other_and_caches = list()

    result = _get_current_source(url, weibo_mid)
    if result is None:
        weibosearch_dao.update_weibo_url(weibo_mid, 2)
        return
    reposts_count, user_id, user_name = result

    if reposts_count > 0:
        soc = SpreadOtherCache()
        soc.set_id(user_id)
        soc.set_name(user_name)
        spread_other_caches.append(soc)

        page = _get_total_page(weibo_mid)
        if page == 0:
            weibosearch_dao.update_weibo_url(weibo_mid, 2)
            return

        page_counter = 0
        _crawl_loop(page, page_counter, weibo_mid, user_id, user_name,
                    spread_other_and_caches, spread_others,
                    spread_other_caches)
        _save_spread_other(spread_others, spread_other_caches, user_id)
        crawler.info(
            '一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format(num=len(spread_others)))

    weibosearch_dao.update_weibo_url(weibo_mid, 1)
Beispiel #2
0
def crawl_comment_page(mid):
    limit = get_max_comment_page()
    cur_page = 1
    next_url = ''
    while cur_page <= limit:
        cur_time = int(time.time()*1000)
        if cur_page == 1:
            url = start_url.format(mid, cur_time)
        else:
            url = base_url.format(next_url, cur_time)
        html = get_page(url, user_verify=False)
        comment_datas = comment.get_comment_list(html, mid)

        if not comment_datas and cur_page == 1:
            crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid))
            return

        save_comments(comment_datas)
        # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦)
        next_url = comment.get_next_url(html)

        if not next_url:
            crawler.info('微博{}的评论采集已经完成'.format(mid))
            return
        cur_page += 1
Beispiel #3
0
def _get_current_source(url, wb_mid):
    """
    :param url: 当前微博url
    :param wb_mid: 当前微博mid
    :return: 转发数,微博用户id,用户名
    """
    html = get_page(url)
    if not html or basic.is_404(html):
        return None

    reposts = parse_status.get_repostcounts(html)
    comments = parse_status.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=wb_mid,
                                          reposts=reposts,
                                          comments=comments)

    root_url = url
    user_id = parse_status.get_userid(html)
    user_name = parse_status.get_username(html)
    post_time = parse_status.get_statustime(html)
    device = parse_status.get_statussource(html)
    comments_count = parse_status.get_commentcounts(html)
    reposts_count = parse_status.get_repostcounts(html)
    root_user = user.get_profile(user_id)
    # 源微博的相关信息存储
    spread_original_dao.save(root_user, wb_mid, post_time, device,
                             reposts_count, comments_count, root_url)

    crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))
    return reposts_count, user_id, user_name
Beispiel #4
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(
                keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Beispiel #5
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
 def save_data(self, data):
     tasks = []
     for item in data:
         try:
             dic = {}
             uuid = item.get("uuid")
             dic["uuid"] = uuid
             dic["url"] = f"https://www.infoq.cn/article/{uuid}"
             dic["title"] = item.get("article_title")
             dic["cover"] = item.get("article_cover")
             dic["summary"] = item.get("article_summary")
             author = item.get("author")
             if author:
                 dic["author"] = author[0].get("nickname")
             else:
                 dic["author"] = item.get("no_author", "").split(":")[-1]
             score = item.get("publish_time")
             dic["publish_time"] = datetime.datetime.utcfromtimestamp(
                 score / 1000).strftime("%Y-%m-%d %H:%M:%S")
             dic["tags"] = ",".join(
                 [data.get("name") for data in item.get("topic")])
             translate = item.get("translator")
             dic["translator"] = dic["author"]
             if translate:
                 dic["translator"] = translate[0].get("nickname")
             dic["status"] = 0
             dic["update_time"] = datetime.datetime.now().strftime(
                 "%Y-%m-%d %H:%M:%S")
             tasks.append(dic)
         except IndexError as e:
             crawler.error("解析出错")
     Mongo().save_data(tasks)
     crawler.info(f"add {len(tasks)} datas to mongodb")
     return score
Beispiel #7
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    #crawler.info(limit)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        #crawler.info(search_page)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in last turn'.format(keyword))
                #continue
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Beispiel #8
0
    async def get_session(self,
                          url,
                          _kwargs: dict = {},
                          source_type="text",
                          status_code=200) -> Response:
        '''
        :param kwargs:url,headers,data,params,etc,,
        :param method: get post.
        :param timeout: defalut 5s.
        '''
        kwargs = _kwargs
        if USE_PROXY:
            kwargs["proxy"] = await self.get_proxy()
        method = kwargs.pop("method", "get")
        timeout = kwargs.pop("timeout", 5)
        with async_timeout.timeout(timeout):
            async with getattr(self.session, method)(url, **kwargs) as req:
                status = req.status
                if status in [status_code, 201]:
                    if source_type == "text":
                        source = await req.text()
                    elif source_type == "buff":
                        source = await req.read()

        crawler.info(f"get url:{url},status:{status}")
        res = Response(status=status, source=source)
        return res
Beispiel #9
0
def excute_repost_task():
    # 以当前微博为源微博进行分析,不向上溯源,如果有同学需要向上溯源,需要自己判断一下该微博是否是根微博
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('本次一共有{}条微博需要抓取转发信息'.format(len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler', routing_key='repost_info')
Beispiel #10
0
async def run(data):
    crawler.info("Start Spider")
    async with aiohttp.connector.TCPConnector(
            limit=300, force_close=True, enable_cleanup_closed=True) as tc:
        async with aiohttp.ClientSession(connector=tc) as session:
            coros = (asyncio.ensure_future(bound_fetch(item, session))
                     for item in data)
            await start_branch(coros)
Beispiel #11
0
def excute_repost_task():
    datas = weibosearch_dao.get_crawl_urls()
    crawler.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas)))
    # 把抓取任务分发到各个机器上执行
    for data in datas:
        app.send_task('tasks.repost.get_current_reposts', args=(data['url'], data['mid']))

    crawler.info('本次任务分发完成')
Beispiel #12
0
def excute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler', routing_key='repost_info')
Beispiel #13
0
async def get_buff(item, session):
    url = item.get("cover")
    with async_timeout.timeout(60):
        async with session.get(url) as r:
            if r.status == 200:
                buff = await r.read()
                if len(buff):
                    crawler.info(f"NOW_IMAGE_URL:, {url}")
                    await get_img(item, buff)
 def log(*args, **kwargs):
     try:
         if f:
             crawler.info(f"{func.__name__} is run")
         return func(*args, **kwargs)
     except Exception as e:
         crawler.error(
             f"{func.__name__} is error,here are details:{traceback.format_exc()}"
         )
Beispiel #15
0
async def run(data):
    crawler.info("Start Spider")
    # TCPConnector维持链接池,限制并行连接的总量,当池满了,有请求退出再加入新请求。默认是100,limit=0的时候是无限制
    # ClientSession调用TCPConnector构造连接,Session可以共用
    async with aiohttp.connector.TCPConnector(
        limit=300, force_close=True, enable_cleanup_closed=True) as tc:
        async with aiohttp.ClientSession(connector=tc) as session:
            coros = (asyncio.ensure_future(bound_fetch(item, session))
                     for item in data)
            await start_branch(coros)
Beispiel #16
0
def get_page(url, need_login=True):
    """
    :param url: url to be crawled
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue
            # slow down to aviod being banned
            time.sleep(interal)

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'excepitons happens when crawling {},specific infos are {}'.
                format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Beispiel #17
0
def excute_repost_task():
    # regard current weibo url as the original url, you can also analyse from the root url
    weibo_datas = wb_data.get_weibo_repost_not_crawled()
    crawler.info('There are {} repost urls have to be crawled'.format(
        len(weibo_datas)))

    for weibo_data in weibo_datas:
        app.send_task('tasks.repost.crawl_repost_page',
                      args=(weibo_data.weibo_id, weibo_data.uid),
                      queue='repost_crawler',
                      routing_key='repost_info')
Beispiel #18
0
async def run():
    '''
    入口函数
    :return:
    '''
    data = await MotorBase().find()
    crawler.info("Start Spider")
    async with aiohttp.connector.TCPConnector(
            limit=300, force_close=True, enable_cleanup_closed=True) as tc:
        async with aiohttp.ClientSession(connector=tc) as session:
            coros = (asyncio.ensure_future(bound_fetch(item, session))
                     async for item in data)
            await branch(coros)
Beispiel #19
0
    async def get_proxy(self) -> Optional[str]:
        '''
        获取代理
        '''
        while True:
            proxy = await proxy_helper.get_proxy(isown=1,
                                                 protocol=2,
                                                 site='dianping')
            if proxy:
                host = proxy[0].get('ip')
                port = proxy[0].get('port')
                ip = f"http://{host}:{port}"
                return ip
            else:
                crawler.info("代理超时开始等待")

                await asyncio.sleep(5)
Beispiel #20
0
def _get_total_page(wb_mid):
    page = 1
    ajax_url = base_url.format(mid=wb_mid, currpage=page)
    source = get_page(ajax_url, False)

    if source == '':
        crawler.error('本次转发url{}抓取出错'.format(ajax_url))
        return 0

    crawler.info('本次转发信息url为{}'.format(ajax_url))

    try:
        repost_json = json.loads(source)
        total_page = int(repost_json['data']['page']['totalpage'])
    except Exception as why:
        parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why))
        return 0
    else:
        return total_page
Beispiel #21
0
    async def fetch_detail_page(self, item: dict):
        '''
        访问详情页,开始解析
        :param url:
        :return:

        '''
        detail_url = item.get("detail_url")
        kwargs = {"headers": DEFAULT_HEADRS}
        # 修改种子URL的状态为1表示开始爬取。
        condition = {'detail_url': detail_url}
        await MotorOperation().change_status(condition, col="discogs_index_data", status_code=1)
        response = await self.get_session(detail_url, kwargs)
        if response.status == 200:
            source = response.source
            # await self.more_images(source)
            try:
                await self.get_list_info(item, detail_url, source)
            except:
                crawler.info(f"解析出错:{detail_url}")
Beispiel #22
0
def search_keyword(row):
    cur_page = 1
    keyword = row.keyword
    if row.startTime:
        startTime = row.startTime.strftime('%Y-%m-%d')
        url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}&timescope=custom:{}'
    if row.endTime:
        endTime = row.endTime.strftime('%Y-%m-%d')
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        if row.startTime and row.endTime:
            finalTime = startTime + ':' + endTime
            cur_url = url.format(encode_keyword, cur_page, finalTime)
        else:
            cur_url = url.format(encode_keyword, cur_page)
        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning(
                '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Beispiel #23
0
def get_page(url, session, headers, user_verify=True):
    """
    :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    try:
        page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
            encode('utf-8', 'ignore').decode('utf-8')
        time.sleep(interal)

        if user_verify:
            if is_403(page):
                crawler.warning('本账号已经被冻结')
                crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime())))
                exit(-1)
            if is_404(page):
                crawler.warning('url为{url}的连接不存在'.format(url=url))
                return ''
            if not is_complete(page):
                time.sleep(excp_interal)
                try:
                    page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
                        encode('utf-8', 'ignore').decode('utf-8')
                except Exception as why:
                    crawler.error(why)
                    return ''
    except requests.exceptions.ReadTimeout:
        crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url))
        time.sleep(excp_interal)
        return ''
    except requests.exceptions.ConnectionError as e:
        crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e))
        time.sleep(excp_interal)
        return ''
    else:
        return page
Beispiel #24
0
def get_all(d):
    while not d:
        crawler.info('现在还未得到有效的session')
        time.sleep(60)

    datas = weibosearch_dao.get_crawl_urls()
    crawler.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas)))

    for data in datas:
        # session放在里面是为了防止某个抓取队列太长或者转发微博太多
        session = d.get('session')

        crawler.info('正在抓取url为{url}的微博'.format(url=data['url']))
        _get_current_reposts(data['url'], session, data['mid'])
        weibosearch_dao.update_weibo_url(data['mid'])

    crawler.info('本次抓取结束')
Beispiel #25
0
def get_page(url, user_verify=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:
        # 每次重试的时候都换cookies,并且和上次不同
        name_cookies = Cookies.fetch_cookies()

        if name_cookies is None:
            crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。')
            os._exit(0)

        if name_cookies == latest_name_cookies:
            continue

        latest_name_cookies = name_cookies

        try:
            resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)
            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Beispiel #26
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同
            name_cookies, cookies_count = Cookies.fetch_cookies()
            
            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker
                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie
            if cookies_count > 1 and name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Beispiel #27
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # slow down to aviod being banned
            time.sleep(interal)

            if user_verify:
                if is_banned(resp.url) or is_403(page):
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('account {} has been locked,you should use your phone to unlock it'.
                                    format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('{url} seems to be 404'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Beispiel #28
0
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid)))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()
            print(name_cookies)
            # check adver_timers
            if int(name_cookies[3]) >= int(adver_timers):
                continue

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]),
                              data={'text': adver_message, 'uid':str(target_uid)},
                              cookies=name_cookies[1], headers=personal_message_headers)

                if "error" in resp.text:
                    crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
                else:
                    # update adver_times
                    Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1)
                    return None

            #     if "$CONFIG['islogin'] = '******'" in resp.text:
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            # # else:
            # #     resp = requests.get(url, headers=headers, timeout=time_out, verify=False)
            #
            # page = resp.text
            #
            # if page:
            #     page = page.encode('utf-8', 'ignore').decode('utf-8')
            # else:
            #     continue
            #
            # # slow down to aviod being banned
            # time.sleep(interal)
            #
            # if user_verify:
            #     if is_banned(resp.url) or is_403(page):
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         count += 1
            #         continue
            #
            #     if 'verifybmobile' in resp.url:
            #         crawler.warning('account {} has been locked,you should use your phone to unlock it'.
            #                         format(name_cookies[0]))
            #
            #         freeze_account(name_cookies[0], -1)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            #
            #     if not is_complete(page):
            #         count += 1
            #         continue
            #
            #     if is_404(page):
            #         crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid)))
            #         return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e))
            count += 1
            time.sleep(excp_interal)

        else:
            # Urls.store_crawl_url(url, 1)
            # return page
            return None

    crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid))
    # Urls.store_crawl_url(url, 0)
    return ''
Beispiel #29
0
def _get_current_reposts(url, session, weibo_mid):
    """
    修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博
    """
    spread_other_caches = list()
    spread_others = list()
    spread_other_and_caches = list()

    html = get_page(url, session, headers)
    reposts = status_parse.get_repostcounts(html)
    comments = status_parse.get_commentcounts(html)

    # 更新weibo_search_data表中的转发数、评论数
    weibosearch_dao.update_repost_comment(mid=weibo_mid,
                                          reposts=reposts,
                                          comments=comments)

    if not basic.is_404(html):
        root_url = url
        mid = status_parse.get_mid(html)
        user_id = status_parse.get_userid(html)
        user_name = status_parse.get_username(html)
        post_time = status_parse.get_statustime(html)
        device = status_parse.get_statussource(html)
        comments_count = status_parse.get_commentcounts(html)
        reposts_count = status_parse.get_repostcounts(html)
        root_user = user.get_profile(user_id, session, headers)

        spread_original_dao.save(root_user, mid, post_time, device,
                                 reposts_count, comments_count, root_url)

        crawler.info('该微博转发数为{counts}'.format(counts=reposts_count))

        if reposts_count > 0:
            base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}'
            soc = SpreadOtherCache()
            soc.set_id(user_id)
            soc.set_name(user_name)
            spread_other_caches.append(soc)
            page = 1
            ajax_url = base_url.format(mid=mid, currpage=page)
            source = get_page(ajax_url, session, headers, False)

            crawler.info('本次转发信息url为:' + ajax_url)

            try:
                repost_json = json.loads(source)
                total_page = int(repost_json['data']['page']['totalpage'])
            except Exception as why:
                parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                    url=ajax_url, why=why))
            else:
                page = total_page
                page_counter = 0

                while page > 0 and page_counter < page_max:
                    ajax_url = base_url.format(mid=mid, currpage=page)
                    repost_info = get_page(ajax_url, session, headers, False)
                    try:
                        repost_json = json.loads(repost_info)
                        repost_html = repost_json['data']['html']
                    except Exception as why:
                        parser.error(
                            '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(
                                url=ajax_url, why=why))
                    else:
                        repost_urls = status_parse.get_reposturls(repost_html)

                        # 转发节点排序逻辑
                        for repost_url in repost_urls:
                            repost_cont = status.get_status_info(
                                repost_url, session, user_id, user_name,
                                headers, mid)

                            if repost_cont is not None:
                                spread_other_and_caches.append(repost_cont)

                        for soac in spread_other_and_caches:
                            if soac.get_so().id != '':
                                spread_others.append(soac.get_so())
                                spread_other_caches.append(soac.get_soc())
                    finally:
                        print('当前位于第{currpage}页'.format(currpage=page))
                        page -= 1
                        page_counter += 1

                for so in spread_others:
                    if so.verify_type == '':
                        so.verify_type = 0

                    for i in spread_other_caches:
                        if so.upper_user_name == i.get_name():
                            so.upper_user_id = i.get_id()
                            break
                        else:
                            so.upper_user_id = user_id

                spread_others = list(set(spread_others))

                spread_other_dao.save(spread_others)
                crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format(
                    num=len(spread_others)))
    else:
        crawler.info('{url}为404页面'.format(url=url))
Beispiel #30
0
def send_jd_seckill_task(jd_user_string, address_string, task_id, skuId, netproxy):
    """
    """
    s = requests.session()
    s.timeout = session_timeout
    s.proxies = netproxy

    jd_user_json = json.loads(jd_user_string)
    address_json = json.loads(address_string)
    cookies_encode = jd_user_json['cookies'].encode()
    cookies_decode = base64.b64decode(cookies_encode).decode()
    # cookies_dict = json.loads(cookies_decode)

    rawdata = '__jdv=122270672|direct|-|none|-|1504798597931; o2-webp=true; TrackID=1d8yuf-8hCib8xjpwDjMwOLGCD0gmGtLEjJFNZQwBIvwskJdwUNnq1kiTmBcsfXw2nATZkxctFmE3r1fN0yVk9egAz0M5KDHytNxuRLuHtOk; pinId=7iwdYGSz99W1ffsfn98I-w; pin=xtuyaowu; thor=C3888A1807C299F45E21294E559BB739649F3F90C26DB309D58688491645C60E7745B49FBD8CD722E210B31A2EE861DAF9C0782F8A06AAF23606C377C1953E40B92BA29EED15FF5F57F2A0165047E0C44F71D5CA5FF000281EC43042F0403E24E8A7B703856EC818D09300F82CB14986EF55754C61CA47D6A3F1A6ADE7E1FE0B99D7576D0BD2721B0E8F279EE5980A2B; _tp=gs6zPQLXL133eDDGdm%2Bv%2Fg%3D%3D; _pst=xtuyaowu; ceshi3.com=000; __jda=122270672.15047985979311779686273.1504798598.1504798598.1504798598.1; __jdb=122270672.3.15047985979311779686273|1.1504798598; __jdc=122270672; __jdu=15047985979311779686273'
    cookie = SimpleCookie()
    cookie.load(cookies_decode)

    # Even though SimpleCookie is dictionary-like, it internally uses a Morsel object
    # which is incompatible with requests. Manually construct a dictionary instead.
    cookies = {}
    for key, morsel in cookie.items():
        cookies[key] = morsel.value

    crawler.info('the send_jd_seckill_task jd_user is {uid}'.format(uid=str(jd_user_string)))
    celery_stask_status = 7
    try:
        # 第一次提交获取地址
        resp = s.get('https://marathon.jd.com/async/getUsualAddressList.action?skuId='+str(skuId), headers=headers,
                            cookies=cookies, timeout=time_out, verify=False)

        # [{
        #     "name": "冷月",
        #     "id": 138356479,
        #     "addressDetail": "广州外国语学校-凤凰大道 丰巢快递柜",
        #     "provinceId": 19,
        #     "cityId": 1601,
        #     "countyId": 50259,
        #     "townId": 51886,
        #     "mobile": "",
        #     "provinceName": "广东",
        #     "cityName": "广州市",
        #     "countyName": "南沙区",
        #     "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3",
        #     "email": "",
        #     "townName": "城区",
        #     "mobileWithXing": "131****5409"
        # }, {
        #     "name": "冷月",
        #     "id": 138359040,
        #     "addressDetail": "中信香樟墅1街12号",
        #     "provinceId": 19,
        #     "cityId": 1601,
        #     "countyId": 50284,
        #     "townId": 50451,
        #     "mobile": "",
        #     "provinceName": "广东",
        #     "cityName": "广州市",
        #     "countyName": "增城区",
        #     "mobileKey": "5fe7bdd8ce0aa7af84b7d1380d8141a3",
        #     "email": "",
        #     "townName": "中新镇",
        #     "mobileWithXing": "131****5409"
        # }]
        #
        # todo 第一次提交返回校验
        if not resp.text:
            save_task_monitor(task_id, celery_stask_status, "do not contain address")
            return None
        if '登录' in resp.text:
            save_task_monitor(task_id, celery_stask_status, "cookies失败")
            return None

        address_list = json.loads(resp.text)
        if len(address_list) >0:
            address_dict = address_list[0]
            if 'addressDetail' not in address_dict:
                crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text))
                save_task_monitor(task_id, celery_stask_status, resp.text)
                return None

        # todo 秒杀 参数需要确认
        resp = s.post('https://marathon.jd.com/seckill/submitOrder.action?skuId='+str(skuId)+'&vid= HTTP/1.1',
                      data={'orderParam.name':address_dict['name'],
                            'orderParam.addressDetail':address_dict['addressDetail'],
                            'orderParam.mobile':address_dict['mobileWithXing'],
                            'orderParam.email':address_dict['email'],
                            'orderParam.provinceId':address_dict['provinceId'],
                            'orderParam.cityId':address_dict['cityId'],
                            'orderParam.countyId':address_dict['countyId'],
                            'orderParam.townId':address_dict['townId'],
                            'orderParam.paymentType':4,
                            'orderParam.password':'',
                            'orderParam.invoiceTitle':4,
                            'orderParam.invoiceContent':1,
                            'orderParam.invoiceCompanyName':'',
                            'orderParam.invoiceTaxpayerNO':'',
                            'orderParam.usualAddressId':address_dict['id'],
                            'skuId':skuId,
                            'num':1,
                            'orderParam.provinceName':address_dict['provinceName'],
                            'orderParam.cityName':address_dict['cityName'],
                            'orderParam.countyName':address_dict['countyName'],
                            'orderParam.townName':address_dict['townName'],
                            'orderParam.codTimeType':3,
                            'orderParam.mobileKey':address_dict['mobileKey'],
                            'eid':jd_user_json['eid'],
                            'fp':jd_user_json['fp']
                            },
                             cookies=cookies, headers=personal_message_headers)

        # 秒杀返回校验
        if "//marathon.jd.com/koFail.html?reason=" in resp.text:
            crawler.warning('task_id {} has been banned, resp.text is: {}'.format(task_id, resp.text))
        else:
            celery_stask_status = 8

    except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
        print(e.format_exc())
        crawler.warning('excepitons happens when task_id {},specific infos are {}'.format(task_id, e))
        time.sleep(excp_interal)

    dbc = class_MongoDB.MongoClient(uri, class_logger.getLogger('MongoDB_Users'), 'JD')
    dbc.setUnique('Users', 'username')
    dbc.update('Users', {'username': jd_user_json['username']}, {'status': 2})

    save_task_monitor(task_id, celery_stask_status, resp.text)
    return ''
Beispiel #31
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),
    否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,请检查账号是否正常')
                other.warning('正在关闭爬虫程序...')
                if 'win32' in sys.platform:
                    os.popen('taskkill /F /IM "celery*"')
                else:
                    os.popen('pkill -f "celery"')
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Beispiel #32
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue
                    # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点
                    # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制
                    # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理
                    # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            if name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''