Esempio n. 1
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info(
                'Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Esempio n. 2
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info('Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Esempio n. 3
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired'
                )
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {
                'http': name_cookies[2],
                'https': name_cookies[2],
            }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')

        try:
            if auth_level == 2:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=COOKIES,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=TIME_OUT,
                                    verify=False,
                                    proxies=proxy)
        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'Excepitons are raised when crawling {}.Here are details:{}'.
                format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(
                    name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''
Esempio n. 4
0
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False):
    """
    :param url: url to crawl
    :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login.
    :param is_ajax: whether the request is ajax
    :param need_proxy: whether the request need a http/https proxy
    :return: response text, when a exception is raised, return ''
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < MAX_RETRIES:
        if auth_level == 2:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)

            # There is no difference between http and https address.
            proxy = {'http': name_cookies[2], 'https': name_cookies[2], }
        else:
            proxy = getip.getIPWithoutLogin('')
            # if proxy['http'] is None:
            #     crawler.warning('No available ip in ip pools. Using local ip instead.')
        
        try:
            if auth_level == 2:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy)
            elif auth_level == 1:
                resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy)
            else:
                resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy)
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e))
            count += 1
            time.sleep(EXCP_INTERAL)
            continue

        if resp.status_code == 414:
            crawler.warning('This ip has been blocked by weibo system')
            if not need_proxy:
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        if resp.text:
            page = resp.text.encode('utf-8', 'ignore').decode('utf-8')
        else:
            count += 1
            continue
        if auth_level == 2:
            # slow down to aviod being banned
            time.sleep(INTERAL)
            if is_banned(resp.url) or is_403(page):
                crawler.warning('Account {} has been banned'.format(name_cookies[0]))
                LoginInfoOper.freeze_account(name_cookies[0], 0)
                Cookies.delete_cookies(name_cookies[0])
                count += 1
                continue

            if not is_ajax and not is_complete(page):
                count += 1
                continue

        if is_404(page):
            crawler.warning('{} seems to be 404'.format(url))
            return ''
        Urls.store_crawl_url(url, 1)
        return page

    Urls.store_crawl_url(url, 0)
    return ''