Example #1
0
    def get_url_body(self, tmp_url):
        '''
        得到url的body
        :param tmp_url: 待爬取的url
        :return: str
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        try:
            response = requests.get(
                tmp_url, headers=self.headers, proxies=tmp_proxies,
                timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            data = response.content.decode('utf-8')
            # print(data)
        except Exception:
            print('requests.get()请求超时....')
            print('today的data为空!')
            data = '{}'
        return data
    def get_url_body(self, url):
        '''
        根据url获取到需求的data数据
        :param url: str
        :return: str
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        print('------>>>>>>| 正在使用代理 %s 进行爬取 ...... |<<<<<<------' %
              tmp_proxies['http'])

        try:
            content = requests.get(url,
                                   headers=self.headers,
                                   proxies=tmp_proxies,
                                   timeout=12).json()  # 用requests自带就很友好,还能避免错误
        except:
            content = {}

        return content.get('data', '')
Example #3
0
    def from_ip_pool_set_proxy_ip_to_phantomjs(self):
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip()
        if not proxy_ip:  # 失败返回False
            return False

        # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip))
        proxy_ip = re.compile(r'http://').sub('', proxy_ip)  # 过滤'http://'
        proxy_ip = proxy_ip.split(':')  # 切割成['xxxx', '端口']

        try:
            tmp_js = {
                'script':
                'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]),
                'args': []
            }
            self.driver.command_executor._commands['executePhantomScript'] = (
                'POST', '/session/$sessionId/phantom/execute')
            self.driver.execute('executePhantomScript', tmp_js)

        except Exception:
            print('动态切换ip失败')
            return False

        return True
Example #4
0
    def set_cookies_key_api_uid(self):
        '''
        给headers增加一个cookie, 里面有个key名字为api_uid
        :return:
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        # 得到cookie中的key名为api_uid的值
        host_url = 'http://mobile.yangkeduo.com'
        try:
            response = requests.get(
                host_url,
                headers=self.headers,
                proxies=tmp_proxies,
                timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            api_uid = response.cookies.get('api_uid')
            # print(response.cookies.items())
            # if api_uid is None:
            #     api_uid = 'rBQh+FoXerAjQWaAEOcpAg=='
            self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';'
            # print(api_uid)
        except Exception:
            print('requests.get()请求超时....')
            pass
Example #5
0
    def _init_chrome(self):
        '''
        如果使用chrome请设置page_timeout=30
        :return:
        '''
        print('--->>>初始化chrome驱动中<<<---')
        chrome_options = webdriver.ChromeOptions()
        # chrome_options.add_argument('--headless')     # 注意: 设置headless无法访问网页
        # 谷歌文档提到需要加上这个属性来规避bug
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument(
            '--no-sandbox'
        )  # required when running as root user. otherwise you would get no sandbox errors.

        # chrome_options.add_argument('window-size=1200x600')   # 设置窗口大小

        # 设置无图模式
        prefs = {
            'profile.managed_default_content_settings.images': 2,
        }
        chrome_options.add_experimental_option("prefs", prefs)

        # 设置代理
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip().replace(
            'http://', '') if isinstance(ip_object._get_random_proxy_ip(),
                                         str) else ''
        if proxy_ip != '':
            chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))
        '''无法打开https解决方案'''
        # 配置忽略ssl错误
        capabilities = webdriver.DesiredCapabilities.CHROME.copy()
        capabilities['acceptSslCerts'] = True
        capabilities['acceptInsecureCerts'] = True

        # 修改user-agent
        chrome_options.add_argument('--user-agent={0}'.format(
            get_random_pc_ua()))

        # 忽视证书错误
        chrome_options.add_experimental_option('excludeSwitches',
                                               ['ignore-certificate-errors'])

        self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,
                                       chrome_options=chrome_options,
                                       desired_capabilities=capabilities)
        wait = ui.WebDriverWait(self.driver, 30)  # 显示等待n秒, 每过0.5检查一次页面是否加载完毕
        print('------->>>初始化完毕<<<-------')
Example #6
0
    def from_ip_pool_set_proxy_ip_to_phantomjs(self):
        ip_object = MyIpPools()
        ip_list = ip_object.get_proxy_ip_from_ip_pool().get('http')
        proxy_ip = ''
        try:
            proxy_ip = ip_list[randint(0, len(ip_list) - 1)]        # 随机一个代理ip
        except Exception:
            print('从ip池获取随机ip失败...正在使用本机ip进行爬取!')
        # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip))
        proxy_ip = re.compile(r'http://').sub('', proxy_ip)     # 过滤'http://'
        proxy_ip = proxy_ip.split(':')                          # 切割成['xxxx', '端口']

        try:
            tmp_js = {
                'script': 'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]),
                'args': []
            }
            self.driver.command_executor._commands['executePhantomScript'] = ('POST', '/session/$sessionId/phantom/execute')
            self.driver.execute('executePhantomScript', tmp_js)
        except Exception:
            print('动态切换ip失败')
            pass
    def parse(self):
        while True:
            if self.index > 48:
                print('-' * 100 + '一次大循环爬取完成')
                print()
                print('-' * 100 + '即将重新开始爬取....')
                ip_object = MyIpPools()
                self.proxies = ip_object.get_proxy_ip_from_ip_pool(
                )  # 获取新的代理pool
                self.index = 1

            else:
                sleep(5)
                tmp_number = randint(1, 8)  # 随机一个数,来获取随机爬取范围

                my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                tmp_index = 1
                for i in range(0, 49):  # 控制每个分类的循环
                    bozhu = {}

                    if self.index == 49:
                        break

                    tmp_type = self.species[self.index][1]
                    number = self.species[self.index][0]

                    domain = '102803_ctg1_{}_-_ctg1_{}'.format(
                        str(number), str(number))
                    id = domain

                    tmp_pagebar_index = 0
                    tmp_pre_page_index = 1
                    tmp_page_index = 1

                    for count in self.page_range[
                            tmp_number]:  # 又入坑(大多数热门页面30页后无法下拉):弄清算法规律后,发现在不同的热门页面,下拉到一定的页数,就无法下拉获取数据,点背...
                        if tmp_index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                            print('正在重置,并与数据库建立新连接中...')
                            my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                            print('与数据库的新连接成功建立...')

                        if my_pipeline.is_connect_success:
                            print('============| 正在采集第%d页的内容 ...... |' %
                                  (count + 1, ))
                            # 分析pagebar
                            #                    5            11           17
                            # pagebar: 0 1 2 3 4 无 0 1 2 3 4 无 0 1 2 3 4 无....
                            if tmp_pagebar_index > 5:  # 控制其始终小于5
                                tmp_pagebar_index = 0
                            pagebar = str(self.pagebar[tmp_pagebar_index])

                            current_page = str(count + 1)
                            script_uri = r'/102803_ctg1_{}_-_ctg1_{}'.format(
                                str(number), str(number))
                            domain_op = domain
                            # 1506471533330
                            __rnd = str(15064) + str(randint(1, 9)) + str(
                                randint(1, 9)) + str(randint(1, 9)) + str(
                                    randint(1, 9)) + str(randint(1, 9)) + str(
                                        randint(1, 9)) + str(randint(
                                            1, 9)) + str(randint(1, 9))
                            # __rnd = str(1506471533330)
                            if (count) % 6 == 0:  # 分析出来count为6的倍数则pre_page加1
                                tmp_pre_page_index += 1
                            pre_page = str(tmp_pre_page_index)

                            if (count + 1) % 6 == 0:  # 分析出来count+1为6的倍数则page加1
                                tmp_page_index += 1
                            page = str(tmp_page_index)

                            url = 'https://d.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&from=faxian_hot&mod=fenlei&tab=home&pl_name=Pl_Core_NewMixFeed__3&feed_type=1&domain={}&pagebar={}&current_page={}&id={}&script_uri={}&domain_op={}&__rnd={}&pre_page={}&page={}' \
                                .format(domain, pagebar, current_page, id, script_uri, domain_op, __rnd, pre_page, page)
                            print(url)
                            sleep(2)  # 设置等待时间避免微博进行网页重定向

                            # 发现规律,每爬取多少页面时,会将页面重定向,并且很久不响应,所以间隔性休眠
                            # if count == 50 or count == 100 or count == 150 or count == 200 or count == 250:
                            #     print('============| >>>>>> 爬虫正在休眠中 ...... <<<<<<')
                            #     time.sleep(100)

                            tmp_html = self.get_url_body(url=url)

                            if len(tmp_html) <= 100000:
                                print(
                                    '==========| 此时返回的content["data"]为空值, 爬虫进入短暂休眠 ....... |'
                                )
                                print('==========| 请稍后,即将开始继续爬取------>>>>>')
                                sleep(2)
                                tmp_html = self.get_url_body(url=url)
                                # print(tmp_html)

                            for item in Selector(
                                    text=tmp_html).css('div.face a').extract():
                                tmp_nick_name = Selector(text=item).css(
                                    'img::attr("title")').extract_first()
                                tmp_head_img_url = 'https:' + Selector(
                                    text=item).css(
                                        'img::attr("src")').extract_first()

                                bozhu['nick_name'] = self.wash_nick_name(
                                    nick_name=tmp_nick_name)
                                bozhu['sina_type'] = tmp_type
                                bozhu['head_img_url'] = tmp_head_img_url

                                print('---->> ', [
                                    tmp_nick_name, tmp_type, tmp_head_img_url
                                ])

                                # yield bozhu
                                my_pipeline.insert_into_sina_weibo_table(
                                    item=bozhu)
                                gc.collect()

                            print('============| 采集第%d页的内容 完毕 |' %
                                  (count + 1, ))
                            tmp_pagebar_index += 1  # 累加1

                        else:
                            print('数据库连接失败!')
                            pass
                        tmp_index += 1
                    self.index += 1  # 更换索引地址
Example #8
0
            def getsource(url):
                payload = {
                    '_':
                    self.datetime_to_timestamp_in_milliseconds(
                        datetime.datetime.now()),
                    'mid':
                    url.replace('https://space.bilibili.com/', '')
                }
                ua = random.choice(self.uas)
                self.head['User-Agent'] = ua
                self.head['Referer'] = 'https://space.bilibili.com/' + str(
                    i) + '?from=search&seid=' + str(
                        random.randint(10000, 50000))

                # 设置ip代理
                ip_object = MyIpPools()
                proxies = ip_object.get_proxy_ip_from_ip_pool(
                )  # {'http': ['xx', 'yy', ...]}
                if proxies == []:  # 避免报错跳出
                    return None

                proxy = proxies['http'][randint(0, len(proxies) - 1)]

                tmp_proxies = {
                    'http': proxy,
                }

                try:
                    jscontent = requests.session().post(
                        url='http://space.bilibili.com/ajax/member/GetInfo',
                        headers=self.head,
                        data=payload,
                        proxies=tmp_proxies,
                        timeout=8).text
                except Exception:
                    return None

                time2 = time.time()
                try:
                    try:
                        jsDict = json.loads(jscontent)
                        statusJson = jsDict[
                            'status'] if 'status' in jsDict.keys() else False
                    except:
                        return None

                    if statusJson == True:
                        if 'data' in jsDict.keys():
                            jsData = jsDict['data']
                            try:
                                mid = jsData['mid']
                                name = jsData['name']
                                # sex = jsData['sex']
                                face = jsData['face']
                            except:
                                return None
                            # coins = jsData['coins']
                            # spacesta = jsData['spacesta']
                            # birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday'
                            # place = jsData['place'] if 'place' in jsData.keys() else 'noplace'
                            # description = jsData['description']
                            # article = jsData['article']
                            # playnum = jsData['playNum']
                            # sign = jsData['sign']
                            # level = jsData['level_info']['current_level']
                            # exp = jsData['level_info']['current_exp']
                            # pprint(jsData)

                            if re.compile(
                                    r'5d2c92beb774a4bb30762538bb102d23670ae9c0.gif'
                            ).findall(face) != []:
                                return None

                            if re.compile(r'noface.gif').findall(face) != []:
                                return None

                            if name in self.db_nick_name_list:
                                print('[%d]该nick_name已存在于db中' % self.index)
                                self.index += 1
                                return None

                            print("(索引值为: %d) Succeed: " % self.index + mid +
                                  "\t" + str(time2 - time1))

                            bozhu = {
                                'nick_name': name,
                                'sina_type': 'bilibili',
                                'head_img_url': face,
                            }
                            print('---->> ', [name, 'bilibili', face])

                            my_pipeline.insert_into_sina_weibo_table(
                                item=bozhu)
                            gc.collect()

                            # try:
                            #     res = requests.get('https://api.bilibili.com/x/space/navnum?mid=' + str(mid) + '&jsonp=jsonp', headers=head, proxies=tmp_proxies).text
                            #     js_fans_data = json.loads(res)
                            #     following = js_fans_data['data']['following']
                            #     fans = js_fans_data['data']['follower']
                            # except:
                            #     following = 0
                            #     fans = 0
                        else:
                            print('no data now')
                        self.index += 1
                        # try:
                        #     conn = pymysql.connect(
                        #         host='localhost', user='******', passwd='123456', db='bilibili', charset='utf8')
                        #     cur = conn.cursor()
                        #     cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, face, coins, spacesta, \
                        #     birthday, place, description, article, following, fans, playnum, sign, level, exp) \
                        #     VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")'
                        #                 % (
                        #                     mid, name, sex, face, coins, spacesta,
                        #                     birthday, place, description, article,
                        #                     following, fans, playnum, sign, level, exp
                        #                 ))
                        #     conn.commit()
                        # except Exception:
                        #     print("MySQL Error")
                    else:
                        print("Error: " + url)
                except ValueError:
                    pass
Example #9
0
# coding:utf-8
'''
@author = super_fazai
@File    : 清空ip池.py
@Time    : 2017/9/25 21:07
@connect : [email protected]
'''

from fzutils.ip_pools import MyIpPools

_ = MyIpPools()
_._empty_ip_pools()
Example #10
0
async def get_proxy():
    # 设置代理ip
    ip_object = MyIpPools()
    ip_list = ip_object.get_proxy_ip_from_ip_pool()['http']
    proxy = ip_list[randint(0, len(ip_list) - 1)]
    return proxy
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--proxy-server=http://183.136.218.253:80')
chrome_options.add_argument('--headless')  # 注意: 设置headless无法访问网页
chrome_options.add_argument('--disable-gpu')

# 设置无图模式
prefs = {'profile.managed_default_content_settings.images': 2}
chrome_options.add_experimental_option("prefs", prefs)
'''无法打开https解决方案'''
# 配置忽略ssl错误
capabilities = webdriver.DesiredCapabilities.CHROME.copy()
capabilities['acceptSslCerts'] = True
capabilities['acceptInsecureCerts'] = True

# 方法1: 设置代理
ip_object = MyIpPools()
proxy_ip = ip_object._get_random_proxy_ip().replace(
    'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else ''
if proxy_ip != '':
    chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))

# 方法2:
# ip_object = MyIpPools()
# proxy_ip = ip_object._get_random_proxy_ip().replace('http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else ''
# # Change the proxy properties of that copy.
# capabilities['proxy'] = {
#     "httpProxy": proxy_ip,
#     "ftpProxy": proxy_ip,
#     "sslProxy": proxy_ip,
#     "noProxy": None,
#     "proxyType": "MANUAL",