def get_url_body(self, tmp_url): ''' 得到url的body :param tmp_url: 待爬取的url :return: str ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) try: response = requests.get( tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 data = response.content.decode('utf-8') # print(data) except Exception: print('requests.get()请求超时....') print('today的data为空!') data = '{}' return data
def get_url_body(self, url): ''' 根据url获取到需求的data数据 :param url: str :return: str ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } print('------>>>>>>| 正在使用代理 %s 进行爬取 ...... |<<<<<<------' % tmp_proxies['http']) try: content = requests.get(url, headers=self.headers, proxies=tmp_proxies, timeout=12).json() # 用requests自带就很友好,还能避免错误 except: content = {} return content.get('data', '')
def from_ip_pool_set_proxy_ip_to_phantomjs(self): ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip() if not proxy_ip: # 失败返回False return False # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip)) proxy_ip = re.compile(r'http://').sub('', proxy_ip) # 过滤'http://' proxy_ip = proxy_ip.split(':') # 切割成['xxxx', '端口'] try: tmp_js = { 'script': 'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]), 'args': [] } self.driver.command_executor._commands['executePhantomScript'] = ( 'POST', '/session/$sessionId/phantom/execute') self.driver.execute('executePhantomScript', tmp_js) except Exception: print('动态切换ip失败') return False return True
def set_cookies_key_api_uid(self): ''' 给headers增加一个cookie, 里面有个key名字为api_uid :return: ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # 得到cookie中的key名为api_uid的值 host_url = 'http://mobile.yangkeduo.com' try: response = requests.get( host_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 api_uid = response.cookies.get('api_uid') # print(response.cookies.items()) # if api_uid is None: # api_uid = 'rBQh+FoXerAjQWaAEOcpAg==' self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';' # print(api_uid) except Exception: print('requests.get()请求超时....') pass
def _init_chrome(self): ''' 如果使用chrome请设置page_timeout=30 :return: ''' print('--->>>初始化chrome驱动中<<<---') chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( '--no-sandbox' ) # required when running as root user. otherwise you would get no sandbox errors. # chrome_options.add_argument('window-size=1200x600') # 设置窗口大小 # 设置无图模式 prefs = { 'profile.managed_default_content_settings.images': 2, } chrome_options.add_experimental_option("prefs", prefs) # 设置代理 ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True # 修改user-agent chrome_options.add_argument('--user-agent={0}'.format( get_random_pc_ua())) # 忽视证书错误 chrome_options.add_experimental_option('excludeSwitches', ['ignore-certificate-errors']) self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options, desired_capabilities=capabilities) wait = ui.WebDriverWait(self.driver, 30) # 显示等待n秒, 每过0.5检查一次页面是否加载完毕 print('------->>>初始化完毕<<<-------')
def from_ip_pool_set_proxy_ip_to_phantomjs(self): ip_object = MyIpPools() ip_list = ip_object.get_proxy_ip_from_ip_pool().get('http') proxy_ip = '' try: proxy_ip = ip_list[randint(0, len(ip_list) - 1)] # 随机一个代理ip except Exception: print('从ip池获取随机ip失败...正在使用本机ip进行爬取!') # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip)) proxy_ip = re.compile(r'http://').sub('', proxy_ip) # 过滤'http://' proxy_ip = proxy_ip.split(':') # 切割成['xxxx', '端口'] try: tmp_js = { 'script': 'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]), 'args': [] } self.driver.command_executor._commands['executePhantomScript'] = ('POST', '/session/$sessionId/phantom/execute') self.driver.execute('executePhantomScript', tmp_js) except Exception: print('动态切换ip失败') pass
def parse(self): while True: if self.index > 48: print('-' * 100 + '一次大循环爬取完成') print() print('-' * 100 + '即将重新开始爬取....') ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # 获取新的代理pool self.index = 1 else: sleep(5) tmp_number = randint(1, 8) # 随机一个数,来获取随机爬取范围 my_pipeline = SqlServerMyPageInfoSaveItemPipeline() tmp_index = 1 for i in range(0, 49): # 控制每个分类的循环 bozhu = {} if self.index == 49: break tmp_type = self.species[self.index][1] number = self.species[self.index][0] domain = '102803_ctg1_{}_-_ctg1_{}'.format( str(number), str(number)) id = domain tmp_pagebar_index = 0 tmp_pre_page_index = 1 tmp_page_index = 1 for count in self.page_range[ tmp_number]: # 又入坑(大多数热门页面30页后无法下拉):弄清算法规律后,发现在不同的热门页面,下拉到一定的页数,就无法下拉获取数据,点背... if tmp_index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: print('============| 正在采集第%d页的内容 ...... |' % (count + 1, )) # 分析pagebar # 5 11 17 # pagebar: 0 1 2 3 4 无 0 1 2 3 4 无 0 1 2 3 4 无.... if tmp_pagebar_index > 5: # 控制其始终小于5 tmp_pagebar_index = 0 pagebar = str(self.pagebar[tmp_pagebar_index]) current_page = str(count + 1) script_uri = r'/102803_ctg1_{}_-_ctg1_{}'.format( str(number), str(number)) domain_op = domain # 1506471533330 __rnd = str(15064) + str(randint(1, 9)) + str( randint(1, 9)) + str(randint(1, 9)) + str( randint(1, 9)) + str(randint(1, 9)) + str( randint(1, 9)) + str(randint( 1, 9)) + str(randint(1, 9)) # __rnd = str(1506471533330) if (count) % 6 == 0: # 分析出来count为6的倍数则pre_page加1 tmp_pre_page_index += 1 pre_page = str(tmp_pre_page_index) if (count + 1) % 6 == 0: # 分析出来count+1为6的倍数则page加1 tmp_page_index += 1 page = str(tmp_page_index) url = 'https://d.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&from=faxian_hot&mod=fenlei&tab=home&pl_name=Pl_Core_NewMixFeed__3&feed_type=1&domain={}&pagebar={}¤t_page={}&id={}&script_uri={}&domain_op={}&__rnd={}&pre_page={}&page={}' \ .format(domain, pagebar, current_page, id, script_uri, domain_op, __rnd, pre_page, page) print(url) sleep(2) # 设置等待时间避免微博进行网页重定向 # 发现规律,每爬取多少页面时,会将页面重定向,并且很久不响应,所以间隔性休眠 # if count == 50 or count == 100 or count == 150 or count == 200 or count == 250: # print('============| >>>>>> 爬虫正在休眠中 ...... <<<<<<') # time.sleep(100) tmp_html = self.get_url_body(url=url) if len(tmp_html) <= 100000: print( '==========| 此时返回的content["data"]为空值, 爬虫进入短暂休眠 ....... |' ) print('==========| 请稍后,即将开始继续爬取------>>>>>') sleep(2) tmp_html = self.get_url_body(url=url) # print(tmp_html) for item in Selector( text=tmp_html).css('div.face a').extract(): tmp_nick_name = Selector(text=item).css( 'img::attr("title")').extract_first() tmp_head_img_url = 'https:' + Selector( text=item).css( 'img::attr("src")').extract_first() bozhu['nick_name'] = self.wash_nick_name( nick_name=tmp_nick_name) bozhu['sina_type'] = tmp_type bozhu['head_img_url'] = tmp_head_img_url print('---->> ', [ tmp_nick_name, tmp_type, tmp_head_img_url ]) # yield bozhu my_pipeline.insert_into_sina_weibo_table( item=bozhu) gc.collect() print('============| 采集第%d页的内容 完毕 |' % (count + 1, )) tmp_pagebar_index += 1 # 累加1 else: print('数据库连接失败!') pass tmp_index += 1 self.index += 1 # 更换索引地址
def getsource(url): payload = { '_': self.datetime_to_timestamp_in_milliseconds( datetime.datetime.now()), 'mid': url.replace('https://space.bilibili.com/', '') } ua = random.choice(self.uas) self.head['User-Agent'] = ua self.head['Referer'] = 'https://space.bilibili.com/' + str( i) + '?from=search&seid=' + str( random.randint(10000, 50000)) # 设置ip代理 ip_object = MyIpPools() proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} if proxies == []: # 避免报错跳出 return None proxy = proxies['http'][randint(0, len(proxies) - 1)] tmp_proxies = { 'http': proxy, } try: jscontent = requests.session().post( url='http://space.bilibili.com/ajax/member/GetInfo', headers=self.head, data=payload, proxies=tmp_proxies, timeout=8).text except Exception: return None time2 = time.time() try: try: jsDict = json.loads(jscontent) statusJson = jsDict[ 'status'] if 'status' in jsDict.keys() else False except: return None if statusJson == True: if 'data' in jsDict.keys(): jsData = jsDict['data'] try: mid = jsData['mid'] name = jsData['name'] # sex = jsData['sex'] face = jsData['face'] except: return None # coins = jsData['coins'] # spacesta = jsData['spacesta'] # birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday' # place = jsData['place'] if 'place' in jsData.keys() else 'noplace' # description = jsData['description'] # article = jsData['article'] # playnum = jsData['playNum'] # sign = jsData['sign'] # level = jsData['level_info']['current_level'] # exp = jsData['level_info']['current_exp'] # pprint(jsData) if re.compile( r'5d2c92beb774a4bb30762538bb102d23670ae9c0.gif' ).findall(face) != []: return None if re.compile(r'noface.gif').findall(face) != []: return None if name in self.db_nick_name_list: print('[%d]该nick_name已存在于db中' % self.index) self.index += 1 return None print("(索引值为: %d) Succeed: " % self.index + mid + "\t" + str(time2 - time1)) bozhu = { 'nick_name': name, 'sina_type': 'bilibili', 'head_img_url': face, } print('---->> ', [name, 'bilibili', face]) my_pipeline.insert_into_sina_weibo_table( item=bozhu) gc.collect() # try: # res = requests.get('https://api.bilibili.com/x/space/navnum?mid=' + str(mid) + '&jsonp=jsonp', headers=head, proxies=tmp_proxies).text # js_fans_data = json.loads(res) # following = js_fans_data['data']['following'] # fans = js_fans_data['data']['follower'] # except: # following = 0 # fans = 0 else: print('no data now') self.index += 1 # try: # conn = pymysql.connect( # host='localhost', user='******', passwd='123456', db='bilibili', charset='utf8') # cur = conn.cursor() # cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, face, coins, spacesta, \ # birthday, place, description, article, following, fans, playnum, sign, level, exp) \ # VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' # % ( # mid, name, sex, face, coins, spacesta, # birthday, place, description, article, # following, fans, playnum, sign, level, exp # )) # conn.commit() # except Exception: # print("MySQL Error") else: print("Error: " + url) except ValueError: pass
# coding:utf-8 ''' @author = super_fazai @File : 清空ip池.py @Time : 2017/9/25 21:07 @connect : [email protected] ''' from fzutils.ip_pools import MyIpPools _ = MyIpPools() _._empty_ip_pools()
async def get_proxy(): # 设置代理ip ip_object = MyIpPools() ip_list = ip_object.get_proxy_ip_from_ip_pool()['http'] proxy = ip_list[randint(0, len(ip_list) - 1)] return proxy
chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--proxy-server=http://183.136.218.253:80') chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 chrome_options.add_argument('--disable-gpu') # 设置无图模式 prefs = {'profile.managed_default_content_settings.images': 2} chrome_options.add_experimental_option("prefs", prefs) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True # 方法1: 设置代理 ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) # 方法2: # ip_object = MyIpPools() # proxy_ip = ip_object._get_random_proxy_ip().replace('http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' # # Change the proxy properties of that copy. # capabilities['proxy'] = { # "httpProxy": proxy_ip, # "ftpProxy": proxy_ip, # "sslProxy": proxy_ip, # "noProxy": None, # "proxyType": "MANUAL",