def you_get(url, print_info, extra_args): try: command = ['you-get', '-u'] if print_info: command.append('-i') if extra_args: command.append(extra_args) command.append(url) process = subprocess.Popen(command, stdout=subprocess.PIPE) try: output = process.communicate()[0] output = output.decode(chardet.detect(output).get('encoding', 'utf-8'), 'replace') except KeyboardInterrupt: process.terminate() return '', [] if print_info: print(output) return '', [] name_match = re.compile(r'title:\s*(.*?)(\r|\n)').search(output) name = name_match.group(1) if name_match else 'Unknown' url_re = re.compile(r'(http.*?)(\r|\n)') url_match = url_re.search(output) video_url = [] while url_match: video_url.append(url_match.group(1)) url_match = url_re.search(output, url_match.end(0)) return name, video_url except Exception as e: logger.error('parse video failed {}'.format(e)) return '', []
def get_content_ip(url, proxy, data=None): """使用代理IP获取网页中的html代码""" header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', } # 创建ProxyHandler proxy_support = request.ProxyHandler(proxy) # 创建opener opener = request.build_opener(proxy_support) # 安装opener request.install_opener(opener) # 随机超时时间 timeout = random.choice(range(80, 180)) while True: try: response = request.urlopen(url, timeout=timeout) html = response.read() c = chardet.detect(html) # print(c) print(response.status) # html = html.decode(c["encoding"]) break # 超时异常 except error.HTTPError as e: print('HTTPError: NoResouse', e) time.sleep(random.choice(range(10, 30))) except error.URLError as e: print('URLError: NoSiteExcit', e) time.sleep(random.choice(range(10, 30))) return html
def check_public_ip_info(): url = 'https://checkip.amazonaws.com' try: opener = util.http_opener() response = opener.open(url, timeout=2000) # 字节码 resp_bytes = response.read() # print("A\n") # print(resp_bytes) charset = chardet.detect(resp_bytes)['encoding'] # print("B\n") # print(charset) # 转成字符串 # html = resp_bytes.decode(charset) content = str(resp_bytes, charset) if content is not None: _content = content.replace('\n', '') if is_ipv4_address(_content): return _content except Exception as e: print(e) logger.log(e)
def get_content(url, data=None): """获取网页中的html代码""" header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', } # 随机超时时间 timeout = random.choice(range(80, 180)) while True: try: req = request.Request(url, headers=header) response = request.urlopen(req, timeout=timeout) html = response.read() c = chardet.detect(html) # print(c) print(response.status) html = html.decode(c["encoding"]) break # 超时异常 except error.HTTPError as e: print('HTTPError: NoResouse', e) time.sleep(random.choice(range(10, 30))) except error.URLError as e: print('URLError: NoSiteExcit', e) time.sleep(random.choice(range(10, 30))) return html
def get_public_ip_info(): """ 请求ip138来返回外网ip地址和地址信息 :return: """ url = 'http://2019.ip138.com/ic.asp' try: opener = util.http_opener() response = opener.open(url, timeout=2000) # 字节码 resp_bytes = response.read() charset = chardet.detect(resp_bytes)['encoding'] # 转成字符串 html = resp_bytes.decode(charset) if 'html' in response.getheader('Content-Type'): info_reg = r'(?<=<center>).*?(?=</center>)' info_all_match = re.findall(info_reg, html, re.S | re.M) info = info_all_match[0] # logger.log(info) ip_reg = r'(?<=\[).*?(?=\])' ip_matched = re.findall(ip_reg, info, re.S | re.M) dict = {'ip': ip_matched[0], 'info': info} return dict except Exception as e: print(e) logger.log(e)
def getPlayerInfo(playerid): html = getHtml('www.csgola.com', '/player/' + playerid) q = pyq(html) avatar = q('img.avatar.center-block.img-responsive').attr('src') playername = q('.personaname').text() statTit = q('.col-md-10 .title').text().encode('utf-8') statVal = q('.col-md-10 .datala') chartVal = q('.polar-detail .datala') statiscsName = q('.list-group .list-group-item span.stats-title') staticsData = q('.list-group .list-group-item span.stats-count.pull-right') print pyq(statiscsName[0]).text() print(chardet.detect(pyq(statiscsName[0]).text().encode('utf-8'))) json = { 'error': 0, 'playerinfo': { 'avatar': avatar, 'name': playername, }, 'stats': { 'jishashu': pyq(statVal[0]).text().encode('utf-8'), 'baotoulv': pyq(statVal[1]).text().encode('utf-8'), 'kd': pyq(statVal[2]).text().encode('utf-8'), 'shenglv': pyq(statVal[3]).text().encode('utf-8'), 'zhengwangshu': pyq(statVal[4]).text().encode('utf-8'), 'mingzhonglv': pyq(statVal[5]).text().encode('utf-8'), 'juanzengwuqi': pyq(statVal[6]).text().encode('utf-8'), 'mvpcishu': pyq(statVal[7]).text().encode('utf-8'), }, 'chart': { 'zonghe': pyq(chartVal[0]).text().encode('utf-8'), 'kd': pyq(chartVal[1]).text().encode('utf-8'), 'mingzhonglv': pyq(chartVal[2]).text().encode('utf-8'), 'baotoulv': pyq(chartVal[3]).text().encode('utf-8'), 'shenglv': pyq(chartVal[4]).text().encode('utf-8'), }, 'staData': { pyq(statiscsName[0]).text(): pyq(staticsData[0]).text(), pyq(statiscsName[1]).text(): pyq(staticsData[1]).text(), pyq(statiscsName[2]).text(): pyq(staticsData[2]).text(), pyq(statiscsName[3]).text(): pyq(staticsData[3]).text(), pyq(statiscsName[4]).text(): pyq(staticsData[4]).text(), pyq(statiscsName[5]).text(): pyq(staticsData[5]).text(), pyq(statiscsName[6]).text(): pyq(staticsData[6]).text(), pyq(statiscsName[7]).text(): pyq(staticsData[7]).text(), pyq(statiscsName[8]).text(): pyq(staticsData[8]).text(), pyq(statiscsName[9]).text(): pyq(staticsData[9]).text(), pyq(statiscsName[10]).text(): pyq(staticsData[10]).text(), pyq(statiscsName[11]).text(): pyq(staticsData[11]).text(), pyq(statiscsName[12]).text(): pyq(staticsData[12]).text(), pyq(statiscsName[13]).text(): pyq(staticsData[13]).text(), pyq(statiscsName[14]).text(): pyq(staticsData[14]).text(), pyq(statiscsName[15]).text(): pyq(staticsData[15]).text(), pyq(statiscsName[16]).text(): pyq(staticsData[16]).text(), pyq(statiscsName[17]).text(): pyq(staticsData[17]).text(), pyq(statiscsName[18]).text(): pyq(staticsData[18]).text(), } } return json
def getProvince(mainUrl): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request(mainUrl) resp = urllib2.urlopen(req) respHtml = resp.read() char_type = chardet.detect(respHtml) # print char_type respHtml = unicode(respHtml, "GBK").encode("utf8") # pattern = re.compile(u'<ul class="interval01-list">') # results=respHtml[respHtml.rfind('<ul class="interval01-list">') +1 :respHtml.rfind('<!--有参数配置 start-->')] print respHtml print getProvince("https://car.autohome.com.cn/price/brand-25.html")
def get_public_ip_info_2(): url = 'http://ip.chinaz.com' try: opener = util.http_opener() response = opener.open(url, timeout=2000) # 字节码 resp_bytes = response.read() # print("A\n") # print(resp_bytes) charset = chardet.detect(resp_bytes)['encoding'] # print("B\n") # print(charset) # 转成字符串 html = resp_bytes.decode(charset) # print(html) if 'html' in response.getheader('Content-Type'): # 解析html # re_comp = re.compile('(?<=<dl class="IpMRig-tit">).*?(?=</dl>)') # all_match = re_compfindall(html) info_reg = r'(?<=<dl class="IpMRig-tit">).*?(?=</dl>)' info_all_match = re.findall(info_reg, html, re.S | re.M) info = info_all_match[0] # logger.log(info) ip_reg = r'(?<=<dd class="fz24">).*?(?=</dd>)' ip_matched = re.findall(ip_reg, info, re.S | re.M) dict = {'ip': ip_matched[0], 'info': info} return dict except Exception as e: print(e) logger.log(e)
def craw(self): start_time = time.time() domain = self.data.get_domain(self.domainpath) if domain is None: self.filelogs.writeLogs('domain is None') exit() self._domain_array(domain) if self.domain_id and self.domain_id is not None: self.filelogs.writeFile(self.domainpath, self.domain_id) self.urls.add_new_url(self.domain) self.filelogs.writeLogs('采集开始 ID:%d' % (self.domain_id)) #print domain while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() #补全链接中的http new_full_url = self.parser.get_http_url(new_url) #new_full_url = 'http://www.lingdiankanshu.com/html/1/1966/' #new_full_url = 'http://www.wn001.com/book/2284/' #new_full_url = 'http://www.zhukeshu.com' self.filelogs.writeLogs('下载页面开始') html_cont = self.downloader.download(new_full_url) self.filelogs.writeLogs('下载页面结束') #html_cont = self.downloader.download('http://www.wn001.com') if chardet.detect( html_cont)['encoding'] == 'GB2312' or chardet.detect( html_cont)['encoding'] == 'gb2312': iconv_type = 'gbk' else: iconv_type = chardet.detect(html_cont)['encoding'] #print chardet.detect(html_cont) #str = unicode(html_cont,'gbk') html_cont_coding = html_cont.decode(iconv_type).encode('utf-8') #判断url是否是小说目录url,True则采集该小说保存入数据库 book_preg = urlparse.urljoin(new_full_url, self.book_regular) #是否是小说目录url if self.parser.is_book_link(new_full_url, book_preg): #获取book_id book_id = self.parser.get_book_id(new_full_url, book_preg) book_id = int(book_id) #判断该book_id是否已采集过 if self.data.is_collect( book_id, self.domain_id) is False: #该book_id没采集过 #获取小说名称 book_name = self.parser.get_book_name( html_cont_coding, self.bookname_regular) #获取作者名称 author = self.parser.get_author( html_cont_coding, self.author_regular) #获取简介 descript = self.parser.get_descript( html_cont_coding, self.descript_regular) if book_name is not None and author is not None and descript is not None: #把小说信息存入数据库 self.data.save_book(self.domain_id, book_id, book_name, author, descript) self.filelogs.writeLogs('解析内容') new_urls = self.parser.parse( new_url, html_cont_coding, self.parser.get_http_url(self.domain)) self.urls.add_new_urls(new_urls) # self.filelogs.writeLogs(str(count)) # if count == 50: # break # count = count + 1 #exit() if (time.time() - start_time) > self.outtime: self.filelogs.writeLogs('采集结束') break except: self.filelogs.writeLogs('Faild:')
from datetime import datetime header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', } url = "http://www.weather.com.cn/weather/101020100.shtml" # 随机超时时间 timeout = random.choice(range(80, 180)) while True: try: req = request.Request(url, headers=header) response = request.urlopen(req, timeout=timeout) html = response.read() c = chardet.detect(html) # print(c) print("Status:", response.status) html = html.decode(c["encoding"]) break # 超时异常 except error.HTTPError as e: print('HTTPError: NoResouse', e) time.sleep(random.choice(range(10, 30))) except error.URLError as e: print('URLError: NoSiteExcit', e) time.sleep(random.choice(range(10, 30))) final_info = [] # 创建bs4实例
def craw(self): start_time = time.time() domain = self.data.get_domain(self.domainpath) if domain is None: self.filelogs.writeLogs('domain is None') exit() self._domain_array(domain) if self.domain_id and self.domain_id is not None: self.filelogs.writeFile(self.domainpath, self.domain_id) self.urls.add_new_url(self.domain) self.filelogs.writeLogs('采集开始 ID:%d'%(self.domain_id)) #print domain while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() #补全链接中的http new_full_url = self.parser.get_http_url(new_url) #new_full_url = 'http://www.lingdiankanshu.com/html/1/1966/' #new_full_url = 'http://www.wn001.com/book/2284/' #new_full_url = 'http://www.zhukeshu.com' self.filelogs.writeLogs('下载页面开始') html_cont = self.downloader.download(new_full_url) self.filelogs.writeLogs('下载页面结束') #html_cont = self.downloader.download('http://www.wn001.com') if chardet.detect(html_cont)['encoding'] == 'GB2312' or chardet.detect(html_cont)['encoding'] == 'gb2312': iconv_type = 'gbk' else: iconv_type = chardet.detect(html_cont)['encoding'] #print chardet.detect(html_cont) #str = unicode(html_cont,'gbk') html_cont_coding = html_cont.decode(iconv_type).encode('utf-8') #判断url是否是小说目录url,True则采集该小说保存入数据库 book_preg = urlparse.urljoin(new_full_url, self.book_regular) #是否是小说目录url if self.parser.is_book_link(new_full_url, book_preg): #获取book_id book_id = self.parser.get_book_id(new_full_url, book_preg) book_id = int(book_id) #判断该book_id是否已采集过 if self.data.is_collect(book_id, self.domain_id) is False: #该book_id没采集过 #获取小说名称 book_name = self.parser.get_book_name(html_cont_coding, self.bookname_regular) #获取作者名称 author = self.parser.get_author(html_cont_coding, self.author_regular) #获取简介 descript = self.parser.get_descript(html_cont_coding, self.descript_regular) if book_name is not None and author is not None and descript is not None: #把小说信息存入数据库 self.data.save_book(self.domain_id, book_id, book_name, author, descript) self.filelogs.writeLogs('解析内容') new_urls = self.parser.parse(new_url, html_cont_coding, self.parser.get_http_url(self.domain)) self.urls.add_new_urls(new_urls) # self.filelogs.writeLogs(str(count)) # if count == 50: # break # count = count + 1 #exit() if (time.time() - start_time) > self.outtime: self.filelogs.writeLogs('采集结束') break except: self.filelogs.writeLogs('Faild:')
# 常用第三方模块---chardet from pip._vendor.requests.packages import chardet result = chardet.detect(b'Hello, world!') print(result) # gbk data = '离离原上草,一岁一枯荣'.encode('gbk') result = chardet.detect(data) print(result) #utf-8 data = '离离原上草,一岁一枯荣'.encode('utf-8') result = chardet.detect(data) print(result) #对日文进行检测 data = '最新の主要ニュース'.encode('euc-jp') result = chardet.detect(data) print('日文:%s' % result)
print(qiuhe(100)) # with request.urlopen('https://api.douban.com/v2/book/2129650') as f: # data = f.read() # print("Status:", f.status, f.reason) # for k, v in f.getheaders(): # print('%s:%s' % (k, v)) # d = json.loads(data.decode('utf-8')) # for p, s in d.items(): # print('%s:%s' % (p, s)) # r = requests.get('https://www.douban.com/') # print(r.text) d = chardet.detect(b'hello world !') print(d) data = '哈哈'.encode('gbk') print(type(data)) d = chardet.detect(data) print(d) s = data.decode('gbk') print(type(s)) print(s) print(psutil.cpu_times())
r3 = requests.get('https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20weather.forecast%20where%20woeid%20%3D%202151330&format=json') print(r3.json()) print(r3.json()['query']['created']) # POST # r4 = requests.post('https://accounts.douban.com/login', data={'form_email': '*****@*****.**', 'form_password': '******'}) # 传递JSON数据 # params = {'key': 'value'} # r = requests.post(url, json=params) # 内部自动序列化为JSON # 上传文件,files参数 # upload_files = {'file': open('report.xls', 'rb')} # >>> r = requests.post(url, files=upload_files) # 在请求中传入Cookie,只需准备一个dict传入cookies参数: # cs = {'token': '12345', 'status': 'working'} # r = requests.get(url, cookies=cs) # 要指定超时,传入以秒为单位的timeout参数: # r = requests.get(url, timeout=2.5) # 2.5秒后超时 # chardet这个第三方库正好就派上了用场。用它来检测编码 print(chardet.detect(b'Hello, world!')) data = '离离原上草,一岁一枯荣'.encode('utf-8') print(data) print(chardet.detect(data))
def craw(self): start_time = time.time() domain = self.data.get_domain(self.domainpath) if domain is None: self.filelogs.writeLogs('domain is None') exit() self._domain_array(domain) if self.domain_id and self.domain_id is not None: self.filelogs.writeFile(self.domainpath, self.domain_id) self.filelogs.writeLogs('采集开始 ID:%d'%(self.domain_id)) #统计连续失败的次数 fail_count = 0 #print domain while (time.time() - start_time) <= self.outtime: new_url = self.book_regular time.sleep(5) #补全链接中的http domian_host = self.parser.get_http_url(self.domain) self.start_id = self.data.get_book_id(self.domain_id, self.start_id) #组装book_id 的 url full_url = self.parser.fill_url_book_id(new_url, self.start_id, self.book_mark_id) if full_url and full_url != '': self.filelogs.writeLogs('下载页面开始') html_cont = self.downloader.download(full_url) self.filelogs.writeLogs('下载页面结束') if html_cont is None: fail_count = fail_count + 1 else: if chardet.detect(html_cont)['encoding'] == 'GB2312' or chardet.detect(html_cont)['encoding'] == 'gb2312': iconv_type = 'gbk' else: iconv_type = chardet.detect(html_cont)['encoding'] html_cont_coding = html_cont.decode(iconv_type,'ignore').encode('utf-8') #获取book_id book_id = self.start_id book_id = int(book_id) #获取小说名称 book_name = self.parser.get_book_name(html_cont_coding, self.bookname_regular) #获取作者名称 author = self.parser.get_author(html_cont_coding, self.author_regular) #获取简介 descript = self.parser.get_descript(html_cont_coding, self.descript_regular) #去除开通的空格 if descript is not None: descript = self.parser.del_space(descript) if book_name is not None and author is not None: fail_count = 0 #把小说信息存入数据库 self.data.save_book(self.domain_id, book_id, book_name, author, descript) else: fail_count = fail_count + 1 #连续失败50次,停止采集 if fail_count == 50: self.filelogs.writeLogs('失败次数:'+str(fail_count)) break if (time.time() - start_time) > self.outtime: break self.filelogs.writeLogs('采集结束')