def processRounds(roundURLs): for roundURL in roundURLs: html = urllib2.urlopen(siteURL + roundURL) roundPage = lxml.html.fromstring(html.read()) html.close() round = roundPage.cssselect( "li[id='tpRound'] a")[0].text_content().replace( "round ", "").replace(" Rankings", "").strip() print "Round: " + round roundRows = roundPage.cssselect("div[id='view_standard'] tr") # specified in the footer pageLinks = roundRows[-1].cssselect("a") #remove the "next page" link del pageLinks[-1] for link in pageLinks: linkURL = siteURL + link.get("href") print linkURL scrapePage(linkURL, round) calculateExtraStats(round)
def get_every_j_data(_url): url = "https://www.jleague.jp{}".format(_url) __level = getlevel(re.findall('/match/(.*)/2022', _url)[0]) __round: str = "" __st = "" __status = False __bc = "" __zhu = "" __ke = "" __zc = "" __kc = "" Logi("start request {}".format(url)) try: html = requests.get(url, verify=False, headers=J_Header, timeout=23) # , proxies=self.proxies except (Exception, ): Loge("{} request error".format(url)) return Logi("request stutas:={}".format(html.status_code)) time.sleep(.1) selector = lxml.html.fromstring(html.text) soup = BeautifulSoup(html.text, 'lxml') # get match info info = selector.cssselect("p span") __zhu = info[0].text # 主 __ke = info[-2].text # 客 __bc = info[3].text + "-" + info[5].text # 半场 bc __zc = info[12].text # zc __kc = info[14].text # kc __zj = selector.cssselect(".leagLeftScore")[0].text # zj 主进 __kj = selector.cssselect(".leagRightScore")[0].text # kj 客进 __round = re.findall( "第(.*?)節", "".join( selector.xpath( '//span[@class=\'matchVsTitle__league\']/text()')))[-1] __date = re.findall("2022/(.*?)/live", soup.link['href'])[0] # get weather weather_url = url.replace("live/", "ajax_live?_={}T{}") # 天气信息ajax动态加载的.拼ajax的url weather_url.format(datetime.date.today(), time.strftime('%H:%M', time.localtime(time.time()))) weather_page = requests.get(weather_url, verify=False, headers=J_Header).content.decode() time.sleep(1) weather_info = lxml.html.fromstring(weather_page) weather_info_list = weather_info.cssselect(".bgGray + td") weather = "".join(re.findall("(.*?)\/", weather_info_list[4].text))[:1] # get start time ss = pyquery.PyQuery(html.text) sk = ss.find(ryaml("sj_css", "css")).text() st = re.findall("\d+\:\d+", sk)[0].replace(":", "") _sql = "INSERT INTO `j22` VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');" % ( __date, st, __level, str(int(__round)), weather, __zhu, "", __ke, __bc, __zj, __kj, __zc, __kc, "9.99", "9.99", "9.99") Logi(f"sql:={_sql}") html.close() w2db(_sql) return True
def extract_links_from_html(base, body): try: html = LinkParser(base) html.feed(body) html.close() for link in html.get_abs_links(): yield link except HTMLParseError, ex: logging.warning("html parse error")
def getHtml(self, url): if (self.testUrl(url) is True): html = urllib.request.urlopen(url) mybytes = html.read() mystr = mybytes.decode("utf8") html.close() return mystr else: return None
def spider(self, url): try: html = requests.get(url, headers=config.headers) html.encoding = 'utf-8' return html.text except Exception as e: print(e, '该URL抓取失败',url) html.close() return ''
def extract_links_from_html(base, body): try: html = LinkParser(base) html.feed(body) html.close() for link in html.get_abs_links(): yield link except HTMLParseError as ex: logging.warning("html parse error")
def post2markdown( tree ): # Process html; Keep only the <article> content - where blogpost actualy is article = (tree.xpath('//article'))[0] header = (tree.xpath('//header[@class="article-header"]'))[0] p_blog = (article.xpath('.//p[@id="breadcrumb"]'))[0] # contains: "Blog:" header.remove(p_blog) if (article.xpath('.//footer')): footer = (article.xpath('.//footer'))[0] article.remove(footer) iframes = article.xpath('//iframe') post_videos(iframes) # videos: replace video's iframe with <a><img> images = (article.xpath('.//img')) post_imgs(images) post_clean_html(article) # author # add class to author wrapping <a> author_tag = (article.xpath('.//span[@class="author"]/a')) # author_tag[0].set('class', 'author') # author_tag[0].attrib.pop('rel') # author_tag[0].attrib.pop('title') # author_tag[0].set('title', '') # print lxml.html.tostring(author_tag[0]) # get info date = ((article.xpath('//time'))[0]).attrib['datetime'] author = (article.xpath('//a[@rel="author"]'))[0].text title = (article.xpath('//h1[@class="entry-title single-title"]'))[0].text #save modified html html_article = lxml.html.tostring(article, pretty_print=True, include_meta_content_type=True, encoding='utf-8', method='html', with_tail=False) html = open('tmp_article.html', 'w') html.write(html_article) html.close() return (date, author, title)
def processRounds(roundURLs): for roundURL in roundURLs: html = urllib2.urlopen(siteURL + roundURL) roundPage = lxml.html.fromstring(html.read()) html.close() round = roundPage.cssselect("li[id='tpRound'] a")[0].text_content().replace("round ", "").replace(" Rankings", "").strip() print "Round: " + round roundRows = roundPage.cssselect("div[id='view_standard'] tr") # specified in the footer pageLinks = roundRows[-1].cssselect("a") #remove the "next page" link del pageLinks[-1] for link in pageLinks: linkURL = siteURL + link.get("href") print linkURL scrapePage(linkURL, round)
def __init__(self, *args, **kwargs): html = None elements = [] self._base_url = None self.parser = kwargs.pop("parser", None) if ( len(args) >= 1 and (not PY3k and isinstance(args[0], basestring) or (PY3k and isinstance(args[0], str))) and args[0].split("://", 1)[0] in ("http", "https") ): kwargs["url"] = args[0] if len(args) >= 2: kwargs["data"] = args[1] args = [] if "parent" in kwargs: self._parent = kwargs.pop("parent") else: self._parent = no_default if "css_translator" in kwargs: self._translator = kwargs.pop("css_translator") elif self.parser in ("xml",): self._translator = self._translator_class(xhtml=True) elif self._parent is not no_default: self._translator = self._parent._translator else: self._translator = self._translator_class(xhtml=False) namespaces = kwargs.pop("namespaces", {}) if kwargs: # specific case to get the dom if "filename" in kwargs: html = open(kwargs["filename"]) elif "url" in kwargs: url = kwargs.pop("url") if "opener" in kwargs: opener = kwargs.pop("opener") html = opener(url, **kwargs) else: html = url_opener(url, kwargs) if not self.parser: self.parser = "html" self._base_url = url else: raise ValueError("Invalid keyword arguments %s" % kwargs) elements = fromstring(html, self.parser) # close open descriptor if possible if hasattr(html, "close"): try: html.close() except: pass else: # get nodes # determine context and selector if any selector = context = no_default length = len(args) if length == 1: context = args[0] elif length == 2: selector, context = args else: raise ValueError("You can't do that. Please, provide arguments") # get context if isinstance(context, basestring): try: elements = fromstring(context, self.parser) except Exception: raise elif isinstance(context, self.__class__): # copy elements = context[:] elif isinstance(context, list): elements = context elif isinstance(context, etree._Element): elements = [context] # select nodes if elements and selector is not no_default: xpath = self._css_to_xpath(selector) results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=namespaces)) elements = results list.__init__(self, elements)
def getHtml(url): html = urllib2.urlopen(url) page = lxml.html.fromstring(html.read()) html.close() return page
def get_html_ulib(self,url_p, type_p='rp', chartset_p='utf-8'): html = urllib.request.urlopen(url=url_p) txt = html.read().decode(chartset_p) html.close() return txt
def get_html(self,url_p="",dic_p={},type_p='rg',chartset_p='utf-8',timeout_p=10): chartset_get = "n/a" # 爬取数据的字符形式编码 headers_p = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"} txt = "nothing" # 获取网页源码 try: # request_get的方法 if (type_p == 'rg'): html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p) chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码 if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") if (chartset_get.lower() == "iso-8859-1"): print ("rg模式,<<原文编码iso-8859-1特殊处理") try: txt = html.content.decode("GBK") except: txt = html.content.decode("gb2312") else: print ("rg模式,按照识别的" + chartset_get + "特殊处理") txt = html.content.decode(chartset_get) else: print ("<<原文编码识别[未通过]>>") txt = "" print ("<<<rg>>>过程:"," ","原文编码:",chartset_get) html.close() # request_get的方法 只输出字节码 if (type_p == 'rg_byte'): txt = b"" html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p) chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码 txt = html.content print ("<<<rg_byte>>>过程:"," ","原文编码:",chartset_get) html.close() # request_post的方法 if (type_p == 'rp'): conn_p = requests.session() rep_p = conn_p.post(url=url_p,data=dic_p,timeout=timeout_p,headers=headers_p) txt = rep_p.content chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") txt = txt.decode(chartset_get, "ignore") else: print ("<<原文编码识别[未通过]>>") code_is = chardet.detect(txt) if ("encoding" in code_is): chartset_get = code_is["encoding"] txt = txt.decode(code_is["encoding"], "ignore") print ("<<<rp>>>过程:"," ","原文编码:",chartset_get) # urllib的get方法 if (type_p == 'ug'): html = urllib.request.urlopen(url=url_p) txt = html.read() chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息 if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") txt = txt.decode(chartset_get, "ignore") else: # 进行编码判别 print ("<<原文编码识别[未通过]>>") code_is = chardet.detect(txt) if ("encoding" in code_is): chartset_get = code_is["encoding"] txt = txt.decode(code_is["encoding"], "ignore") print ("<<<ug>>>过程:"," ","原文编码:",chartset_get) # urllib的post方法 if (type_p == 'up'): #将字典格式化成能用的形式 data_p = urllib.parse.urlencode(dic_p).encode('utf-8') #创建一个request,放入我们的地址、数据、头 request = urllib.request.Request(url_p, data_p, headers_p) #访问 txt = urllib.request.urlopen(request).read() chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息 if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") txt = txt.decode(chartset_get, "ignore") else: # 进行编码判别 print ("<<原文编码识别[未通过]>>") code_is = chardet.detect(txt) if ("encoding" in code_is): chartset_get = code_is["encoding"] txt = txt.decode(code_is["encoding"], "ignore") print ("<<<up>>>过程:"," ","原文编码:",chartset_get) # session的方法 if (type_p == 'ss'): res_addr = self.session.get(url_p, timeout=timeout_p, headers=headers_p) res_addr.encoding = chardet.detect(res_addr.content)["encoding"] txt = bs_4(res_addr.text, "lxml") print ("<<<ss>>>过程:"," ","原文编码:",chartset_get) # Selenium的方法 待完善 if (type_p == 'se'): self.driver.get(url_p) js = "var q=document.body.scrollTop=100000" self.driver.execute_script(js) self.driver.implicitly_wait(30) # 据说此方法是智能等待,看效果还不错,数据加载完就返回了 30 代表等待秒 txt = self.driver.page_source chartset_get = self.get_encodings_from_content(txt) print ("<<<se>>>过程:"," ","原文编码:",chartset_get) # login的方法 待完善 if (type_p == 'lg'): print ("<<<lg>>>过程:"," ","原文编码:",chartset_get) except Exception as e: print("html爬虫处理失败", e) html = requests.get(url=url_p, headers=headers_p) chartset_get = "n/a" print ("爬虫的最后处理,按照默认的" + chartset_p + "编码输出") try: txt = html.content.decode(chartset_p) except: txt = html.content.decode("gbk") html.close() return txt,chartset_get # 返回文本型html编码 加上自定义编码头
import time import sys import codecs import lxml.html import urllib2 query = 'http://www39.atwiki.jp/osakahennyu/?cmd=backup&action=source&pageid=<PLACEHOLDER>&num=0' for line in open(sys.argv[1], 'r'): url = query.replace('<PLACEHOLDER>', line.rstrip()) while True: try: html = urllib2.urlopen(url) code = unicode(html.read(), 'utf-8') dom = lxml.html.fromstring(code) wiki = dom.xpath('//pre')[0] fout = codecs.open(line.rstrip() + '.txt', 'w', 'utf-8') fout.write(wiki.text) fout.close() html.close() break except urllib2.HTTPError: raw_input('>>> error! press continue...') time.sleep(1)
def __init__(self, *args, **kwargs): html = None elements = [] self._base_url = None self.parser = kwargs.get('parser', None) if 'parser' in kwargs: del kwargs['parser'] if len(args) >= 1 and \ (not PY3k and isinstance(args[0], basestring) or \ (PY3k and isinstance(args[0], str))) and \ args[0].split('://', 1)[0] in ('http', 'https'): kwargs['url'] = args[0] if len(args) >= 2: kwargs['data'] = args[1] args = [] if 'parent' in kwargs: self._parent = kwargs.pop('parent') else: self._parent = no_default if 'css_translator' in kwargs: self._translator = kwargs.pop('css_translator') elif self.parser in ('xml',): self._translator = JQueryTranslator(xhtml=True) elif self._parent is not no_default: self._translator = self._parent._translator else: self._translator = JQueryTranslator(xhtml=False) namespaces = kwargs.get('namespaces', {}) if 'namespaces' in kwargs: del kwargs['namespaces'] if kwargs: # specific case to get the dom if 'filename' in kwargs: html = open(kwargs['filename']) elif 'url' in kwargs: url = kwargs.pop('url') if 'opener' in kwargs: opener = kwargs.pop('opener') html = opener(url, **kwargs) else: html = url_opener(url, kwargs) if not self.parser: self.parser = 'html' self._base_url = url else: raise ValueError('Invalid keyword arguments %s' % kwargs) elements = fromstring(html, self.parser) # close open descriptor if possible if hasattr(html, 'close'): try: html.close() except: pass else: # get nodes # determine context and selector if any selector = context = no_default length = len(args) if length == 1: context = args[0] elif length == 2: selector, context = args else: raise ValueError("You can't do that." +\ " Please, provide arguments") # get context if isinstance(context, basestring): try: elements = fromstring(context, self.parser) except Exception: raise elif isinstance(context, self.__class__): # copy elements = context[:] elif isinstance(context, list): elements = context elif isinstance(context, etree._Element): elements = [context] # select nodes if elements and selector is not no_default: xpath = self._css_to_xpath(selector) results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=namespaces)) elements = results list.__init__(self, elements)
def __init__(self, *args, **kwargs): html = None elements = [] self._base_url = None self.parser = kwargs.get('parser', None) if 'parser' in kwargs: del kwargs['parser'] if len(args) >= 1 and \ (not PY3k and isinstance(args[0], basestring) or \ (PY3k and isinstance(args[0], str))) and \ args[0].split('://', 1)[0] in ('http', 'https'): kwargs['url'] = args[0] if len(args) >= 2: kwargs['data'] = args[1] args = [] if 'parent' in kwargs: self._parent = kwargs.pop('parent') else: self._parent = no_default if 'css_translator' in kwargs: self._translator = kwargs.pop('css_translator') elif self.parser in ('xml', ): self._translator = JQueryTranslator(xhtml=True) elif self._parent is not no_default: self._translator = self._parent._translator else: self._translator = JQueryTranslator(xhtml=False) namespaces = kwargs.get('namespaces', {}) if 'namespaces' in kwargs: del kwargs['namespaces'] if kwargs: # specific case to get the dom if 'filename' in kwargs: html = open(kwargs['filename']) elif 'url' in kwargs: url = kwargs.pop('url') if 'opener' in kwargs: opener = kwargs.pop('opener') html = opener(url, **kwargs) else: html = url_opener(url, kwargs) if not self.parser: self.parser = 'html' self._base_url = url else: raise ValueError('Invalid keyword arguments %s' % kwargs) elements = fromstring(html, self.parser) # close open descriptor if possible if hasattr(html, 'close'): try: html.close() except: pass else: # get nodes # determine context and selector if any selector = context = no_default length = len(args) if length == 1: context = args[0] elif length == 2: selector, context = args else: raise ValueError("You can't do that." +\ " Please, provide arguments") # get context if isinstance(context, basestring): try: elements = fromstring(context, self.parser) except Exception: raise elif isinstance(context, self.__class__): # copy elements = context[:] elif isinstance(context, list): elements = context elif isinstance(context, etree._Element): elements = [context] # select nodes if elements and selector is not no_default: xpath = self._css_to_xpath(selector) results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=namespaces)) elements = results list.__init__(self, elements)
import csv html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors') lib_methods = dir(html) print('HTML Built-In Library Methods:\n', lib_methods, end='\n\n') soup = BeautifulSoup(html, 'html.parser') #print(type(soup.prettify())) with open('site.html', 'w+', newline='') as f: html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors') mybytes = html.read() mystr = mybytes.decode("utf8") html.close() f.write(mystr) f.close() #print(type(lines)) #print(len(x)) #for line in lines: #print(lines) """# Widgets""" # Source article # https://towardsdatascience.com/bring-your-jupyter-notebook-to-life-with-interactive-widgets-bc12e03f0916