def processRounds(roundURLs):
    for roundURL in roundURLs:
        html = urllib2.urlopen(siteURL + roundURL)
        roundPage = lxml.html.fromstring(html.read())
        html.close()

        round = roundPage.cssselect(
            "li[id='tpRound'] a")[0].text_content().replace(
                "round ", "").replace(" Rankings", "").strip()
        print "Round: " + round

        roundRows = roundPage.cssselect("div[id='view_standard'] tr")
        # specified in the footer
        pageLinks = roundRows[-1].cssselect("a")

        #remove the "next page" link
        del pageLinks[-1]

        for link in pageLinks:
            linkURL = siteURL + link.get("href")
            print linkURL

            scrapePage(linkURL, round)

        calculateExtraStats(round)
Esempio n. 2
0
def get_every_j_data(_url):
    url = "https://www.jleague.jp{}".format(_url)
    __level = getlevel(re.findall('/match/(.*)/2022', _url)[0])
    __round: str = ""
    __st = ""
    __status = False
    __bc = ""
    __zhu = ""
    __ke = ""
    __zc = ""
    __kc = ""
    Logi("start request {}".format(url))
    try:
        html = requests.get(url, verify=False, headers=J_Header,
                            timeout=23)  # , proxies=self.proxies
    except (Exception, ):
        Loge("{} request error".format(url))
        return
    Logi("request stutas:={}".format(html.status_code))
    time.sleep(.1)
    selector = lxml.html.fromstring(html.text)
    soup = BeautifulSoup(html.text, 'lxml')
    # get match info
    info = selector.cssselect("p span")
    __zhu = info[0].text  # 主
    __ke = info[-2].text  # 客
    __bc = info[3].text + "-" + info[5].text  # 半场 bc
    __zc = info[12].text  # zc
    __kc = info[14].text  # kc
    __zj = selector.cssselect(".leagLeftScore")[0].text  # zj 主进
    __kj = selector.cssselect(".leagRightScore")[0].text  # kj 客进
    __round = re.findall(
        "第(.*?)節", "".join(
            selector.xpath(
                '//span[@class=\'matchVsTitle__league\']/text()')))[-1]
    __date = re.findall("2022/(.*?)/live", soup.link['href'])[0]
    # get weather
    weather_url = url.replace("live/",
                              "ajax_live?_={}T{}")  # 天气信息ajax动态加载的.拼ajax的url
    weather_url.format(datetime.date.today(),
                       time.strftime('%H:%M', time.localtime(time.time())))
    weather_page = requests.get(weather_url, verify=False,
                                headers=J_Header).content.decode()
    time.sleep(1)
    weather_info = lxml.html.fromstring(weather_page)
    weather_info_list = weather_info.cssselect(".bgGray + td")
    weather = "".join(re.findall("(.*?)\/", weather_info_list[4].text))[:1]
    # get start time
    ss = pyquery.PyQuery(html.text)
    sk = ss.find(ryaml("sj_css", "css")).text()
    st = re.findall("\d+\:\d+", sk)[0].replace(":", "")
    _sql = "INSERT INTO `j22` VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');" % (
        __date, st, __level, str(int(__round)), weather, __zhu, "", __ke, __bc,
        __zj, __kj, __zc, __kc, "9.99", "9.99", "9.99")
    Logi(f"sql:={_sql}")
    html.close()
    w2db(_sql)
    return True
Esempio n. 3
0
 def extract_links_from_html(base, body):
     try:
         html = LinkParser(base)
         html.feed(body)
         html.close()
         for link in html.get_abs_links():
             yield link
     except HTMLParseError, ex:
         logging.warning("html parse error")
Esempio n. 4
0
 def getHtml(self, url):
     if (self.testUrl(url) is True):
         html = urllib.request.urlopen(url)
         mybytes = html.read()
         mystr = mybytes.decode("utf8")
         html.close()
         return mystr
     else:
         return None
Esempio n. 5
0
 def spider(self, url):
     try:
         html = requests.get(url, headers=config.headers)
         html.encoding = 'utf-8'
         return html.text
     except Exception as e:
         print(e, '该URL抓取失败',url)
         html.close()
         return ''
Esempio n. 6
0
 def extract_links_from_html(base, body):
     try:
         html = LinkParser(base)
         html.feed(body)
         html.close()
         for link in html.get_abs_links():
             yield link
     except HTMLParseError as ex:
         logging.warning("html parse error")
def post2markdown(
    tree
):  # Process html; Keep only the <article> content - where blogpost actualy is
    article = (tree.xpath('//article'))[0]
    header = (tree.xpath('//header[@class="article-header"]'))[0]
    p_blog = (article.xpath('.//p[@id="breadcrumb"]'))[0]  # contains: "Blog:"
    header.remove(p_blog)
    if (article.xpath('.//footer')):
        footer = (article.xpath('.//footer'))[0]
        article.remove(footer)
    iframes = article.xpath('//iframe')
    post_videos(iframes)  # videos: replace video's iframe with <a><img>

    images = (article.xpath('.//img'))
    post_imgs(images)
    post_clean_html(article)

    # author # add class to author wrapping <a>
    author_tag = (article.xpath('.//span[@class="author"]/a'))
    #    author_tag[0].set('class', 'author')
    #    author_tag[0].attrib.pop('rel')
    #    author_tag[0].attrib.pop('title')
    #    author_tag[0].set('title', '')

    #    print lxml.html.tostring(author_tag[0])

    # get info
    date = ((article.xpath('//time'))[0]).attrib['datetime']
    author = (article.xpath('//a[@rel="author"]'))[0].text
    title = (article.xpath('//h1[@class="entry-title single-title"]'))[0].text

    #save modified html
    html_article = lxml.html.tostring(article,
                                      pretty_print=True,
                                      include_meta_content_type=True,
                                      encoding='utf-8',
                                      method='html',
                                      with_tail=False)
    html = open('tmp_article.html', 'w')
    html.write(html_article)
    html.close()
    return (date, author, title)
def processRounds(roundURLs):
    for roundURL in roundURLs:
        html = urllib2.urlopen(siteURL + roundURL)
        roundPage = lxml.html.fromstring(html.read())
        html.close()
        
        round = roundPage.cssselect("li[id='tpRound'] a")[0].text_content().replace("round ", "").replace(" Rankings", "").strip()
        print "Round: " + round
        
        roundRows = roundPage.cssselect("div[id='view_standard'] tr")
        # specified in the footer
        pageLinks = roundRows[-1].cssselect("a")
        
        #remove the "next page" link
        del pageLinks[-1]
        
        for link in pageLinks:
            linkURL = siteURL + link.get("href")
            print linkURL
        
            scrapePage(linkURL, round)
Esempio n. 9
0
    def __init__(self, *args, **kwargs):
        html = None
        elements = []
        self._base_url = None
        self.parser = kwargs.pop("parser", None)

        if (
            len(args) >= 1
            and (not PY3k and isinstance(args[0], basestring) or (PY3k and isinstance(args[0], str)))
            and args[0].split("://", 1)[0] in ("http", "https")
        ):
            kwargs["url"] = args[0]
            if len(args) >= 2:
                kwargs["data"] = args[1]
            args = []

        if "parent" in kwargs:
            self._parent = kwargs.pop("parent")
        else:
            self._parent = no_default

        if "css_translator" in kwargs:
            self._translator = kwargs.pop("css_translator")
        elif self.parser in ("xml",):
            self._translator = self._translator_class(xhtml=True)
        elif self._parent is not no_default:
            self._translator = self._parent._translator
        else:
            self._translator = self._translator_class(xhtml=False)

        namespaces = kwargs.pop("namespaces", {})

        if kwargs:
            # specific case to get the dom
            if "filename" in kwargs:
                html = open(kwargs["filename"])
            elif "url" in kwargs:
                url = kwargs.pop("url")
                if "opener" in kwargs:
                    opener = kwargs.pop("opener")
                    html = opener(url, **kwargs)
                else:
                    html = url_opener(url, kwargs)
                if not self.parser:
                    self.parser = "html"
                self._base_url = url
            else:
                raise ValueError("Invalid keyword arguments %s" % kwargs)

            elements = fromstring(html, self.parser)
            # close open descriptor if possible
            if hasattr(html, "close"):
                try:
                    html.close()
                except:
                    pass

        else:
            # get nodes

            # determine context and selector if any
            selector = context = no_default
            length = len(args)
            if length == 1:
                context = args[0]
            elif length == 2:
                selector, context = args
            else:
                raise ValueError("You can't do that. Please, provide arguments")

            # get context
            if isinstance(context, basestring):
                try:
                    elements = fromstring(context, self.parser)
                except Exception:
                    raise
            elif isinstance(context, self.__class__):
                # copy
                elements = context[:]
            elif isinstance(context, list):
                elements = context
            elif isinstance(context, etree._Element):
                elements = [context]

            # select nodes
            if elements and selector is not no_default:
                xpath = self._css_to_xpath(selector)
                results = []
                for tag in elements:
                    results.extend(tag.xpath(xpath, namespaces=namespaces))
                elements = results

        list.__init__(self, elements)
Esempio n. 10
0
def getHtml(url):
    html = urllib2.urlopen(url)
    page = lxml.html.fromstring(html.read())
    html.close()

    return page
Esempio n. 11
0
 def get_html_ulib(self,url_p, type_p='rp', chartset_p='utf-8'):
     html = urllib.request.urlopen(url=url_p)
     txt = html.read().decode(chartset_p)
     html.close()
     return txt
Esempio n. 12
0
    def get_html(self,url_p="",dic_p={},type_p='rg',chartset_p='utf-8',timeout_p=10):
        
        chartset_get = "n/a" # 爬取数据的字符形式编码
        headers_p = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
        txt = "nothing"
                
        # 获取网页源码
        try:
        
            # request_get的方法
            if (type_p == 'rg'):
            
                html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p)
                chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码
                
                if (self.code_standard_is(chartset_p=chartset_get)):
                
                    print ("<<原文编码识别[通过]>>")
                    
                    if (chartset_get.lower() == "iso-8859-1"):
                    
                        print ("rg模式,<<原文编码iso-8859-1特殊处理")
                        try:
                            txt = html.content.decode("GBK")
                        except:
                            txt = html.content.decode("gb2312")
                            
                    else:
                    
                        print ("rg模式,按照识别的" + chartset_get + "特殊处理")
                        txt = html.content.decode(chartset_get)
                    
                else:
                
                    print ("<<原文编码识别[未通过]>>")
                    
                    txt = ""
                    
                print ("<<<rg>>>过程:"," ","原文编码:",chartset_get)
                html.close()
            
            # request_get的方法 只输出字节码
            if (type_p == 'rg_byte'):
                txt = b""
                html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p)
                chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码
                txt = html.content
                print ("<<<rg_byte>>>过程:"," ","原文编码:",chartset_get)
                html.close()
            
            # request_post的方法
            if (type_p == 'rp'):
            
                conn_p = requests.session()
                rep_p = conn_p.post(url=url_p,data=dic_p,timeout=timeout_p,headers=headers_p)
                txt = rep_p.content
                chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore"))
                if (self.code_standard_is(chartset_p=chartset_get)):
                    print ("<<原文编码识别[通过]>>")
                    txt = txt.decode(chartset_get, "ignore")
                else:
                    print ("<<原文编码识别[未通过]>>")
                    code_is = chardet.detect(txt)
                    if ("encoding" in code_is):
                        chartset_get = code_is["encoding"]
                        txt = txt.decode(code_is["encoding"], "ignore")
                    
                print ("<<<rp>>>过程:"," ","原文编码:",chartset_get)
                
            # urllib的get方法
            if (type_p == 'ug'):

                html = urllib.request.urlopen(url=url_p)
                txt = html.read()
                chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息
                if (self.code_standard_is(chartset_p=chartset_get)):
                    print ("<<原文编码识别[通过]>>")
                    txt = txt.decode(chartset_get, "ignore")
                else:
                    # 进行编码判别
                    print ("<<原文编码识别[未通过]>>")
                    code_is = chardet.detect(txt)
                    if ("encoding" in code_is):
                        chartset_get = code_is["encoding"]
                        txt = txt.decode(code_is["encoding"], "ignore")
                        
                print ("<<<ug>>>过程:"," ","原文编码:",chartset_get)
            
            # urllib的post方法
            if (type_p == 'up'):
                
                #将字典格式化成能用的形式
                data_p = urllib.parse.urlencode(dic_p).encode('utf-8')
                #创建一个request,放入我们的地址、数据、头
                request = urllib.request.Request(url_p, data_p, headers_p)
                #访问
                txt = urllib.request.urlopen(request).read()
                chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息
                if (self.code_standard_is(chartset_p=chartset_get)):
                    print ("<<原文编码识别[通过]>>")
                    txt = txt.decode(chartset_get, "ignore")
                else:
                    # 进行编码判别
                    print ("<<原文编码识别[未通过]>>")
                    code_is = chardet.detect(txt)
                    if ("encoding" in code_is):
                        chartset_get = code_is["encoding"]
                        txt = txt.decode(code_is["encoding"], "ignore")
                        
                print ("<<<up>>>过程:"," ","原文编码:",chartset_get)

                
            # session的方法
            if (type_p == 'ss'):
                res_addr = self.session.get(url_p, timeout=timeout_p, headers=headers_p)
                res_addr.encoding = chardet.detect(res_addr.content)["encoding"]
                txt = bs_4(res_addr.text, "lxml")
                print ("<<<ss>>>过程:"," ","原文编码:",chartset_get)

            # Selenium的方法 待完善
            if (type_p == 'se'):
                self.driver.get(url_p)
                js = "var q=document.body.scrollTop=100000"
                self.driver.execute_script(js)
                self.driver.implicitly_wait(30)  # 据说此方法是智能等待,看效果还不错,数据加载完就返回了 30 代表等待秒
                txt = self.driver.page_source
                chartset_get = self.get_encodings_from_content(txt)
                print ("<<<se>>>过程:"," ","原文编码:",chartset_get)

            # login的方法 待完善
            if (type_p == 'lg'):
                print ("<<<lg>>>过程:"," ","原文编码:",chartset_get)

        except Exception as e:
            
            print("html爬虫处理失败", e)
            
            html = requests.get(url=url_p, headers=headers_p)
            chartset_get = "n/a"
            print ("爬虫的最后处理,按照默认的" + chartset_p + "编码输出")
            try:
                txt = html.content.decode(chartset_p)
            except:
                txt = html.content.decode("gbk")
                
            html.close()
            
        return txt,chartset_get # 返回文本型html编码 加上自定义编码头
Esempio n. 13
0
import time
import sys
import codecs
import lxml.html
import urllib2

query = 'http://www39.atwiki.jp/osakahennyu/?cmd=backup&action=source&pageid=<PLACEHOLDER>&num=0'

for line in open(sys.argv[1], 'r'):
	url = query.replace('<PLACEHOLDER>', line.rstrip())

	while True:
		try:
			html = urllib2.urlopen(url)

			code = unicode(html.read(), 'utf-8')
			dom  = lxml.html.fromstring(code)
			wiki = dom.xpath('//pre')[0]
			
			fout = codecs.open(line.rstrip() + '.txt', 'w', 'utf-8')
			fout.write(wiki.text)
			fout.close()

			html.close()
			break
			
		except urllib2.HTTPError:
			raw_input('>>> error! press continue...')

	time.sleep(1)
Esempio n. 14
0
    def __init__(self, *args, **kwargs):
        html = None
        elements = []
        self._base_url = None
        self.parser = kwargs.get('parser', None)
        if 'parser' in kwargs:
            del kwargs['parser']

        if len(args) >= 1 and \
           (not PY3k and isinstance(args[0], basestring) or \
           (PY3k and isinstance(args[0], str))) and \
           args[0].split('://', 1)[0] in ('http', 'https'):
            kwargs['url'] = args[0]
            if len(args) >= 2:
                kwargs['data'] = args[1]
            args = []

        if 'parent' in kwargs:
            self._parent = kwargs.pop('parent')
        else:
            self._parent = no_default

        if 'css_translator' in kwargs:
            self._translator = kwargs.pop('css_translator')
        elif self.parser in ('xml',):
            self._translator = JQueryTranslator(xhtml=True)
        elif self._parent is not no_default:
            self._translator = self._parent._translator
        else:
            self._translator = JQueryTranslator(xhtml=False)

        namespaces = kwargs.get('namespaces', {})
        if 'namespaces' in kwargs:
            del kwargs['namespaces']

        if kwargs:
            # specific case to get the dom
            if 'filename' in kwargs:
                html = open(kwargs['filename'])
            elif 'url' in kwargs:
                url = kwargs.pop('url')
                if 'opener' in kwargs:
                    opener = kwargs.pop('opener')
                    html = opener(url, **kwargs)
                else:
                    html = url_opener(url, kwargs)
                if not self.parser:
                    self.parser = 'html'
                self._base_url = url
            else:
                raise ValueError('Invalid keyword arguments %s' % kwargs)

            elements = fromstring(html, self.parser)
            # close open descriptor if possible
            if hasattr(html, 'close'):
                try:
                    html.close()
                except:
                    pass

        else:
            # get nodes

            # determine context and selector if any
            selector = context = no_default
            length = len(args)
            if length == 1:
                context = args[0]
            elif length == 2:
                selector, context = args
            else:
                raise ValueError("You can't do that." +\
                        " Please, provide arguments")

            # get context
            if isinstance(context, basestring):
                try:
                    elements = fromstring(context, self.parser)
                except Exception:
                    raise
            elif isinstance(context, self.__class__):
                # copy
                elements = context[:]
            elif isinstance(context, list):
                elements = context
            elif isinstance(context, etree._Element):
                elements = [context]

            # select nodes
            if elements and selector is not no_default:
                xpath = self._css_to_xpath(selector)
                results = []
                for tag in elements:
                    results.extend(tag.xpath(xpath, namespaces=namespaces))
                elements = results

        list.__init__(self, elements)
def getHtml(url):
    html = urllib2.urlopen(url)
    page = lxml.html.fromstring(html.read())
    html.close()

    return page
Esempio n. 16
0
    def __init__(self, *args, **kwargs):
        html = None
        elements = []
        self._base_url = None
        self.parser = kwargs.get('parser', None)
        if 'parser' in kwargs:
            del kwargs['parser']

        if len(args) >= 1 and \
           (not PY3k and isinstance(args[0], basestring) or \
           (PY3k and isinstance(args[0], str))) and \
           args[0].split('://', 1)[0] in ('http', 'https'):
            kwargs['url'] = args[0]
            if len(args) >= 2:
                kwargs['data'] = args[1]
            args = []

        if 'parent' in kwargs:
            self._parent = kwargs.pop('parent')
        else:
            self._parent = no_default

        if 'css_translator' in kwargs:
            self._translator = kwargs.pop('css_translator')
        elif self.parser in ('xml', ):
            self._translator = JQueryTranslator(xhtml=True)
        elif self._parent is not no_default:
            self._translator = self._parent._translator
        else:
            self._translator = JQueryTranslator(xhtml=False)

        namespaces = kwargs.get('namespaces', {})
        if 'namespaces' in kwargs:
            del kwargs['namespaces']

        if kwargs:
            # specific case to get the dom
            if 'filename' in kwargs:
                html = open(kwargs['filename'])
            elif 'url' in kwargs:
                url = kwargs.pop('url')
                if 'opener' in kwargs:
                    opener = kwargs.pop('opener')
                    html = opener(url, **kwargs)
                else:
                    html = url_opener(url, kwargs)
                if not self.parser:
                    self.parser = 'html'
                self._base_url = url
            else:
                raise ValueError('Invalid keyword arguments %s' % kwargs)

            elements = fromstring(html, self.parser)
            # close open descriptor if possible
            if hasattr(html, 'close'):
                try:
                    html.close()
                except:
                    pass

        else:
            # get nodes

            # determine context and selector if any
            selector = context = no_default
            length = len(args)
            if length == 1:
                context = args[0]
            elif length == 2:
                selector, context = args
            else:
                raise ValueError("You can't do that." +\
                        " Please, provide arguments")

            # get context
            if isinstance(context, basestring):
                try:
                    elements = fromstring(context, self.parser)
                except Exception:
                    raise
            elif isinstance(context, self.__class__):
                # copy
                elements = context[:]
            elif isinstance(context, list):
                elements = context
            elif isinstance(context, etree._Element):
                elements = [context]

            # select nodes
            if elements and selector is not no_default:
                xpath = self._css_to_xpath(selector)
                results = []
                for tag in elements:
                    results.extend(tag.xpath(xpath, namespaces=namespaces))
                elements = results

        list.__init__(self, elements)
Esempio n. 17
0
import csv


html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')

lib_methods = dir(html)
print('HTML Built-In Library Methods:\n', lib_methods, end='\n\n')

soup = BeautifulSoup(html, 'html.parser')
#print(type(soup.prettify()))

with open('site.html', 'w+', newline='') as f:
  html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
  mybytes = html.read()
  mystr = mybytes.decode("utf8")
  html.close()
  f.write(mystr)
f.close()

#print(type(lines))
#print(len(x))

#for line in lines:
#print(lines)

"""# Widgets"""

# Source article
# https://towardsdatascience.com/bring-your-jupyter-notebook-to-life-with-interactive-widgets-bc12e03f0916