Ejemplo n.º 1
0
    def load_async(self, time):
        # load content of loading divs.
        lst = self.document.xpath(
            '//input[@type="hidden" and starts-with(@id, "asynch")]')
        if len(lst) > 0:
            params = {}
            for i, input in enumerate(lst):
                params['key%s' % i] = input.attrib['name']
                params['div%s' % i] = input.attrib['value']
            params['time'] = time

            r = self.browser.openurl(
                self.browser.buildurl('/AsynchAjax', **params))
            data = json.load(r)

            for i, d in enumerate(data['data']):
                div = self.document.xpath('//div[@id="%s"]' % d['key'])[0]
                html = d['flux']
                div.clear()
                div.attrib['id'] = d[
                    'key']  # needed because clear removes also all attributes
                div.insert(0, etree.fromstring(html,
                                               parser=etree.HTMLParser()))

            if 'time' in data:
                sleep(float(data['time']) / 1000.0)
                return self.load_async(time)
Ejemplo n.º 2
0
def resolving():
    html = etree.parse('RoomTable.html', etree.HTMLParser())
    tr_list = html.xpath("//center/table[3]")
    cell = tr_list[0].xpath("./tr/td/text() | ./tr/td/a/attribute::href")
    a = tr_list[0].xpath("./tr/td/a/attribute::*")
    print(cell)
    data = []
    tmp_obj = []
    k = 0
    for i in cell:
        content = replaceCoding(i)
        if (content.__len__() == 0):
            continue
        k += 1
        if (k == 6):
            tmp_obj.append(getCode(i))
            data.append(tmp_obj)
            tmp_obj = []
            k = 0
        else:
            tmp_obj.append(content)
    # result=html.xpath("//center/table/tr/td/a[1]/attribute::*")
    # print(result)
    sql = ""
    for i in data:
        sql += "('%s','%s',%s,'%s','%s')," % (i[1], i[2], i[3], i[4], i[5])
    sql = "insert into c_origin_data (`city`,`location`,`num`,`category`,`code`) values " + sql[
        0:-1]
    print(sql)
    cur.execute(sql)
    db.commit()
Ejemplo n.º 3
0
def read_data(page_count, filepath="./datas/job_python/"):
    """
    :param page_count: 需要爬取的页数
    :param filepath: 源文件存放的路径
    :return: 无返回值,在函数的最后将解析后的数据存储文件中
    """
    parser = etree.HTMLParser(encoding='utf-8')

    for i in range(1, page_count):
        html_tree = etree.parse(filepath + f"python_{i}.html", parser=parser)
        path = "//div[@class='dw_table']/div[@class='el']"
        jobs = html_tree.xpath(path)

        jobs_list = []

        for job in jobs:
            dict_job = std_job(job)
            jobs_list.append(dict_job)
            # job_title = job.xpath('./p/span/a')[0].text
            # job_company = job.xpath('./span/a')[0].text
            # job_place = job.xpath('./span[@class="t3"]')[0].text
            # job_salary = job.xpath('./span[@class="t4"]')[0].text
            # job_date = job.xpath('./span[@class="t5"]')[0].text

            # 加入文件到csv文件中
        #保存页面中的信息到csv文件
        save_csv(
            f"./handled_data/job_python_{str(datetime.datetime.now()).split(' ')[0]}.csv",
            jobs_list)
Ejemplo n.º 4
0
    def load_async(self, time):
        total = 0
        restart = True
        while restart:
            restart = False

            # load content of loading divs.
            lst = self.doc.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]')
            if len(lst) > 0:
                params = {}
                for i, input in enumerate(lst):
                    params['key%s' % i] = input.attrib['name']
                    params['div%s' % i] = input.attrib['value']
                params['time'] = time

                r = self.browser.open('/AsynchAjax', params=params)
                data = json.loads(r.content)

                for i, d in enumerate(data['data']):
                    div = self.doc.xpath('//div[@id="%s"]' % d['key'])[0]
                    html = d['flux']
                    div.clear()
                    div.attrib['id'] = d['key'] # needed because clear removes also all attributes
                    div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))

                if 'time' in data:
                    wait = float(data['time'])/1000.0
                    self.logger.debug('should wait %f more seconds', wait)
                    total += wait
                    if total > 120:
                        raise BrowserUnavailable('too long time to wait')

                    sleep(wait)
                    restart = True
Ejemplo n.º 5
0
    def on_loaded(self):
        warn = self.document.xpath('//div[@id="message_renouvellement_mot_passe"]')
        if len(warn) > 0:
            raise BrowserIncorrectPassword(warn[0].text)

        # load content of loading divs.
        divs = []
        for div in self.document.xpath('//div[starts-with(@id, "as_")]'):
            loading = div.xpath('.//span[@class="loading"]')
            if len(loading) == 0:
                continue

            input = div.xpath('.//input')[0]
            divs.append([div, input.attrib['name']])

        if len(divs) > 0:
            args = {}
            for i, (div, name) in enumerate(divs):
                args['key%s' % i] = name
                args['div%s' % i] = div.attrib['id']
            args['time'] = 0
            r = self.browser.openurl(self.browser.buildurl('/AsynchAjax', **args))
            data = json.load(r)

            for i, (div, name) in enumerate(divs):
                html = data['data'][i]['flux']
                div.clear()
                div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))
Ejemplo n.º 6
0
def check_enc_fixed(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print("Origignal encoding: {} vs declared_html_encoding: {}"
              "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}".format(chardet.detect(r.content))

    enc = ud.original_encoding.lower()
    declared_enc = ud.declared_html_encoding
    if declared_enc:
        declared_enc = declared_enc.lower()
    # possible misregocnition of an encoding
    if (declared_enc and enc != declared_enc):
        detect_dict = chardet.detect(r.content)
        det_conf = detect_dict["confidence"]
        det_enc = detect_dict["encoding"].lower()
        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
            enc = declared_enc
    print "CHOOSED ENCODING: {}".format(enc)
    # if page contains any characters that differ from the main
    # encodin we will ignore them
    content = r.content.decode(enc, "ignore").encode(enc)
    htmlparser = etree.HTMLParser(encoding=enc)
    root = etree.HTML(content, parser=htmlparser)
    etree.strip_elements(root, html.etree.Comment, "script", "style")
    text = html.tostring(root, method="text", encoding=unicode)

    text = re.sub('\s+', ' ', text)
    print text[:200]
Ejemplo n.º 7
0
def get_params():
    '''get form parameters for session use'''
    r = _session.get(URL, headers={'User-Agent': _UA})
    tree = etree.fromstring(r.text, etree.HTMLParser())
    # Get all input tags
    params = {
        x.attrib['name']: x.attrib.get('value', '')
        for x in tree.xpath('.//input')
    }
    return r.text, params
Ejemplo n.º 8
0
def get_links(html):
    parser = etree.HTMLParser()

    try:
        tree = etree.fromstring(html, parser=parser)
    except XMLSyntaxError as ex:
        return []

    if tree is None:
        return []
    links = tree.xpath('//a/@href')
    return links
Ejemplo n.º 9
0
def get_load_test_result(load_test_report_path):
    if (os.path.exists(load_test_report_path)):
        parser = etree.HTMLParser(encoding='utf-8')
        html = etree.parse(load_test_report_path, parser=parser)
        cbft_name = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[1]/text()')[0]
        cbft_status = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[2]/text()')[0]
        cbft_tps = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[3]/text()')[0]
        cbft_cpu = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[4]/text()')[0]
        cbft_memory = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[5]/text()')[0]
        cbft_bwup = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[6]/text()')[0]
        cbft_bwdown = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[1]/td[7]/text()')[0]
        wasm_name = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[1]/text()')[0]
        wasm_status = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[2]/text()')[0]
        wasm_tps = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[3]/text()')[0]
        wasm_cpu = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[4]/text()')[0]
        wasm_memory = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[5]/text()')[0]
        wasm_bwup = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[6]/text()')[0]
        wasm_bwdown = html.xpath(
            '/html/body/div/div[2]/div/table/tbody/tr[3]/td[7]/text()')[0]
        cbft = '场景:' + cbft_name + '<br>' + \
               '状态:' + cbft_status + '<br>' + \
               '每秒事务数:' + cbft_tps + '<br>' + \
               '处理器(%):' + cbft_cpu + '<br>' + \
               '内存(%):' + cbft_memory + '<br>' + \
               '带宽上行(Mb/s):' + cbft_bwup + '<br>' + \
               '带宽下行(Mb/s):' + cbft_bwdown
        wasm = '场景:' + wasm_name + '<br>' + \
               '状态:' + wasm_status + '<br>' + \
               '每秒事务数:' + wasm_tps + '<br>' + \
               '处理器(%):' + wasm_cpu + '<br>' + \
               '内存(%):' + wasm_memory + '<br>' + \
               '带宽上行(Mb/s):' + wasm_bwup + '<br>' + \
               '带宽下行(Mb/s):' + wasm_bwdown
        if cbft_status == 'success' and wasm_status == 'success':
            TestResult = 'PASS'
        else:
            TestResult = 'FAIL'
        return TestResult, cbft + '<br><br>' + wasm
    else:
        return 'ERROR', '测试被人为中断或测试代码出错,未能生成报告'
Ejemplo n.º 10
0
def get_links(html):
    parser = etree.HTMLParser()
    
    try:
        tree = etree.fromstring(html, parser=parser)
    except XMLSyntaxError as ex:
        log.warn('html parsing error')
        return []

    if tree is None:
        log.warn("html not parsed")
        return []
    links = tree.xpath('//a/@href')
    return links
Ejemplo n.º 11
0
    def _get_text(self, remove_newlines=True):
        """ Retrieves html with provided url and parses it to fully remove
        all html tags, style declarations and scripts.

        Args:
            remove_newlines (bool): wheter perform cleaning of a \n\r
                sequencies or not.

        Returns:
            unicode object of the whole text without html tags

        """
        if not self.text:
            url = self.url
            try:
                self.log.debug("Try to get content from page {}".format(url))
                r = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.log.warn("Unable to get page content of the url: {url}. "
                              "The reason: {exc!r}".format(url=url, exc=e))
                raise ParsingError(e.strerror)

            ud = UnicodeDammit(r.content, is_html=True)

            enc = ud.original_encoding.lower()
            declared_enc = ud.declared_html_encoding
            if declared_enc:
                declared_enc = declared_enc.lower()
            # possible misregocnition of an encoding
            if (declared_enc and enc != declared_enc):
                detect_dict = chardet.detect(r.content)
                det_conf = detect_dict["confidence"]
                det_enc = detect_dict["encoding"].lower()
                if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
                    enc = declared_enc
            # if page contains any characters that differ from the main
            # encoding we will ignore them
            content = r.content.decode(enc, "ignore").encode(enc)
            htmlparser = etree.HTMLParser(encoding=enc)
            root = etree.HTML(content, parser=htmlparser)
            etree.strip_elements(root, html.etree.Comment, "script", "style")
            text = html.tostring(root, method="text", encoding="unicode")

            if remove_newlines:
                self.log.debug(str(type(text)))
                text = re.sub('\s+', ' ', text)
            self.text = text

        return self.text
Ejemplo n.º 12
0
def checkdocument(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print("Origignal encoding: {} vs declared_html_encoding: {}"
              "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}".format(chardet.detect(r.content))
    content = ud.unicode_markup.encode(ud.original_encoding, "ignore")
    root = etree.HTML(content,
                      parser=etree.HTMLParser(encoding=ud.original_encoding))
    lxml.html.etree.strip_elements(root, lxml.etree.Comment, "script", "style")
    text = lxml.html.tostring(root, method="text", encoding="utf-8")
    text = re.sub('\s+', ' ', text)
    print text[:200]
Ejemplo n.º 13
0
    def filter(
        self,
        html: str,
        inline: bool = False,
        outgoing: bool = False,
        display_name_mentions: Optional[Dict[str, str]] = None,
    ) -> str:
        """Filter and return HTML."""

        mentions = display_name_mentions

        sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions))
        html = sanit.sanitize(html).rstrip("\n")

        if not html.strip():
            return html

        tree = etree.fromstring(
            html,
            parser=etree.HTMLParser(encoding="utf-8"),
        )

        for a_tag in tree.iterdescendants("a"):
            self._mentions_to_matrix_to_links(a_tag, mentions, outgoing)

            if not outgoing:
                self._matrix_to_links_add_classes(a_tag)

        html = etree.tostring(tree, encoding="utf-8", method="html").decode()
        html = sanit.sanitize(html).rstrip("\n")

        if outgoing:
            return html

        # Client-side modifications

        html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html)

        if not inline:
            return html

        return self.inline_quote_regex.sub(
            r'\1<span class="quote">\2</span>',
            html,
        )
Ejemplo n.º 14
0
def get_congress(cong):
    params = urllib.urlencode({'congress': cong})
    results = urllib.urlopen('http://bioguide.congress.gov/biosearch/biosearch1.asp', params)
    page = etree.parse(StringIO.StringIO(results.read()), etree.HTMLParser())
    nas = 1
    for member in page.xpath("//table")[1].xpath("tr")[1:]:
        name = member.xpath("td/a/text()")
        print name
        if len(name) == 0:
            name = ""
            print nas
            nas += 1
            continue        
        else:
            name = name[0]
            pid = member.xpath("td/a/@href")[0].split("=")[1]
        stats = member.xpath("td/text()")
        c.execute('''INSERT OR IGNORE INTO terms (pid, name, dates, position, party, state, congress)
            VALUES (?,?,?,?,?,?,?)''', (pid, name, stats[0], stats[1][0], stats[2], stats[3], int(stats[4])))        
    conn.commit()
Ejemplo n.º 15
0
def parse_detail_page(detail_page_source):
    '''
    解析详情页,提取字体文件中的id(/2d/2df8ui),并提取公司的核准日期。
    问题一:如何解决已经解析过的字体文件不用重复解析?如果是解析过的,直接使用映射字典,不用再进行图片识别了。如果是没有解析过的字体文件,再进行图片识别。
    方案:声明一个字典,将解析过的字体id都存入列表中,等到后续获取字体id的时候,和字典中的id进行比对,查询是否存在这个id,如果已经存在,直接将映射字典取出。
    {'2df8ui':{'1':'2'},'b64s4f':{'1':'3'}}
    每一个id对应的映射字典是否保留?还是说只保留最新的id映射规则?
    都保留,避免刷新到以前的id时再重新解析一遍。
    :param data_tuple:get_detail_page()这个函数返回的元组。
    :return:
    '''
    obj = etree.HTML(detail_page_source,
                     parser=etree.HTMLParser(encoding='utf8'))
    # 在详情页中提取字体id
    pattern = re.compile(
        r'<head>.*?<base.*?<link.*?href="https://static\.tianyancha\.com/fonts-styles/css/(.*?)/font\.css">',
        re.S)
    font_id = re.search(pattern, detail_page_source).group(1)
    # 在获取font_id以后,先判断Map_Dict中是否存在这个键,不存在再去调用parse_map_rule()
    if font_id not in Map_Dict:
        map_dic = parse_map_rule(font_id)
    else:
        map_dic = Map_Dict[font_id]
    print(map_dic)
    exit()
    # 根据映射  字典转换核准日期
    try:
        hezhunriqi = obj.xpath(
            '//td[@colspan="2"]/text[contains(@class,"tyc-num")]/text()')[0]
        name = obj.xpath('//h1[@class="name"]/text()')[0]
        result = ''
        for char in hezhunriqi:
            if char != '-':
                real_num = map_dic[char]
                result += real_num
            else:
                result += '-'
        print(name, result)
    except:
        pass
Ejemplo n.º 16
0
def auth(user, passwd):
    os = 'Linux'
    useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1'
    AuthCheckURL = 'http://google.com/'

    parser = etree.HTMLParser()

    config = ConfigParser.SafeConfigParser({
        'os': os,
        'useragent': useragent,
        'debug': 'False'
    })
    config.read('unCleanAccess.cfg')
    if not config.has_section('login'):
        config.add_section('login')

    debug = config.get('login', 'debug') in ('True', 'true')

    print 'Checking if Authenticated'
    if not debug:
        responseAuthCheck = urllib2.urlopen(AuthCheckURL)
        AuthCheckhtml = responseAuthCheck.read()
    else:
        f = open('unCleanAuthCheckunauthed.html', 'r')
        AuthCheckhtml = f.read()
        f.close()

    if AuthCheckhtml.find('/auth/perfigo_weblogin.jsp') != -1:
        print 'Not Authenticated Yet'
        urlSplit = AuthCheckhtml.split('URL=')
        if len(urlSplit) != 2:
            print 'Error extracting redirect URL (1)'
        else:
            urlSplit = re.split("'>|;", urlSplit[1])
            if len(urlSplit) < 2:
                print 'Error extracting redirect URL (2)'
            else:
                print 'Fetching Login Page'
                if not debug:
                    responseAuthPage = urllib2.urlopen(urlSplit[0])
                    AuthPagehtml = etree.parse(responseAuthPage, parser)
                else:
                    f = open('authPage.html', 'r')
                    AuthPagehtml = etree.parse(f, parser)
                    f.close()

                print 'Parsing Login Page'
                POSTDataItems = dict()
                for formInput in AuthPagehtml.xpath(
                        ".//form[@name='loginform']//input"):
                    if formInput.get('name'):
                        POSTDataItems[formInput.get('name')] = formInput.get(
                            'value')

                POSTDataItems['pm'] = config.get('login', 'os')
                POSTDataItems['username'] = user
                POSTDataItems['password'] = passwd

                authData = urllib.urlencode(POSTDataItems)
                authHeaders = {
                    'Referer': urlSplit[0].split('perfigo_weblogin.jsp', 1)[0],
                    'User-Agent': config.get('login', 'useragent')
                }

                print 'Logging in'
                authReq = urllib2.Request(
                    urlSplit[0].split('auth/perfigo_weblogin.jsp', 1)[0] +
                    AuthPagehtml.xpath(".//form[@name='loginform']")[0].get(
                        'action').split('/', 1)[1], authData, authHeaders)
                responseAuthReq = urllib2.urlopen(authReq)
                authReqhtml = responseAuthReq.read()

                if authReqhtml.find(
                        'You have been successfully logged on the network'
                ) != -1:
                    print 'Successfuly Authenticated!'
                else:
                    print 'Invalid credentials'
                    (userName, password) = setCreds(regKeyVal)
                    auth(userName, password)

    else:
        print 'Already Authenticated'
Ejemplo n.º 17
0
import requests
from lxml.html import etree

headers = {
    "cookie":
    "__cfduid=dfa5a44a56e1f4818da6dc1c0442d32e61555031717; _"
    "ga=GA1.2.446599568.1555031722; trc_cookie_storage=taboola%2520global%253Auser-id%3Df47e0355-c5e3-4ac8-8d9c-69e65b8be1c0-tuct3a468dd; "
    "ShowSubtitleDetails=true; ShowSubtitlePreview=true; "
    "HearingImpaired=2; ForeignOnly=False; _gid=GA1.2.1534139390.1556500043; LanguageFilter=28; "
    "cookieconsent_dismissed=yes; cf_clearance=5c22147cf3e89737a1f9ac602ed6b8491cc6bc33-1556588618-31536000-150",
    "pragma":
    "no-cache",
    "upgrade-insecure-requests":
    "1",
    "user-agent":
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
# for i in range(2, 51):
url = "https://subscene.com/browse/popular/all/2"
session = requests.session()
resp = session.get(url=url, headers=headers)

e = etree.HTML(resp.text, etree.HTMLParser(encoding='utf8'))
result = e.xpath(
    '//div[@class="content clearfix"]/table/tbody/tr/td[@class="a1"]/a/@href')
result = ["https://subscene.com" + short_url for short_url in result]
print('\n'.join(result))
# with open("url.txt", 'a', encoding='utf8')as f:
#     f.write('\n'.join(result))
Ejemplo n.º 18
0
# lxml用法:xpath路径提取数据,也支持css选择器的写法。

# etree: element tree文档树,将html源代码转化为一个文档树对象。
from lxml.html import etree

html = """
<a class="one" id="first" href="xxx">百度一下</a>
<a class="two three four" href="==="><p>111</p>a标签内部的内容。<img src="---"></a>
<div>
    <span>五险一金</span>
    <span>双休</span>
    <span>餐补</span>
</div>
"""
# parser: 解析器。
obj = etree.HTML(html, parser=etree.HTMLParser(encoding='utf8'))
# <class 'lxml.etree._Element'>
print(type(obj))
# a[@class="one"]:查找class="one"的a标签,[@class="xx"]固定用法
# //:表示从html源码中的任意位置查找标签。
# 提取标签属性的写法
# /@href /@src /@id /@class
content = obj.xpath('//a[@class="one"]/@href')
print(content)

# 提取标签的文本内容的写法
text = obj.xpath('//a[@class="one"]/text()')
print(text)

# 如果要提取一个标签的属性和文本等多个内容,一般都不采用上面这种写法,xpath路径重复的内容太多。先定位到标签,然后再单独从这个标签中提取需要的内容
a = obj.xpath('//a[@class="one"]')[0]
Ejemplo n.º 19
0
    html = driver.page_source.encode('utf-8')
    page_num = 0

    while driver.find_element_by_xpath('//*[@class="load_more mt3"]/a'):
        break
        driver.find_element_by_xpath('//*[@class="load_more mt3"]/a').click()
        page_num += 1
        print("getting page number " + str(page_num))
        time.sleep(1)
        if page_num == 1:
            break

    return driver.page_source.encode('utf-8')


response = getMoreRequests(
    'https://www.kickstarter.com/discover/advanced?category_id=16')

htmlparser = etree.HTMLParser()
tree = etree.parse(response, htmlparser)

soup = BeautifulSoup(response, 'html.parser')
# projects_grid = soup.find('div', id="projects")
projects = soup.find_all(
    'div',
    {"class": "js-react-proj-card grid-col-12 grid-col-6-sm grid-col-4-lg"})

for project in projects:
    print(project['data-projects'])

## '//div[contains(@data-project)]/@data-project').getall()