def load_async(self, time): # load content of loading divs. lst = self.document.xpath( '//input[@type="hidden" and starts-with(@id, "asynch")]') if len(lst) > 0: params = {} for i, input in enumerate(lst): params['key%s' % i] = input.attrib['name'] params['div%s' % i] = input.attrib['value'] params['time'] = time r = self.browser.openurl( self.browser.buildurl('/AsynchAjax', **params)) data = json.load(r) for i, d in enumerate(data['data']): div = self.document.xpath('//div[@id="%s"]' % d['key'])[0] html = d['flux'] div.clear() div.attrib['id'] = d[ 'key'] # needed because clear removes also all attributes div.insert(0, etree.fromstring(html, parser=etree.HTMLParser())) if 'time' in data: sleep(float(data['time']) / 1000.0) return self.load_async(time)
def resolving(): html = etree.parse('RoomTable.html', etree.HTMLParser()) tr_list = html.xpath("//center/table[3]") cell = tr_list[0].xpath("./tr/td/text() | ./tr/td/a/attribute::href") a = tr_list[0].xpath("./tr/td/a/attribute::*") print(cell) data = [] tmp_obj = [] k = 0 for i in cell: content = replaceCoding(i) if (content.__len__() == 0): continue k += 1 if (k == 6): tmp_obj.append(getCode(i)) data.append(tmp_obj) tmp_obj = [] k = 0 else: tmp_obj.append(content) # result=html.xpath("//center/table/tr/td/a[1]/attribute::*") # print(result) sql = "" for i in data: sql += "('%s','%s',%s,'%s','%s')," % (i[1], i[2], i[3], i[4], i[5]) sql = "insert into c_origin_data (`city`,`location`,`num`,`category`,`code`) values " + sql[ 0:-1] print(sql) cur.execute(sql) db.commit()
def read_data(page_count, filepath="./datas/job_python/"): """ :param page_count: 需要爬取的页数 :param filepath: 源文件存放的路径 :return: 无返回值,在函数的最后将解析后的数据存储文件中 """ parser = etree.HTMLParser(encoding='utf-8') for i in range(1, page_count): html_tree = etree.parse(filepath + f"python_{i}.html", parser=parser) path = "//div[@class='dw_table']/div[@class='el']" jobs = html_tree.xpath(path) jobs_list = [] for job in jobs: dict_job = std_job(job) jobs_list.append(dict_job) # job_title = job.xpath('./p/span/a')[0].text # job_company = job.xpath('./span/a')[0].text # job_place = job.xpath('./span[@class="t3"]')[0].text # job_salary = job.xpath('./span[@class="t4"]')[0].text # job_date = job.xpath('./span[@class="t5"]')[0].text # 加入文件到csv文件中 #保存页面中的信息到csv文件 save_csv( f"./handled_data/job_python_{str(datetime.datetime.now()).split(' ')[0]}.csv", jobs_list)
def load_async(self, time): total = 0 restart = True while restart: restart = False # load content of loading divs. lst = self.doc.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]') if len(lst) > 0: params = {} for i, input in enumerate(lst): params['key%s' % i] = input.attrib['name'] params['div%s' % i] = input.attrib['value'] params['time'] = time r = self.browser.open('/AsynchAjax', params=params) data = json.loads(r.content) for i, d in enumerate(data['data']): div = self.doc.xpath('//div[@id="%s"]' % d['key'])[0] html = d['flux'] div.clear() div.attrib['id'] = d['key'] # needed because clear removes also all attributes div.insert(0, etree.fromstring(html, parser=etree.HTMLParser())) if 'time' in data: wait = float(data['time'])/1000.0 self.logger.debug('should wait %f more seconds', wait) total += wait if total > 120: raise BrowserUnavailable('too long time to wait') sleep(wait) restart = True
def on_loaded(self): warn = self.document.xpath('//div[@id="message_renouvellement_mot_passe"]') if len(warn) > 0: raise BrowserIncorrectPassword(warn[0].text) # load content of loading divs. divs = [] for div in self.document.xpath('//div[starts-with(@id, "as_")]'): loading = div.xpath('.//span[@class="loading"]') if len(loading) == 0: continue input = div.xpath('.//input')[0] divs.append([div, input.attrib['name']]) if len(divs) > 0: args = {} for i, (div, name) in enumerate(divs): args['key%s' % i] = name args['div%s' % i] = div.attrib['id'] args['time'] = 0 r = self.browser.openurl(self.browser.buildurl('/AsynchAjax', **args)) data = json.load(r) for i, (div, name) in enumerate(divs): html = data['data'][i]['flux'] div.clear() div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))
def check_enc_fixed(url): print "\n\n" print "That is url {}".format(url) r = requests.get(url) ud = UnicodeDammit(r.content, is_html=True) print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding if not ud.original_encoding == ud.declared_html_encoding: print("Origignal encoding: {} vs declared_html_encoding: {}" "".format(ud.original_encoding, ud.declared_html_encoding)) print "Detected encoding: {!r}".format(chardet.detect(r.content)) enc = ud.original_encoding.lower() declared_enc = ud.declared_html_encoding if declared_enc: declared_enc = declared_enc.lower() # possible misregocnition of an encoding if (declared_enc and enc != declared_enc): detect_dict = chardet.detect(r.content) det_conf = detect_dict["confidence"] det_enc = detect_dict["encoding"].lower() if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: enc = declared_enc print "CHOOSED ENCODING: {}".format(enc) # if page contains any characters that differ from the main # encodin we will ignore them content = r.content.decode(enc, "ignore").encode(enc) htmlparser = etree.HTMLParser(encoding=enc) root = etree.HTML(content, parser=htmlparser) etree.strip_elements(root, html.etree.Comment, "script", "style") text = html.tostring(root, method="text", encoding=unicode) text = re.sub('\s+', ' ', text) print text[:200]
def get_params(): '''get form parameters for session use''' r = _session.get(URL, headers={'User-Agent': _UA}) tree = etree.fromstring(r.text, etree.HTMLParser()) # Get all input tags params = { x.attrib['name']: x.attrib.get('value', '') for x in tree.xpath('.//input') } return r.text, params
def get_links(html): parser = etree.HTMLParser() try: tree = etree.fromstring(html, parser=parser) except XMLSyntaxError as ex: return [] if tree is None: return [] links = tree.xpath('//a/@href') return links
def get_load_test_result(load_test_report_path): if (os.path.exists(load_test_report_path)): parser = etree.HTMLParser(encoding='utf-8') html = etree.parse(load_test_report_path, parser=parser) cbft_name = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[1]/text()')[0] cbft_status = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[2]/text()')[0] cbft_tps = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[3]/text()')[0] cbft_cpu = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[4]/text()')[0] cbft_memory = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[5]/text()')[0] cbft_bwup = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[6]/text()')[0] cbft_bwdown = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[1]/td[7]/text()')[0] wasm_name = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[1]/text()')[0] wasm_status = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[2]/text()')[0] wasm_tps = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[3]/text()')[0] wasm_cpu = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[4]/text()')[0] wasm_memory = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[5]/text()')[0] wasm_bwup = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[6]/text()')[0] wasm_bwdown = html.xpath( '/html/body/div/div[2]/div/table/tbody/tr[3]/td[7]/text()')[0] cbft = '场景:' + cbft_name + '<br>' + \ '状态:' + cbft_status + '<br>' + \ '每秒事务数:' + cbft_tps + '<br>' + \ '处理器(%):' + cbft_cpu + '<br>' + \ '内存(%):' + cbft_memory + '<br>' + \ '带宽上行(Mb/s):' + cbft_bwup + '<br>' + \ '带宽下行(Mb/s):' + cbft_bwdown wasm = '场景:' + wasm_name + '<br>' + \ '状态:' + wasm_status + '<br>' + \ '每秒事务数:' + wasm_tps + '<br>' + \ '处理器(%):' + wasm_cpu + '<br>' + \ '内存(%):' + wasm_memory + '<br>' + \ '带宽上行(Mb/s):' + wasm_bwup + '<br>' + \ '带宽下行(Mb/s):' + wasm_bwdown if cbft_status == 'success' and wasm_status == 'success': TestResult = 'PASS' else: TestResult = 'FAIL' return TestResult, cbft + '<br><br>' + wasm else: return 'ERROR', '测试被人为中断或测试代码出错,未能生成报告'
def get_links(html): parser = etree.HTMLParser() try: tree = etree.fromstring(html, parser=parser) except XMLSyntaxError as ex: log.warn('html parsing error') return [] if tree is None: log.warn("html not parsed") return [] links = tree.xpath('//a/@href') return links
def _get_text(self, remove_newlines=True): """ Retrieves html with provided url and parses it to fully remove all html tags, style declarations and scripts. Args: remove_newlines (bool): wheter perform cleaning of a \n\r sequencies or not. Returns: unicode object of the whole text without html tags """ if not self.text: url = self.url try: self.log.debug("Try to get content from page {}".format(url)) r = requests.get(url) except requests.exceptions.RequestException as e: self.log.warn("Unable to get page content of the url: {url}. " "The reason: {exc!r}".format(url=url, exc=e)) raise ParsingError(e.strerror) ud = UnicodeDammit(r.content, is_html=True) enc = ud.original_encoding.lower() declared_enc = ud.declared_html_encoding if declared_enc: declared_enc = declared_enc.lower() # possible misregocnition of an encoding if (declared_enc and enc != declared_enc): detect_dict = chardet.detect(r.content) det_conf = detect_dict["confidence"] det_enc = detect_dict["encoding"].lower() if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: enc = declared_enc # if page contains any characters that differ from the main # encoding we will ignore them content = r.content.decode(enc, "ignore").encode(enc) htmlparser = etree.HTMLParser(encoding=enc) root = etree.HTML(content, parser=htmlparser) etree.strip_elements(root, html.etree.Comment, "script", "style") text = html.tostring(root, method="text", encoding="unicode") if remove_newlines: self.log.debug(str(type(text))) text = re.sub('\s+', ' ', text) self.text = text return self.text
def checkdocument(url): print "\n\n" print "That is url {}".format(url) r = requests.get(url) ud = UnicodeDammit(r.content, is_html=True) print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding if not ud.original_encoding == ud.declared_html_encoding: print("Origignal encoding: {} vs declared_html_encoding: {}" "".format(ud.original_encoding, ud.declared_html_encoding)) print "Detected encoding: {!r}".format(chardet.detect(r.content)) content = ud.unicode_markup.encode(ud.original_encoding, "ignore") root = etree.HTML(content, parser=etree.HTMLParser(encoding=ud.original_encoding)) lxml.html.etree.strip_elements(root, lxml.etree.Comment, "script", "style") text = lxml.html.tostring(root, method="text", encoding="utf-8") text = re.sub('\s+', ' ', text) print text[:200]
def filter( self, html: str, inline: bool = False, outgoing: bool = False, display_name_mentions: Optional[Dict[str, str]] = None, ) -> str: """Filter and return HTML.""" mentions = display_name_mentions sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions)) html = sanit.sanitize(html).rstrip("\n") if not html.strip(): return html tree = etree.fromstring( html, parser=etree.HTMLParser(encoding="utf-8"), ) for a_tag in tree.iterdescendants("a"): self._mentions_to_matrix_to_links(a_tag, mentions, outgoing) if not outgoing: self._matrix_to_links_add_classes(a_tag) html = etree.tostring(tree, encoding="utf-8", method="html").decode() html = sanit.sanitize(html).rstrip("\n") if outgoing: return html # Client-side modifications html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html) if not inline: return html return self.inline_quote_regex.sub( r'\1<span class="quote">\2</span>', html, )
def get_congress(cong): params = urllib.urlencode({'congress': cong}) results = urllib.urlopen('http://bioguide.congress.gov/biosearch/biosearch1.asp', params) page = etree.parse(StringIO.StringIO(results.read()), etree.HTMLParser()) nas = 1 for member in page.xpath("//table")[1].xpath("tr")[1:]: name = member.xpath("td/a/text()") print name if len(name) == 0: name = "" print nas nas += 1 continue else: name = name[0] pid = member.xpath("td/a/@href")[0].split("=")[1] stats = member.xpath("td/text()") c.execute('''INSERT OR IGNORE INTO terms (pid, name, dates, position, party, state, congress) VALUES (?,?,?,?,?,?,?)''', (pid, name, stats[0], stats[1][0], stats[2], stats[3], int(stats[4]))) conn.commit()
def parse_detail_page(detail_page_source): ''' 解析详情页,提取字体文件中的id(/2d/2df8ui),并提取公司的核准日期。 问题一:如何解决已经解析过的字体文件不用重复解析?如果是解析过的,直接使用映射字典,不用再进行图片识别了。如果是没有解析过的字体文件,再进行图片识别。 方案:声明一个字典,将解析过的字体id都存入列表中,等到后续获取字体id的时候,和字典中的id进行比对,查询是否存在这个id,如果已经存在,直接将映射字典取出。 {'2df8ui':{'1':'2'},'b64s4f':{'1':'3'}} 每一个id对应的映射字典是否保留?还是说只保留最新的id映射规则? 都保留,避免刷新到以前的id时再重新解析一遍。 :param data_tuple:get_detail_page()这个函数返回的元组。 :return: ''' obj = etree.HTML(detail_page_source, parser=etree.HTMLParser(encoding='utf8')) # 在详情页中提取字体id pattern = re.compile( r'<head>.*?<base.*?<link.*?href="https://static\.tianyancha\.com/fonts-styles/css/(.*?)/font\.css">', re.S) font_id = re.search(pattern, detail_page_source).group(1) # 在获取font_id以后,先判断Map_Dict中是否存在这个键,不存在再去调用parse_map_rule() if font_id not in Map_Dict: map_dic = parse_map_rule(font_id) else: map_dic = Map_Dict[font_id] print(map_dic) exit() # 根据映射 字典转换核准日期 try: hezhunriqi = obj.xpath( '//td[@colspan="2"]/text[contains(@class,"tyc-num")]/text()')[0] name = obj.xpath('//h1[@class="name"]/text()')[0] result = '' for char in hezhunriqi: if char != '-': real_num = map_dic[char] result += real_num else: result += '-' print(name, result) except: pass
def auth(user, passwd): os = 'Linux' useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1' AuthCheckURL = 'http://google.com/' parser = etree.HTMLParser() config = ConfigParser.SafeConfigParser({ 'os': os, 'useragent': useragent, 'debug': 'False' }) config.read('unCleanAccess.cfg') if not config.has_section('login'): config.add_section('login') debug = config.get('login', 'debug') in ('True', 'true') print 'Checking if Authenticated' if not debug: responseAuthCheck = urllib2.urlopen(AuthCheckURL) AuthCheckhtml = responseAuthCheck.read() else: f = open('unCleanAuthCheckunauthed.html', 'r') AuthCheckhtml = f.read() f.close() if AuthCheckhtml.find('/auth/perfigo_weblogin.jsp') != -1: print 'Not Authenticated Yet' urlSplit = AuthCheckhtml.split('URL=') if len(urlSplit) != 2: print 'Error extracting redirect URL (1)' else: urlSplit = re.split("'>|;", urlSplit[1]) if len(urlSplit) < 2: print 'Error extracting redirect URL (2)' else: print 'Fetching Login Page' if not debug: responseAuthPage = urllib2.urlopen(urlSplit[0]) AuthPagehtml = etree.parse(responseAuthPage, parser) else: f = open('authPage.html', 'r') AuthPagehtml = etree.parse(f, parser) f.close() print 'Parsing Login Page' POSTDataItems = dict() for formInput in AuthPagehtml.xpath( ".//form[@name='loginform']//input"): if formInput.get('name'): POSTDataItems[formInput.get('name')] = formInput.get( 'value') POSTDataItems['pm'] = config.get('login', 'os') POSTDataItems['username'] = user POSTDataItems['password'] = passwd authData = urllib.urlencode(POSTDataItems) authHeaders = { 'Referer': urlSplit[0].split('perfigo_weblogin.jsp', 1)[0], 'User-Agent': config.get('login', 'useragent') } print 'Logging in' authReq = urllib2.Request( urlSplit[0].split('auth/perfigo_weblogin.jsp', 1)[0] + AuthPagehtml.xpath(".//form[@name='loginform']")[0].get( 'action').split('/', 1)[1], authData, authHeaders) responseAuthReq = urllib2.urlopen(authReq) authReqhtml = responseAuthReq.read() if authReqhtml.find( 'You have been successfully logged on the network' ) != -1: print 'Successfuly Authenticated!' else: print 'Invalid credentials' (userName, password) = setCreds(regKeyVal) auth(userName, password) else: print 'Already Authenticated'
import requests from lxml.html import etree headers = { "cookie": "__cfduid=dfa5a44a56e1f4818da6dc1c0442d32e61555031717; _" "ga=GA1.2.446599568.1555031722; trc_cookie_storage=taboola%2520global%253Auser-id%3Df47e0355-c5e3-4ac8-8d9c-69e65b8be1c0-tuct3a468dd; " "ShowSubtitleDetails=true; ShowSubtitlePreview=true; " "HearingImpaired=2; ForeignOnly=False; _gid=GA1.2.1534139390.1556500043; LanguageFilter=28; " "cookieconsent_dismissed=yes; cf_clearance=5c22147cf3e89737a1f9ac602ed6b8491cc6bc33-1556588618-31536000-150", "pragma": "no-cache", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" } # for i in range(2, 51): url = "https://subscene.com/browse/popular/all/2" session = requests.session() resp = session.get(url=url, headers=headers) e = etree.HTML(resp.text, etree.HTMLParser(encoding='utf8')) result = e.xpath( '//div[@class="content clearfix"]/table/tbody/tr/td[@class="a1"]/a/@href') result = ["https://subscene.com" + short_url for short_url in result] print('\n'.join(result)) # with open("url.txt", 'a', encoding='utf8')as f: # f.write('\n'.join(result))
# lxml用法:xpath路径提取数据,也支持css选择器的写法。 # etree: element tree文档树,将html源代码转化为一个文档树对象。 from lxml.html import etree html = """ <a class="one" id="first" href="xxx">百度一下</a> <a class="two three four" href="==="><p>111</p>a标签内部的内容。<img src="---"></a> <div> <span>五险一金</span> <span>双休</span> <span>餐补</span> </div> """ # parser: 解析器。 obj = etree.HTML(html, parser=etree.HTMLParser(encoding='utf8')) # <class 'lxml.etree._Element'> print(type(obj)) # a[@class="one"]:查找class="one"的a标签,[@class="xx"]固定用法 # //:表示从html源码中的任意位置查找标签。 # 提取标签属性的写法 # /@href /@src /@id /@class content = obj.xpath('//a[@class="one"]/@href') print(content) # 提取标签的文本内容的写法 text = obj.xpath('//a[@class="one"]/text()') print(text) # 如果要提取一个标签的属性和文本等多个内容,一般都不采用上面这种写法,xpath路径重复的内容太多。先定位到标签,然后再单独从这个标签中提取需要的内容 a = obj.xpath('//a[@class="one"]')[0]
html = driver.page_source.encode('utf-8') page_num = 0 while driver.find_element_by_xpath('//*[@class="load_more mt3"]/a'): break driver.find_element_by_xpath('//*[@class="load_more mt3"]/a').click() page_num += 1 print("getting page number " + str(page_num)) time.sleep(1) if page_num == 1: break return driver.page_source.encode('utf-8') response = getMoreRequests( 'https://www.kickstarter.com/discover/advanced?category_id=16') htmlparser = etree.HTMLParser() tree = etree.parse(response, htmlparser) soup = BeautifulSoup(response, 'html.parser') # projects_grid = soup.find('div', id="projects") projects = soup.find_all( 'div', {"class": "js-react-proj-card grid-col-12 grid-col-6-sm grid-col-4-lg"}) for project in projects: print(project['data-projects']) ## '//div[contains(@data-project)]/@data-project').getall()