def parse_page(self, district: str, page: int = 1) -> None: """To parse through each page. Parsing through pages for given district. Calls itself if it's not last page. Starts with first place, unless another page given. :param district: Given district :param page: Page """ print("Parse", self.domain, district, page) r = self.get(district=district, page=page) html = HTML(html=r.content) flats = html.xpath("//div[@data-name='LinkArea']") for flat in flats: self.parse_flat(flat) if r.status_code == 302 or not flats: print(r.status_code, "Failed page") sleep(10) return self.parse_page(district=district, page=page) page += 1 if html.xpath( f'//div[@data-name="Pagination"]//ul//li//a[text()="{page}"]' ): # noqa: R503, calls itself self.save_parsed(district, page) return self.parse_page(district=district, page=page)
async def test_get_poll(http_client, choices_fixtures): resp = await http_client.get('/poll/1') assert resp.status == 200 html = HTML(html=await resp.text()) assert html.xpath("//label[@for='choice1']", first=True).text == 'Not much' assert html.xpath("//label[@for='choice2']", first=True).text == 'The sky' assert html.xpath("//label[@for='choice3']", first=True).text == 'Just hacking again'
def html_parsing_chromium(fp=r'utils/commands.py'): p = """# Licensed to the White Turing under one or more # contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The SFC licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. '''List of Chromium Command Line Switches.''' class Chromium(object): '''Frequently used commands mappings. There are lots of command lines which can be used with the Google Chrome browser. Some change behavior of features, others are for debugging or experimenting. This page lists the available switches including their conditions and descriptions. Last update occurred on 2018-06-08 from `https://peter.sh/experiments/chromium-command-line-switches/`. ''' """ remove(fp) print(p, file=open(fp, 'a', encoding='utf-8')) with open( r'html/List of Chromium Command Line Switches « Peter Beverloo.html', encoding='utf-8') as f: doc = f.read() html = HTML(html=doc) condition = html.xpath('//tr/@id') explanation = html.xpath('//tr/td[2]/text()') for i, j in zip(condition, explanation): k = i.split(' ')[0].replace("'", '').replace('-', '_').replace('.', '_').upper() j = j.replace('\n', '') if len(k) < 1 or not k[0].isalpha(): continue print(f' {k} = {i.strip()!r} # {j.strip()}', file=open(fp, 'a', encoding='utf-8'))
def parse_edb_cve(self, url, item, html): edb_html = HTML(html=html) raw_id, edb_title, edb_author, edb_type, edb_platform, edb_rport, edb_published = item edb_id = "EDB-{}".format(raw_id) edb_url = url edb_verified = get_val(edb_html.xpath(element_xpath['edb_verified'])) try: edb_cve_num = [ i.strip() for i in edb_html.xpath(element_xpath['edb_cve']) ] if edb_cve_num: maped_edb_cve = [ "CVE-{}".format(cve_id) for cve_id in edb_cve_num ] edb_cve = ','.join(maped_edb_cve) tqdm.write("Detected {} <--> {}".format(edb_id, edb_cve)) except Exception: edb_cve = 'N/A' if 'mdi-close' in edb_verified: edb_verified = 'Unverified' else: edb_verified = 'Verified' edb_exploit_raw_url = 'https://www.exploit-db.com/raw/{}'.format( raw_id) edb_vulnerable_app_url = get_val( edb_html.xpath(element_xpath['edb_vulnerable_app_url'])) if edb_vulnerable_app_url != "": edb_vulnerable_app_url = 'https://www.exploit-db.com' + edb_vulnerable_app_url edb_collect_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) exploit_record = EdbRecord( edb_id=edb_id, edb_title=edb_title, edb_url=edb_url, edb_author=edb_author, edb_cve=edb_cve, edb_type=edb_type, edb_platform=edb_platform, edb_remote_ports=edb_rport, edb_verified=edb_verified, edb_vulnerable_app_url=edb_vulnerable_app_url, edb_exploit_raw_url=edb_exploit_raw_url, edb_published=edb_published, edb_collect_date=edb_collect_date) self.insert_record(exploit_record)
def html_parsing_ios(fp=r'utils/ios.json'): import json with open(r'html/iOS version history - Wikipedia.htm', encoding='utf-8') as f: doc = f.read() html = HTML(html=doc) nv = html.xpath('//tr[@valign="top"]/th[not(@colspan)]/text()[1]')[64:-1] cv = html.xpath('//tr[@valign="top"]/td[1]/text()[1]')[64:-1] nv = map(lambda x: x.strip(), nv) cv = map(lambda x: x.strip().split('/')[-1], cv) json.dump(dict(zip(nv, cv)), fp=open(fp, 'w'))
def extract(html, number): """ 从搜索页面提取标题和 url, 标题中含有番号则返回 url 估计这个用xpath提取标题很容易失效 Args: html: number: Returns: """ html = HTML(html=html) link_content = html.xpath("//a") # 直接维护列表 title_xpath = ["//h3/div/text()", "//h3/span/text()"] for content in link_content: for xpath in title_xpath: title = content.xpath(xpath, first=True) if not title: continue if re.search( "".join(filter(str.isalnum, number)), "".join(filter(str.isalnum, title)), flags=re.I, ): link = content.xpath("//@href", first=True) if link: return link
def worker(domain): while True: url = LINKS_QUEUE.get() SCANNED_LINKS.add(url) try: with webdriver.Chrome(executable_path='./chromedriver') as browser: browser.get(url) html_code = browser.page_source except Exception as e: print(e, type(e)) continue html = HTML(html=html_code) try: page_title = html.xpath('//title')[0].text except IndexError: page_title = 'Not Found' try: page_h1 = html.xpath('//h1')[0].text except IndexError: page_h1 = 'Not Found' Page.create(url=url, title=page_title, h1=page_h1) print('[OK]', url) for link in html.absolute_links: link = link.split('#')[0] if domain not in link: continue if link in SCANNED_LINKS: continue if any(part in link for part in BAD_PARTS): continue LINKS_QUEUE.put(link)
def find_raw_file_link(self, file_page: HTML): raw_link_element = file_page.xpath(self.raw_link_xpath, first=True) raw_url = raw_link_element.attrs.get("href") if raw_url is None: raise ContinueException( f"Failed to find the raw file link in the attrs: " f"{raw_link_element.attrs}." ) full_raw_url = self.domain_url + raw_url return full_raw_url
def crawler(link): driver.get(link) base_url = 'https://www.coursera.org' r = HTML(html=driver.page_source) course_name_xpath = '//div[@class="rc-Welcome"]/div/div//h1/text()' course_name = r.xpath(course_name_xpath)[0] weeks_hrefs = r.xpath( '//*[@id="rendered-content"]/div/div/div/div[2]/nav/div[1]/div/a/@href' ) weeks_hrefs = [base_url + url for url in weeks_hrefs] table_of_content = {} for week_href in weeks_hrefs: driver.get(week_href) time.sleep(7) r = HTML(html=driver.page_source) heading_elements = r.xpath('//div[@class="rc-NamedItemList"]') for heading_element in heading_elements: heading_text = heading_element.xpath('//h3/text()')[0] elements = heading_element.xpath( '//div[@class="rc-WeekItemName headline-1-text"]/text()') table_of_content[heading_text] = elements return course_name, table_of_content
def parser(page_source): r = HTML(html=page_source) data = [] posts = r.xpath('//div[contains(@class, "userContent")]') for post in posts: try: content = post.xpath('//div[contains(@data-testid, "post_message")]')[0] except IndexError: continue text = content.text text = cleanup_text(text) attached_file = get_attached_file(post) data.append((text, attached_file)) return data
def step_5(self): script = 'window.scrollTo(0,document.body.scrollHeight)' self._browser.execute_script(script)# 滚动到页面底部 src = self._browser.page_source# 先看一下还有没有加载更多的button html = HTML(html=src) buttons = html.xpath('//*[@id="app"]/div/div[2]/div/div[2]/button') if len(buttons) == 0:# 如果没有,说明加载完了 raise DownloadOver('[step_5] download is over') else: select = '//*[@id="app"]/div/div[2]/div/div[2]/button' try: more_button = self._browser.find_element_by_xpath(select) more_button.click() self.Log('[step_5] <more button> clicked, waiting for load more pics......') time.sleep(10) except EX.NoSuchElementException: raise DownloadInterrupt('[step_5] [error] can not find <more button>')
class ECNU_WEBGRAB: def __init__(self, uri): self.uri = uri self.root = None def get_fz_data(self): html = query_fz_from_ecnu(self.uri) self.root = HTML(html=html) return self.__get_data() def __get_data(self): #property_div = self.root.xpath(r'//*[@id="directs"]')[0] prefix_uri = self.root.xpath(r'//*[@id="directs"]/label/a/@href') prefix = cleanup(self.root.xpath(r'//*[@id="directs"]/label/a/text()')) name = self.root.xpath(r'//*[@id="directs"]/label/a/span/text()') properties = makeprops(prefix, name) #print(properties) values = [ v.strip() for v in self.root.xpath(r'//*[@id="directs"]/div/*/*/text()') ] #print(values) fz_brief = {k: v for k, v in zip(properties, values)} return self.__unfold_fz_data(fz_brief) def __unfold_fz_data(self, brief): need2unfold = {k: v for (k, v) in brief.items() if v.startswith(r'_:')} #print(need2unfold) prefix = cleanup(self.root.xpath(r'//*[@id="bnodes"]/label/a/text()')) name = self.root.xpath(r'//*[@id="bnodes"]/label/a/span/text()') properties = makeprops(prefix, name) #print(properties) values = self.root.xpath( r'//*[@id="bnodes"]/div[@class="c2 valuecnt"]') for value in values: print(value.absolute_links) print(value.text) print(value.links) print("-------------")
def parse_movie(doc): html = HTML(html=doc) title = html.xpath("head/title")[0] title_str = title.text # director = html.find('#info > span:nth-child(1) > span.attrs > a')[0] director = html.find('#info > span a[rel="v:directedBy"]')[0] director_str = director.text type = html.find('#info > span[property="v:genre"]')[0] type_str = type.text.split()[0] # release_date = html.find('#info > span[property="v:initialReleaseDate"]')[-1] # release_date_str = release_date.text length_in_minute = html.find('#info > span[property="v:runtime"]')[0] length_in_minute_str = length_in_minute.attrs['content'] comments_str = html.text is3D = comments_str.find("3D") != -1 score = html.find('#interest_sectl strong[property="v:average"]')[0] score_str = score.text # divs = html.xpath("body/div") # print() print(director_str) print(type_str) # print(release_date_str) print(length_in_minute_str) print(is3D) print(score_str) actors = html.find('#info > span.actor > span.attrs')[0].find('a') actor_list = [(actor.attrs['href'], actor.text.split()[0]) for actor in actors] print(actor_list)
def new_instance(submit_url: str, html: HTML): form_definition_script = \ html.xpath('//script[text()[contains(.,"FB_PUBLIC_LOAD_DATA_")]]', first=True) if not form_definition_script: raise FormNotFoundException() form_definition = \ GoogleForm.__parse_form_definition_script(form_definition_script) title = form_definition[1][8] description = form_definition[1][0] file_name = form_definition[3] if form_definition[1][10] is not None: requires_login = bool(form_definition[1][10][1]) requires_email = bool(form_definition[1][10][4]) else: requires_login = False requires_email = False questions_data = form_definition[1][1] pages = [] page_title = title page_description = description page_questions = [] for question_data in questions_data: question = GoogleFormQuestion.new_instance(question_data) if question.question_type is GoogleFormQuestion.Type.PAGE_SWITCH: pages.append( GoogleFormPage(page_title, page_description, page_questions)) page_title = question.title page_description = question.description page_questions = [] else: page_questions.append(question) pages.append( GoogleFormPage(page_title, page_description, page_questions)) return GoogleForm(submit_url, title, description, file_name, requires_login, requires_email, pages)
def change_content(content, xpath, url=None): """ 处理内容 :param content: :param xpath: :param url: :return: """ rule = r'src="(.*?)"' img_list = re.compile(rule, re.S).findall(content) html_body = content # 没有图片不需要处理 if img_list != []: for img_url in img_list: img_src = urljoin(url, img_url) html_body = html_body.replace(img_url, img_src) img_link = re.findall(r'<img.*?>', html_body) if img_link != []: for img in img_link: url = re.findall(r'src="(.*?)"', img) if url == []: logging.error(content) img_str = img_str_link.format(url[0]) html_body = html_body.replace(img, img_str) from requests_html import HTML html1 = HTML(html=html_body) # # print(html1.markdown) html_body = html1.xpath(xpath)[0].text return html_body # import hashlib # # # 待加密信息 # str = '江苏明月光电科技有限公司' # # # 创建md5对象 # hl = hashlib.md5() # # # Tips # # 此处必须声明encode # # 若写法为hl.update(str) 报错为: Unicode-objects must be encoded before hashing # hl.update(str.encode(encoding='utf-8')) # print(hl.hexdigest()) # # # # # html=""" # <div class="bmsg job_msg inbox"> # 岗位职责<br>1、销售管理职位,负责其功能领域内主要目标和计划;<br>2、制定、参与或协助上层执行相关的政策和制度;<br>3、负责区域的销售运作,包括计划、组织、进度控制和检讨;<br>4、分析和开发市场并搞好售后服务;<br><br>任职资格<br>1、大专以上学历;<br>2、有做营销的愿望和激情;<br>2、有销售经验或应届大学毕业生均可;<br>3、出色的市场分析洞察能力、具备全面深刻营销知识和技能;<br>4、具备一定的管理领导能力和沟通协调能力;<br>5、江苏省13个地级市驻地区域经理,各城市本地人。 # <div class="mt10"> # <p class="fp"> # <span class="label">职能类别:</span> # <span class="el">销售代表</span> # </p> # <p class="fp"> # <span class="label">关键字:</span> # <span class="el">销售营销业务</span> # </p> # </div> # <div class="share"> # <a track-type="jobsButtonClick" event-type="6" class="a" href="javascript:void(0);" id="fenxiang">分享</a> # <div class="shareBox"> # <div id="weixinMa_fx" style="display:none;"><img width="198" height="198" alt="二维码" src="https://jobs.51job.com/comm/qrcode.php?url=https%3A%2F%2Fm.51job.com%2Fsearch%2Fjobdetail.php%3Fjobid%3D96516324"></div> # <a class="icon_b i_weixin" href="javascript:;" onclick="weixinMa();">微信</a> # <a class="icon_b i_mail" target="_blank" href="http://my.51job.com/sc/sendjob_tofriend.php?jobid=96516324&coid=3511134&divid=0">邮件</a> # </div> # </div> # <div class="clear"></div> # </div> # # # """ # from requests_html import HTML # html1 = HTML(html=html) # # # print(html1.markdown) # html_body = html1.xpath('//div[@class="bmsg job_msg inbox"]')[0].text # print(html_body) # # html_body = change_content(html,'//section[@class="textblock"]') # # with open('1.txt','w',encoding='utf-8') as f: # # f.write(html_body) # # # print(html_body) # # from requests_html import HTMLSession # # # # session = HTMLSession() # # r = session.get('https://toutiao.hc360.com/2/29822.html') # # print(r.html.xpath('//div[@class="textblock"]')) # # print(r.html.decode('utf-8','ignore').find('#textblock')) # # print(r) # # print(r.html.xpath('//div[@class="textblock"')) # # print(r.html.links) # # from requests_html import session # # from requests_html import session # # soup = BeautifulSoup(html_body,'html.parser',from_encoding='utf-8') # # info = soup.find('div',class_='text_box1 cl') # # print(soup.text) # # html_body = etree.HTML(html_body) # # info = html_body.xpath('//div[@class="text_box1 cl"]') # # print(info[0]) # # selector = Selector(text=html_body) # # bloger = selector.xpath('//section [@class="textblock"]') # # print(bloger.xpath('string(.)').extract_first()) # # bloger = selector.xpath('//div[@class="art-con article_body"]') # # info = bloger.text # # print(info) # # import html2text # # print (html2text.html2text(html)) # # # # print(html1.links) # # from tomd import Tomd # # a = Tomd(html_body).markdown # # # pattern = '[\\\`\*\_\[\]\#\+\-\!\>]' # # pattern = '[\\\`\*\_\[\]\#\+\-\!\>]' # # partter1 = ' ' # # content_text3 = re.sub(pattern, ' ', a) # # content_text4 = re.sub(partter1, ' ', content_text3) # # partter2 = '(http:.*?.com)' # # content_text5 = re.sub(partter2, ' ', content_text4) # # # print(content_text5) # # print(a) # # # sample_text = ''' # # The textwrap module can be used to format text for output in # # situations where pretty-printing is desired. It offers # # programmatic functionality similar to the paragraph wrapping # # or filling features found in many text editors. # # ''' # # import textwrap # # print(a[0].text) # # b = textwrap.fill(a[0].text,initial_indent='',subsequent_indent=' ' * 4,) # # # # print(b)
class ExtractionInteret(object): REGEX_HTML_TAGS = re.compile( r'<(br|basefont|hr|input|source|frame|param|area|meta|!--|col|link|option|base|img|wbr|!DOCTYPE|html|head).*?>|<(a|abbr|acronym|address|applet|article|aside|audio|b|bdi|bdo|big|blockquote|body|button|canvas|caption|center|cite|code|colgroup|command|datalist|dd|del|details|dfn|dialog|dir|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|frameset|head|header|hgroup|h1|h2|h3|h4|h5|h6|html|i|iframe|ins|kbd|keygen|label|legend|li|map|mark|menu|meter|nav|noframes|noscript|object|ol|optgroup|output|p|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|span|strike|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|tt|u|ul|var|video).*?</\2>' ) def __init__(self, titre, source): """ :param string source: """ self._titre = titre.replace('\r', '').replace('\n', '') self._source = source.lstrip().strip('\r') self._interets = {'informations': list(), 'titre': self._titre, 'hyperliens': list(), 'identifiants': list()} self._recyles = list() self._may_html = re.search(ExtractionInteret.REGEX_HTML_TAGS, self._source) is not None self._dom = HTML(html=source.replace('<br>', '<br>\n').replace('<br/>', '<br/>\n')) if self._may_html else None nb_interet_pre = len(self._interets.keys()) if self._may_html: for table in self._dom.find('table'): if table.attrs.get('class') is not None and 'MsoNormalTable' in table.attrs.get('class'): continue try: df = pd.read_html(table.html) except ValueError: continue if len(df) == 0: continue for el in df[0].to_dict(orient='records'): keys = el.keys() if 0 in keys and 1 in keys: possible_key = str(el[0]).lstrip().rstrip() possible_value = str(el[1]).lstrip().rstrip() self[possible_key] = possible_value self._recyles.append(possible_value) elif 1 not in keys and 0 in keys: possible_line = str(el[0]) self._recyles.append(possible_line) self._interets['hyperliens'] = list(self._dom.links) if self._may_html is True and len(self._interets.keys()) == nb_interet_pre: self._source = self._dom.full_text.replace('\r', '\n') self._may_html = False self._sentences = ExtractionInteret.extract_sentences(self._source.replace('\n', '\n ')) for line in self._source.split('\n') + [self._titre]: self._recyles.append(line) mes_associations = re.findall(r'(([^\w])|^)([a-zA-Z $\u00C0-\u017F\'_]{3,})(:|→|⟶|-->|->)(.+?(?=[\n\"><\]\[]))', line+'\n') mes_hyperliens = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line) self._interets['hyperliens'] += [el.strip('<>') for el in mes_hyperliens if el.strip('<>') not in self._interets['hyperliens']] for association in mes_associations: # type: tuple[str, str, str, str, str] a, b, c, e, d = association partie_possible_cle, partie_possible_valeur = c.rstrip().lstrip(), d.rstrip().lstrip() if not partie_possible_valeur.startswith('//') and (self[partie_possible_cle] is None or self[partie_possible_cle] != partie_possible_valeur): self[partie_possible_cle] = partie_possible_valeur self._interets['informations'] = self.retrive_informations_balisees() self._interets['identifiants'] = self.retrieve_identifer(None, multiple=True) @property def recycles(self): return self._recyles def __contains__(self, item): return slugify(item) in self._interets.keys() def __getitem__(self, key): key = slugify(key) return self._interets[key] if key in self._interets.keys() else None def __setitem__(self, key, value): key = slugify(key) if self[key] is not None: self[key+'-0'] = value return self._interets[key] = value def injecter_interet(self, cle, donnee): """ :param str cle: :param str donnee: :return: """ cle = slugify(cle) if cle in self._interets.keys(): raise KeyError self._interets[cle] = donnee @property def interets(self): return self._interets @property def source(self): return self._source @property def sentences(self): return self._sentences def retrieve_xpath(self, expression_xpath): """ Découverte d'un chemin xpath :param str expression_xpath: :return: """ if self._may_html is False: return None r = self._dom.xpath(expression_xpath, first=True) return r.full_text if r is not None else None def retrive_informations_balisees(self, focus=None): def extract(my_string): mes_informations = [el[1:-1] for el in re.findall(r'\[[a-zA-Z0-9:\-# _\'\u00C0-\u017F]{1,36}\]', my_string)] + \ [''.join(el) for el in re.findall(r'(([^\w#])|^)#(\w*[0-9a-zA-Z]+\w*[0-9a-zA-Z])', my_string)] return mes_informations informations = list() if focus is None: cts_listes = self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)] elif focus == 'corpus': cts_listes = self._recyles + self._sentences else: cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])] for my_str in cts_listes: informations += extract(my_str) return list(set(informations)) def retrieve_expression_reguliere(self, expression_reguliere, focus=None): expression_reguliere = re.compile(expression_reguliere) if focus is None: cts_listes = self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)] elif focus == 'corpus': cts_listes = self._recyles + self._sentences else: cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])] for my_str in cts_listes: my_extract = re.findall(expression_reguliere, my_str) if len(my_extract) > 0: return str(my_extract[0]) if isinstance(my_extract, list) else ''.join(my_extract) return None def retrieve_date(self, prefix, focus=None, multiple=False): """ :param str prefix: :param bool multiple: :return: """ def extract(my_string): """ :param str my_string: :return: :rtype: str """ date_fr_regex = re.compile( r'{}'.format(re.escape(prefix+' '))+r'([0-2 ][0-9]|(3)[0-1])([\/-])(((0)[0-9])|((1)[0-2]))([\/-])\d{2,4}' ) date_us_regex = re.compile( r'{}'.format(re.escape(prefix+' '))+r'\d{4}([\/-])(((0)[0-9])|((1)[0-2]))([\/-])([0-2][0-9]|(3)[0-1])' ) date_rfc_3339 = re.compile( r'{}'.format(re.escape(prefix+' '))+r'((?:(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2}(?:\.\d+)?))(Z|[+-]\d{2}:\d{2})?)' ) date_rfc_2822 = re.compile( r'{}'.format(re.escape(prefix+' '))+r'(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?(0[1-9]|[1-2]?[0-9]|3[01])\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(19[0-9]{2}|[2-9][0-9]{3})\s+(2[0-3]|[0-1][0-9]):([0-5][0-9])(?::(60|[0-5][0-9]))?\s+([-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))(\s+|\(([^\(\)]+|\\\(|\\\))*\))*' ) date_fr_reduite_regex = re.compile( r'{}'.format(re.escape(prefix+' '))+r'(((0)[0-9])|((1)[0-2]))([\/-])\d{4}' ) date_us_reduite_regex = re.compile( r'{}'.format(re.escape(prefix+' '))+r'\d{4}([\/-])(((0)[0-9])|((1)[0-2]))' ) dates_expressions_regulieres = [ date_rfc_3339, date_rfc_2822, date_fr_regex, date_us_regex, date_fr_reduite_regex, date_us_reduite_regex ] for el in dates_expressions_regulieres: mt = re.search(el, my_string) if mt is not None: return mt.group().replace(prefix, '') return None dates = list() if focus is None: cts_listes = self._interets['informations'] + self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)] elif focus == 'corpus': cts_listes = self._recyles + self._sentences else: cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])] for my_str in cts_listes: ma_date = extract(my_str) if ma_date: if multiple is False: return ma_date dates.append(ma_date) return None if multiple is False else dates def retrieve_inner_expression(self, expr_left, expr_right, focus=None, multiple=False): """ :param str focus: :param str expr_left: :param str expr_right: :param bool multiple: :return: """ expr_left = unidecode(expr_left).lower() if expr_left is not None else '' expr_right = unidecode(expr_right).lower() if expr_right is not None else '' def extract(ma_chaine): """ :param str ma_chaine: :return: """ ma_chaine_unidecoded = unidecode(ma_chaine).lower() if expr_left is not None and len(expr_left) > 0 and expr_left in ma_chaine_unidecoded: if expr_right is None or len(expr_right) == 0: return ma_chaine[ma_chaine_unidecoded.index(expr_left) + len(expr_left):].lstrip().rstrip() if expr_right in ma_chaine_unidecoded[ma_chaine_unidecoded.index(expr_left) + len(expr_left) - 1:]: return ma_chaine[ma_chaine_unidecoded.index(expr_left) + len(expr_left):ma_chaine_unidecoded.index(expr_right)].lstrip().rstrip() elif (expr_left is None or len(expr_left) == 0) and expr_right is not None and expr_right in ma_chaine_unidecoded: return ma_chaine[:ma_chaine_unidecoded.index(expr_right)-1].lstrip().rstrip() return None expressions = list() if focus is None: cts_listes = self._interets['informations'] + self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)] elif focus == 'corpus': cts_listes = self._recyles + self._sentences else: cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])] for my_str in cts_listes: k = extract(my_str) if k is not None: if multiple is False: return k expressions.append(k) return None if multiple is False else expressions def retrieve_identifer(self, prefix, focus=None, exclude_prefix=False, cast_integer=False, multiple=False): """ Récupération d'un identifiant :param str prefix: :param bool exclude_prefix: :param bool cast_integer: :param bool multiple: :return: """ def extract(ma_chaine): matchs = re.search(r'(([^\w-])|^){prefix}([^\W\n]+[\d]+)'.format(prefix=prefix.replace(' ', '\\ ')), ma_chaine) if matchs: digits = ExtractionInteret.extract_digits(matchs.group()) if digits is not None: return (int(digits) if cast_integer is True else digits) if exclude_prefix is True else matchs.group().replace(matchs.group(1), '') return None if prefix is None or len(prefix) == 0: prefix = '[A-Za-z-°]+' identifiants = list() if focus is None: cts_listes = self._interets['informations'] + self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)] elif focus == 'corpus': cts_listes = self._recyles + self._sentences else: cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])] for my_str in cts_listes: identifiant = extract(my_str) if identifiant is not None: if multiple is False: return identifiant if identifiant not in identifiants: identifiants.append(identifiant) return None if multiple is False else identifiants def has_expression_cle(self, expression_cle, focus=None): """ :param str expression_cle: :return: """ expression_cle = unidecode(expression_cle).lower() if focus is None: cts_listes = self._recyles+self._sentences+self.interets['informations']+[self.interets['titre']] elif focus == 'corpus': cts_listes = self._recyles + self._sentences else: cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])] for el in cts_listes: if not isinstance(el, str): continue if expression_cle in unidecode(el).lower(): return True return False def has_expression_dans_cle(self, ma_cle, mon_expression): mon_expression = unidecode(mon_expression).lower() if self.has_interet(ma_cle) is True: el = self[ma_cle] if isinstance(el, str): return mon_expression in unidecode(el).lower() elif isinstance(el, list): for el_l in el: if isinstance(el_l, str): if mon_expression in unidecode(el_l).lower(): return True return False def has_information(self, information_cible, focus=None): """ :param focus: :param str information_cible: :return: """ information_cible = unidecode(information_cible).lower() for el in self.interets['informations'] if focus is None else self.retrive_informations_balisees(focus): if information_cible in unidecode(el).lower(): return True return False def has_interet(self, interet_cible): """ :param str interet_cible: :return: """ return slugify(interet_cible) in self.interets.keys() def get_interet(self, interet_cible): return self.interets[slugify(interet_cible)] if self.has_interet(interet_cible) else None @staticmethod def extract_digits(string): """ :param str string: :return: """ final_str = '' first_digit_mt = False for c in string: if c.isdigit(): first_digit_mt = True final_str += c elif first_digit_mt is True and c.isdigit() is False: break return final_str if final_str != '' else None @staticmethod def alnum_percentage(source): """ :param string source: :return: """ o_len = len(source) f_len = 0 for el in source: if el.isalnum(): f_len += 1 return f_len / o_len @staticmethod def extract_sentences(source): """ :param str source: :return: """ source_splitted = source.split(' ') sentences = [''] for possible_word in source_splitted: if len(possible_word) == 0: continue if re.fullmatch(r'[\w\'’/.,!?;\-\u00C0-\u017F\n]{1,26}', possible_word): sentences[-1] += ' ' + possible_word if len(sentences[-1]) > 0 else possible_word if possible_word in ['.', '?', '!', '\n'] or sentences[-1][-1] in ['.', '?', '!', '\n']: sentences.append('') elif sentences[-1] != '': if len(sentences[-1].split(' ')) > 3: sentences.append('') else: sentences[-1] = '' return sentences
def parse(self, response): follow_urls = set() paper_info = { 'title': response.xpath(path.TITLE).get(), 'url': response.url, 'date': response.xpath(path.DATE).get(), 'DOI': response.xpath(path.DOI).get(), 'conference': response.xpath(path.CONFERENCE).get(), 'citation count': transform_number(response.xpath(path.CITATIONS_COUNT).get()), 'reference count': transform_number(response.xpath(path.REFERENCES_COUNT).get()) } self.file_name = paper_info['title'].replace(" ", "_") target_file = open(f'../output/{self.file_name}.json', 'w') target_file.write('{"result": [' + json.dumps(paper_info, indent=4) + ',\n') target_file.close() publication_id = paper_info['url'][paper_info['url'].find("publication" ) + 12:paper_info['url'].find("_")] request_token = response.xpath(path.RG_REQUEST_TOKEN).attrib['content'] offset = 10 if get_reference(token=request_token, uid=publication_id, offset=offset).status_code != 200: self.logger.info( f"response status {get_reference(uid=publication_id, offset=offset).status_code} instead of 200, possibly need to update cookies & token" ) while get_reference(token=request_token, uid=publication_id, offset=offset).status_code == 200: ref_response = get_reference(token=request_token, uid=publication_id, offset=offset) if (ref_response.text == ''): break html = HTML(html=ref_response.text) links = html.xpath(path.REFERENCES_LINK) if len(links) == 0: break for link in links: follow_urls.add(path.BASE_URL + link) offset = offset + 5 for reference in response.xpath(path.REFERENCES): reference_link = path.BASE_URL + reference.xpath( path.REFERENCES_LINK).get() if reference.xpath( path.REFERENCES_LINK).get() is not None else "" if (reference_link != ''): follow_urls.add(reference_link) self.logger.info(f"total urls to follow: {len(follow_urls)}") for url in follow_urls: if url is not None: yield response.follow(url, self.reference_parse)
from requests_html import HTML import codecs fp = codecs.open( "About this Documentation _ Node.js v8.9.4 Documentation.html", "r", "utf-8") html = HTML(html=fp.read()) # c2=html.find('#column2', first=True) # print(c2,dir(c2)) h1s = html.xpath("./body/div/div/div/h1/span/a") for h1 in h1s: print(h1.attrs["id"]) print(len(h1s)) h2s = html.xpath("./body/div/div/ul/li/a") for i in range(len(h1s)): print(h2s[i].attrs["href"]) print("#" + h1s[i].attrs["id"]) #print(h2s[i].attrs["href"]) pass