def getDaeCc(nsd, nsr, infos): logging.info('Getting Dados da Empresa - Composicao do Capital - %s %s', nsd, nsr) page = HtmlElement(openUrl(buildUrl(URL_DCC, PARMS_GERAL, PARMS_BPP, PARMS_BPP, 'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr))) # Multiplicador em = page.xpath('.//div[@id="UltimaTabela"]/table/tr/td/b/text()') m = 1000 if len(em) > 0 and em[0].find('(Mil)') != -1 else 1 for i in ['QtdAordCapiItgz', 'QtdAprfCapiItgz', 'QtdTotAcaoCapiItgz', 'QtdAordTeso', 'QtdAprfTeso', 'QtdTotAcaoTeso']: qnt = page.get_element_by_id('ctl00_cphPopUp_{0}_1'.format(i)) infos[i] = 0 if qnt is None else toInt(qnt.text) * m
def __init__(self, html_element: HtmlElement =None, title=None, broadcast_datetime=None, enclosure_url=None, enclosure_content_length=None, enclosure_content_type=None, show_slug=None): """ Scrape PodcastEpisode information from html_element if given, otherwise use passed-through values. """ if html_element: logging.info('Parsing PodcastEpisode from HTML element…') try: enclosure_url = html_element.xpath('./div/div[@class="download icon"]/a/@href')[0].strip() broadcast_date = datetime.strptime( html_element.xpath('.//div[contains(@class, "listen")]/a/@data-air-day')[0], '%Y-%m-%d') broadcast_time = datetime.strptime( html_element.xpath('.//div[contains(@class, "listen")]/a/@data-airtime')[0], '%H') broadcast_datetime = datetime.combine(broadcast_date.date(), broadcast_time.time()) title = " ".join(html_element.xpath(".//h3//text()")).strip() except IndexError: logging.warning("Failed parsing PodcastEpisode.", exc_info=True) raise try: show_url = html_element.xpath(".//h3/a/@href") if isinstance(show_url, list): # in case the XPath returned multiple "href"s show_url = show_url[0] show_slug = Show.parse_slug(furl(show_url)) if show_url else None except IndexError: logging.info("Could not find Show URL for PodcastEpisode <%s>.", enclosure_url) # Get accurate download information for the RSS Enclosure download_headers = http_session.head(enclosure_url).headers enclosure_content_length = download_headers.get('content-length') enclosure_content_type = download_headers.get('content-type') logging.info('Scraped PodcastEpisode <%s>.', enclosure_url) self.guid = enclosure_url self.title = title self.broadcast_datetime = broadcast_datetime self.enclosure_url = enclosure_url self.enclosure_content_length = enclosure_content_length self.enclosure_content_type = enclosure_content_type self.show_slug = show_slug
def getDfpConBPA(url, infos): logging.info('Getting DFs Consolidadas - Balanco Patrimonial Ativo') t = dict() page = HtmlElement(openUrl(url)) table = page.get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Caixa cxa = toInt(t['1.01.01'][1])*m if '1.01.01' in t else 0 apf = toInt(t['1.01.02'][1])*m if '1.01.02' in t else 0 infos['CAIXA'] = cxa + apf
def getDfpConBPA(nsd, nsr, infos): logging.info('Getting DFs Consolidadas - Balanco Patrimonial Ativo - %s %s', nsd, nsr) t = dict() page = HtmlElement(openUrl(buildUrl(URL_FDF, PARMS_GERAL, PARMS_BPP, 'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr, 'Informacao=2', 'Demonstracao=2'))) table = page.get_element_by_id(TABLE_BPP) for row in getRows(table): rowToDict(row, t) # Multiplicador m = getMultiplicador(page) # Caixa cxa = toInt(t['1.01.01'][1])*m if '1.01.01' in t else 0 apf = toInt(t['1.01.02'][1])*m if '1.01.02' in t else 0 infos['CAIXA'] = cxa + apf
def get_text(elem: LH.HtmlElement) -> Text: # Convert default emojis to their unicode equivalent. classes = elem.get("class", "") if "emoji" in classes: match = re.search("emoji-(?P<emoji_code>\S+)", classes) if match: emoji_code = match.group('emoji_code') char_repr = "" for codepoint in emoji_code.split('-'): char_repr += chr(int(codepoint, 16)) return char_repr # Handles realm emojis, avatars etc. if elem.tag == "img": return elem.get("alt", "") return elem.text or ""
def getEnterprises(): a = HtmlElement(openUrl(buildUrl(URL_ELIST, 'idioma=pt-br'))) d = { '__EVENTTARGET': 'ctl00:contentPlaceHolderConteudo:BuscaNomeEmpresa1:btnTodas', '__VIEWSTATE': a.get_element_by_id('__VIEWSTATE').get('value'), '__EVENTVALIDATION': a.get_element_by_id('__EVENTVALIDATION').get('value'), 'ctl00_contentPlaceHolderConteudo_AjaxPanelBuscaPostDataValue' :'ctl00_contentPlaceHolderConteudo_AjaxPanelBusca,ActiveElement,ctl00_contentPlaceHolderConteudo_BuscaNomeEmpresa1_btnTodas;', 'ctl00$contentPlaceHolderConteudo$tabMenuEmpresaListada': '{"State":{},"TabState":{"ctl00_contentPlaceHolderConteudo_tabMenuEmpresaListada_tabNome":{"Selected":true}}}', 'ctl00$contentPlaceHolderConteudo$BuscaNomeEmpresa1$txtNomeEmpresa$txtNomeEmpresa': '', 'ctl00$contentPlaceHolderConteudo$mpgPaginas_Selected': '0', 'RadAJAXControlID': 'ctl00_contentPlaceHolderConteudo_AjaxPanelBusca', 'httprequest': 'true' } page = etree.HTML(requests.post(buildUrl(URL_ELIST, 'idioma=pt-br'), data=d).text) tables = getTables(page, DIV_ENTERPRISES) if len(tables) < 1: return for row in tables[0].find('tbody').findall('.//tr'): link = row.find('.//td/a[@href]') yield (parse_qs(urlparse(link.get('href')).query, keep_blank_values=True)['codigoCvm'][0], link.text)
def preview(docx_content): docx_file = StringIO(docx_content) zip_ = ZipFile(docx_file) doc_file = zip_.open('word/document.xml') doc = fromstring(doc_file.read()) elements = tuple(doc.iter()) for el in elements: p = HtmlElement('p') p.text = '\n' el.append(p) text = doc.text_content() while ' \n' in text: text = text.replace(' \n', '\n') while '\n\n' in text: text = text.replace('\n\n', '\n') text = text.strip() return text
def _remove_quotes(post: HtmlElement): remove_these = [] # TODO: Summarize instead for e in post.iter(): if 'spoiler' in e.classes: remove_these.append(e) if 'quote' in e.classes: remove_these.append(e) if 'Spoiler' in e.get('onclick', ''): remove_these.append(e) for quote in remove_these: quote.getparent().remove(quote)
def get_mpn(tree: html.HtmlElement): """ Parses mpn from page :param tree: html.HtmlElement from lxml :return: str, manuf. product number, if exists; else None """ path = '//span[@class="product-label-value"]' try: mpn_tag = tree.xpath(path)[1] except IndexError as e: # Combo deals/splash pages/etc frys_logger.error(f'{e.__class__}: {e}') return None else: return mpn_tag.text.strip()
def get_xpath(path: str, tree: html.HtmlElement): """ Looks for path/element in tree :param path: str, valid xpath search string :param tree: html.HtmlElement from lxml :return: element, based on path; or None if not found """ try: element = tree.xpath(path)[0] except IndexError as e: # attribute doesnt exist = sale splash page/bad link/local pickup, etc ebay_logger.error(f'{e.__class__}: {path}: {e}') return None else: return element
def __fake__iterchildren(obj, tag=None, reversed=False): if obj._children_container is None: obj._children_container = [] for e in HtmlElement.iterchildren(obj): obj._children_container.append(e) seq = xrange(0, len(obj._children_container)) if reversed: seq = reversed(seq) for i in seq: #for e in obj._children_container: e = obj._children_container[i] if tag is not None: if e.tag == tag: yield e else: yield e
def get_open_box(tree: html.HtmlElement): """ Give html.HtmlElement for single MC store, parse and return open box price :param tree: html.HtmlElement :return: str, open box price, ex '$249.99'; None if doesn't exist """ path = '//span[@id="opCostNew"]' try: open_box = tree.xpath(path)[0].text except IndexError as e: # No opCostNew id found = no open box available at location mc_logger.info(f'{e.__class__}: {e}') else: return open_box return None
def get_inventory(tree: html.HtmlElement): """ Given html.HtmlElement for single MC store, parse and return inventory count :param tree: html.HtmlElement :return: str, inventory, ex '9 in stock'; None if parse error """ path = '//span[@class="inventoryCnt"]' try: inventory = tree.xpath(path)[0].text except IndexError as e: # No inventoryCnt class found = sold out at location mc_logger.error(f'{e.__class__}: {e}') else: if inventory != 'Sold Out': return inventory return None
def get_price(tree: html.HtmlElement): """ Parses price from page :param tree: html.HtmlElement from lxml :return: int, rounded, if exists; else None """ path = '//span[@id="did_price1valuediv"]' try: price_tag = tree.xpath(path)[0].text price = int(round(float(price_tag[1:]))) except IndexError as e: # Combo deals/splash pages/etc frys_logger.error(f'{e.__class__}: {e}') return None except TypeError as e: frys_logger.error(f'{e.__class__}: {e}') return None else: return price
def posts_from_page(main_content: HtmlElement): post_elems = main_content.cssselect('.fpost') posts = [] for e_post in post_elems: post_id = int(e_post.cssselect('div > a')[0].get('id')) user = e_post.cssselect('div.solid > section .fpost-username')[0][0].text content = e_post.cssselect('div.solid > section article.forumPost > section')[0] # TODO: Summarize quotes and spoilers _remove_quotes(content) text = [] for t in content.itertext(): t = t.strip() if t and not _is_punctuation(t): text.append(t) posts.append(Post(post_id, user, text)) return posts
def get_author_name(root: html.HtmlElement, num: int) -> str: if author := root.cssselect('div.bo_div')[num].cssselect('b')[1].text.strip(): return author
def _dfs( tree: _ElementTree, element: HtmlElement, parent_language: str, infractions: List[LanguageInfraction], ) -> Tuple[str, Optional[str], Optional[str], List[LanguageInfraction]]: """Check for infractions against WCAG 3.1.2 using a recursive DFS. Parameters ---------- tree : _ElementTree The root element tree of the web page, used to calculate the xpath element : HtmlElement The current element to check for infractions parent_language : str The defined language of the current element's parent infractions : List[LanguageInfraction] The infractions found until now Returns ------- Tuple[str, Optional[str], Optional[str], List[LanguageInfraction]] A tuple containing: - the explicitly defined language of the current element - the detected language of the current element - the text of the current element - the infractions found """ # If the current element does not have a `lang` attribute, take the parent's language defined_language = element.get("lang", parent_language).lower()[:2] # The text contained in this element (and in this element alone) text = (element.text or "").replace("\n", " ").strip() # Check for infractions against WCAG 3.1.2 in hidden attributes hidden_infraction = _check_hidden_attributes(tree, element, defined_language) if hidden_infraction is not None: infractions.append(hidden_infraction) # Check for infractions against WCAG 3.1.2 in the current element's children children = element.getchildren() if children: # Run this function on each of the children (= recursion) children_results = [ _dfs(tree, child, defined_language, infractions) for child in children ] # Differ between children that are similar and children that are different if len({(defined, detected) for defined, detected, _, _ in children_results}) == 1: # All children have the same language defined and detected child_defined_language, child_detected_language, _, _ = children_results[ 0] children_text = " ".join( str(child_result[2]) for child_result in children_results) if child_detected_language and child_defined_language != child_detected_language: # All children are wrong # Give a warning for the current element instead of for each of its children infractions.append( LanguageInfraction( wcag_criterion="WCAG_3_1_2", xpath=tree.getpath(element), html_language=child_defined_language, predicted_language=child_detected_language, text=children_text, )) else: # The children have different values for their defined and detected languages for child, child_result in zip(children, children_results): child_defined_language, child_detected_language, child_text, _ = child_result if child_detected_language and child_detected_language != child_defined_language: # This child is wrong, give a warning for only this child infractions.append( LanguageInfraction( wcag_criterion="WCAG_3_1_2", xpath=tree.getpath(child), html_language=child_defined_language, predicted_language=child_detected_language, text=str(child_text), )) # If any of the children of the current element contains a very short piece of text, add it # to the current element's text for _, _, child_text, _ in children_results: if child_text is None: continue child_text = child_text.strip() # We only add the child text if it is short if count_words(child_text) >= MIN_WORDS_DEFAULT: continue current_text = (text or "").strip() # We only add the child text if the current text doesn't end with the child text if not current_text.endswith(child_text): text = current_text + " " + child_text # If the current element's text is long enough, predict its language if count_words(text) >= MIN_WORDS_DEFAULT: detected_language = predict_language(text) else: detected_language = None return defined_language, detected_language, text, infractions
def _parse_table(self, node: HtmlElement, state: Dict[str, Any]) -> Dict[str, Any]: """Parse a table node. :param node: The lxml table node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if not self.tabular: logger.error("Called _parse_table without tabular activated.") return state if node.tag == "table": table_idx = state["table"]["idx"] stable_id = f"{state['document'].name}::{'table'}:{state['table']['idx']}" # Set name for Table name = node.attrib["name"] if "name" in node.attrib else None # Create the Table in the DB parts = {} parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = table_idx parent = state["parent"][node] if isinstance(parent, Cell): parts["section"] = parent.table.section elif isinstance(parent, Section): parts["section"] = parent else: raise NotImplementedError( "Table is not within a Section or Cell") state["context"][node] = Table(**parts) # Local state for each table. This is required to support nested # tables state["table"][table_idx] = { "grid": defaultdict(int), "cell_pos": 0, "row_idx": -1, "col_idx": 0, } # Increment table counter state["table"]["idx"] += 1 elif node.tag == "tr": if not isinstance(state["parent"][node], Table): raise NotImplementedError("Table row parent must be a Table.") state["table"][state["parent"][node].position]["col_idx"] = 0 state["table"][state["parent"][node].position]["row_idx"] += 1 elif node.tag in ["td", "th"]: if not isinstance(state["parent"][node], Table): raise NotImplementedError("Cell parent must be a Table.") if not state["table"][state["parent"] [node].position]["row_idx"] >= 0: raise NotImplementedError( "Table cell encountered before a table row.") # calculate row_start/col_start while state["table"][state["parent"][node].position]["grid"][( state["table"][state["parent"][node].position]["row_idx"], state["table"][state["parent"][node].position]["col_idx"], )]: # while a cell on the grid is occupied, keep moving state["table"][state["parent"][node].position]["col_idx"] += 1 col_start = state["table"][state["parent"] [node].position]["col_idx"] row_start = state["table"][state["parent"] [node].position]["row_idx"] # calculate row_end/col_end row_end = row_start if "rowspan" in node.attrib: try: row_end += int(node.get("rowspan")) - 1 except ValueError: logger.error( f"Rowspan has invalid value: '{node.get('rowspan')}'") col_end = col_start if "colspan" in node.attrib: try: col_end += int(node.get("colspan")) - 1 except ValueError: logger.error( f"Colspan has invalid value: '{node.get('colspan')}'") # update grid with occupied cells for r, c in itertools.product(list(range(row_start, row_end + 1)), list(range(col_start, col_end + 1))): state["table"][state["parent"][node].position]["grid"][(r, c)] = 1 # Set name for Cell name = node.attrib["name"] if "name" in node.attrib else None # construct cell parts = defaultdict(list) parts["document"] = state["document"] parts["name"] = name parts["table"] = state["parent"][node] parts["row_start"] = row_start parts["row_end"] = row_end parts["col_start"] = col_start parts["col_end"] = col_end parts["position"] = state["table"][state["parent"] [node].position]["cell_pos"] stable_id = (f"{parts['document'].name}" f"::" f"{'cell'}" f":" f"{parts['table'].position}" f":" f"{row_start}" f":" f"{col_start}") parts["stable_id"] = stable_id # Create the Cell in the DB state["context"][node] = Cell(**parts) # Update position state["table"][state["parent"][node].position]["col_idx"] += 1 state["table"][state["parent"][node].position]["cell_pos"] += 1 return state
def _parse_blockquote(self, blockquote: HtmlElement): for item in blockquote.getchildren(): yield from item.text_content().splitlines()
def _get_text_from_element(self, element: HtmlElement) -> Optional[str]: if element.text and element.text.strip(): return element.text.strip() if element.text_content() and element.text_content().strip(): return element.text_content().strip() return None
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: return lx.xpath(xpath)[0].strip()
def _commoninfo(self, lx_content: lxml.HtmlElement) -> ResultCommonInfo: """ 水面気象情報と決まり手,返還挺の有無などの選手以外のレース結果情報 """ # これに情報を格納し最後に型に入れる。 content_dict = {} # 水面気象情報の取得 table_xpath = ("/html/body/main/div/div/div/div[2]" "/div[5]/div[2]/div[1]/div[1]/div/div[1]") content_dict["weather_info"] = self.__common_methods.getweatherinfo( lx_content=lx_content, table_xpath=table_xpath) # 返還テーブルを抜く # 返還挺はリストのまま辞書に入れる # 返還艇がなければ空リスト table_xpath = ("/html/body/main/div/div/div/div[2]" "/div[5]/div[2]/div[1]/div[2]/div[1]" "/table/tbody/tr/td/div/div/span") henkantei_list = lx_content.xpath(table_xpath) # 返還艇をint型に直す,変なやつはNoneでハンドル(ないと思う) def teistr2str(tei_str): tei = re.search(r"[1-6]", tei_str) if tei is not None: return str(tei.group(0)) else: return None henkantei_list = list(map(lambda x: teistr2str(x.text), henkantei_list)) # 返還艇があればリスト長が1以上になる if len(henkantei_list) > 0: is_henkan = True else: is_henkan = False henkantei_str = ",".join(henkantei_list) content_dict["henkantei_list"] = henkantei_str content_dict["is_henkan"] = is_henkan # 決まりて table_xpath = ("/html/body/main/div/div/div/div[2]/div[5]" "/div[2]/div[1]/div[2]/div[2]/table/tbody/tr/td") kimarite = lx_content.xpath(table_xpath)[0].text.strip() content_dict["kimarite"] = kimarite # 備考 table_xpath = ("/html/body/main/div/div/div/div[2]/div[5]" "/div[2]/div[2]/table/tbody/tr/td") biko = lx_content.xpath(table_xpath)[0].text.strip() content_dict["biko"] = biko # 払い戻し table_xpath = ("/html/body/main/div/div/div/div[2]" "/div[5]/div[1]/div/table/tbody/tr[1]/td[3]/span") pay_contents = lx_content.xpath(table_xpath) pay_list = list( map(lambda x: int(re.sub(r"[^\d]", "", x.text)), pay_contents)) # 人気 # TODO 順番問題でうまくいってない可能性大 table_xpath = ("/html/body/main/div/div/div/div[2]" "/div[5]/div[1]/div/table/tbody/tr[1]/td[4]") popular_contents = lx_content.xpath(table_xpath) popular_list = list(map(lambda x: x.text.strip(), popular_contents)) content_dict["payout_3tan"] = pay_list[0] content_dict["popular_3tan"] = self.__common_methods.rmletter2int( popular_list[0]) content_dict["payout_3fuku"] = pay_list[1] content_dict["popular_3fuku"] = self.__common_methods.rmletter2int( popular_list[1]) content_dict["payout_2tan"] = pay_list[2] content_dict["popular_2tan"] = self.__common_methods.rmletter2int( popular_list[2]) content_dict["payout_2fuku"] = pay_list[3] content_dict["popular_2fuku"] = self.__common_methods.rmletter2int( popular_list[3]) content_dict["payout_1tan"] = pay_list[5] return ResultCommonInfo(**content_dict)
def get_result_rows(cls, tree: html.HtmlElement) -> Iterable[html.HtmlElement]: return tree.xpath(cls.rows_xpath)[1:]
def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str: if publish_time_xpath: publish_time = ''.join(element.xpath(publish_time_xpath)) return publish_time return ''
def get_series2(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
def get_title(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
def get_outline(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
def callback(_: int, elem: HtmlElement) -> None: id_ = elem.get("id", "") if id_.isdigit(): cur.execute(sql, (PyQuery(elem).outer_html(), int(id_)))
def get_text(root: html.HtmlElement, num: int) -> str: return ' '.join([item.strip() for item in root.cssselect("div.bo_div")[num].xpath('./text()')]).strip()
def _parse_sentence(self, paragraph: Paragraph, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Parse the Sentences of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] # Set name for Sentence name = node.attrib["name"] if "name" in node.attrib else None # Lingual Parse document = state["document"] for parts in self.lingual_parser.split_sentences(text): abs_offset = state["sentence"]["abs_offset"] parts["abs_char_offsets"] = [ char_offset + abs_offset for char_offset in parts["char_offsets"] ] parts["document"] = document # NOTE: Why do we overwrite this from the spacy parse? parts["position"] = state["sentence"]["idx"] abs_sentence_offset_end = (state["sentence"]["abs_offset"] + parts["char_offsets"][-1] + len(parts["words"][-1])) parts["stable_id"] = construct_stable_id( document, "sentence", state["sentence"]["abs_offset"], abs_sentence_offset_end, ) parts["name"] = name state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node tree = lxml.etree.ElementTree(state["root"]) parts["xpath"] = tree.getpath(context_node) parts["html_tag"] = context_node.tag parts["html_attrs"] = [ "=".join(x) for x in list(context_node.attrib.items()) ] # Extending html style attribute with the styles # from inline style class for the element. cur_style_index = None for index, attr in enumerate(parts["html_attrs"]): if attr.find("style") >= 0: cur_style_index = index break head = state["root"].find("head") styles = None if head is not None: styles = head.find("style") if styles is not None: for x in list(context_node.attrib.items()): if x[0] == "class": exp = r"(." + x[1] + r")([\n\s\r]*)\{(.*?)\}" r = re.compile(exp, re.DOTALL) if r.search(styles.text) is not None: if cur_style_index is not None: parts["html_attrs"][cur_style_index] += ( r.search(styles.text).group(3).replace( "\r", "").replace("\n", "").replace("\t", "")) else: parts["html_attrs"].extend([ "style=" + re.sub( r"\s{1,}", " ", r.search( styles.text).group(3).replace( "\r", "").replace( "\n", "").replace( "\t", "").strip(), ) ]) break parts["position"] = state["sentence"]["idx"] # If tabular, consider own Context first in case a Cell # was just created. Otherwise, defer to the parent. parent = paragraph if isinstance(parent, Paragraph): parts["section"] = parent.section parts["paragraph"] = parent if parent.cell: # if True self.tabular is also always True parts["table"] = parent.cell.table parts["cell"] = parent.cell parts["row_start"] = parent.cell.row_start parts["row_end"] = parent.cell.row_end parts["col_start"] = parent.cell.col_start parts["col_end"] = parent.cell.col_end else: raise NotImplementedError("Sentence parent must be Paragraph.") yield Sentence(**parts) state["sentence"]["idx"] += 1
def get_title(lx: html.HtmlElement) -> str: return str( lx.xpath( "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()" )[0]).strip()
def _parse_figure(self, node: HtmlElement, state: Dict[str, Any]) -> Dict[str, Any]: """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the Figure stable_id = (f"{state['document'].name}" f"::" f"{'figure'}" f":" f"{state['figure']['idx']}") # Set name for Figure name = node.attrib["name"] if "name" in node.attrib else None # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. parts: Dict[str, Any] = {} parent = state["parent"][node] if isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent else: logger.warning(f"Figure is nested within {state['parent'][node]}") return state parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = state["figure"]["idx"] # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB parts["url"] = node.get("src") state["context"][node] = Figure(**parts) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [ child for child in node if child.tag == "figcaption" ] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB parts["url"] = img.get("src") state["context"][node] = Figure(**parts) state["figure"]["idx"] += 1 return state
def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def _playerinfo( self, lx_content: lxml.HtmlElement) -> Iterator[ResultPlayerInfo]: target_table_xpath = ( "/html/body/main/div/div/div/div[2]/div[4]/div[1]/div/table/tbody") rank_xpath = "/".join([target_table_xpath, "/tr/td[1]"]) rank_el_list = lx_content.xpath(rank_xpath) rank_list = list( map( lambda x: int(x.text) if x.text.isdecimal() else -1, rank_el_list, )) waku_xpath = "/".join([target_table_xpath, "/tr/td[2]"]) waku_el_list = lx_content.xpath(waku_xpath) waku_list = list( map( lambda x: int(x.text) if x.text.isdecimal() else -1, waku_el_list, )) name_xpath = "/".join([target_table_xpath, "/tr/td[3]/span[2]"]) name_el_list = lx_content.xpath(name_xpath) name_list = list( map(lambda x: x.text.replace("\u3000", "").strip(), name_el_list)) reg_no_xpath = "/".join([target_table_xpath, "/tr/td[3]/span[1]"]) reg_el_list = lx_content.xpath(reg_no_xpath) reg_no_list = list( map( lambda x: int(x.text) if x.text.isdecimal() else -1, reg_el_list, )) racetime_xpath = "/".join([target_table_xpath, "/tr/td[4]"]) racetime_el_list = lx_content.xpath(racetime_xpath) racetime_list = list( map(lambda x: self._racetime_str_to_sec(x.text), racetime_el_list)) waku_dict = {} for i, waku in enumerate(waku_list): waku_dict[waku] = { "rank": rank_list[i], "name": name_list[i], "no": reg_no_list[i], "racetime": racetime_list[i], } for waku in range(1, 7): # # 結果STテーブルの情報を取得 tbody_xpath = ("/html/body/main/div/div/div/" "div[2]/div[4]/div[2]/div/table/tbody") course, st_time = self.__common_methods.getSTtable( lx_content=lx_content, tbody_xpath=tbody_xpath, waku=waku, table_type="result", ) yield ResultPlayerInfo( waku, waku_dict[waku]["rank"], waku_dict[waku]["name"], waku_dict[waku]["no"], waku_dict[waku]["racetime"], course, st_time, )
def process(self, default_idname=None, extra_filters=None, reqinfo=None): ''' Process the extracted element, before rendering as a string This is for an HTML element that has been extracted and parsed from the document source. We apply certain transformations and mods needed before it can be rendered into a string. Operates on self.elem, replacing it as a side effect. The element will be wrapped in a new div, which is given the class and ID according to the classvalue and idname member variables. default_idname is used as a fallback idname; If self.idname has already been set, that will be used instead. It is a runtime error if neither are set. @param elem : HTML element to process @type elem : lxml.html.HtmlElement @param default_idname : Optional fallback ID attribute to apply to the enclosing div @type default_idname : str @param extra_filters : Additional filters to post-apply, from moplate @type extra_filters : list of callable; or None for no filters (empty list) @return : New element with the applied changes @rtype : lxml.html.HtmlElement ''' from lxml.html import HtmlElement if extra_filters is None: extra_filters = [] def applyfilters(elem): from itertools import chain def relevant(filt): _is_relevant = True if hasattr(filt, 'relevant'): assert callable(filt.relevant), filt.relevant _is_relevant = filt.relevant(reqinfo) return _is_relevant for filt in chain(self.filters, extra_filters): if relevant(filt): filt(elem) assert type(self.elems) is list, self.elems if self.idname is None: assert default_idname is not None, 'cannot determine an idname!' idname = default_idname else: idname = self.idname if self.filtermode == FILT_EACHELEM: # applying filters to extracted elements individually for elem in self.elems: applyfilters(elem) # wrap in special mobilize class, id if self.innerhtml and len(self.elems) == 1: newelem = copy.deepcopy(self.elems[0]) newelem.tag = self.tag else: newelem = HtmlElement() newelem.tag = self.tag for elem in self.elems: newelem.append(elem) if self.filtermode == FILT_COLLAPSED: # applying filters to the single collapsed element applyfilters(newelem) newelem.attrib['class'] = self.classvalue newelem.attrib['id'] = idname if bool(self.style): newelem.attrib['style'] = self.style self.elem = newelem return newelem
def _parse_tags_group(self, start_tag: HtmlElement): tag: HtmlElement = start_tag.getnext() while tag is not None and tag.tag not in ["h3", "h4"]: yield tag tag: HtmlElement = tag.getnext()
def get_data(data: HtmlElement, htmlattr: HTMLAttr): item = data.xpath(htmlattr.xpath)[0] data = item.attrib data["text"] = item.text return data[htmlattr.attr]
def _parse_list(self, data: HtmlElement): for item in data.getchildren(): yield " - " + item.text_content()
def _page_numbers(main_content: HtmlElement): pagination = main_content.cssselect('.pagination')[0] page_elements = [int(p.text) for p in pagination if p.text.isnumeric()] return range(1, page_elements[-1] + 1)
def _key_element_to_cell(self, key: str, key_element: HtmlElement) -> bool: """Converts a |key_element| Element to a table cell and tries to modify the corresponding value to a cell. Args: key: (string) the key that |key_element| represents key_element: (HtmlElement) the element to be modified Returns: True if a modification was made and False otherwise. """ # <foo><bar>key</bar>value</foo> # Create a new td element containing the following-sibling's text and # add it after the key cell. following_siblings = key_element.xpath("following-sibling::text()") if following_siblings: following_text = following_siblings[0].strip() if following_text: key_element.tag = "td" following_cell = HtmlElement(following_text) following_cell.tag = "td" key_element.addnext(following_cell) return True # <foo>key</foo><bar>value</bar> # The key and value are already adjacent, so just make them both cells. if key_element.getnext() is not None: key_element.tag = "td" key_element.getnext().tag = "td" return True # <foo><bar/><baz></baz>key: value</foo> # Create new td elements for the key and the value and insert them. for child in key_element: if child.tail and child.tail.startswith(key): if self._insert_cells_from_text(key, child.tail, key_element): return True # <foo>key<bar>value</bar></foo> # Create a new td element containing the key and add it before the # value cell. if len(key_element) == 1: key_cell = HtmlElement(key) key_cell.tag = "td" value_cell = key_element[0] value_cell.tag = "td" value_cell.addprevious(key_cell) return True # <foo>key : value</foo> # Create new td elements for the key and the value and insert them. text = self._get_text_from_element(key_element) if text and text.startswith(key): if self._insert_cells_from_text(key, text, key_element): return True return False
def text_content(element: HtmlElement, strip: bool = True): text = element.text_content() return text.strip() if strip else text
def get_cover(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
def is_empty_element(node: HtmlElement): return not node.getchildren() and not node.text
def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace( '/', '-')
def get_book_quantities(root: html.HtmlElement) -> int: try: count = len(root.cssselect('table.record')) except AttributeError: return 0 return count
def get_runtime(lx: html.HtmlElement) -> str: return str( lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()") [0]).strip()
def _parse_paragraph(self, node: HtmlElement, state: Dict[str, Any]) -> Iterator[Sentence]: """Parse a Paragraph of the node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ # Both Paragraphs will share the same parent parent = (state["context"][node] if node in state["context"] else state["parent"][node]) # Set name for Paragraph name = node.attrib["name"] if "name" in node.attrib else None if len(node.getchildren()) == 0: # leaf node fields = ["text", "tail"] elif node.get("visited") == "text": # .text was parsed already fields = ["tail"] node.set("visited", "true") else: fields = ["text"] node.set("visited", "text") self.stack.append(node) # will visit again later for tail for field in fields: text = getattr(node, field) text = text.strip() if text and self.strip else text # Skip if "" or None if not text: continue # Run RegEx replacements for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) # Process the Paragraph stable_id = (f"{state['document'].name}" f"::" f"{'paragraph'}" f":" f"{state['paragraph']['idx']}") parts = {} parts["stable_id"] = stable_id parts["name"] = name parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): if parent.table: parts["section"] = parent.table.section elif parent.figure: parts["section"] = parent.figure.section parts["caption"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent elif isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Figure): # occurs with text in the tail of an img parts["section"] = parent.section elif isinstance(parent, Table): # occurs with text in the tail of a table parts["section"] = parent.section else: raise NotImplementedError( f"Para '{text}' parent must be Section, Caption, or Cell, " f"not {parent}") # Create the entry in the DB paragraph = Paragraph(**parts) state["paragraph"]["idx"] += 1 state["paragraph"]["text"] = text state["paragraph"]["field"] = field yield from self._parse_sentence(paragraph, node, state)