コード例 #1
0
ファイル: getfinancialdata.py プロジェクト: luciomp/bmf
def getDaeCc(nsd, nsr, infos):
    logging.info('Getting Dados da Empresa - Composicao do Capital - %s %s', nsd, nsr)
    page = HtmlElement(openUrl(buildUrl(URL_DCC, PARMS_GERAL, PARMS_BPP, PARMS_BPP,
        'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr)))
  
    # Multiplicador 
    em = page.xpath('.//div[@id="UltimaTabela"]/table/tr/td/b/text()')
    m = 1000 if len(em) > 0 and em[0].find('(Mil)') != -1 else 1

    for i in ['QtdAordCapiItgz', 'QtdAprfCapiItgz', 'QtdTotAcaoCapiItgz', 'QtdAordTeso', 'QtdAprfTeso', 'QtdTotAcaoTeso']:    
        qnt = page.get_element_by_id('ctl00_cphPopUp_{0}_1'.format(i))
        infos[i] = 0 if qnt is None else toInt(qnt.text) * m
コード例 #2
0
    def __init__(self,
                 html_element: HtmlElement =None,
                 title=None,
                 broadcast_datetime=None,
                 enclosure_url=None,
                 enclosure_content_length=None,
                 enclosure_content_type=None,
                 show_slug=None):
        """
        Scrape PodcastEpisode information from html_element if given,
        otherwise use passed-through values.
        """
        if html_element:
            logging.info('Parsing PodcastEpisode from HTML element…')
            try:
                enclosure_url = html_element.xpath('./div/div[@class="download icon"]/a/@href')[0].strip()

                broadcast_date = datetime.strptime(
                        html_element.xpath('.//div[contains(@class, "listen")]/a/@data-air-day')[0],
                        '%Y-%m-%d')
                broadcast_time = datetime.strptime(
                        html_element.xpath('.//div[contains(@class, "listen")]/a/@data-airtime')[0],
                        '%H')
                broadcast_datetime = datetime.combine(broadcast_date.date(), broadcast_time.time())
                title = " ".join(html_element.xpath(".//h3//text()")).strip()
            except IndexError:
                logging.warning("Failed parsing PodcastEpisode.", exc_info=True)
                raise

            try:
                show_url = html_element.xpath(".//h3/a/@href")
                if isinstance(show_url, list):
                    # in case the XPath returned multiple "href"s
                    show_url = show_url[0]
                show_slug = Show.parse_slug(furl(show_url)) if show_url else None
            except IndexError:
                logging.info("Could not find Show URL for PodcastEpisode <%s>.", enclosure_url)

            # Get accurate download information for the RSS Enclosure
            download_headers = http_session.head(enclosure_url).headers
            enclosure_content_length = download_headers.get('content-length')
            enclosure_content_type = download_headers.get('content-type')

            logging.info('Scraped PodcastEpisode <%s>.', enclosure_url)

        self.guid = enclosure_url
        self.title = title
        self.broadcast_datetime = broadcast_datetime
        self.enclosure_url = enclosure_url
        self.enclosure_content_length = enclosure_content_length
        self.enclosure_content_type = enclosure_content_type
        self.show_slug = show_slug
コード例 #3
0
ファイル: getfinancialdata.py プロジェクト: luciomp/bmf
def getDfpConBPA(url, infos):
    logging.info('Getting DFs Consolidadas - Balanco Patrimonial Ativo')
    t = dict()
    page = HtmlElement(openUrl(url)) 
    table = page.get_element_by_id(TABLE_BPP)
    for row in getRows(table): rowToDict(row, t)

    # Multiplicador
    m = getMultiplicador(page)

    # Caixa
    cxa = toInt(t['1.01.01'][1])*m if '1.01.01' in t else 0
    apf = toInt(t['1.01.02'][1])*m if '1.01.02' in t else 0
    infos['CAIXA'] = cxa + apf
コード例 #4
0
ファイル: getfinancialdata.py プロジェクト: luciomp/bmf
def getDfpConBPA(nsd, nsr, infos):
    logging.info('Getting DFs Consolidadas - Balanco Patrimonial Ativo - %s %s', nsd, nsr)
    t = dict()
    page = HtmlElement(openUrl(buildUrl(URL_FDF, PARMS_GERAL, PARMS_BPP, 
        'NumeroSequencialDocumento=' + nsd, 'NumeroSequencialRegistroCvm=' + nsr, 
        'Informacao=2', 'Demonstracao=2')))
    table = page.get_element_by_id(TABLE_BPP)
    for row in getRows(table): rowToDict(row, t)

    # Multiplicador
    m = getMultiplicador(page)

    # Caixa
    cxa = toInt(t['1.01.01'][1])*m if '1.01.01' in t else 0
    apf = toInt(t['1.01.02'][1])*m if '1.01.02' in t else 0
    infos['CAIXA'] = cxa + apf
コード例 #5
0
ファイル: push_notifications.py プロジェクト: gnprice/zulip
    def get_text(elem: LH.HtmlElement) -> Text:
        # Convert default emojis to their unicode equivalent.
        classes = elem.get("class", "")
        if "emoji" in classes:
            match = re.search("emoji-(?P<emoji_code>\S+)", classes)
            if match:
                emoji_code = match.group('emoji_code')
                char_repr = ""
                for codepoint in emoji_code.split('-'):
                    char_repr += chr(int(codepoint, 16))
                return char_repr
        # Handles realm emojis, avatars etc.
        if elem.tag == "img":
            return elem.get("alt", "")

        return elem.text or ""
コード例 #6
0
ファイル: utils.py プロジェクト: luciomp/bmf
def getEnterprises():
    a = HtmlElement(openUrl(buildUrl(URL_ELIST, 'idioma=pt-br')))
    d = {
        '__EVENTTARGET': 'ctl00:contentPlaceHolderConteudo:BuscaNomeEmpresa1:btnTodas',
        '__VIEWSTATE': a.get_element_by_id('__VIEWSTATE').get('value'),
        '__EVENTVALIDATION': a.get_element_by_id('__EVENTVALIDATION').get('value'),
        'ctl00_contentPlaceHolderConteudo_AjaxPanelBuscaPostDataValue' :'ctl00_contentPlaceHolderConteudo_AjaxPanelBusca,ActiveElement,ctl00_contentPlaceHolderConteudo_BuscaNomeEmpresa1_btnTodas;',
        'ctl00$contentPlaceHolderConteudo$tabMenuEmpresaListada': '{"State":{},"TabState":{"ctl00_contentPlaceHolderConteudo_tabMenuEmpresaListada_tabNome":{"Selected":true}}}',
        'ctl00$contentPlaceHolderConteudo$BuscaNomeEmpresa1$txtNomeEmpresa$txtNomeEmpresa': '',
        'ctl00$contentPlaceHolderConteudo$mpgPaginas_Selected': '0',
        'RadAJAXControlID': 'ctl00_contentPlaceHolderConteudo_AjaxPanelBusca',
        'httprequest': 'true'
    }
    page = etree.HTML(requests.post(buildUrl(URL_ELIST, 'idioma=pt-br'), data=d).text)
    tables = getTables(page, DIV_ENTERPRISES)
    if len(tables) < 1: return
    for row in tables[0].find('tbody').findall('.//tr'):
        link = row.find('.//td/a[@href]')
        yield (parse_qs(urlparse(link.get('href')).query, keep_blank_values=True)['codigoCvm'][0], link.text)
コード例 #7
0
ファイル: previewdocx.py プロジェクト: petrushev/gists
def preview(docx_content):
    docx_file = StringIO(docx_content)

    zip_ = ZipFile(docx_file)
    doc_file = zip_.open('word/document.xml')
    doc = fromstring(doc_file.read())

    elements = tuple(doc.iter())
    for el in elements:
        p = HtmlElement('p')
        p.text = '\n'
        el.append(p)

    text = doc.text_content()
    while ' \n' in text:
        text = text.replace(' \n', '\n')
    while '\n\n' in text:
        text = text.replace('\n\n', '\n')
    text = text.strip()

    return text
コード例 #8
0
ファイル: scraper.py プロジェクト: TripleSnail/mafia_analyze
def _remove_quotes(post: HtmlElement):
    remove_these = []  # TODO: Summarize instead
    for e in post.iter():
        if 'spoiler' in e.classes:
            remove_these.append(e)
        if 'quote' in e.classes:
            remove_these.append(e)
        if 'Spoiler' in e.get('onclick', ''):
            remove_these.append(e)

    for quote in remove_these:
        quote.getparent().remove(quote)
コード例 #9
0
ファイル: frys.py プロジェクト: krober/bapcs-stock-checker
def get_mpn(tree: html.HtmlElement):
    """
    Parses mpn from page
    :param tree: html.HtmlElement from lxml
    :return: str, manuf. product number, if exists; else None
    """
    path = '//span[@class="product-label-value"]'
    try:
        mpn_tag = tree.xpath(path)[1]
    except IndexError as e:
        # Combo deals/splash pages/etc
        frys_logger.error(f'{e.__class__}: {e}')
        return None
    else:
        return mpn_tag.text.strip()
コード例 #10
0
ファイル: ebay.py プロジェクト: krober/bapcs-stock-checker
def get_xpath(path: str, tree: html.HtmlElement):
    """
    Looks for path/element in tree
    :param path: str, valid xpath search string
    :param tree: html.HtmlElement from lxml
    :return: element, based on path; or None if not found
    """
    try:
        element = tree.xpath(path)[0]
    except IndexError as e:
        # attribute doesnt exist = sale splash page/bad link/local pickup, etc
        ebay_logger.error(f'{e.__class__}: {path}: {e}')
        return None
    else:
        return element
def __fake__iterchildren(obj, tag=None, reversed=False):
    if obj._children_container is None:
        obj._children_container = []
        for e in HtmlElement.iterchildren(obj):
            obj._children_container.append(e)
    seq = xrange(0, len(obj._children_container))
    if reversed:
        seq = reversed(seq)
    for i in seq:
        #for e in obj._children_container:
        e = obj._children_container[i]
        if tag is not None:
            if e.tag == tag:
                yield e
        else:
            yield e
コード例 #12
0
def get_open_box(tree: html.HtmlElement):
    """
    Give html.HtmlElement for single MC store, parse
    and return open box price
    :param tree: html.HtmlElement
    :return: str, open box price, ex '$249.99'; None if doesn't exist
    """
    path = '//span[@id="opCostNew"]'
    try:
        open_box = tree.xpath(path)[0].text
    except IndexError as e:
        # No opCostNew id found = no open box available at location
        mc_logger.info(f'{e.__class__}: {e}')
    else:
        return open_box
    return None
コード例 #13
0
def get_inventory(tree: html.HtmlElement):
    """
    Given html.HtmlElement for single MC store, parse
    and return inventory count
    :param tree: html.HtmlElement
    :return: str, inventory, ex '9 in stock'; None if parse error
    """
    path = '//span[@class="inventoryCnt"]'
    try:
        inventory = tree.xpath(path)[0].text
    except IndexError as e:
        # No inventoryCnt class found = sold out at location
        mc_logger.error(f'{e.__class__}: {e}')
    else:
        if inventory != 'Sold Out':
            return inventory
    return None
コード例 #14
0
ファイル: frys.py プロジェクト: krober/bapcs-stock-checker
def get_price(tree: html.HtmlElement):
    """
    Parses price from page
    :param tree: html.HtmlElement from lxml
    :return: int, rounded, if exists; else None
    """
    path = '//span[@id="did_price1valuediv"]'
    try:
        price_tag = tree.xpath(path)[0].text
        price = int(round(float(price_tag[1:])))
    except IndexError as e:
        # Combo deals/splash pages/etc
        frys_logger.error(f'{e.__class__}: {e}')
        return None
    except TypeError as e:
        frys_logger.error(f'{e.__class__}: {e}')
        return None
    else:
        return price
コード例 #15
0
ファイル: scraper.py プロジェクト: TripleSnail/mafia_analyze
def posts_from_page(main_content: HtmlElement):
    post_elems = main_content.cssselect('.fpost')

    posts = []

    for e_post in post_elems:
        post_id = int(e_post.cssselect('div > a')[0].get('id'))
        user = e_post.cssselect('div.solid > section .fpost-username')[0][0].text
        content = e_post.cssselect('div.solid > section article.forumPost > section')[0]

        # TODO: Summarize quotes and spoilers
        _remove_quotes(content)
        text = []
        for t in content.itertext():
            t = t.strip()
            if t and not _is_punctuation(t):
                text.append(t)

        posts.append(Post(post_id, user, text))

    return posts
コード例 #16
0
def get_author_name(root: html.HtmlElement, num: int) -> str:
    if author := root.cssselect('div.bo_div')[num].cssselect('b')[1].text.strip():
        return author
コード例 #17
0
def _dfs(
    tree: _ElementTree,
    element: HtmlElement,
    parent_language: str,
    infractions: List[LanguageInfraction],
) -> Tuple[str, Optional[str], Optional[str], List[LanguageInfraction]]:
    """Check for infractions against WCAG 3.1.2 using a recursive DFS.

    Parameters
    ----------
    tree : _ElementTree
        The root element tree of the web page, used to calculate the xpath
    element : HtmlElement
        The current element to check for infractions
    parent_language : str
        The defined language of the current element's parent
    infractions : List[LanguageInfraction]
        The infractions found until now

    Returns
    -------
    Tuple[str, Optional[str], Optional[str], List[LanguageInfraction]]
        A tuple containing:
          - the explicitly defined language of the current element
          - the detected language of the current element
          - the text of the current element
          - the infractions found
    """
    # If the current element does not have a `lang` attribute, take the parent's language
    defined_language = element.get("lang", parent_language).lower()[:2]

    # The text contained in this element (and in this element alone)
    text = (element.text or "").replace("\n", " ").strip()

    # Check for infractions against WCAG 3.1.2 in hidden attributes
    hidden_infraction = _check_hidden_attributes(tree, element,
                                                 defined_language)
    if hidden_infraction is not None:
        infractions.append(hidden_infraction)

    # Check for infractions against WCAG 3.1.2 in the current element's children
    children = element.getchildren()
    if children:
        # Run this function on each of the children (= recursion)
        children_results = [
            _dfs(tree, child, defined_language, infractions)
            for child in children
        ]

        # Differ between children that are similar and children that are different
        if len({(defined, detected)
                for defined, detected, _, _ in children_results}) == 1:
            # All children have the same language defined and detected
            child_defined_language, child_detected_language, _, _ = children_results[
                0]
            children_text = " ".join(
                str(child_result[2]) for child_result in children_results)
            if child_detected_language and child_defined_language != child_detected_language:
                # All children are wrong
                # Give a warning for the current element instead of for each of its children
                infractions.append(
                    LanguageInfraction(
                        wcag_criterion="WCAG_3_1_2",
                        xpath=tree.getpath(element),
                        html_language=child_defined_language,
                        predicted_language=child_detected_language,
                        text=children_text,
                    ))
        else:
            # The children have different values for their defined and detected languages
            for child, child_result in zip(children, children_results):
                child_defined_language, child_detected_language, child_text, _ = child_result
                if child_detected_language and child_detected_language != child_defined_language:
                    # This child is wrong, give a warning for only this child
                    infractions.append(
                        LanguageInfraction(
                            wcag_criterion="WCAG_3_1_2",
                            xpath=tree.getpath(child),
                            html_language=child_defined_language,
                            predicted_language=child_detected_language,
                            text=str(child_text),
                        ))

        # If any of the children of the current element contains a very short piece of text, add it
        # to the current element's text
        for _, _, child_text, _ in children_results:
            if child_text is None:
                continue
            child_text = child_text.strip()

            # We only add the child text if it is short
            if count_words(child_text) >= MIN_WORDS_DEFAULT:
                continue
            current_text = (text or "").strip()

            # We only add the child text if the current text doesn't end with the child text
            if not current_text.endswith(child_text):
                text = current_text + " " + child_text

    # If the current element's text is long enough, predict its language
    if count_words(text) >= MIN_WORDS_DEFAULT:
        detected_language = predict_language(text)
    else:
        detected_language = None

    return defined_language, detected_language, text, infractions
コード例 #18
0
    def _parse_table(self, node: HtmlElement,
                     state: Dict[str, Any]) -> Dict[str, Any]:
        """Parse a table node.

        :param node: The lxml table node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if not self.tabular:
            logger.error("Called _parse_table without tabular activated.")
            return state

        if node.tag == "table":
            table_idx = state["table"]["idx"]
            stable_id = f"{state['document'].name}::{'table'}:{state['table']['idx']}"

            # Set name for Table
            name = node.attrib["name"] if "name" in node.attrib else None

            # Create the Table in the DB
            parts = {}
            parts["document"] = state["document"]
            parts["stable_id"] = stable_id
            parts["name"] = name
            parts["position"] = table_idx
            parent = state["parent"][node]
            if isinstance(parent, Cell):
                parts["section"] = parent.table.section
            elif isinstance(parent, Section):
                parts["section"] = parent
            else:
                raise NotImplementedError(
                    "Table is not within a Section or Cell")

            state["context"][node] = Table(**parts)

            # Local state for each table. This is required to support nested
            # tables
            state["table"][table_idx] = {
                "grid": defaultdict(int),
                "cell_pos": 0,
                "row_idx": -1,
                "col_idx": 0,
            }

            # Increment table counter
            state["table"]["idx"] += 1

        elif node.tag == "tr":
            if not isinstance(state["parent"][node], Table):
                raise NotImplementedError("Table row parent must be a Table.")

            state["table"][state["parent"][node].position]["col_idx"] = 0
            state["table"][state["parent"][node].position]["row_idx"] += 1

        elif node.tag in ["td", "th"]:
            if not isinstance(state["parent"][node], Table):
                raise NotImplementedError("Cell parent must be a Table.")

            if not state["table"][state["parent"]
                                  [node].position]["row_idx"] >= 0:
                raise NotImplementedError(
                    "Table cell encountered before a table row.")

            # calculate row_start/col_start
            while state["table"][state["parent"][node].position]["grid"][(
                    state["table"][state["parent"][node].position]["row_idx"],
                    state["table"][state["parent"][node].position]["col_idx"],
            )]:  # while a cell on the grid is occupied, keep moving
                state["table"][state["parent"][node].position]["col_idx"] += 1
            col_start = state["table"][state["parent"]
                                       [node].position]["col_idx"]
            row_start = state["table"][state["parent"]
                                       [node].position]["row_idx"]

            # calculate row_end/col_end
            row_end = row_start
            if "rowspan" in node.attrib:
                try:
                    row_end += int(node.get("rowspan")) - 1
                except ValueError:
                    logger.error(
                        f"Rowspan has invalid value: '{node.get('rowspan')}'")

            col_end = col_start
            if "colspan" in node.attrib:
                try:
                    col_end += int(node.get("colspan")) - 1
                except ValueError:
                    logger.error(
                        f"Colspan has invalid value: '{node.get('colspan')}'")

            # update grid with occupied cells
            for r, c in itertools.product(list(range(row_start, row_end + 1)),
                                          list(range(col_start, col_end + 1))):
                state["table"][state["parent"][node].position]["grid"][(r,
                                                                        c)] = 1

            # Set name for Cell
            name = node.attrib["name"] if "name" in node.attrib else None

            # construct cell
            parts = defaultdict(list)
            parts["document"] = state["document"]
            parts["name"] = name
            parts["table"] = state["parent"][node]
            parts["row_start"] = row_start
            parts["row_end"] = row_end
            parts["col_start"] = col_start
            parts["col_end"] = col_end
            parts["position"] = state["table"][state["parent"]
                                               [node].position]["cell_pos"]
            stable_id = (f"{parts['document'].name}"
                         f"::"
                         f"{'cell'}"
                         f":"
                         f"{parts['table'].position}"
                         f":"
                         f"{row_start}"
                         f":"
                         f"{col_start}")
            parts["stable_id"] = stable_id
            # Create the Cell in the DB
            state["context"][node] = Cell(**parts)

            # Update position
            state["table"][state["parent"][node].position]["col_idx"] += 1
            state["table"][state["parent"][node].position]["cell_pos"] += 1

        return state
コード例 #19
0
ファイル: parser.py プロジェクト: aiogram/tg-codegen
 def _parse_blockquote(self, blockquote: HtmlElement):
     for item in blockquote.getchildren():
         yield from item.text_content().splitlines()
コード例 #20
0
 def _get_text_from_element(self, element: HtmlElement) -> Optional[str]:
     if element.text and element.text.strip():
         return element.text.strip()
     if element.text_content() and element.text_content().strip():
         return element.text_content().strip()
     return None
コード例 #21
0
ファイル: javlib.py プロジェクト: xyx208/AV_Data_Capture
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
    return lx.xpath(xpath)[0].strip()
コード例 #22
0
    def _commoninfo(self, lx_content: lxml.HtmlElement) -> ResultCommonInfo:
        """
        水面気象情報と決まり手,返還挺の有無などの選手以外のレース結果情報
        """
        # これに情報を格納し最後に型に入れる。
        content_dict = {}
        # 水面気象情報の取得
        table_xpath = ("/html/body/main/div/div/div/div[2]"
                       "/div[5]/div[2]/div[1]/div[1]/div/div[1]")
        content_dict["weather_info"] = self.__common_methods.getweatherinfo(
            lx_content=lx_content, table_xpath=table_xpath)

        # 返還テーブルを抜く
        # 返還挺はリストのまま辞書に入れる
        # 返還艇がなければ空リスト
        table_xpath = ("/html/body/main/div/div/div/div[2]"
                       "/div[5]/div[2]/div[1]/div[2]/div[1]"
                       "/table/tbody/tr/td/div/div/span")
        henkantei_list = lx_content.xpath(table_xpath)

        # 返還艇をint型に直す,変なやつはNoneでハンドル(ないと思う)
        def teistr2str(tei_str):
            tei = re.search(r"[1-6]", tei_str)
            if tei is not None:
                return str(tei.group(0))
            else:
                return None

        henkantei_list = list(map(lambda x: teistr2str(x.text),
                                  henkantei_list))

        # 返還艇があればリスト長が1以上になる
        if len(henkantei_list) > 0:
            is_henkan = True
        else:
            is_henkan = False
        henkantei_str = ",".join(henkantei_list)
        content_dict["henkantei_list"] = henkantei_str
        content_dict["is_henkan"] = is_henkan

        # 決まりて
        table_xpath = ("/html/body/main/div/div/div/div[2]/div[5]"
                       "/div[2]/div[1]/div[2]/div[2]/table/tbody/tr/td")
        kimarite = lx_content.xpath(table_xpath)[0].text.strip()
        content_dict["kimarite"] = kimarite

        # 備考
        table_xpath = ("/html/body/main/div/div/div/div[2]/div[5]"
                       "/div[2]/div[2]/table/tbody/tr/td")
        biko = lx_content.xpath(table_xpath)[0].text.strip()
        content_dict["biko"] = biko

        # 払い戻し
        table_xpath = ("/html/body/main/div/div/div/div[2]"
                       "/div[5]/div[1]/div/table/tbody/tr[1]/td[3]/span")
        pay_contents = lx_content.xpath(table_xpath)
        pay_list = list(
            map(lambda x: int(re.sub(r"[^\d]", "", x.text)), pay_contents))

        # 人気
        # TODO 順番問題でうまくいってない可能性大
        table_xpath = ("/html/body/main/div/div/div/div[2]"
                       "/div[5]/div[1]/div/table/tbody/tr[1]/td[4]")
        popular_contents = lx_content.xpath(table_xpath)
        popular_list = list(map(lambda x: x.text.strip(), popular_contents))
        content_dict["payout_3tan"] = pay_list[0]
        content_dict["popular_3tan"] = self.__common_methods.rmletter2int(
            popular_list[0])
        content_dict["payout_3fuku"] = pay_list[1]
        content_dict["popular_3fuku"] = self.__common_methods.rmletter2int(
            popular_list[1])
        content_dict["payout_2tan"] = pay_list[2]
        content_dict["popular_2tan"] = self.__common_methods.rmletter2int(
            popular_list[2])
        content_dict["payout_2fuku"] = pay_list[3]
        content_dict["popular_2fuku"] = self.__common_methods.rmletter2int(
            popular_list[3])
        content_dict["payout_1tan"] = pay_list[5]

        return ResultCommonInfo(**content_dict)
コード例 #23
0
ファイル: grabbers.py プロジェクト: ADR-007/tolokaindex
 def get_result_rows(cls,
                     tree: html.HtmlElement) -> Iterable[html.HtmlElement]:
     return tree.xpath(cls.rows_xpath)[1:]
コード例 #24
0
 def extract_from_user_xpath(self, publish_time_xpath: str,
                             element: HtmlElement) -> str:
     if publish_time_xpath:
         publish_time = ''.join(element.xpath(publish_time_xpath))
         return publish_time
     return ''
コード例 #25
0
def get_series2(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
コード例 #26
0
def get_title(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
コード例 #27
0
def get_outline(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
コード例 #28
0
ファイル: processor.py プロジェクト: lggruspe/slipbox
 def callback(_: int, elem: HtmlElement) -> None:
     id_ = elem.get("id", "")
     if id_.isdigit():
         cur.execute(sql, (PyQuery(elem).outer_html(), int(id_)))
コード例 #29
0
def get_text(root: html.HtmlElement, num: int) -> str:
    return ' '.join([item.strip() for item in root.cssselect("div.bo_div")[num].xpath('./text()')]).strip()
コード例 #30
0
    def _parse_sentence(self, paragraph: Paragraph, node: HtmlElement,
                        state: Dict[str, Any]) -> Iterator[Sentence]:
        """Parse the Sentences of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        text = state["paragraph"]["text"]
        field = state["paragraph"]["field"]

        # Set name for Sentence
        name = node.attrib["name"] if "name" in node.attrib else None

        # Lingual Parse
        document = state["document"]
        for parts in self.lingual_parser.split_sentences(text):
            abs_offset = state["sentence"]["abs_offset"]
            parts["abs_char_offsets"] = [
                char_offset + abs_offset
                for char_offset in parts["char_offsets"]
            ]
            parts["document"] = document
            # NOTE: Why do we overwrite this from the spacy parse?
            parts["position"] = state["sentence"]["idx"]
            abs_sentence_offset_end = (state["sentence"]["abs_offset"] +
                                       parts["char_offsets"][-1] +
                                       len(parts["words"][-1]))
            parts["stable_id"] = construct_stable_id(
                document,
                "sentence",
                state["sentence"]["abs_offset"],
                abs_sentence_offset_end,
            )
            parts["name"] = name
            state["sentence"]["abs_offset"] = abs_sentence_offset_end
            if self.structural:
                context_node = node.getparent() if field == "tail" else node
                tree = lxml.etree.ElementTree(state["root"])
                parts["xpath"] = tree.getpath(context_node)
                parts["html_tag"] = context_node.tag
                parts["html_attrs"] = [
                    "=".join(x) for x in list(context_node.attrib.items())
                ]

                # Extending html style attribute with the styles
                # from inline style class for the element.
                cur_style_index = None
                for index, attr in enumerate(parts["html_attrs"]):
                    if attr.find("style") >= 0:
                        cur_style_index = index
                        break
                head = state["root"].find("head")
                styles = None
                if head is not None:
                    styles = head.find("style")
                if styles is not None:
                    for x in list(context_node.attrib.items()):
                        if x[0] == "class":
                            exp = r"(." + x[1] + r")([\n\s\r]*)\{(.*?)\}"
                            r = re.compile(exp, re.DOTALL)
                            if r.search(styles.text) is not None:
                                if cur_style_index is not None:
                                    parts["html_attrs"][cur_style_index] += (
                                        r.search(styles.text).group(3).replace(
                                            "\r",
                                            "").replace("\n",
                                                        "").replace("\t", ""))
                                else:
                                    parts["html_attrs"].extend([
                                        "style=" + re.sub(
                                            r"\s{1,}",
                                            " ",
                                            r.search(
                                                styles.text).group(3).replace(
                                                    "\r", "").replace(
                                                        "\n", "").replace(
                                                            "\t", "").strip(),
                                        )
                                    ])
                            break
            parts["position"] = state["sentence"]["idx"]

            # If tabular, consider own Context first in case a Cell
            # was just created. Otherwise, defer to the parent.
            parent = paragraph
            if isinstance(parent, Paragraph):
                parts["section"] = parent.section
                parts["paragraph"] = parent
                if parent.cell:  # if True self.tabular is also always True
                    parts["table"] = parent.cell.table
                    parts["cell"] = parent.cell
                    parts["row_start"] = parent.cell.row_start
                    parts["row_end"] = parent.cell.row_end
                    parts["col_start"] = parent.cell.col_start
                    parts["col_end"] = parent.cell.col_end
            else:
                raise NotImplementedError("Sentence parent must be Paragraph.")
            yield Sentence(**parts)
            state["sentence"]["idx"] += 1
コード例 #31
0
def get_title(lx: html.HtmlElement) -> str:
    return str(
        lx.xpath(
            "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()"
        )[0]).strip()
コード例 #32
0
    def _parse_figure(self, node: HtmlElement,
                      state: Dict[str, Any]) -> Dict[str, Any]:
        """Parse the figure node.

        :param node: The lxml img node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["img", "figure"]:
            return state

        # Process the Figure
        stable_id = (f"{state['document'].name}"
                     f"::"
                     f"{'figure'}"
                     f":"
                     f"{state['figure']['idx']}")

        # Set name for Figure
        name = node.attrib["name"] if "name" in node.attrib else None

        # img within a Figure get's processed in the parent Figure
        if node.tag == "img" and isinstance(state["parent"][node], Figure):
            return state

        # NOTE: We currently do NOT support nested figures.
        parts: Dict[str, Any] = {}
        parent = state["parent"][node]
        if isinstance(parent, Section):
            parts["section"] = parent
        elif isinstance(parent, Cell):
            parts["section"] = parent.table.section
            parts["cell"] = parent
        else:
            logger.warning(f"Figure is nested within {state['parent'][node]}")
            return state

        parts["document"] = state["document"]
        parts["stable_id"] = stable_id
        parts["name"] = name
        parts["position"] = state["figure"]["idx"]

        # If processing a raw img
        if node.tag == "img":
            # Create the Figure entry in the DB
            parts["url"] = node.get("src")
            state["context"][node] = Figure(**parts)
        elif node.tag == "figure":
            # Pull the image from a child img node, if one exists
            imgs = [child for child in node if child.tag == "img"]

            if len(imgs) > 1:
                logger.warning("Figure contains multiple images.")
                # Right now we don't support multiple URLs in the Figure context
                # As a workaround, just ignore the outer Figure and allow processing
                # of the individual images. We ignore the accompanying figcaption
                # by marking it as visited.
                captions = [
                    child for child in node if child.tag == "figcaption"
                ]
                state["visited"].update(captions)
                return state

            img = imgs[0]
            state["visited"].add(img)

            # Create the Figure entry in the DB
            parts["url"] = img.get("src")
            state["context"][node] = Figure(**parts)

        state["figure"]["idx"] += 1
        return state
コード例 #33
0
def get_year(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
コード例 #34
0
    def _playerinfo(
            self, lx_content: lxml.HtmlElement) -> Iterator[ResultPlayerInfo]:
        target_table_xpath = (
            "/html/body/main/div/div/div/div[2]/div[4]/div[1]/div/table/tbody")
        rank_xpath = "/".join([target_table_xpath, "/tr/td[1]"])
        rank_el_list = lx_content.xpath(rank_xpath)
        rank_list = list(
            map(
                lambda x: int(x.text) if x.text.isdecimal() else -1,
                rank_el_list,
            ))

        waku_xpath = "/".join([target_table_xpath, "/tr/td[2]"])
        waku_el_list = lx_content.xpath(waku_xpath)
        waku_list = list(
            map(
                lambda x: int(x.text) if x.text.isdecimal() else -1,
                waku_el_list,
            ))

        name_xpath = "/".join([target_table_xpath, "/tr/td[3]/span[2]"])
        name_el_list = lx_content.xpath(name_xpath)
        name_list = list(
            map(lambda x: x.text.replace("\u3000", "").strip(), name_el_list))

        reg_no_xpath = "/".join([target_table_xpath, "/tr/td[3]/span[1]"])
        reg_el_list = lx_content.xpath(reg_no_xpath)
        reg_no_list = list(
            map(
                lambda x: int(x.text) if x.text.isdecimal() else -1,
                reg_el_list,
            ))

        racetime_xpath = "/".join([target_table_xpath, "/tr/td[4]"])
        racetime_el_list = lx_content.xpath(racetime_xpath)
        racetime_list = list(
            map(lambda x: self._racetime_str_to_sec(x.text), racetime_el_list))

        waku_dict = {}
        for i, waku in enumerate(waku_list):
            waku_dict[waku] = {
                "rank": rank_list[i],
                "name": name_list[i],
                "no": reg_no_list[i],
                "racetime": racetime_list[i],
            }

        for waku in range(1, 7):
            # # 結果STテーブルの情報を取得
            tbody_xpath = ("/html/body/main/div/div/div/"
                           "div[2]/div[4]/div[2]/div/table/tbody")
            course, st_time = self.__common_methods.getSTtable(
                lx_content=lx_content,
                tbody_xpath=tbody_xpath,
                waku=waku,
                table_type="result",
            )

            yield ResultPlayerInfo(
                waku,
                waku_dict[waku]["rank"],
                waku_dict[waku]["name"],
                waku_dict[waku]["no"],
                waku_dict[waku]["racetime"],
                course,
                st_time,
            )
コード例 #35
0
ファイル: extracted.py プロジェクト: redsymbol/mobilize
    def process(self, default_idname=None, extra_filters=None, reqinfo=None):
        '''
        Process the extracted element, before rendering as a string

        This is for an HTML element that has been extracted and parsed
        from the document source.  We apply certain transformations and
        mods needed before it can be rendered into a string.

        Operates on self.elem, replacing it as a side effect.

        The element will be wrapped in a new div, which is given the
        class and ID according to the classvalue and idname member
        variables.  default_idname is used as a fallback idname; If
        self.idname has already been set, that will be used instead.
        It is a runtime error if neither are set.

        @param elem           : HTML element to process
        @type  elem           : lxml.html.HtmlElement

        @param default_idname : Optional fallback ID attribute to apply to the enclosing div
        @type  default_idname : str

        @param extra_filters  : Additional filters to post-apply, from moplate
        @type  extra_filters  : list of callable; or None for no filters (empty list)

        @return               : New element with the applied changes
        @rtype                : lxml.html.HtmlElement
        
        '''
        from lxml.html import HtmlElement
        if extra_filters is None:
            extra_filters = []
        def applyfilters(elem):
            from itertools import chain
            def relevant(filt):
                _is_relevant = True
                if hasattr(filt, 'relevant'):
                    assert callable(filt.relevant), filt.relevant
                    _is_relevant = filt.relevant(reqinfo)
                return _is_relevant
            for filt in chain(self.filters, extra_filters):
                if relevant(filt):
                    filt(elem)
        assert type(self.elems) is list, self.elems
        if self.idname is None:
            assert default_idname is not None, 'cannot determine an idname!'
            idname = default_idname
        else:
            idname = self.idname
        if self.filtermode == FILT_EACHELEM:
            # applying filters to extracted elements individually
            for elem in self.elems:
                applyfilters(elem)
        # wrap in special mobilize class, id
        if self.innerhtml and len(self.elems) == 1:
            newelem = copy.deepcopy(self.elems[0])
            newelem.tag = self.tag
        else:
            newelem = HtmlElement()
            newelem.tag = self.tag
            for elem in self.elems:
                newelem.append(elem)
        if self.filtermode == FILT_COLLAPSED:
            # applying filters to the single collapsed element
            applyfilters(newelem)
        newelem.attrib['class'] = self.classvalue
        newelem.attrib['id'] = idname
        if bool(self.style):
            newelem.attrib['style'] = self.style
        self.elem = newelem
        return newelem
コード例 #36
0
ファイル: parser.py プロジェクト: aiogram/tg-codegen
 def _parse_tags_group(self, start_tag: HtmlElement):
     tag: HtmlElement = start_tag.getnext()
     while tag is not None and tag.tag not in ["h3", "h4"]:
         yield tag
         tag: HtmlElement = tag.getnext()
コード例 #37
0
def get_data(data: HtmlElement, htmlattr: HTMLAttr):
    item = data.xpath(htmlattr.xpath)[0]
    data = item.attrib
    data["text"] = item.text
    return data[htmlattr.attr]
コード例 #38
0
ファイル: parser.py プロジェクト: aiogram/tg-codegen
 def _parse_list(self, data: HtmlElement):
     for item in data.getchildren():
         yield " - " + item.text_content()
コード例 #39
0
ファイル: scraper.py プロジェクト: TripleSnail/mafia_analyze
def _page_numbers(main_content: HtmlElement):
    pagination = main_content.cssselect('.pagination')[0]
    page_elements = [int(p.text) for p in pagination
                     if p.text.isnumeric()]

    return range(1, page_elements[-1] + 1)
コード例 #40
0
    def _key_element_to_cell(self, key: str, key_element: HtmlElement) -> bool:
        """Converts a |key_element| Element to a table cell and tries to modify
        the corresponding value to a cell.

        Args:
            key: (string) the key that |key_element| represents
            key_element: (HtmlElement) the element to be modified
        Returns:
            True if a modification was made and False otherwise.
        """

        # <foo><bar>key</bar>value</foo>
        # Create a new td element containing the following-sibling's text and
        # add it after the key cell.
        following_siblings = key_element.xpath("following-sibling::text()")
        if following_siblings:
            following_text = following_siblings[0].strip()
            if following_text:
                key_element.tag = "td"
                following_cell = HtmlElement(following_text)
                following_cell.tag = "td"
                key_element.addnext(following_cell)
                return True

        # <foo>key</foo><bar>value</bar>
        # The key and value are already adjacent, so just make them both cells.
        if key_element.getnext() is not None:
            key_element.tag = "td"
            key_element.getnext().tag = "td"
            return True

        # <foo><bar/><baz></baz>key: value</foo>
        # Create new td elements for the key and the value and insert them.
        for child in key_element:
            if child.tail and child.tail.startswith(key):
                if self._insert_cells_from_text(key, child.tail, key_element):
                    return True

        # <foo>key<bar>value</bar></foo>
        # Create a new td element containing the key and add it before the
        # value cell.
        if len(key_element) == 1:
            key_cell = HtmlElement(key)
            key_cell.tag = "td"
            value_cell = key_element[0]
            value_cell.tag = "td"
            value_cell.addprevious(key_cell)
            return True

        # <foo>key : value</foo>
        # Create new td elements for the key and the value and insert them.
        text = self._get_text_from_element(key_element)
        if text and text.startswith(key):
            if self._insert_cells_from_text(key, text, key_element):
                return True

        return False
コード例 #41
0
 def text_content(element: HtmlElement, strip: bool = True):
     text = element.text_content()
     return text.strip() if strip else text
コード例 #42
0
ファイル: jav321.py プロジェクト: lededev/AV_Data_Capture
def get_cover(lx: html.HtmlElement) -> str:
    return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
コード例 #43
0
def is_empty_element(node: HtmlElement):
    return not node.getchildren() and not node.text
コード例 #44
0
def get_release(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace(
        '/', '-')
コード例 #45
0
def get_book_quantities(root: html.HtmlElement) -> int:
    try:
        count = len(root.cssselect('table.record'))
    except AttributeError:
        return 0
    return count
コード例 #46
0
def get_runtime(lx: html.HtmlElement) -> str:
    return str(
        lx.xpath(
            "//span[@class='spec-content']/span[@itemprop='duration']/text()")
        [0]).strip()
コード例 #47
0
    def _parse_paragraph(self, node: HtmlElement,
                         state: Dict[str, Any]) -> Iterator[Sentence]:
        """Parse a Paragraph of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        # Both Paragraphs will share the same parent
        parent = (state["context"][node]
                  if node in state["context"] else state["parent"][node])
        # Set name for Paragraph
        name = node.attrib["name"] if "name" in node.attrib else None

        if len(node.getchildren()) == 0:  # leaf node
            fields = ["text", "tail"]
        elif node.get("visited") == "text":  # .text was parsed already
            fields = ["tail"]
            node.set("visited", "true")
        else:
            fields = ["text"]
            node.set("visited", "text")
            self.stack.append(node)  # will visit again later for tail
        for field in fields:
            text = getattr(node, field)
            text = text.strip() if text and self.strip else text

            # Skip if "" or None
            if not text:
                continue

            # Run RegEx replacements
            for (rgx, replace) in self.replacements:
                text = rgx.sub(replace, text)

            # Process the Paragraph
            stable_id = (f"{state['document'].name}"
                         f"::"
                         f"{'paragraph'}"
                         f":"
                         f"{state['paragraph']['idx']}")
            parts = {}
            parts["stable_id"] = stable_id
            parts["name"] = name
            parts["document"] = state["document"]
            parts["position"] = state["paragraph"]["idx"]
            if isinstance(parent, Caption):
                if parent.table:
                    parts["section"] = parent.table.section
                elif parent.figure:
                    parts["section"] = parent.figure.section
                parts["caption"] = parent
            elif isinstance(parent, Cell):
                parts["section"] = parent.table.section
                parts["cell"] = parent
            elif isinstance(parent, Section):
                parts["section"] = parent
            elif isinstance(parent,
                            Figure):  # occurs with text in the tail of an img
                parts["section"] = parent.section
            elif isinstance(parent,
                            Table):  # occurs with text in the tail of a table
                parts["section"] = parent.section
            else:
                raise NotImplementedError(
                    f"Para '{text}' parent must be Section, Caption, or Cell, "
                    f"not {parent}")

            # Create the entry in the DB
            paragraph = Paragraph(**parts)

            state["paragraph"]["idx"] += 1

            state["paragraph"]["text"] = text
            state["paragraph"]["field"] = field

            yield from self._parse_sentence(paragraph, node, state)