Python HTMLTranslatorの例、cssselect.HTMLTranslator Pythonの例

コード例 #1

0

ファイルを表示

ファイル: parser.py プロジェクト: nickmvincent/SerpScrap

    def __init__(self, config={}, html='', query=''):
        """Create new Parser instance and parse all information."""
        self.config = config
        self.searchtype = self.config.get('search_type', 'normal')
        assert self.searchtype in self.search_types, 'search type "{}" is not supported in {}'.format(
            self.searchtype, self.__class__.__name__)

        self.query = query
        self.html = html
        self.dom = None
        self.search_results = {}
        self.num_results_for_query = ''
        self.num_results = 0
        self.effective_query = ''
        self.page_number = -1
        self.no_results = False
        self.related_keywords = {}

        # to be set by the implementing sub classes
        self.search_engine = ''

        # short alias because we use it so extensively
        self.css_to_xpath = HTMLTranslator().css_to_xpath

        if self.html:
            self.parse()

コード例 #2

0

ファイルを表示

    def __init__(self, html=None, query=''):
        """Create new Parser instance and parse all information.

        Args:
            html: The raw html from the search engine search. If not provided, you can parse 
                    the data later by calling parse(html) directly.
            searchtype: The search type. By default "normal"
            
        Raises:
            Assertion error if the subclassed
            specific parser cannot handle the the settings.
        """
        self.searchtype = Config['SCRAPING'].get('search_type', 'normal')
        assert self.searchtype in self.search_types, 'search type "{}" is not supported in {}'.format(
            self.searchtype, self.__class__.__name__)

        self.query = query
        self.html = html
        self.dom = None
        self.search_results = {}
        self.num_results_for_query = ''
        self.num_results = 0
        self.effective_query = ''
        self.page_number = -1
        self.no_results = False

        # to be set by the implementing sub classes
        self.search_engine = ''

        # short alias because we use it so extensively
        self.css_to_xpath = HTMLTranslator().css_to_xpath

        if self.html:
            self.parse()

コード例 #3

0

ファイルを表示

def get_game_data(username):
    wishlist_url = 'http://steamcommunity.com/id/%s/games/?tab=all' % (username,)

    response = urllib.request.urlopen(wishlist_url)
    html_data = response.read().decode('utf-8')
    doc = html.document_fromstring(html_data)
    translator = HTMLTranslator()
    row_selector = translator.css_to_xpath('script[language=javascript]')

    games = None
    for el in doc.xpath(row_selector):
        variables = parse_script(el.text_content())
        for variable in variables:
            if variable.identifier.value == 'rgGames':
                games = variable

    return[to_map(item) for item in games.initializer.items]

コード例 #4

0

ファイルを表示

def CSSSelect(expr):
    try:
        return css_select_cache[expr]
    except KeyError:
        from cssselect import HTMLTranslator
        from lxml.etree import XPath
        ans = css_select_cache[expr] = XPath(
            HTMLTranslator().css_to_xpath(expr))
        return ans

コード例 #5

0

ファイルを表示

ファイル: htmls.py プロジェクト: sss/calibre

def shorten_title(doc):
    title = doc.find('.//title').text
    if not title:
        return ''

    title = orig = norm_title(title)

    candidates = set()

    for item in ['.//h1', './/h2', './/h3']:
        for e in list(doc.iterfind(item)):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

    from cssselect import HTMLTranslator
    css_to_xpath = HTMLTranslator().css_to_xpath
    for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title',
            '.title', '.head', '.heading', '.contentheading',
            '.small_header_red'):
        for e in doc.xpath(css_to_xpath(item)):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

    if candidates:
        title = sorted(candidates, key=len)[-1]
    else:
        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
            if delimiter in title:
                parts = orig.split(delimiter)
                if len(parts[0].split()) >= 4:
                    title = parts[0]
                    break
                elif len(parts[-1].split()) >= 4:
                    title = parts[-1]
                    break
        else:
            if ': ' in title:
                parts = orig.split(': ')
                if len(parts[-1].split()) >= 4:
                    title = parts[-1]
                else:
                    title = orig.split(': ', 1)[1]

    if not 15 < len(title) < 150:
        return orig

    return title

コード例 #6

0

ファイルを表示

def generate_examples_from_file(file_path):
    """Extracts a list of strings representing header and example elements
    from the file specifed by `file_path`.

    """

    expression = HTMLTranslator().css_to_xpath(EXTRACTED_SELECTORS)
    document = lxml.html.parse(file_path)

    elements = document.xpath(expression)
    elements = filter_nonexample_headers(elements)
    elements = filter_duplicated_descendants(elements)

    for el in elements:
        html = lxml.etree.tostring(el, pretty_print=True, method='html')
        yield rewrite_asset_urls(html)

コード例 #7

0

ファイルを表示

def mutate_selector_del(selector, method, expression):
    """Under the covers, Selectors contain an lxml.etree.Element document
       root, which is not exposed by the Selector interface. This is mutatable
       using the .remove method on parts of the selector.root document tree.
       Unfortunately, there is no native content removal interface in scrapy.

       As this is not using a published interface for Selector, it must be
       considered risky. In particular, it is feasible (though not likely) that
       scrapy could change its selector implementation to use a different
       HTML/XML parsing library, at which point this would fail.
    """
    try:
        if method == 'xpath':
            s = expression
        elif method == 'css':
            s = HTMLTranslator().css_to_xpath(expression)
        else:
            raise NotImplementedError

        for node in selector.root.xpath(s):
            node.getparent().remove(node)
    except Exception as e:
        logger.error('mutate_selector_del({}, {}, {},) failed: {}'.format(
            selector, method, expression, e))

コード例 #8

0

ファイルを表示

    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
            from calibre.ebooks.oeb.stylizer import fix_namespace
            css_to_xpath = HTMLTranslator().css_to_xpath
            self.page_break_selectors = set([])
            stylesheets = [
                x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES
            ]
            for rule in rules(stylesheets):
                before = getattr(
                    rule.style.getPropertyCSSValue('page-break-before'),
                    'cssText', '').strip().lower()
                after = getattr(
                    rule.style.getPropertyCSSValue('page-break-after'),
                    'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((XPath(
                            fix_namespace(css_to_xpath(rule.selectorText))),
                                                       True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((XPath(
                            fix_namespace(css_to_xpath(rule.selectorText))),
                                                       False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
        page_breaks = set([])
        for selector, before in self.page_break_selectors:
            body = item.data.xpath('//h:body', namespaces=NAMESPACES)
            if not body:
                continue
            for elem in selector(body[0]):
                if elem not in body:
                    elem.set('pb_before', '1' if before else '0')
                    page_breaks.add(elem)

        for i, elem in enumerate(item.data.iter()):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
                continue

        page_breaks = list(page_breaks)
        page_breaks.sort(key=lambda x: int(x.get('pb_order')))
        page_break_ids, page_breaks_ = [], []
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d' % i))
            id = x.get('id')
            try:
                xp = XPath('//*[@id="%s"]' % id)
            except:
                try:
                    xp = XPath("//*[@id='%s']" % id)
                except:
                    # The id has both a quote and an apostrophe or some other
                    # Just replace it since I doubt its going to work anywhere else
                    # either
                    id = 'calibre_pb_%d' % i
                    x.set('id', id)
                    xp = XPath('//*[@id=%r]' % id)
            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
            page_break_ids.append(id)

        for elem in item.data.iter():
            elem.attrib.pop('pb_order', False)
            elem.attrib.pop('pb_before', False)

        return page_breaks_, page_break_ids

コード例 #9

0

ファイルを表示

ファイル: test_helpers.py プロジェクト: mrooney/metakv

 def css_select(self, response, css_selector):
     document = self.parse_response(response)
     expression = HTMLTranslator().css_to_xpath(css_selector)
     return document.xpath(expression)

コード例 #10

0

ファイルを表示

ファイル: parsing.py プロジェクト: truebit/GoogleScraper

class GoogleParser():
    """Parses data from Google SERP pages."""

    # Named tuple type for the search results
    Result = namedtuple('LinkResult',
                        'link_title link_snippet link_url link_position')

    # short alias because we use it so extensively
    _xp = HTMLTranslator().css_to_xpath

    # Valid URL (taken from django)
    _REGEX_VALID_URL = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$',
        re.IGNORECASE)
    _REGEX_VALID_URL_SIMPLE = re.compile(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )

    def __init__(self, html, searchtype='normal'):
        self.html = html
        self.searchtype = searchtype
        self.dom = None

        self.search_results = {'num_results_for_kw': []}

        # Try to parse the google HTML result using lxml
        try:
            doc = UnicodeDammit(self.html, is_html=True)
            parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
            self.dom = lxml.html.document_fromstring(self.html, parser=parser)
            self.dom.resolve_base_href()
        except Exception as e:
            print(
                'Some error occurred while lxml tried to parse: {}'.format(e))

        # Very redundant by now, but might change in the soon future
        if self.searchtype == 'normal':
            self.search_results.update({
                'results': [],  # List of Result, list of named tuples
                'ads_main': [],  # The google ads in the main result set.
                'ads_aside': [],  # The google ads on the right aside.
            })
        elif self.searchtype == 'video':
            self.search_results.update({
                'results': [],  # Video search results
                'ads_main': [],  # The google ads in the main result set.
                'ads_aside': [],  # The google ads on the right aside.
            })
        elif self.searchtype == 'image':
            self.search_results.update({
                'results': [],  # Images links
            })
        elif self.searchtype == 'news':
            self.search_results.update({
                'results': [],  # Links from news search
                'ads_main': [],  # The google ads in the main result set.
                'ads_aside': [],  # The google ads on the right aside.
            })

        ### the actual PARSING happens here
        parsing_actions = {
            'normal': self._parse_normal_search,
            'image': self._parse_image_search,
            'video': self._parse_video_search,
            'news': self._parse_news_search,
        }
        # Call the correct parsing method
        parsing_actions.get(self.searchtype)(self.dom)

        # Clean the results
        self._clean_results()

    def __iter__(self):
        """Simple magic method to iterate quickly over found non ad results"""
        for link_title, link_snippet, link_url in self.result['results']:
            yield (link_title, link_snippet, link_url)

    def num_results(self):
        """Returns the number of pages found by keyword as shown in top of SERP page."""
        return self.search_results['num_results_for_kw']

    @property
    def results(self):
        """Returns all results including sidebar and main result advertisements"""
        return {
            k: v
            for k, v in self.search_results.items()
            if k not in ('num_results_for_kw', )
        }

    @property
    def all_results(self):
        return self.search_results

    @property
    def links(self):
        """Only returns non ad results"""
        return self.search_results['results']

    def _clean_results(self):
        """Cleans/extracts the found href or data-href attributes."""

        # Now try to create ParseResult objects from the URL
        for key in ('results', 'ads_aside', 'ads_main'):
            for i, e in enumerate(self.search_results[key]):
                # First try to extract the url from the strange relative /url?sa= format
                matcher = re.search(r'/url\?q=(?P<url>.*?)&sa=U&ei=',
                                    e.link_url)
                if matcher:
                    url = matcher.group(1)
                else:
                    url = e.link_url

                self.search_results[key][i] = \
                    self.Result(link_title=e.link_title, link_url=urllib.parse.urlparse(url),
                                link_snippet=e.link_snippet, link_position=e.link_position)

    def _parse_num_results(self):
        # try to get the number of results for our search query
        try:
            self.search_results['num_results_for_kw'] = \
                self.dom.xpath(self._xp('div#resultStats'))[0].text_content()
        except Exception as e:
            logger.debug(
                'Cannot parse number of results for keyword from SERP page: {}'
                .format(e))

    def _parse_normal_search(self, dom):
        """Specifies the CSS selectors to extract links/snippets for a normal search.

        @param dom The page source to parse.
        """

        # There might be several list of different css selectors to handle different SERP formats
        css_selectors = {
            # to extract all links of non-ad results, including their snippets(descriptions) and titles.
            'results': (['li.g', 'h3.r > a', 'div.s span.st'], ),
            # to parse the centered ads
            'ads_main':
            (['div#center_col li.ads-ad', 'h3.r > a', 'div.ads-creative'],
             ['div#tads li', 'h3 > a:first-child', 'span:last-child']),
            # the ads on on the right
            'ads_aside':
            (['#rhs_block li.ads-ad', 'h3.r > a', 'div.ads-creative'], ),
        }
        self._parse(dom, css_selectors)

    def _parse_image_search(self, dom):
        """Specifies the CSS selectors to extract links/snippets for a image search."""
        css_selectors = {
            'results': (['div.rg_di', 'a:first-child', 'span.rg_ilmn'], )
        }
        self._parse(dom, css_selectors)

    def _parse_video_search(self, dom):
        """Specifies the CSS selectors to extract links/snippets for a video search.

        Very similar to a normal search. Basically the same. But this is a unique method
        because the parsing logic may change over time.
        """
        css_selectors = {
            # to extract all links of non-ad results, including their snippets(descriptions) and titles.
            'results': (['li.g', 'h3.r > a:first-child', 'div.s > span.st'], ),
            # to parse the centered ads
            'ads_main':
            (['div#center_col li.ads-ad', 'h3.r > a', 'div.ads-creative'],
             ['div#tads li', 'h3 > a:first-child', 'span:last-child']),
            # the ads on on the right
            'ads_aside':
            (['#rhs_block li.ads-ad', 'h3.r > a', 'div.ads-creative'], ),
        }
        self._parse(dom, css_selectors)

    def _parse_news_search(self, dom):
        """Specifies the CSS selectors to extract links/snippets for a news search.

        Is also similar to a normal search. But must be a separate function since
        https://news.google.com/nwshp? needs own parsing code...
        """
        css_selectors = {
            # to extract all links of non-ad results, including their snippets(descriptions) and titles.
            # The first CSS selector is the wrapper element where the search results are situated
            # the second CSS selector selects the link and the title. If there are 4 elements in the list, then
            # the second and the third element are for the link and the title.
            # the 4th selector is for the snippet.
            'results': (['li.g', 'h3.r > a:first-child', 'div.s span.st'], ),
            # to parse the centered ads
            'ads_main':
            (['div#center_col li.ads-ad', 'h3.r > a', 'div.ads-creative'],
             ['div#tads li', 'h3 > a:first-child', 'span:last-child']),
            # the ads on on the right
            'ads_aside':
            (['#rhs_block li.ads-ad', 'h3.r > a', 'div.ads-creative'], ),
        }
        self._parse(dom, css_selectors)

    def _parse(self, dom, css_selectors):
        """Generic parse method"""
        for key, slist in css_selectors.items():
            for selectors in slist:
                self.search_results[key].extend(
                    self._parse_links(dom, *selectors))
        self._parse_num_results()

    def _parse_links(self, dom, container_selector, link_selector,
                     snippet_selector):
        links = []
        # Try to extract all links of non-ad results, including their snippets(descriptions) and titles.
        # The parsing should be as robust as possible. Sometimes we can't extract all data, but as much as humanly
        # possible.
        rank = 0
        try:
            li_g_results = dom.xpath(self._xp(container_selector))
            for i, e in enumerate(li_g_results):
                snippet = link = title = ''
                try:
                    link_element = e.xpath(self._xp(link_selector))
                    link = link_element[0].get('href')
                    title = link_element[0].text_content()
                    # For every result where we can parse the link and title, increase the rank
                    rank += 1
                except IndexError as err:
                    logger.debug(
                        'Error while parsing link/title element with selector={}: {}'
                        .format(link_selector, err))
                try:
                    for r in e.xpath(self._xp(snippet_selector)):
                        snippet += r.text_content()
                except Exception as err:
                    logger.debug(
                        'Error in parsing snippet with selector={}.Error: {}'.
                        format(snippet_selector, repr(e), err))

                links.append(
                    self.Result(link_title=title,
                                link_url=link,
                                link_snippet=snippet,
                                link_position=rank))
        # Catch further errors besides parsing errors that take shape as IndexErrors
        except Exception as err:
            logger.error(
                'Error in parsing result links with selector={}: {}'.format(
                    container_selector, err))
        return links or []

コード例 #11

0

ファイルを表示

g = requests.get('http://www.cetip.com.br')
# #ctl00_Banner_lblTaxDI
# //*[@id="ctl00_Banner_lblTaxDI"]
tree = etree.HTML(g.text)
res = etree.tostring(tree, pretty_print=True, method="html")
res = tree.xpath('//*[@id="ctl00_Banner_lblTaxDI"]')
print res[0].text

# body > div:nth-child(1) > div:nth-child(17) > table > tbody > tr > td > div > table
# /html/body/div[1]/div[6]/table/tbody/tr/td/div/table

g = requests.get('http://www.portalbrasil.net/ipca.htm')
tree = etree.HTML(g.text)
res = etree.tostring(tree, pretty_print=True, method="html")
xpath = HTMLTranslator().css_to_xpath('table:nth-last-child(1)')
print xpath
res = tree.xpath(xpath)
print etree.tostring(res[0], pretty_print=True, method="html")
# print res


g = requests.get('http://www2.bmf.com.br/pages/portal/bmfbovespa/boletim1/TxRef1.asp')
tree = etree.HTML(g.text)
res = etree.tostring(tree, pretty_print=True, method="html")
# tit <- xpathSApply(doc, "//td[contains(@class, 'tabelaTitulo')]", xmlValue)
# tit <- str_replace_all(tit, '\\s+', ' ')
# bases <- xpathSApply(doc, "//td[contains(@class, 'tabelaItem')]", xmlValue)
# bases <- str_replace_all(bases, '\\s+', '')
# bases <- str_replace_all(bases, '^(\\d+)[^\\d].*', '\\1')
# bases <- as.numeric(bases)

コード例 #12

0

ファイルを表示

ファイル: etyonline.py プロジェクト: ShadowKyogre/vim-word-study

import vim
from urllib import parse as uparse
from lxml import html, etree
from cssselect import HTMLTranslator
import sys

htmltrans = HTMLTranslator()

old_search_pattern = vim.eval('@/')
base_url="http://etymonline.com/index.php?term={}"
etymologynr = int(vim.eval("bufwinnr('^etymology$')"))
word_to_look_up = sys.argv[0]
term_start = "{} {{{{{{"
term_end = "}}}"

if etymologynr > -1:
	vim.command('{}wincmd w'.format(etymologynr))
else:
	vim.command('silent keepalt belowright split etymology')

vim.command('setlocal noswapfile nobuflisted nospell nowrap modifiable')
vim.command('setlocal buftype=nofile bufhidden=hide')
vim.command('setlocal foldmethod=marker textwidth=80 wrapmargin=0')

term_xpath = etree.XPath(htmltrans.css_to_xpath('dt'))
linkfixes = etree.XPath(htmltrans.css_to_xpath("a.crossreference"))
foreignfixes = etree.XPath(htmltrans.css_to_xpath("span.foreign"))

definitions = html.parse(base_url.format(uparse.quote_plus(word_to_look_up)))
lines = []
for foreignfix in foreignfixes(definitions):

コード例 #13

0

ファイルを表示

ファイル: enact.py プロジェクト: ohpauleez/enact

 def cssToXpath(css_selector, translator=None):
     if not translator:
         translator = HTMLTranslator()
     return translator.css_to_xpath(css_selector)

コード例 #14

0

ファイルを表示

ファイル: helpers.py プロジェクト: sanders41/docs-scraper

def css_to_xpath(css):
    return HTMLTranslator().css_to_xpath(css) if len(css) > 0 else ""

コード例 #15

0

ファイルを表示

def CSSSelect(expr):
    from cssselect import HTMLTranslator
    from lxml.etree import XPath
    return XPath(HTMLTranslator().css_to_xpath(expr))

コード例 #16

0

ファイルを表示

ファイル: htmlparser.py プロジェクト: pije76/dragline

def extract(self, rules, strict=False):
    parselet = Parselet(rules, strict=strict)
    return parselet.extract(self)


def cssselect(self, expr):
    return self._css_translator.css_to_xpath(expr)


def css(self, expr):
    return self.xpath(self.cssselect(expr))


html.HtmlElement.extract_text = extract_text
html.HtmlElement._css_translator = HTMLTranslator()
html.HtmlElement.cssselect = cssselect
html.HtmlElement.css = css
html.HtmlElement.extract = extract
html.HtmlElement.extract_urls = extract_urls


def HtmlParser(response):
    """
    :param response:
    :type response: :class:`dragline.http.Response`

    This method takes response object as its argument and returns
    the lxml etree object.

    HtmlParser function returns a lxml object of type HtmlElement which got few potential methods.

コード例 #17

0

ファイルを表示

# Searching on class names with a dash ('-')
from cssselect import HTMLTranslator
result = lxml_document.xpath(HTMLTranslator().css_to_xpath('div.reddit-entry'))

コード例 #18

0

ファイルを表示

    def _parse(self):
        """Parse the dom according to the provided css selectors.
        
        Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found.
        """
        # try to parse the number of results.
        attr_name = self.searchtype + '_search_selectors'
        selector_dict = getattr(self, attr_name, None)

        # short alias because we use it so extensively
        css_to_xpath = HTMLTranslator().css_to_xpath

        # get the appropriate css selectors for the num_results for the keyword
        num_results_selector = getattr(self, 'num_results_search_selectors',
                                       None)
        if num_results_selector:
            self.search_results['num_results'] = self.dom.xpath(
                css_to_xpath(num_results_selector))[0].text_content()

        if not selector_dict:
            raise InvalidSearchTypeExcpetion(
                'There is no such attribute: {}. No selectors found'.format(
                    attr_name))

        for result_type, selectors in selector_dict.items():
            self.search_results[result_type] = []

            results = self.dom.xpath(
                css_to_xpath(
                    '{container} {result_container}'.format(**selectors)))

            to_extract = set(
                selectors.keys()) - {'container', 'result_container'}
            selectors_to_use = dict(((key, selectors[key])
                                     for key in to_extract
                                     if key in selectors.keys()))

            for index, result in enumerate(results):
                # Let's add primitve support for CSS3 pseudo selectors
                # We just need two of them
                # ::text
                # ::attr(someattribute)

                # You say we should use xpath expresssions instead?
                # Maybe you're right, but they are complicated when it comes to classes,
                # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html
                serp_result = {}
                for key, selector in selectors_to_use.items():
                    value = None
                    if selector.endswith('::text'):
                        try:
                            value = result.xpath(
                                css_to_xpath(selector.split('::')
                                             [0]))[0].text_content()
                        except IndexError as e:
                            pass
                    else:
                        attr = re.search(r'::attr\((?P<attr>.*)\)$',
                                         selector).group('attr')
                        if attr:
                            try:
                                value = result.xpath(
                                    css_to_xpath(
                                        selector.split('::')[0]))[0].get(attr)
                            except IndexError as e:
                                pass
                        else:
                            try:
                                value = result.xpath(
                                    css_to_xpath(selector))[0].text_content()
                            except IndexError as e:
                                pass
                    serp_result[key] = value
                if serp_result:
                    self.search_results[result_type].append(serp_result)

コード例 #19

0

ファイルを表示

ファイル: parsing.py プロジェクト: sisteamnik/GoogleScraper

    def _parse(self):
        """Internal parse the dom according to the provided css selectors.
        
        Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found.
        """

        # Try to parse the provided HTML string using lxml
        # strip all unnecessary information to save space
        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.style = True

        try:
            parser = lxml.html.HTMLParser(encoding='utf-8')
            self.dom = lxml.html.document_fromstring(self.html, parser=parser)
            self.dom = cleaner.clean_html(self.dom)
            self.dom.resolve_base_href()
        except Exception as e:
            # maybe wrong encoding
            logger.error(e)

        # try to parse the number of results.
        attr_name = self.searchtype + '_search_selectors'
        selector_dict = getattr(self, attr_name, None)

        # short alias because we use it so extensively
        css_to_xpath = HTMLTranslator().css_to_xpath

        # get the appropriate css selectors for the num_results for the keyword
        num_results_selector = getattr(self, 'num_results_search_selectors',
                                       None)
        self.search_results['num_results'] = ''

        if isinstance(num_results_selector, list) and num_results_selector:
            for selector in num_results_selector:
                try:
                    self.search_results['num_results'] = self.dom.xpath(
                        css_to_xpath(selector))[0].text_content()
                except IndexError as e:
                    logger.warning(
                        'Cannot parse num_results from serp page with selector {}'
                        .format(selector))
                else:  # leave when first selector grabbed something
                    break

        if not selector_dict and not isinstance(selector_dict, dict):
            raise InvalidSearchTypeException(
                'There is no such attribute: {}. No selectors found'.format(
                    attr_name))

        for result_type, selector_class in selector_dict.items():

            self.search_results[result_type] = []

            for selector_specific, selectors in selector_class.items():

                results = self.dom.xpath(
                    css_to_xpath(
                        '{container} {result_container}'.format(**selectors)))
                to_extract = set(
                    selectors.keys()) - {'container', 'result_container'}
                selectors_to_use = {
                    key: selectors[key]
                    for key in to_extract if key in selectors.keys()
                }

                for index, result in enumerate(results):
                    # Let's add primitive support for CSS3 pseudo selectors
                    # We just need two of them
                    # ::text
                    # ::attr(attribute)

                    # You say we should use xpath expressions instead?
                    # Maybe you're right, but they are complicated when it comes to classes,
                    # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html
                    serp_result = {}
                    # key are for example 'link', 'snippet', 'snippet', ...
                    # selector is the selector to grab these items
                    for key, selector in selectors_to_use.items():
                        value = None
                        if selector.endswith('::text'):
                            try:
                                value = result.xpath(
                                    css_to_xpath(selector.split('::')
                                                 [0]))[0].text_content()
                            except IndexError as e:
                                pass
                        else:
                            attr = re.search(r'::attr\((?P<attr>.*)\)$',
                                             selector).group('attr')
                            if attr:
                                try:
                                    value = result.xpath(
                                        css_to_xpath(selector.split('::')
                                                     [0]))[0].get(attr)
                                except IndexError as e:
                                    pass
                            else:
                                try:
                                    value = result.xpath(css_to_xpath(
                                        selector))[0].text_content()
                                except IndexError as e:
                                    pass

                        serp_result[key] = value
                    # only add items that have not None links.
                    # Avoid duplicates. Detect them by the link.
                    # If statement below: Lazy evaluation. The more probable case first.
                    if 'link' in serp_result and serp_result['link'] and \
                            not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]:
                        self.search_results[result_type].append(serp_result)

コード例 #20

0

ファイルを表示

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = 'ipetrash'


# pip install cssselect
from cssselect import HTMLTranslator
css_to_xpath = HTMLTranslator(xhtml=True).css_to_xpath


if __name__ == '__main__':
    xpath_expr = css_to_xpath('div#main > a[href]')
    print(xpath_expr)  # descendant-or-self::div[@id = 'main']/a[@href]

    xpath_expr = css_to_xpath('div')
    print(xpath_expr)  # descendant-or-self::div

    xpath_expr = css_to_xpath('table:nth-last-child(1)')
    print(xpath_expr)  # descendant-or-self::table[count(following-sibling::*) = 0]

    print()

    for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head',
                 '.heading', '.contentheading', '.small_header_red'):
        xpath_expr = css_to_xpath(item)
        print(xpath_expr)

コード例 #21

0

ファイルを表示

ファイル: amazon_plugin.py プロジェクト: liwp-stephen/calibre

def CSSSelect(expr):
    from cssselect import HTMLTranslator
    return HTMLTranslator().css_to_xpath(expr)

コード例 #22

0

ファイルを表示

ファイル: selector.py プロジェクト: djangosporti/cyborg

import logging
from cssselect import HTMLTranslator, SelectorError
from functools import lru_cache


class SelectorException(RuntimeError):
    def __init__(self, selector):
        self.selector = selector


translator = HTMLTranslator()


@lru_cache()
def xpath(pattern):
    return translator.css_to_xpath(pattern)


logger = logging.getLogger("selector")


class Selector(object):
    def __init__(self, document):
        self.document = document
        self.translator = HTMLTranslator()

    def find(self, pattern):
        expression = xpath(pattern)
        results = [Selector(d) for d in self.document.xpath(expression)]
        if len(results) == 0:
            logger.warning("Selector {0} found 0 results".format(pattern))

コード例 #23

0

ファイルを表示

ファイル: scraper_duproprio.py プロジェクト: annLiu310/visualize-real-estate

 def scrap_value(doc, selector):
     xpath_selector = HTMLTranslator().css_to_xpath(selector)
     elems = doc.xpath(xpath_selector)
     return elems

コード例 #24

0

ファイルを表示

ファイル: selector.py プロジェクト: djangosporti/cyborg

 def __init__(self, document):
     self.document = document
     self.translator = HTMLTranslator()

コード例 #25

0

ファイルを表示

import re
from dataclasses import asdict, dataclass
from functools import lru_cache
from typing import Callable, Iterable, Iterator, List, Optional, Tuple, TypeVar
from urllib.parse import urljoin

import arrow
import attr
import requests
from cssselect import HTMLTranslator
from lxml.html import fromstring
from requests import Session
from robobrowser import RoboBrowser
from tqdm import tqdm

_ctx = lru_cache()(HTMLTranslator().css_to_xpath)
parser = arrow.parser.DateTimeParser('en_au', 100)
T = TypeVar('T')


def ctx(el, selector):
    return el.xpath(_ctx(selector))


def parse(string, fmt=None):
    if fmt is None:
        t = parser.parse_iso(string)
    else:
        t = parser.parse(string, fmt)
    return arrow.Arrow.fromdatetime(t)

コード例 #26

0

ファイルを表示

 def process_query(self, query):
     xpath_query = HTMLTranslator().css_to_xpath(query)
     return super(CssSelector, self).process_query(xpath_query)

コード例 #27

0

ファイルを表示

    def _search(self):
        """The actual search and parsing of the results.
 
        Private, internal method.
        Parsing is done with lxml and cssselect. The html structure of the Google Search
        results may change over time. Effective: February 2014
        """
        self._build_query()

        if DO_CACHING:
            html = get_cached(self._SEARCH_PARAMS)
            self.SEARCH_RESULTS['cache_file'] = os.path.join(
                CACHEDIR, cached_file_name(self._SEARCH_PARAMS))
        else:
            html = False

        if not html:
            try:
                r = requests.get(self._SEARCH_URL,
                                 headers=self._HEADERS,
                                 params=self._SEARCH_PARAMS,
                                 timeout=3.0)

                logger.debug("Scraped with url: {}".format(r.url))

            except requests.ConnectionError as cerr:
                print('Network problem occurred {}'.format(cerr.msg))
                return False
            except requests.Timeout as terr:
                print('Connection timeout {}'.format(terr.msg))
                return False

            if not r.ok:
                print('HTTP Error:', r.status_code)
                if str(r.status_code)[0] == '5':
                    print('Maybe google recognizes you as sneaky spammer after'
                          ' you requested their services too inexhaustibly :D')
                return False

            html = r.text
            # cache fresh results
            if DO_CACHING:
                cache_results(self._SEARCH_PARAMS, html)
                self.SEARCH_RESULTS['cache_file'] = os.path.join(
                    CACHEDIR, cached_file_name(self._SEARCH_PARAMS))

        # Try to parse the google HTML result using lxml
        try:
            doc = UnicodeDammit(html, is_html=True)
            parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
            dom = lxml.html.document_fromstring(html, parser=parser)
            dom.resolve_base_href()
        except Exception as e:
            print('Some error occurred while lxml tried to parse: {}'.format(
                e.msg))
            return False

        # Try to extract all links of non-ad results, including their snippets(descriptions) and titles.
        try:
            li_g_results = dom.xpath(HTMLTranslator().css_to_xpath('li.g'))
            links = []
            for e in li_g_results:
                try:
                    link_element = e.xpath(
                        HTMLTranslator().css_to_xpath('h3.r > a:first-child'))
                    link = link_element[0].get('href')
                    title = link_element[0].text_content()
                except IndexError as err:
                    logger.error(
                        'Error while parsing link/title element: {}'.format(
                            err))
                    continue
                try:
                    snippet_element = e.xpath(
                        HTMLTranslator().css_to_xpath('div.s > span.st'))
                    snippet = snippet_element[0].text_content()
                except IndexError as err:
                    logger.error(
                        'Error while parsing snippet element: {}'.format(err))
                    continue

                links.append(
                    self.Result(link_title=title,
                                link_url=link,
                                link_snippet=snippet))
        # Catch further errors besides parsing errors that take shape as IndexErrors
        except Exception as err:
            logger.error('Error in parsing result links: {}'.format(err))

        self.SEARCH_RESULTS['results'].extend(links)

        # try to get the number of results for our search query
        try:
            self.SEARCH_RESULTS['num_results_for_kw'] = \
                dom.xpath(HTMLTranslator().css_to_xpath('div#resultStats'))[0].text_content()
        except Exception as e:
            logger.critical(e.msg)

コード例 #28

0

ファイルを表示

ファイル: scrape.py プロジェクト: dcloud/thatsfantastic

import lxml.html
from cssselect import HTMLTranslator
import re
from scraper.models import FilmDict
from scraper.utils import (decode_html, unicode_normalize,
                           clean_string, string_to_list,
                           correct_countries_list)
from cinema.utils import (titlecase, country_title)

html_translator = HTMLTranslator()
META_XPATH = html_translator.css_to_xpath('header.carousel-caption > h6')
ANCHOR_XPATH = html_translator.css_to_xpath('ul.thumbnails > li .thumbnail > a:nth-of-type(1)')
SYNOPSIS_GRAPHS_XPATH = "//div[@class='lead']/p"
DESCRIPTION_GRAPHS_XPATH = '//article/h4[2]/following-sibling::p'
DIRECTOR_REG = r'dir\.\s+([^\d]+)'
COUNTRIES_REG = r'(?:\,\s+(\w[\'\w\s]+)+)'


class HTMLScraper:
    """docstring for HTMLScraper"""
    def __init__(self, raw_html, source_url=None):
        super(HTMLScraper, self).__init__()
        self.source_url = source_url
        self.raw_html = raw_html
        self._tree = None

    @property
    def tree(self):
        if self._tree is None:
            self._tree = self.make_tree()
        return self._tree

コード例 #29

0

ファイルを表示

class LinkExtractor(ABC):
    """
    The abstract class LinkExtractor defines the behavior that extracts links from the specific
    response and the must follow the specific rules, each subclass must implement function _process()
    which represent the extract links logic.

    Each element in the extracted links must be an object Link from common_crawler.link.
    """

    _css_translator = HTMLTranslator()

    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 tags=('a', 'area'),
                 attrs=('href', ),
                 canonicalize=False,
                 unique=True,
                 process_attr=None,
                 deny_extensions=None,
                 strip=True,
                 restrict_xpaths=(),
                 restrict_css=()):
        """
        :param allow:
            a regular expression tuple(or single value) that the URLs must match in order to extract.
        :param deny:
            a regular expression tuple(or single value), the match the successful URLs will not be extracted.
        :param allow_domains:
            a tuple(or single value) of a string containing domains which will be considered for extracting the links.
        :param deny_domains:
            a tuple(or single value) of a string containing domains which won't be considered for extracting the links.
        :param tags:
            a tags list(or single value) to consider when extracting links.
        :param attrs:
            an attribute list(or single value) which should be considered when looking for links to extract, only for those tags specified in the tags param.
        :param canonicalize:
            canonicalize each extracted url (using w3lib.url.canonicalize_url).
        :param unique:
            whether duplicate filtering should be applied to extracted links.
        :param process_attr:
            a function which receives each value extracted from the tag and attributes scanned and can modify the value and return a new one.
        :param deny_extensions:
            a extension list(or single value) that should be ignored when extracting links.
        :param strip:
            whether to strip whitespaces from extracted attributes, according to HTML5 standard.
        :param restrict_xpaths:
            an XPath or list of XPath which defines regions inside the response where links should
            be extracted from.
        :param restrict_css:
            a CSS or list of CSS which defines regions inside the response where links should
            be extracted from.
        """

        self.unique = unique
        self.strip = strip

        self.allowed_rule = compile_regexes(arg_to_iter(allow))
        self.denied_rule = compile_regexes(arg_to_iter(deny))
        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))

        self.deny_extensions = deny_extensions or IGNORED_EXTENSIONS
        self.deny_extensions = {'.' + x for x in arg_to_iter(deny_extensions)}

        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        self.scan_tag_func, self.scan_attr_func = lambda x: x in tags, lambda x: x in attrs
        self.process_attr = process_attr if callable(
            process_attr) else lambda v: v

        self.canonicalize = canonicalize
        if canonicalize:
            self.link_key = lambda link: link.url
        else:
            self.link_key = lambda link: canonicalize_url(link.url,
                                                          keep_fragments=True)

        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
        self.restrict_xpaths += tuple(
            map(self._css_translator.css_to_xpath, arg_to_iter(restrict_css)))

    def _link_allowed(self, link):
        """Return true if the link meets the requirements of the rules."""
        if not is_valid_url(link.url):
            return False
        if self.allowed_rule and not matches(link.url, self.allowed_rule):
            return False
        if self.denied_rule and matches(link.url, self.denied_rule):
            return False
        parsed_url = parse_url(link.url)
        if self.allow_domains and not url_in_domains(parsed_url,
                                                     self.allow_domains):
            return False
        if self.deny_domains and url_in_domains(parsed_url, self.deny_domains):
            return False
        if self.deny_extensions and url_has_extension(parsed_url,
                                                      self.deny_extensions):
            return False
        return True

    def _get_response_text(self, response, func_name='text', encoding='utf-8'):
        """
        Return a text of the response by invoking the specific function,
        return itself if the response is the string.
        """
        if isinstance(response, str):
            return response
        if hasattr(response, func_name):
            text = getattr(response, func_name)
            text = text() if callable(text) else text
            return text.decode(encoding) if isinstance(text, bytes) else text
        raise ValueError(
            'The response must be str or has a function or param for getting the text'
        )

    def _deduplicate(self, links):
        """Remove duplicate links."""
        if self.unique:
            return unique_list(list_=links, key=self.link_key)
        return links

    def extract_links(self, response, encoding='utf-8'):
        """
        Return extracted links from the specific response according to rules,
        invoke the function _link_allowed() for filtering invalid links.
        """
        links = self._process(response, encoding)
        links = [x for x in links if self._link_allowed(x)]
        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)
        links = self._deduplicate(links)
        return links

    @abstractmethod
    def _process(self, response, encoding='utf-8'):
        """
        Specific extract link logic that subclass implementation,
        need basis on the params tags, attrs, process_attr and strip to extracts.
        """
        raise NotImplementedError

コード例 #30

0

ファイルを表示

except ImportError:
    raise RuntimeError('You need cssutils >= 0.9.9 for calibre')
from cssutils import (profile as cssprofiles, parseString, parseStyle, log as
                      cssutils_log, CSSParser, profiles, replaceUrls)
from lxml import etree
from cssselect import HTMLTranslator

from calibre import force_unicode
from calibre.ebooks import unit_convert
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize

cssutils_log.setLevel(logging.WARN)

_html_css_stylesheet = None
css_to_xpath = HTMLTranslator().css_to_xpath


def html_css_stylesheet():
    global _html_css_stylesheet
    if _html_css_stylesheet is None:
        html_css = open(os.path.join(os.path.dirname(__file__), 'html.css'),
                        'rb').read()
        _html_css_stylesheet = parseString(html_css, validate=False)
        _html_css_stylesheet.namespaces['h'] = XHTML_NS
    return _html_css_stylesheet


XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS

INHERITED = set([

コード例 #31

0

ファイルを表示

    def test_select(self):
        document = etree.fromstring(HTML_IDS)
        sort_key = dict(
            (el, count)
            for count, el in enumerate(document.getiterator())).__getitem__
        css_to_xpath = GenericTranslator().css_to_xpath
        html_css_to_xpath = HTMLTranslator().css_to_xpath

        def select_ids(selector, html_only):
            xpath = css_to_xpath(selector)
            items = document.xpath(xpath)
            if html_only:
                assert items == []
                xpath = html_css_to_xpath(selector)
                items = document.xpath(xpath)
            items.sort(key=sort_key)
            return [element.get('id', 'nil') for element in items]

        def pcss(main, *selectors, **kwargs):
            html_only = kwargs.pop('html_only', False)
            result = select_ids(main, html_only)
            for selector in selectors:
                assert select_ids(selector, html_only) == result
            return result

        all_ids = pcss('*')
        assert all_ids[:6] == [
            'html', 'nil', 'link-href', 'link-nohref', 'nil', 'outer-div'
        ]
        assert all_ids[-1:] == ['foobar-span']
        assert pcss('div') == ['outer-div', 'li-div', 'foobar-div']
        assert pcss('DIV',
                    html_only=True) == ['outer-div', 'li-div', 'foobar-div'
                                        ]  # case-insensitive in HTML
        assert pcss('div div') == ['li-div']
        assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div']
        assert pcss('a[name]') == ['name-anchor']
        assert pcss('a[NAme]',
                    html_only=True) == ['name-anchor'
                                        ]  # case-insensitive in HTML:
        assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor']
        assert pcss('a[rel="tag"]') == ['tag-anchor']
        assert pcss('a[href*="localhost"]') == ['tag-anchor']
        assert pcss('a[href*=""]') == []
        assert pcss('a[href^="http"]') == ['tag-anchor', 'nofollow-anchor']
        assert pcss('a[href^="http:"]') == ['tag-anchor']
        assert pcss('a[href^=""]') == []
        assert pcss('a[href$="org"]') == ['nofollow-anchor']
        assert pcss('a[href$=""]') == []
        assert pcss('div[foobar~="bc"]',
                    'div[foobar~="cde"]') == ['foobar-div']
        assert pcss('[foobar~="ab bc"]', '[foobar~=""]',
                    '[foobar~=" \t"]') == []
        assert pcss('div[foobar~="cd"]') == []
        assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ['second-li']
        # Attribute values are case sensitive
        assert pcss('*[lang|="en"]', '[lang|="en-US"]') == []
        assert pcss('*[lang|="e"]') == []
        # ... :lang() is not.
        assert pcss(':lang("EN")', '*:lang(en-US)',
                    html_only=True) == ['second-li', 'li-div']
        assert pcss(':lang("e")', html_only=True) == []
        assert pcss('li:nth-child(3)') == ['third-li']
        assert pcss('li:nth-child(10)') == []
        assert pcss(
            'li:nth-child(2n)', 'li:nth-child(even)',
            'li:nth-child(2n+0)') == ['second-li', 'fourth-li', 'sixth-li']
        assert pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') == [
            'first-li', 'third-li', 'fifth-li', 'seventh-li'
        ]
        assert pcss('li:nth-child(2n+4)') == ['fourth-li', 'sixth-li']
        # FIXME: I'm not 100% sure this is right:
        assert pcss('li:nth-child(3n+1)') == [
            'first-li', 'fourth-li', 'seventh-li'
        ]
        assert pcss('li:nth-last-child(0)') == ['seventh-li']
        assert pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') == [
            'second-li', 'fourth-li', 'sixth-li'
        ]
        assert pcss('li:nth-last-child(2n+2)') == ['second-li', 'fourth-li']
        assert pcss('ol:first-of-type') == ['first-ol']
        assert pcss('ol:nth-child(1)') == []
        assert pcss('ol:nth-of-type(2)') == ['second-ol']
        # FIXME: like above', '(1) or (2)?
        assert pcss('ol:nth-last-of-type(1)') == ['first-ol']
        assert pcss('span:only-child') == ['foobar-span']
        assert pcss('li div:only-child') == ['li-div']
        assert pcss('div *:only-child') == ['li-div', 'foobar-span']
        self.assertRaises(ExpressionError, pcss, 'p *:only-of-type')
        assert pcss('p:only-of-type') == ['paragraph']
        assert pcss('a:empty', 'a:EMpty') == ['name-anchor']
        assert pcss('li:empty') == [
            'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li'
        ]
        assert pcss(':root', 'html:root') == ['html']
        assert pcss('li:root', '* :root') == []
        assert pcss('*:contains("link")', ':CONtains("link")') == [
            'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor'
        ]
        assert pcss('*:contains("LInk")') == []  # case sensitive
        assert pcss('*:contains("e")') == [
            'html', 'nil', 'outer-div', 'first-ol', 'first-li', 'paragraph',
            'p-em'
        ]
        assert pcss('*:contains("E")') == []  # case-sensitive
        assert pcss('.a', '.b', '*.a', 'ol.a') == ['first-ol']
        assert pcss('.c', '*.c') == ['first-ol', 'third-li', 'fourth-li']
        assert pcss('ol *.c', 'ol li.c', 'li ~ li.c',
                    'ol > li.c') == ['third-li', 'fourth-li']
        assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li']
        assert pcss('li div', 'li > div', 'div div') == ['li-div']
        assert pcss('div > div') == []
        assert pcss('div>.c', 'div > .c') == ['first-ol']
        assert pcss('div + div') == ['foobar-div']
        assert pcss('a ~ a') == ['tag-anchor', 'nofollow-anchor']
        assert pcss('a[rel="tag"] ~ a') == ['nofollow-anchor']
        assert pcss('ol#first-ol li:last-child') == ['seventh-li']
        assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li']
        assert pcss('#outer-div:first-child') == ['outer-div']
        assert pcss('#outer-div :first-child') == [
            'name-anchor', 'first-li', 'li-div', 'p-b',
            'checkbox-fieldset-disabled', 'area-href'
        ]
        assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor']
        assert pcss(':not(*)') == []
        assert pcss('a:not([href])') == ['name-anchor']
        assert pcss('ol :Not(li[class])') == [
            'first-li', 'second-li', 'li-div', 'fifth-li', 'sixth-li',
            'seventh-li'
        ]
        # Invalid characters in XPath element names, should not crash
        assert pcss(r'di\a0 v', r'div\[') == []
        assert pcss(r'[h\a0 ref]', r'[h\]ref]') == []

        # HTML-specific
        assert pcss(':link', html_only=True) == [
            'link-href', 'tag-anchor', 'nofollow-anchor', 'area-href'
        ]
        assert pcss(':visited', html_only=True) == []
        assert pcss(':enabled', html_only=True) == [
            'link-href', 'tag-anchor', 'nofollow-anchor', 'checkbox-unchecked',
            'text-checked', 'checkbox-checked', 'area-href'
        ]
        assert pcss(':disabled', html_only=True) == [
            'checkbox-disabled', 'checkbox-disabled-checked', 'fieldset',
            'checkbox-fieldset-disabled'
        ]
        assert pcss(':checked', html_only=True) == [
            'checkbox-checked', 'checkbox-disabled-checked'
        ]

コード例 #32

0

ファイルを表示

ファイル: etyonline.py プロジェクト: ShadowKyogre/vim-word-study

import vim
from urllib import parse as uparse
from lxml import html, etree
from cssselect import HTMLTranslator
import sys

htmltrans = HTMLTranslator()

old_search_pattern = vim.eval('@/')
base_url = "http://etymonline.com/index.php?term={}"
etymologynr = int(vim.eval("bufwinnr('^etymology$')"))
word_to_look_up = sys.argv[0]
term_start = "{} {{{{{{"
term_end = "}}}"

if etymologynr > -1:
    vim.command('{}wincmd w'.format(etymologynr))
else:
    vim.command('silent keepalt belowright split etymology')

vim.command('setlocal noswapfile nobuflisted nospell nowrap modifiable')
vim.command('setlocal buftype=nofile bufhidden=hide')
vim.command('setlocal foldmethod=marker textwidth=80 wrapmargin=0')

term_xpath = etree.XPath(htmltrans.css_to_xpath('dt'))
linkfixes = etree.XPath(htmltrans.css_to_xpath("a.crossreference"))
foreignfixes = etree.XPath(htmltrans.css_to_xpath("span.foreign"))

definitions = html.parse(base_url.format(uparse.quote_plus(word_to_look_up)))
lines = []
for foreignfix in foreignfixes(definitions):

コード例 #33

0

ファイルを表示

            return "{expression}{element_name}".format(
                expression=expression, element_name=element_names[0])
        elif len(element_names) > 1:
            element_names_xpath = " | ".join([
                "self::{element_name}".format(element_name=element_name)
                for element_name in element_names
            ])
            return "{expression}*[{element_names}]".format(
                expression=expression, element_names=element_names_xpath)
        else:
            return "{expression}*".format(expression=expression)


def to_xpath(node, exact=False):
    """
    Converts a given XPath :class:`Expression` into a corresponding string query.

    Args:
        node (Expression): An XPath :class:`Expression` to convert.
        exact (bool, optional): Whether the generated query should perform exact or approximate
            locator matches. Defaults to False.

    Returns:
        str: A valid XPath query corresponding to the given :class:`Expression`.
    """

    return Renderer(exact=exact).render(node)


_selector_to_xpath = partial(HTMLTranslator().selector_to_xpath, prefix=None)