Esempi in Python per GenericTranslator, esempi in Python per cssselect.GenericTranslator

Esempio n. 1

0

Mostra file

File: europe1.py Progetto: Nuttenscl/radiostreampodcast

    def fillemissionindb(self, query=""):
        self.cleardb()
        conn = sqlite3.connect('podcast.db')
        c = conn.cursor()
        html_parser = etree.HTMLParser(encoding='utf-8',
                                       recover=True,
                                       strip_cdata=True)
        page = html.parse(self.url)

        try:
            expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
            expressionurl = GenericTranslator().css_to_xpath(self.argurl)
        except SelectorError:
            parser.error('Invalid CSS selector')

        for e, eid in zip(page.xpath(expressiontitle),
                          page.xpath(expressionurl)):
            try:
                title = re.search('.* au podcast (.*)', e.text).group(1)
                found = re.search('^.*sound/(.*)\.xml',
                                  eid.get("href")).group(1)
            except AttributeError:
                found = ''
            etemp = emissioneurope1(title, found)
            qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\"" + self.name + "\",\"" + etemp.name + "\",'" + etemp.podcasturl + "','" + str(
                etemp.idpod) + "')"
            print(qqq)
            c.execute(qqq)
        conn.commit()
        conn.close()

Esempio n. 2

0

Mostra file

File: poster.py Progetto: Vegasq/WikiPosters

    def search_page(self):
        wiki_url = "https://en.wikipedia.org"
        search_param = "+".join(self.search_string.split(" ") + ["film"])
        url = f"{wiki_url}/w/index.php?search={search_param}&title=Special%3ASearch&go=Go"
        out = requests.get(url)

        if "index.php?search" not in out.url:
            # If we guessed page name
            return out.url
        # text = "".join(re.split('<head>.*</head>', out.text,
        #                flags=re.IGNORECASE | re.DOTALL))

        parser = XMLParser(recover=True)
        document = fromstring(out.text, parser=parser)

        expression = GenericTranslator().css_to_xpath('.mw-search-result')
        all_results = document.xpath(expression)
        if not all_results:
            raise NotFound(url)
        first_result = all_results[0]

        link_selector = GenericTranslator().css_to_xpath('a')
        first_link = first_result.xpath(link_selector)[0]

        first_result_url = f"{wiki_url}{first_link.get('href')}"

        return first_result_url

Esempio n. 3

0

Mostra file

    def transform_result(cls, text):

        translator = GenericTranslator()

        item_xpath = translator.css_to_xpath('div.card.movies a.title')

        document = html.fromstring(text)

        elements = document.xpath(item_xpath)

        log.debug(
            'found %r matching elements for xpath %r',
            len(elements),
            item_xpath,
        )

        def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)):

            return urlunsplit((
                base.scheme,
                base.netloc,
                path,
                '',
                '',
            ))

        items = ((e.get('title'), e.get('href')) for e in elements)

        return (SearchResult(title, absolutize(path), cls.SOURCE)
                for (title, path) in items)

Esempio n. 4

0

Mostra file

File: mwctools.py Progetto: nagarjunacse02/MailWebsiteChanges

    def __init__(self, contentcss, titlecss=None):
        contentxpath = GenericTranslator().css_to_xpath(contentcss)
        titlexpath = None
        if titlecss is not None:
            titlexpath = GenericTranslator().css_to_xpath(titlecss)

        self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath)

Esempio n. 5

0

Mostra file

    def _get_request_body(self, element):
        """
        Get body params from sampler
        :param element:
        :return: dict
        """

        raw_body = self._get_bool_prop(element, 'HTTPSampler.postBodyRaw')
        if raw_body:
            xpath = GenericTranslator().css_to_xpath("elementProp>collectionProp>elementProp")
            http_args_element = element.xpath(xpath)[0]
            body = self._get_string_prop(http_args_element, 'Argument.value')
            if body:
                self.log.debug('Got %s for body in %s (%s)', body, element.tag, element.get("name"))
                return {"body": body}
            else:
                return {}
        else:
            body_params = {}
            xpath = GenericTranslator().css_to_xpath("elementProp>collectionProp>elementProp")
            http_args_collection = element.xpath(xpath)
            for element in http_args_collection:
                body_params[element.get("name")] = self._get_string_prop(element, 'Argument.value')
            if body_params:
                self.log.debug('Got %s for body in %s (%s)', body_params, element.tag, element.get("name"))
                return {"body": body_params}
            else:
                return {}

Esempio n. 6

0

Mostra file

def get_best_albums():
    expression1 = GenericTranslator().css_to_xpath(".fr_list_heading.fr-text p")
    expression2 = GenericTranslator().css_to_xpath(".fr_list_sub_heading.fr-text p")
    content = requests.get("https://www.factmag.com/2018/12/13/the-50-best-albums-of-2018/").text
    xml_tree = lxml.html.fromstring(content)
    artists = [element.text_content() for element in xml_tree.xpath(expression1)]
    albums = [element.text_content() for element in xml_tree.xpath(expression2)]
    return zip(artists, albums)

Esempio n. 7

0

Mostra file

    def test_unicode(self):
        if sys.version_info[0] < 3:
            css = '.a\xc1b'.decode('ISO-8859-1')
        else:
            css = '.a\xc1b'

        xpath = GenericTranslator().css_to_xpath(css)
        assert css[1:] in xpath
        xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII')
        assert xpath == (
            "descendant-or-self::*[@class and contains("
            "concat(' ', normalize-space(@class), ' '), ' a&#193;b ')]")

Esempio n. 8

0

Mostra file

File: tests.py Progetto: varialus/cssselect

    def test_unicode(self):
        if sys.version_info[0] >= 3:
            css = '.a\xc1b'
        else:
            css = '.a\xc1b'.decode('ISO-8859-1')

        xpath = GenericTranslator().css_to_xpath(css)
        assert css[1:] in xpath
        xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII')
        assert xpath == (
            "descendant-or-self::*[@class and contains("
            "concat(' ', normalize-space(@class), ' '), ' a&#193;b ')]")

Esempio n. 9

0

Mostra file

File: rtbf.py Progetto: Nuttenscl/radiostreampodcast

	def fillemissionindb(self,query=""):
		emissions=[]
		self.cleardb()
		conn = sqlite3.connect('podcast.db')
		c = conn.cursor()
		html_parser = etree.HTMLParser(encoding='utf-8', recover=True,strip_cdata=True)
		page= html.parse(self.url)
	
		try:
			expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
		except SelectorError:
			parser.error('Invalid CSS selector')
	
		for e in page.xpath(expressiontitle):
			try:
				found =re.search('https://www.rtbf.be/'+self.nomcode+'/.*?programId=([^"]*)', e.get("href")).group(1)
			except AttributeError:
			    found = '' 
			etemp = emissionrtbf(e.get("title"),found)
			qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\""+self.name+"\",\""+etemp.name+"\",'"+etemp.podcasturl+"','"+str(etemp.idpod)+"')"
			print(qqq)
			c.execute(qqq)
			emissions.append(etemp)
		self.emissions=emissions
		conn.commit()
		conn.close()

Esempio n. 10

0

Mostra file

 def execute(self, step):
     param = step['param']
     print("-------------------Executing %s -------------------" %
           (step["name"]))
     if self.config['iter'] == 0:
         browser_copy = self.browser_copy
         feed_copy = self.feed_copy
         if param['path']:
             try:
                 expression = GenericTranslator().css_to_xpath(
                     param['path'])
             except SelectorError:
                 print('Invalid selector')
             parser = etree.HTMLParser()
             tree = etree.parse(StringIO.StringIO(browser_copy), parser)
             nodes = tree.xpath(expression)
             number_of_nodes = len(nodes)
     else:
         number_of_nodes = self.config['number_of_nodes']
     if self.config['iter'] <= number_of_nodes:
         if self.config['iter'] == 0:
             self.config['iter'] += 1
             result = [
                 browser_copy, feed_copy, self.config['iter'],
                 len(nodes)
             ]
             return result
         else:
             self.config['iter'] += 1
             result = ["Continuing Iteration", self.config['iter']]
             return result
     else:
         result = ["Finish Iteration", self.config['iter']]
         return result

Esempio n. 11

0

Mostra file

def convert_css_to_xpath(css):
    """ Convert CSS Selectors to XPath Selectors.
        Example:
            convert_css_to_xpath('button:contains("Next")')
            Output => "//button[contains(., 'Next')]"
    """
    xpath = GenericTranslator().css_to_xpath(css, prefix='//')
    return xpath

Esempio n. 12

0

Mostra file

def get_ambient():
    expression = GenericTranslator().css_to_xpath('div.entry-content > hr + p + p strong')
    expression2 = GenericTranslator().css_to_xpath('strong a')
    content = requests.get("https://www.factmag.com/2018/12/16/best-ambient-2018/").text
    xml_tree = lxml.html.fromstring(content)
    albums = []
    
    for element in xml_tree.xpath(expression):
        if element.xpath(expression2):
            text_content = element.text_content()
            if text_content.startswith("Read next"):
                continue
            artist = text_content.split("\n")[0]
            album = element.xpath(expression2)[0].text_content()
            albums.append((artist, album))
    
    return albums

Esempio n. 13

0

Mostra file

File: python.scrapemz.py Progetto: ajmal017/MiscScripts

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('html', nargs='?', type=argparse.FileType('rb'),
                        default=sys.stdin, help="HTML", metavar="HTML")
    parser.add_argument('-a', '--argument', default="",
                        help="argument to extract from tag")
    parser.add_argument('-b', '--body', action='store_true', default=False,
                        help="Enclose output with HTML and BODY tags")
    parser.add_argument('-e', '--expression', default=[], action='append',
                        help="XPath query or CSS3 selector")
    parser.add_argument('-f', '--file', default='',
                        help="File to read input from")
    parser.add_argument('-x', '--check-existance', action='store_true', default=False,
                        help="Process return value signifying existance")
    parser.add_argument('-r', '--rawinput', action='store_true', default=False,
                        help="Do not parse HTML before feeding etree (useful"
                        "for escaping CData)")
    args = parser.parse_args()

    args.expression = [e.decode('utf-8') for e in args.expression]

    from cssselect import GenericTranslator

    expression = [e if e.startswith('//') else GenericTranslator().css_to_xpath(e) for e in args.expression]

    html_parser = etree.HTMLParser(encoding='utf-8', recover=True,
                                   strip_cdata=True)

    inp = open(args.file) if args.file else args.html
    if args.rawinput:
        document = etree.fromstring(inp.read())
    else:
        document = etree.parse(inp, html_parser)

    if args.body:
        sys.stdout.write("<!DOCTYPE html>\n<html>\n<body>\n")

    for e in expression:
        els = list(document.xpath(e))

        if args.check_existance:
            sys.exit(1 if len(els) == 0 else 0)

        for e in els:
            if isinstance(e, basestring):
                text = e
            elif not args.argument:
                text = etree.tostring(e)
            else:
                text = e.get(args.argument)
            if text is not None:
                sys.stdout.write(text.encode('utf-8').strip() + "\t")

    if args.body:
        sys.stdout.write("</body>\n</html>")

    sys.stdout.write('\n')
    sys.stdout.flush()

Esempio n. 14

0

Mostra file

File: poster.py Progetto: Vegasq/WikiPosters

    def movie_page(self, url):
        out = requests.get(url)
        document = fromstring(out.text)
        expression_thumb = GenericTranslator().css_to_xpath('.thumbborder')
        all_results = document.xpath(expression_thumb)
        if not all_results:
            # Try to get first emage of the page
            expression_img = GenericTranslator().css_to_xpath('img')
            all_results = document.xpath(expression_img)

        if not all_results:
            raise NotFound(url)

        first_result = all_results[0]
        url = first_result.get("src")
        if url.startswith("//"):
            url = "https:" + url
        return url

Esempio n. 15

0

Mostra file

File: locator.py Progetto: Aurococcus/wasd

 def is_css(cls, selector):
     """
     Проверяет, что переданный слектор - CSS
     """
     try:
         GenericTranslator().css_to_xpath(selector)
     except SelectorError:
         return False
     return True

Esempio n. 16

0

Mostra file

File: lxml.py Progetto: walison17/data_extractor

    def build(self) -> None:
        try:
            xpath_expr = GenericTranslator().css_to_xpath(self.expr)
        except SelectorError as exc:
            raise ExprError(extractor=self, exc=exc) from exc

        self._extractor = XPathExtractor(xpath_expr)
        self._extractor.build()
        self.built = True

Esempio n. 17

0

Mostra file

def get_table_rows():
    document: str = read_document()
    tree = html.fromstring(document)

    expression = GenericTranslator().css_to_xpath('.coming_list tbody tr')

    elements = tree.xpath(expression)

    return elements

Esempio n. 18

0

Mostra file

File: bbc.py Progetto: Nuttenscl/radiostreampodcast

    def fillemissionindb(self, query=""):
        self.cleardb()
        conn = connecttodb()
        c = conn.cursor()
        html_parser = etree.HTMLParser(encoding='utf-8',
                                       recover=True,
                                       strip_cdata=True)
        page = html.parse(self.url)

        try:
            expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
            expressionurl = GenericTranslator().css_to_xpath(self.argurl)
        except SelectorError:
            return 0
            #feedparser.error('Invalid CSS selector')

        for e, eid in zip(page.xpath(expressiontitle),
                          page.xpath(expressionurl)):
            if eid.get("href"):
                try:
                    if self.name == "France culture":
                        foundb = re.search('/podcast/(.*)',
                                           eid.get("href")).group(1)
                        pageb = html.parse(
                            "https://www.franceculture.fr/podcast/" + foundb)
                        aaa = pageb.xpath(
                            GenericTranslator().css_to_xpath(".lien-rss"))[0]
                        found = re.search("https.*rss_(.*)\.xml",
                                          aaa.get("href")).group(1)
                        print(found)
                    else:
                        found = re.search('https.*rss_(.*)\.xml',
                                          eid.get("href")).group(1)
                except AttributeError:
                    found = ''
            else:
                found = ""
            etemp = emissionradiofrance(e.text, found)
            qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\"" + self.name + "\",\"" + etemp.name + "\",'" + etemp.podcasturl + "','" + str(
                etemp.idpod) + "')"
            print(qqq)
            c.execute(qqq)
        conn.commit()
        conn.close()

Esempio n. 19

0

Mostra file

 def test_quoting(self):
     css_to_xpath = GenericTranslator().css_to_xpath
     assert css_to_xpath('*[aval="\'"]') == (
         '''descendant-or-self::*[@aval = "'"]''')
     assert css_to_xpath('*[aval="\'\'\'"]') == (
         """descendant-or-self::*[@aval = "'''"]""")
     assert css_to_xpath('*[aval=\'"\']') == (
         '''descendant-or-self::*[@aval = '"']''')
     assert css_to_xpath('*[aval=\'"""\']') == (
         '''descendant-or-self::*[@aval = '"""']''')

Esempio n. 20

0

Mostra file

    def get(self, selector):
        """
        Returns tree elements by CSS selector

        :type selector: str
        :return:
        """
        expression = GenericTranslator().css_to_xpath(selector)
        nodes = self.tree.xpath(expression)
        return nodes

Esempio n. 21

0

Mostra file

File: locator.py Progetto: Aurococcus/wasd

 def to_xpath(cls, selector):
     """
     Конвертирует CSS селектор в XPath.
     Если передать валидный XPath, то вернёт его без изменений.
     """
     try:
         return GenericTranslator().css_to_xpath(selector)
     except SelectorError:
         if cls.is_xpath(selector):
             return selector
     return None

Esempio n. 22

0

Mostra file

 def test_unicode_escapes(self):
     # \22 == '"'  \20 == ' '
     css_to_xpath = GenericTranslator().css_to_xpath
     assert css_to_xpath(r'*[aval="\'\22\'"]') == (
         '''descendant-or-self::*[@aval = concat("'",'"',"'")]''')
     assert css_to_xpath(r'*[aval="\'\22 2\'"]') == (
         '''descendant-or-self::*[@aval = concat("'",'"2',"'")]''')
     assert css_to_xpath(r'*[aval="\'\20  \'"]') == (
         '''descendant-or-self::*[@aval = "'  '"]''')
     assert css_to_xpath('*[aval="\'\\20\r\n \'"]') == (
         '''descendant-or-self::*[@aval = "'  '"]''')

Esempio n. 23

0

Mostra file

File: locator.py Progetto: Aurococcus/wasd

    def contains(cls, element, *text):
        """
        Находит элемент, содержащий текст.
        Можно передать CSS или XPath, однако они будут превращены в XPath.
        """
        condition = ""
        for string in text:
            condition += "[contains(., {})]".format(
                GenericTranslator().xpath_literal(string))

        xpath = '{0}{1}'.format(cls.to_xpath(element), condition)
        return xpath

Esempio n. 24

0

Mostra file

File: bbc.py Progetto: Nuttenscl/radiostreampodcast

    def fillemission(self, query="", iditt=1):
        emissions = []
        html_parser = etree.HTMLParser(encoding='utf-8',
                                       recover=True,
                                       strip_cdata=True)
        if iditt == 1:
            page = html.parse(self.url)
        else:
            page = html.parse(self.url + "?page=" + str(iditt) +
                              "#results-list")
        try:
            expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
            expressionurl = GenericTranslator().css_to_xpath(self.argurl)
            expressionother = GenericTranslator().css_to_xpath(".nav-pages a")
        except SelectorError:
            return 0
            #feedparser.error('Invalid CSS selector')

        for e, eid in zip(page.xpath(expressiontitle),
                          page.xpath(expressionurl)):
            if eid.get("href"):
                try:
                    found = re.search('.*/([^/]*)$', eid.get("href")).group(1)
                except AttributeError:
                    found = ''
            else:
                found = ""
            etemp = emissionbbc(e.text, found, self.nomcode)
            emissions.append(etemp)

        for eoth in page.xpath(expressionother):
            totest = self.url + "?page=" + str(iditt + 1) + "#results-list"
            if eoth.get("href") == totest:
                print("yes " + eoth.get("href"))
                self.fillemission(query, iditt + 1)
                break
        if iditt == 11:
            self.emissions = emissions
        else:
            self.emissions += emissions

Esempio n. 25

0

Mostra file

    def __init__(self, css=None, xpath=None, namespaces=None):
        if xpath and css:
            raise ParserError('At most one of "xpath" or "css" attributes can be specified.')

        if xpath:
            self.raw_xpath = xpath
        elif css:
            self.raw_xpath = GenericTranslator().css_to_xpath(css)
        else:
            self.raw_xpath = 'self::*'

        self.namespaces = namespaces
        self._compiled_xpath = None  # compile xpath lazily

Esempio n. 26

0

Mostra file

File: rtlfrance.py Progetto: Nuttenscl/radiostreampodcast

    def fillemission(self, query):
        emissions = []
        html_parser = etree.HTMLParser(encoding='utf-8',
                                       recover=True,
                                       strip_cdata=True)
        theurl = self.url + query
        page = html.parse(theurl)

        try:
            expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
            expressionurl = GenericTranslator().css_to_xpath(self.argpodcast)
        except SelectorError:
            parser.error('Invalid CSS selector')
        if self.code == "rtlfr":
            for e in page.xpath(expressiontitle):
                try:
                    found = re.search('https://www.rtl.fr/emission/([^"]*)',
                                      e.get("href")).group(1)
                except AttributeError:
                    found = ''
                etemp = emissionrtl(e.get("title"), found)
                emissions.append(etemp)

        elif self.code == "rtl2fr":
            for e, eid in zip(page.xpath(expressiontitle),
                              page.xpath(expressionurl)):
                if eid.get("href"):
                    try:
                        found = re.search(
                            'https://www.rtl2.fr/podcast/(.*).xml',
                            eid.get("href")).group(1)
                    except AttributeError:
                        found = ''
                else:
                    found = ""
            #	print eid.get("href")+"  "+found+"  "+e.text
                etemp = emissionrtl(e.text, found, True)
                emissions.append(etemp)
        self.emissions = emissions

Esempio n. 27

0

Mostra file

File: apifier.py Progetto: LucasBerbesson/apifier

    def _load_foreach(self):
        # Scrap data over multiple pages
        page = requests.get(self.url)
        tree = etree.HTML(page.text)

        selector = GenericTranslator().css_to_xpath(self.foreach)
        l = [e for e in tree.xpath(selector)]

        # Concat results from all pages
        return [
            sublist for e in l
            for sublist in self._load_data(e.get('href'), e.text)
        ]

Esempio n. 28

0

Mostra file

    def fillemission(self, query=""):
        emissions = []
        html_parser = etree.HTMLParser(encoding='utf-8',
                                       recover=True,
                                       strip_cdata=True)
        page = html.parse(self.url)

        try:
            expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
            expressionurl = GenericTranslator().css_to_xpath(self.argurl)
        except SelectorError:
            return 0
            #feedparser.error('Invalid CSS selector')

        for e, eid in zip(page.xpath(expressiontitle),
                          page.xpath(expressionurl)):
            if eid.get("href"):
                try:
                    if self.name == "France culture":
                        foundb = re.search('/podcast/(.*)',
                                           eid.get("href")).group(1)
                        pageb = html.parse(
                            "https://www.franceculture.fr/podcast/" + foundb)
                        aaa = pageb.xpath(
                            GenericTranslator().css_to_xpath(".lien-rss"))[0]
                        found = re.search("https.*rss_(.*)\.xml",
                                          aaa.get("href")).group(1)
                        print(found)
                    else:
                        found = re.search('https.*rss_(.*)\.xml',
                                          eid.get("href")).group(1)
                except AttributeError:
                    found = ''
            else:
                found = ""
            etemp = emissionradiofrance(e.text, found)
            emissions.append(etemp)
        self.emissions = emissions

Esempio n. 29

0

Mostra file

File: googleplay.py Progetto: jyelloz/mediasearch

    def transform_result(cls, text):

        translator = GenericTranslator()

        item_xpath = translator.css_to_xpath(
            'div.card.movies a.title'
        )

        document = html.fromstring(text)

        elements = document.xpath(item_xpath)

        log.debug(
            'found %r matching elements for xpath %r',
            len(elements),
            item_xpath,
        )

        def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)):

            return urlunsplit((
                base.scheme,
                base.netloc,
                path,
                '',
                '',
            ))

        items = (
            (e.get('title'), e.get('href'))
            for e in elements
        )

        return (
            SearchResult(title, absolutize(path), cls.SOURCE)
            for (title, path) in items
        )

Esempio n. 30

0

Mostra file

File: europe1.py Progetto: Nuttenscl/radiostreampodcast

    def fillemission(self, query=""):
        emissions = []
        html_parser = etree.HTMLParser(encoding='utf-8',
                                       recover=True,
                                       strip_cdata=True)
        page = html.parse(self.url)

        try:
            expressiontitle = GenericTranslator().css_to_xpath(self.argtitle)
            expressionurl = GenericTranslator().css_to_xpath(self.argurl)
        except SelectorError:
            parser.error('Invalid CSS selector')

        for e, eid in zip(page.xpath(expressiontitle),
                          page.xpath(expressionurl)):
            try:
                title = re.search('.* au podcast (.*)', e.text).group(1)
                found = re.search('^.*sound/(.*)\.xml',
                                  eid.get("href")).group(1)
            except AttributeError:
                found = ''
            etemp = emissioneurope1(title, found)
            emissions.append(etemp)
        self.emissions = emissions

Esempio n. 31

0

Mostra file

File: htqq.py Progetto: burtgulash/htqq

def preprocess_query(queries):
    for qs in queries:
        qs = filter(None, (x.strip() for x in qs.split("|")))

        # Convert css queries to xpath
        for query in qs:
            if not (query.startswith("//") or query.startswith("@")):
                from cssselect import GenericTranslator, SelectorError
                try:
                    # Try to interpret the selector as css
                    query = GenericTranslator().css_to_xpath(query)
                except SelectorError:
                    # Else fallback to xpath
                    pass

            yield query

Esempio n. 32

0

Mostra file

    def __init__(self, expr: str):
        super().__init__(expr)

        if _missing_cssselect:
            _missing_dependency("cssselect")

        # Third Party Library
        from cssselect import GenericTranslator
        from cssselect.parser import SelectorError

        try:
            xpath_expr = GenericTranslator().css_to_xpath(self.expr)
        except SelectorError as exc:
            raise ExprError(extractor=self, exc=exc) from exc

        self._extractor = XPathExtractor(xpath_expr)

Esempio n. 33

0

Mostra file

File: odfhtml2asciidocable.py Progetto: ShadowKyogre/asciidoc-odf-scripts

	elem.text = comment.text
	elem.tail = comment.tail
	pbody = comment.getparent()
	converted_blockquotes.append((elem, pbody, pbody.index(comment)))

for elem, pbody, pbodyidx in converted_blockquotes:
	pbody[pbodyidx] = elem

for ul in body.iter(tag=['ul', 'ol']):
	for li in ul.iter('li'):
		neighbor = li.getnext()
		while neighbor is not None and neighbor.tag != 'li':
			li.append(neighbor)
			neighbor = li.getnext()

css_translator = GenericTranslator()

unique_links = set()
for link in body.xpath(css_translator.css_to_xpath('a[href]')):
	unique_links.add(link.attrib['href'])

for url in unique_links:
	elem = html.Element('a')
	elem.attrib['href'] = url
	textparts = []
	duplinks = css_translator.css_to_xpath('a[href="{}"]'.format(url))
	first_dup_link = None
	more_things = False
	for duplink in body.xpath(duplinks):
		if first_dup_link is None:
			first_dup_link = duplink

Esempio n. 34

0

Mostra file

File: utils_fetch.py Progetto: hanfeisun/cn_etymology_parser

def _fetch_img_of_character(char, root_folder, dict_not_found):
    root_char = os.path.join(root_folder, char)
    if not os.path.exists(root_char):
        os.makedirs(root_char)

    url_root = 'http://www.chineseetymology.org'
    url = 'http://www.chineseetymology.org/CharacterEtymology.aspx?characterInput=' \
          + quote(char)

    attempts = 0
    max_attempts = 20
    while attempts < max_attempts:
        try:
            page = urlopen(url).read().decode('utf8')
            break
        except (TimeoutError, URLError, ConnectionError) as e:
            attempts += 1
            if isinstance(e, TimeoutError):
                msg = 'Time out when opening page %s. Retrying.' % url
            elif isinstance(e, URLError):
                msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (e.reason, url)
            elif isinstance(e, ConnectionError):
                msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (str(e), url)
            else:
                msg = 'Reached impossible branch.'
            _logger.warning(msg)

    if attempts == max_attempts:
        msg = 'Max attempts reached. Fail to open page ' + url
        _logger.error(msg)
        return

    page = fromstring(page)
    
    gt = GenericTranslator()
    seal_selector = gt.css_to_xpath("span#SealImages img")
    lst_selector = gt.css_to_xpath("span#LstImages img")
    bronze_selector = gt.css_to_xpath("span#BronzeImages img")
    oracle_selector = gt.css_to_xpath("span#OracleImages img")

    seal_img = [img.get('src') for img in page.xpath(seal_selector)]
    lst_img = [img.get('src') for img in page.xpath(lst_selector)]
    bronze_img = [img.get('src') for img in page.xpath(bronze_selector)]
    oracle_img = [img.get('src') for img in page.xpath(oracle_selector)]

    all_img = {"seal": seal_img, "lst": lst_img, "bronze": bronze_img, "oracle": oracle_img}

    for folder in all_img.keys():
        folder_full = os.path.join(root_char, folder)
        if not os.path.exists(folder_full):
            os.makedirs(folder_full)
        for img_src in all_img[folder]:
            (_, gif_name) = os.path.split(img_src)
            gif_full_path = os.path.join(folder_full, gif_name)
            if not os.path.exists(gif_full_path):
                img_url = url_root + img_src
                attempts = 0
                while attempts < max_attempts:
                    try:
                        urlretrieve(img_url, gif_full_path)
                        break
                    except TimeoutError:
                        msg = 'Time out when downloading %s to %s. Retrying.' % (img_url, gif_full_path)
                        _logger.warning(msg)
                    except HTTPError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s' % (e.reason, img_url, gif_full_path)
                        if e.code == 404:
                            dict_not_found[gif_full_path] = img_url
                            _logger.warning(msg)
                            break
                        else:
                            msg += ' Retrying.'
                            _logger.warning(msg)
                    except URLError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % (
                            e.reason, img_url, gif_full_path)
                        _logger.warning(msg)
                    except ConnectionError as e:
                        msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % (
                            str(e), img_url, gif_full_path)
                        _logger.warning(msg)

                if attempts == max_attempts:
                    msg = 'Max attempts reached. Fail to download image ' + img_url
                    _logger.error(msg)