def fillemissionindb(self, query=""): self.cleardb() conn = sqlite3.connect('podcast.db') c = conn.cursor() html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) page = html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) except SelectorError: parser.error('Invalid CSS selector') for e, eid in zip(page.xpath(expressiontitle), page.xpath(expressionurl)): try: title = re.search('.* au podcast (.*)', e.text).group(1) found = re.search('^.*sound/(.*)\.xml', eid.get("href")).group(1) except AttributeError: found = '' etemp = emissioneurope1(title, found) qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\"" + self.name + "\",\"" + etemp.name + "\",'" + etemp.podcasturl + "','" + str( etemp.idpod) + "')" print(qqq) c.execute(qqq) conn.commit() conn.close()
def search_page(self): wiki_url = "https://en.wikipedia.org" search_param = "+".join(self.search_string.split(" ") + ["film"]) url = f"{wiki_url}/w/index.php?search={search_param}&title=Special%3ASearch&go=Go" out = requests.get(url) if "index.php?search" not in out.url: # If we guessed page name return out.url # text = "".join(re.split('<head>.*</head>', out.text, # flags=re.IGNORECASE | re.DOTALL)) parser = XMLParser(recover=True) document = fromstring(out.text, parser=parser) expression = GenericTranslator().css_to_xpath('.mw-search-result') all_results = document.xpath(expression) if not all_results: raise NotFound(url) first_result = all_results[0] link_selector = GenericTranslator().css_to_xpath('a') first_link = first_result.xpath(link_selector)[0] first_result_url = f"{wiki_url}{first_link.get('href')}" return first_result_url
def transform_result(cls, text): translator = GenericTranslator() item_xpath = translator.css_to_xpath('div.card.movies a.title') document = html.fromstring(text) elements = document.xpath(item_xpath) log.debug( 'found %r matching elements for xpath %r', len(elements), item_xpath, ) def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)): return urlunsplit(( base.scheme, base.netloc, path, '', '', )) items = ((e.get('title'), e.get('href')) for e in elements) return (SearchResult(title, absolutize(path), cls.SOURCE) for (title, path) in items)
def __init__(self, contentcss, titlecss=None): contentxpath = GenericTranslator().css_to_xpath(contentcss) titlexpath = None if titlecss is not None: titlexpath = GenericTranslator().css_to_xpath(titlecss) self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath)
def _get_request_body(self, element): """ Get body params from sampler :param element: :return: dict """ raw_body = self._get_bool_prop(element, 'HTTPSampler.postBodyRaw') if raw_body: xpath = GenericTranslator().css_to_xpath("elementProp>collectionProp>elementProp") http_args_element = element.xpath(xpath)[0] body = self._get_string_prop(http_args_element, 'Argument.value') if body: self.log.debug('Got %s for body in %s (%s)', body, element.tag, element.get("name")) return {"body": body} else: return {} else: body_params = {} xpath = GenericTranslator().css_to_xpath("elementProp>collectionProp>elementProp") http_args_collection = element.xpath(xpath) for element in http_args_collection: body_params[element.get("name")] = self._get_string_prop(element, 'Argument.value') if body_params: self.log.debug('Got %s for body in %s (%s)', body_params, element.tag, element.get("name")) return {"body": body_params} else: return {}
def get_best_albums(): expression1 = GenericTranslator().css_to_xpath(".fr_list_heading.fr-text p") expression2 = GenericTranslator().css_to_xpath(".fr_list_sub_heading.fr-text p") content = requests.get("https://www.factmag.com/2018/12/13/the-50-best-albums-of-2018/").text xml_tree = lxml.html.fromstring(content) artists = [element.text_content() for element in xml_tree.xpath(expression1)] albums = [element.text_content() for element in xml_tree.xpath(expression2)] return zip(artists, albums)
def test_unicode(self): if sys.version_info[0] < 3: css = '.a\xc1b'.decode('ISO-8859-1') else: css = '.a\xc1b' xpath = GenericTranslator().css_to_xpath(css) assert css[1:] in xpath xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII') assert xpath == ( "descendant-or-self::*[@class and contains(" "concat(' ', normalize-space(@class), ' '), ' aÁb ')]")
def test_unicode(self): if sys.version_info[0] >= 3: css = '.a\xc1b' else: css = '.a\xc1b'.decode('ISO-8859-1') xpath = GenericTranslator().css_to_xpath(css) assert css[1:] in xpath xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII') assert xpath == ( "descendant-or-self::*[@class and contains(" "concat(' ', normalize-space(@class), ' '), ' aÁb ')]")
def fillemissionindb(self,query=""): emissions=[] self.cleardb() conn = sqlite3.connect('podcast.db') c = conn.cursor() html_parser = etree.HTMLParser(encoding='utf-8', recover=True,strip_cdata=True) page= html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) except SelectorError: parser.error('Invalid CSS selector') for e in page.xpath(expressiontitle): try: found =re.search('https://www.rtbf.be/'+self.nomcode+'/.*?programId=([^"]*)', e.get("href")).group(1) except AttributeError: found = '' etemp = emissionrtbf(e.get("title"),found) qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\""+self.name+"\",\""+etemp.name+"\",'"+etemp.podcasturl+"','"+str(etemp.idpod)+"')" print(qqq) c.execute(qqq) emissions.append(etemp) self.emissions=emissions conn.commit() conn.close()
def execute(self, step): param = step['param'] print("-------------------Executing %s -------------------" % (step["name"])) if self.config['iter'] == 0: browser_copy = self.browser_copy feed_copy = self.feed_copy if param['path']: try: expression = GenericTranslator().css_to_xpath( param['path']) except SelectorError: print('Invalid selector') parser = etree.HTMLParser() tree = etree.parse(StringIO.StringIO(browser_copy), parser) nodes = tree.xpath(expression) number_of_nodes = len(nodes) else: number_of_nodes = self.config['number_of_nodes'] if self.config['iter'] <= number_of_nodes: if self.config['iter'] == 0: self.config['iter'] += 1 result = [ browser_copy, feed_copy, self.config['iter'], len(nodes) ] return result else: self.config['iter'] += 1 result = ["Continuing Iteration", self.config['iter']] return result else: result = ["Finish Iteration", self.config['iter']] return result
def convert_css_to_xpath(css): """ Convert CSS Selectors to XPath Selectors. Example: convert_css_to_xpath('button:contains("Next")') Output => "//button[contains(., 'Next')]" """ xpath = GenericTranslator().css_to_xpath(css, prefix='//') return xpath
def get_ambient(): expression = GenericTranslator().css_to_xpath('div.entry-content > hr + p + p strong') expression2 = GenericTranslator().css_to_xpath('strong a') content = requests.get("https://www.factmag.com/2018/12/16/best-ambient-2018/").text xml_tree = lxml.html.fromstring(content) albums = [] for element in xml_tree.xpath(expression): if element.xpath(expression2): text_content = element.text_content() if text_content.startswith("Read next"): continue artist = text_content.split("\n")[0] album = element.xpath(expression2)[0].text_content() albums.append((artist, album)) return albums
def main(): parser = argparse.ArgumentParser() parser.add_argument('html', nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="HTML", metavar="HTML") parser.add_argument('-a', '--argument', default="", help="argument to extract from tag") parser.add_argument('-b', '--body', action='store_true', default=False, help="Enclose output with HTML and BODY tags") parser.add_argument('-e', '--expression', default=[], action='append', help="XPath query or CSS3 selector") parser.add_argument('-f', '--file', default='', help="File to read input from") parser.add_argument('-x', '--check-existance', action='store_true', default=False, help="Process return value signifying existance") parser.add_argument('-r', '--rawinput', action='store_true', default=False, help="Do not parse HTML before feeding etree (useful" "for escaping CData)") args = parser.parse_args() args.expression = [e.decode('utf-8') for e in args.expression] from cssselect import GenericTranslator expression = [e if e.startswith('//') else GenericTranslator().css_to_xpath(e) for e in args.expression] html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) inp = open(args.file) if args.file else args.html if args.rawinput: document = etree.fromstring(inp.read()) else: document = etree.parse(inp, html_parser) if args.body: sys.stdout.write("<!DOCTYPE html>\n<html>\n<body>\n") for e in expression: els = list(document.xpath(e)) if args.check_existance: sys.exit(1 if len(els) == 0 else 0) for e in els: if isinstance(e, basestring): text = e elif not args.argument: text = etree.tostring(e) else: text = e.get(args.argument) if text is not None: sys.stdout.write(text.encode('utf-8').strip() + "\t") if args.body: sys.stdout.write("</body>\n</html>") sys.stdout.write('\n') sys.stdout.flush()
def movie_page(self, url): out = requests.get(url) document = fromstring(out.text) expression_thumb = GenericTranslator().css_to_xpath('.thumbborder') all_results = document.xpath(expression_thumb) if not all_results: # Try to get first emage of the page expression_img = GenericTranslator().css_to_xpath('img') all_results = document.xpath(expression_img) if not all_results: raise NotFound(url) first_result = all_results[0] url = first_result.get("src") if url.startswith("//"): url = "https:" + url return url
def is_css(cls, selector): """ Проверяет, что переданный слектор - CSS """ try: GenericTranslator().css_to_xpath(selector) except SelectorError: return False return True
def build(self) -> None: try: xpath_expr = GenericTranslator().css_to_xpath(self.expr) except SelectorError as exc: raise ExprError(extractor=self, exc=exc) from exc self._extractor = XPathExtractor(xpath_expr) self._extractor.build() self.built = True
def get_table_rows(): document: str = read_document() tree = html.fromstring(document) expression = GenericTranslator().css_to_xpath('.coming_list tbody tr') elements = tree.xpath(expression) return elements
def fillemissionindb(self, query=""): self.cleardb() conn = connecttodb() c = conn.cursor() html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) page = html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) except SelectorError: return 0 #feedparser.error('Invalid CSS selector') for e, eid in zip(page.xpath(expressiontitle), page.xpath(expressionurl)): if eid.get("href"): try: if self.name == "France culture": foundb = re.search('/podcast/(.*)', eid.get("href")).group(1) pageb = html.parse( "https://www.franceculture.fr/podcast/" + foundb) aaa = pageb.xpath( GenericTranslator().css_to_xpath(".lien-rss"))[0] found = re.search("https.*rss_(.*)\.xml", aaa.get("href")).group(1) print(found) else: found = re.search('https.*rss_(.*)\.xml', eid.get("href")).group(1) except AttributeError: found = '' else: found = "" etemp = emissionradiofrance(e.text, found) qqq = "INSERT INTO emissions (station, title, podcasturl, idemission) VALUES (\"" + self.name + "\",\"" + etemp.name + "\",'" + etemp.podcasturl + "','" + str( etemp.idpod) + "')" print(qqq) c.execute(qqq) conn.commit() conn.close()
def test_quoting(self): css_to_xpath = GenericTranslator().css_to_xpath assert css_to_xpath('*[aval="\'"]') == ( '''descendant-or-self::*[@aval = "'"]''') assert css_to_xpath('*[aval="\'\'\'"]') == ( """descendant-or-self::*[@aval = "'''"]""") assert css_to_xpath('*[aval=\'"\']') == ( '''descendant-or-self::*[@aval = '"']''') assert css_to_xpath('*[aval=\'"""\']') == ( '''descendant-or-self::*[@aval = '"""']''')
def get(self, selector): """ Returns tree elements by CSS selector :type selector: str :return: """ expression = GenericTranslator().css_to_xpath(selector) nodes = self.tree.xpath(expression) return nodes
def to_xpath(cls, selector): """ Конвертирует CSS селектор в XPath. Если передать валидный XPath, то вернёт его без изменений. """ try: return GenericTranslator().css_to_xpath(selector) except SelectorError: if cls.is_xpath(selector): return selector return None
def test_unicode_escapes(self): # \22 == '"' \20 == ' ' css_to_xpath = GenericTranslator().css_to_xpath assert css_to_xpath(r'*[aval="\'\22\'"]') == ( '''descendant-or-self::*[@aval = concat("'",'"',"'")]''') assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( '''descendant-or-self::*[@aval = concat("'",'"2',"'")]''') assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( '''descendant-or-self::*[@aval = "' '"]''') assert css_to_xpath('*[aval="\'\\20\r\n \'"]') == ( '''descendant-or-self::*[@aval = "' '"]''')
def contains(cls, element, *text): """ Находит элемент, содержащий текст. Можно передать CSS или XPath, однако они будут превращены в XPath. """ condition = "" for string in text: condition += "[contains(., {})]".format( GenericTranslator().xpath_literal(string)) xpath = '{0}{1}'.format(cls.to_xpath(element), condition) return xpath
def fillemission(self, query="", iditt=1): emissions = [] html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) if iditt == 1: page = html.parse(self.url) else: page = html.parse(self.url + "?page=" + str(iditt) + "#results-list") try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) expressionother = GenericTranslator().css_to_xpath(".nav-pages a") except SelectorError: return 0 #feedparser.error('Invalid CSS selector') for e, eid in zip(page.xpath(expressiontitle), page.xpath(expressionurl)): if eid.get("href"): try: found = re.search('.*/([^/]*)$', eid.get("href")).group(1) except AttributeError: found = '' else: found = "" etemp = emissionbbc(e.text, found, self.nomcode) emissions.append(etemp) for eoth in page.xpath(expressionother): totest = self.url + "?page=" + str(iditt + 1) + "#results-list" if eoth.get("href") == totest: print("yes " + eoth.get("href")) self.fillemission(query, iditt + 1) break if iditt == 11: self.emissions = emissions else: self.emissions += emissions
def __init__(self, css=None, xpath=None, namespaces=None): if xpath and css: raise ParserError('At most one of "xpath" or "css" attributes can be specified.') if xpath: self.raw_xpath = xpath elif css: self.raw_xpath = GenericTranslator().css_to_xpath(css) else: self.raw_xpath = 'self::*' self.namespaces = namespaces self._compiled_xpath = None # compile xpath lazily
def fillemission(self, query): emissions = [] html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) theurl = self.url + query page = html.parse(theurl) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argpodcast) except SelectorError: parser.error('Invalid CSS selector') if self.code == "rtlfr": for e in page.xpath(expressiontitle): try: found = re.search('https://www.rtl.fr/emission/([^"]*)', e.get("href")).group(1) except AttributeError: found = '' etemp = emissionrtl(e.get("title"), found) emissions.append(etemp) elif self.code == "rtl2fr": for e, eid in zip(page.xpath(expressiontitle), page.xpath(expressionurl)): if eid.get("href"): try: found = re.search( 'https://www.rtl2.fr/podcast/(.*).xml', eid.get("href")).group(1) except AttributeError: found = '' else: found = "" # print eid.get("href")+" "+found+" "+e.text etemp = emissionrtl(e.text, found, True) emissions.append(etemp) self.emissions = emissions
def _load_foreach(self): # Scrap data over multiple pages page = requests.get(self.url) tree = etree.HTML(page.text) selector = GenericTranslator().css_to_xpath(self.foreach) l = [e for e in tree.xpath(selector)] # Concat results from all pages return [ sublist for e in l for sublist in self._load_data(e.get('href'), e.text) ]
def fillemission(self, query=""): emissions = [] html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) page = html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) except SelectorError: return 0 #feedparser.error('Invalid CSS selector') for e, eid in zip(page.xpath(expressiontitle), page.xpath(expressionurl)): if eid.get("href"): try: if self.name == "France culture": foundb = re.search('/podcast/(.*)', eid.get("href")).group(1) pageb = html.parse( "https://www.franceculture.fr/podcast/" + foundb) aaa = pageb.xpath( GenericTranslator().css_to_xpath(".lien-rss"))[0] found = re.search("https.*rss_(.*)\.xml", aaa.get("href")).group(1) print(found) else: found = re.search('https.*rss_(.*)\.xml', eid.get("href")).group(1) except AttributeError: found = '' else: found = "" etemp = emissionradiofrance(e.text, found) emissions.append(etemp) self.emissions = emissions
def transform_result(cls, text): translator = GenericTranslator() item_xpath = translator.css_to_xpath( 'div.card.movies a.title' ) document = html.fromstring(text) elements = document.xpath(item_xpath) log.debug( 'found %r matching elements for xpath %r', len(elements), item_xpath, ) def absolutize(path, base=urlsplit(cls.ENDPOINT_URL)): return urlunsplit(( base.scheme, base.netloc, path, '', '', )) items = ( (e.get('title'), e.get('href')) for e in elements ) return ( SearchResult(title, absolutize(path), cls.SOURCE) for (title, path) in items )
def fillemission(self, query=""): emissions = [] html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) page = html.parse(self.url) try: expressiontitle = GenericTranslator().css_to_xpath(self.argtitle) expressionurl = GenericTranslator().css_to_xpath(self.argurl) except SelectorError: parser.error('Invalid CSS selector') for e, eid in zip(page.xpath(expressiontitle), page.xpath(expressionurl)): try: title = re.search('.* au podcast (.*)', e.text).group(1) found = re.search('^.*sound/(.*)\.xml', eid.get("href")).group(1) except AttributeError: found = '' etemp = emissioneurope1(title, found) emissions.append(etemp) self.emissions = emissions
def preprocess_query(queries): for qs in queries: qs = filter(None, (x.strip() for x in qs.split("|"))) # Convert css queries to xpath for query in qs: if not (query.startswith("//") or query.startswith("@")): from cssselect import GenericTranslator, SelectorError try: # Try to interpret the selector as css query = GenericTranslator().css_to_xpath(query) except SelectorError: # Else fallback to xpath pass yield query
def __init__(self, expr: str): super().__init__(expr) if _missing_cssselect: _missing_dependency("cssselect") # Third Party Library from cssselect import GenericTranslator from cssselect.parser import SelectorError try: xpath_expr = GenericTranslator().css_to_xpath(self.expr) except SelectorError as exc: raise ExprError(extractor=self, exc=exc) from exc self._extractor = XPathExtractor(xpath_expr)
elem.text = comment.text elem.tail = comment.tail pbody = comment.getparent() converted_blockquotes.append((elem, pbody, pbody.index(comment))) for elem, pbody, pbodyidx in converted_blockquotes: pbody[pbodyidx] = elem for ul in body.iter(tag=['ul', 'ol']): for li in ul.iter('li'): neighbor = li.getnext() while neighbor is not None and neighbor.tag != 'li': li.append(neighbor) neighbor = li.getnext() css_translator = GenericTranslator() unique_links = set() for link in body.xpath(css_translator.css_to_xpath('a[href]')): unique_links.add(link.attrib['href']) for url in unique_links: elem = html.Element('a') elem.attrib['href'] = url textparts = [] duplinks = css_translator.css_to_xpath('a[href="{}"]'.format(url)) first_dup_link = None more_things = False for duplink in body.xpath(duplinks): if first_dup_link is None: first_dup_link = duplink
def _fetch_img_of_character(char, root_folder, dict_not_found): root_char = os.path.join(root_folder, char) if not os.path.exists(root_char): os.makedirs(root_char) url_root = 'http://www.chineseetymology.org' url = 'http://www.chineseetymology.org/CharacterEtymology.aspx?characterInput=' \ + quote(char) attempts = 0 max_attempts = 20 while attempts < max_attempts: try: page = urlopen(url).read().decode('utf8') break except (TimeoutError, URLError, ConnectionError) as e: attempts += 1 if isinstance(e, TimeoutError): msg = 'Time out when opening page %s. Retrying.' % url elif isinstance(e, URLError): msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (e.reason, url) elif isinstance(e, ConnectionError): msg = 'Error \"%s\" occurs when opening page %s. Retrying.' % (str(e), url) else: msg = 'Reached impossible branch.' _logger.warning(msg) if attempts == max_attempts: msg = 'Max attempts reached. Fail to open page ' + url _logger.error(msg) return page = fromstring(page) gt = GenericTranslator() seal_selector = gt.css_to_xpath("span#SealImages img") lst_selector = gt.css_to_xpath("span#LstImages img") bronze_selector = gt.css_to_xpath("span#BronzeImages img") oracle_selector = gt.css_to_xpath("span#OracleImages img") seal_img = [img.get('src') for img in page.xpath(seal_selector)] lst_img = [img.get('src') for img in page.xpath(lst_selector)] bronze_img = [img.get('src') for img in page.xpath(bronze_selector)] oracle_img = [img.get('src') for img in page.xpath(oracle_selector)] all_img = {"seal": seal_img, "lst": lst_img, "bronze": bronze_img, "oracle": oracle_img} for folder in all_img.keys(): folder_full = os.path.join(root_char, folder) if not os.path.exists(folder_full): os.makedirs(folder_full) for img_src in all_img[folder]: (_, gif_name) = os.path.split(img_src) gif_full_path = os.path.join(folder_full, gif_name) if not os.path.exists(gif_full_path): img_url = url_root + img_src attempts = 0 while attempts < max_attempts: try: urlretrieve(img_url, gif_full_path) break except TimeoutError: msg = 'Time out when downloading %s to %s. Retrying.' % (img_url, gif_full_path) _logger.warning(msg) except HTTPError as e: msg = 'Error \"%s\" occurs when downloading %s to %s' % (e.reason, img_url, gif_full_path) if e.code == 404: dict_not_found[gif_full_path] = img_url _logger.warning(msg) break else: msg += ' Retrying.' _logger.warning(msg) except URLError as e: msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % ( e.reason, img_url, gif_full_path) _logger.warning(msg) except ConnectionError as e: msg = 'Error \"%s\" occurs when downloading %s to %s. Retrying.' % ( str(e), img_url, gif_full_path) _logger.warning(msg) if attempts == max_attempts: msg = 'Max attempts reached. Fail to download image ' + img_url _logger.error(msg)