def extract(tag, regex, type, link, webContent): if type == "text": try: select = cssselect.CSSSelector(tag) root = etree.HTML(webContent) result = select(root) result2 = result[0].text if regex == "": match2 = result2 return match2 else: try: match = re.search(regex, result2, re.S) # importante activar s flag try: match2 = match.group(0) return match2 except: print("No groups found") except: print("No regex entry") except: print('No tags found') if type == "attribute": select = cssselect.CSSSelector(tag) root = etree.HTML(webContent) result = select(root) link1 = re.search('(.+?.com)', link, re.S) host = link1.group(0) result2 = (host + result[0].attrib["href"]) return result2
def split_text_image_and_links(description): splitted_content = [] document = html.document_fromstring(description) raw_text = document.text_content() splitted_content.append({'type': 'text', 'content': raw_text}) document = html.fromstring(description) select = cssselect.CSSSelector("img") images = [el.get('src') for el in select(document)] if len(images) == 1: images = images[0] splitted_content.append({'type': 'image', 'content': images}) document = html.fromstring(description) select = cssselect.CSSSelector("a") links = [el.get('href') for el in select(document)] splitted_content.append({'type': 'links', 'content': links}) return splitted_content
def read_verses(tree, book: str, chapter: int) -> dict[str, Verse]: """Finds `Verse`s in the current document. Args: tree: ElementTree. book: Short name of the book. chapter: Chapter or section number. Returns: Dict of `Verse`s keyed by the reference form (e.g. "1 Ne. 3:7"). """ verses = {} for verse_element in cssselect.CSSSelector(".verse-first,.verse")(tree): verse = None for element in verse_element.iter(): if element.get("class") == "verseNumber": verse = int(list(element.itertext())[0]) # Remove verse numbers and reference markers. if element.get("class") in ["verseNumber", "marker"]: element.clear(keep_tail=True) text = "".join(verse_element.itertext()) if not verse: if text.startswith(("After prayer", )): continue # D&C 102:34. raise ValueError( f"could not find verse number for {book} {chapter}: {text}") key = f"{book} {chapter}:{verse}" verses[key] = Verse(book=book, chapter=chapter, verse=verse, text=text) return verses
def get_title(tree) -> str: """Extracts the title from an ElementTree.""" selector = cssselect.CSSSelector("default|title", namespaces=NAMESPACES) headers = selector(tree) if len(headers) != 1: raise ValueError(f"unexpected number of titles: {headers}") return headers[0].text
def test_token_created_before_last_updated_password_cannot_be_used(self): self.data_api_client.get_user.return_value = self.user( 123, "*****@*****.**", 1234, 'email', 'Name', is_token_valid=False) token = generate_token(self._user, self.app.config['SHARED_EMAIL_KEY'], self.app.config['RESET_PASSWORD_TOKEN_NS']) url = '/user/reset-password/{}'.format(token) res = self.client.post(url, data={ 'password': '******', 'confirm_password': '******' }, follow_redirects=True) assert res.status_code == 200 document = html.fromstring(res.get_data(as_text=True)) error_selector = cssselect.CSSSelector('div.dm-alert.dm-alert--error') error_elements = error_selector(document) assert len(error_elements) == 1 assert reset_password.EXPIRED_PASSWORD_RESET_TOKEN_MESSAGE in error_elements[ 0].text_content() assert self.data_api_client.update_user_password.called is False
def set_custom_embed_code(self, data): """ Return the code that embed the code. Could be with the original size or the custom chosen. """ if 'embed_html' not in data: return tree = etree.HTML(data['embed_html']) sel = cssselect.CSSSelector('body > *') el = sel(tree) # add a div around if there is more than one element into code if len(el) > 1: el = DIV(*el) else: el = el[0] # width and height attributes should not be set in a div tag if el.tag in ['iframe', 'object']: if data.get('width', None): el.attrib['width'] = data['width'] and str( data['width']) or el.attrib['width'] if data.get('height', None): el.attrib['height'] = data['height'] and str( data['height']) or el.attrib['height'] data['embed_html'] = sanitize_iframe_tag(html.tostring(el))
def get_links(self): selector = cssselect.CSSSelector("a") return [ # (stringify_children(l) or '', l.get("href")) (l.text or '', l.get("href")) for l in selector(self.lxml) ]
def extract_links(self, selector): """ Method for performing the link extraction for the crawler implementation. As in the extract_content method, the cssselect library is used to translate \ the CSS selector expression into an XPath expression. The selector passed as the argument is a selector to point to the anchor tags \ that the crawler should pass through. A list of links is obtained, and the links \ are iterated through. The relative paths are converted into absolute paths and \ a ``CssSelector`` object is created with the URL of the next page as the argument \ and this created object is yielded. The extract_links method basically generates ``CssSelector`` objects for all of \ the links to be crawled through. :param selector: The selector for the anchor tags to be crawled through :return: A ``CssSelector`` object for every page to be crawled through """ sel = cssselect.CSSSelector(selector) links = sel(self.tree) for link in links: next_url = urljoin(self.url, link.get('href')) yield CssSelector(next_url)
def extract_next_links(self, url_data): """ The url_data coming from the fetch_url method will be given as a parameter to this method. url_data contains the fetched url, the url content in binary format, and the size of the content in bytes. This method should return a list of urls in their absolute form (some links in the content are relative and needs to be converted to the absolute form). Validation of links is done later via is_valid method. It is not required to remove duplicates that have already been fetched. The frontier takes care of that. Suggested library: lxml """ outputLinks = [] root = html.fromstring(url_data['content']) new_Root = root.make_links_absolute(url_data['url'], resolve_base_href=True) #print(type(new_Root)) parsed_uri = urlparse(url_data['url']) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) tree = etree.ElementTree(root) r = tree.getroot() select = cssselect.CSSSelector("a") links = [element.get('href') for element in select(r)] #currentPath = url_data['url'].strip('/') for link in links: if link is None: pass else: outputLinks.append(link) return outputLinks
def quick_register(self, element_key, element_selector): if not lxml_available: raise RuntimeError( "You can't use CSS selectors unless you install lxml. Installing it is pretty easy. Check our docs at http://www.pyccuracy.org to know more." ) selector = cssselect.CSSSelector(element_selector) xpath = selector.path.replace("descendant-or-self::", "//") self.register_element(element_key, xpath)
def css_select(self, selector: str) -> list: """ Shortcut to select elements based on CSS selector. """ return self.xpath( cssselect.CSSSelector(selector, translator="html", namespaces=se.XHTML_NAMESPACES).path)
def loopload(tag,webContent,link,max_iter=100): linklist=[] select = cssselect.CSSSelector(tag) root = etree.HTML(webContent) result = select(root) link1 = re.search('(.+?.com)', link, re.S) host = link1.group(0) for k in result: linklist.append((host + k.attrib["href"])) return linklist
def extract_columns(self, *args, **kwargs): """ Column data extraction for extract_tabular """ result_list = [] result = kwargs.get('result', {}) try: if type(kwargs.get('selector', '')) in [str, unicode]: selectors = [kwargs.get('selector', '')] elif type(kwargs.get('selector', '')) == list: selectors = kwargs.get('selector', '') else: raise Exception( "Use a list of selector expressions for the various columns" ) from itertools import izip, count pairs = izip(kwargs.get('table_headers', []), selectors) columns = {} for head, selector in pairs: sel = cssselect.CSSSelector(selector) columns[head] = sel(self.tree) try: for i in count(start=0): r = result.copy() for head in columns.keys(): if kwargs.get('verbosity', 0) > 1: print("\nExtracting", head, "attribute", sep=' ', end='') col = columns[head][i] if kwargs.get('attr', 'text') == "text": try: content = kwargs.get('connector', '').join([ make_ascii(x).strip() for x in col.itertext() ]) except Exception: content = kwargs.get('default', '') content = content.replace("\n", " ").strip() else: content = col.get(kwargs.get('attr', 'text')) if kwargs.get('attr', 'text') in ["href", "src"]: content = urljoin(self.url, content) r[head] = content result_list.append(r) except IndexError: pass except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
def parseCard(html_code, i): result = {} parser = etree.HTMLParser() html = etree.fromstring(html_code, parser) # card id result['id'] = i # chinese name select = cssselect.CSSSelector("table.table_out a span") if not select(html): # no data page return result['cname'] = select(html)[0].text.strip() # english name select = cssselect.CSSSelector("table.table_out a br") result['ename'] = select(html)[0].tail.strip() # image url select = cssselect.CSSSelector("#card_book_container img") result['img'] = select(html)[0].get('src') # others select = cssselect.CSSSelector( "div table div table.table_out div table tr td") foo = select(html) result['class'] = foo[7].text #result['source'] = foo[9].text result['level'] = foo[8].text result['type'] = foo[9].text result['race'] = foo[10].text result['mp'] = foo[11].getchildren()[0].text result['atk'] = foo[12].getchildren()[0].text result['hp'] = foo[13].getchildren()[0].text bar = foo[15].find('div') if bar is not None: result['eeffect'] = bar.text bar.clear() else: result['eeffect'] = '' result['ceffect'] = "".join( [t.strip() for t in foo[15].itertext() if t.strip()]) result['desc'] = foo[17].text result['misc'] = "".join( [t.strip() for t in foo[len(foo) - 1].itertext() if t.strip()]) return result
def css_selector(selector: str) -> cssselect.CSSSelector: """ Create a CSS selector for the given selector string. Return a cached CSS selector if one already exists. """ sel = CSS_SELECTOR_CACHE.get(selector) if not sel: sel = cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) CSS_SELECTOR_CACHE[selector] = sel return sel
def find_link(self, pattern): selector = cssselect.CSSSelector("a") links = [ # (stringify_children(l) or '', l.get("href")) (l.text or '', l.get("href")) for l in selector(self.lxml) ] for link in links: if re.search(pattern, link[0]) or re.search(pattern, link[1]): return link[1] return ''
def match_selector(rule, tree): """Yield the ``(element, specificity)`` in ``tree`` matching ``rule``.""" for selector in rule.selectorList: specificity = selector.specificity try: matcher = cssselect.CSSSelector(selector.selectorText) except cssselect.ExpressionError: # Unsupported selector # TODO: warn continue for element in matcher(tree): yield element, specificity
def extract_tabular(self, *args, **kwargs): """ Method for performing the extraction of tabular data. As in the extract_content method, the cssselect library is used to translate \ the CSS selector expression into an XPath expression. :param result: A dictionary containing the extracted data so far :param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \ A row extraction is when there is a single row to be extracted and mapped to a set of headers. \ A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings. :param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers :param prefix: A prefix to be added to each header :param suffix: A suffix to be added to each header :param selector: For row extraction, this is a selector that gives the row to be extracted. \ For column extraction, this is a list of selectors for each column. :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :param verbosity: The verbosity set as the argument for scrapple run :return: A 2-tuple containing the list of all the column headers extracted and the list of \ dictionaries which contain (header, content) pairs """ result = kwargs.get('result', {}) result_list = [] if type(kwargs.get('header', [])) in [str, unicode]: try: sel = cssselect.CSSSelector(kwargs.get('header', [])) header_list = sel(self.tree) table_headers = [ kwargs.get('prefix', '') + h.text + kwargs.get('suffix', '') for h in header_list ] if len(table_headers) == 0: raise Exception("Invalid CSS selector " + kwargs.get('header', [])) except TypeError: raise Exception( "Selector expression string to be provided. Got " + kwargs.get('header', [])) else: table_headers = [ kwargs.get('prefix', '') + h + kwargs.get('suffix', '') for h in kwargs.get('header', []) ] if kwargs.get('table_type', 'rows') not in ["rows", "columns"]: raise Exception("Specify 'rows' or 'columns' in table_type") kwargs.update({'table_headers': table_headers}) if kwargs.get('table_type', 'rows') == "rows": result_list = self.extract_rows(**kwargs) else: result_list = self.extract_columns(**kwargs) return table_headers, result_list
def setTable(self): # some bug select_tables = cssselect.CSSSelector("table") symbol_count = 0 # print(select_tables(self.code)) for t in select_tables(self.code): cleaned = self.cleaner.clean_html(t) txt = sub("[\r\n\t ]", "", cleaned.text_content()) # print(t) # print(txt) symbol_count += len(txt) if not self.features["size_of_text"] == 0: self.features["fraction_of_table"] = float(symbol_count) / self.features["size_of_text"]
def search(word): res = requests.get(queryurl(word)) doc = lhtml.fromstring(res.text) w = {} # word el_word = cssselect.CSSSelector("#headword > h1 > strong")(doc) if len(el_word) != 1: return None w["word"] = el_word[0].text_content() # pronounces el_prons = cssselect.CSSSelector( "body > div.contentPadding > div > div > div.lf_area > div.qdef > div.hd_area > div.hd_tf_lh > div > div:nth-child(even) > a" )(doc) pronounces = {} if len(el_prons) > 0: if len(el_prons) == 2: prEng = el_prons[1].get("onmouseover") prEng = re.search(r"https?://.*\.mp3", prEng).group() pronounces["eng"] = prEng prUs = el_prons[0].get("onmouseover") prUs = re.search(r"https?://.*\.mp3", prUs).group() pronounces["us"] = prUs w["pronounces"] = pronounces # definitions el_defs = cssselect.CSSSelector( "body > div.contentPadding > div > div > div.lf_area > div.qdef > ul > li" )(doc) definitions = [] for el in el_defs: pos = cssselect.CSSSelector(".pos")(el)[0].text_content() defi = cssselect.CSSSelector(".def")(el)[0].text_content() definitions.append({"pos": pos, "def": defi}) w["definitions"] = definitions # variants el_varis_kind = cssselect.CSSSelector( "body > div.contentPadding > div > div > div.lf_area > div.qdef > div.hd_div1 > div > span" )(doc) el_varis_word = cssselect.CSSSelector( "body > div.contentPadding > div > div > div.lf_area > div.qdef > div.hd_div1 > div > a" )(doc) variants = [] for (kind, word) in itertools.izip(el_varis_kind, el_varis_word): variants.append({ "kind": kind.text_content(), "word": word.text_content() }) w["variants"] = variants return w
def getUsers(self): ula = cssselect.CSSSelector('div.mw-spcontent > ul > li > a') list_links = ula(self.lxml_root) total_users = [] for link in list_links: if 'User:'******'href']: continue new_user = UserFromUserList(self.site, link.text) total_users.append(new_user) if link.get('class') == 'new': new_user.forceUserPage(False) else: new_user.forceUserPage(True) return total_users
def css_select(self, selector: str): """ Shortcut to select elements based on CSS selector. """ try: sel = CSS_SELECTOR_CACHE.get(selector) if not sel: sel = cssselect.CSSSelector(selector, translator="xhtml", namespaces=self.namespaces) CSS_SELECTOR_CACHE[selector] = sel return self.xpath(sel.path) except parser.SelectorSyntaxError as ex: raise se.InvalidCssException(f"Invalid selector: [css]{selector}[/]") from ex
def extract_content(self, *args, **kwargs): """ Method for performing the content extraction for the given CSS selector. The cssselect library is used to handle CSS selector expressions. \ XPath expressions have a higher speed of execution, so the given CSS selector \ expression is translated into the corresponding XPath expression, by the \ ``cssselect.CSSSelector`` class. This selector can be used to extract content \ from the element tree corresponding to the fetched web page. If the selector is "url", the URL of the current web page is returned. Otherwise, the selector expression is used to extract content. The particular \ attribute to be extracted ("text", "href", etc.) is specified in the method \ arguments, and this is used to extract the required content. If the content \ extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ to convert the relative path into an absolute path. If the selector does not fetch any content, the default value is returned. \ If no default value is specified, an exception is raised. :param selector: The CSS selector expression :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :return: The extracted content """ try: selector, attr, default, connector = [ kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector'] ] if selector == "url": return self.url sel = cssselect.CSSSelector(selector) if attr == "text": tag = sel(self.tree)[0] content = connector.join( [make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: content = sel(self.tree)[0].get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) return content except IndexError: if default is not "": return default raise Exception("There is no content for the selector " + selector)
def selector_exists(parsed_code, selector, namespaces_dict, is_xhtml): """ Converts selector's text to XPath and make a search in xhtml file. Returns True if it finds a correspondence or the translation of the selector to XPath is not yet implemented by cssselect, False otherwise. """ translator = 'xhtml' if is_xhtml else 'xml' try: if cssselect.CSSSelector(selector, translator=translator, namespaces=namespaces_dict)(parsed_code): return True except SelectorError: return True return False
def read_headers(tree) -> tuple[Optional[str], Optional[int]]: """Finds the book and chapter for the given document. Returns: book: Short name of the book (or None if not found). chapter: Chapter or section number (or None if not found). """ title = get_title(tree) book = title.split("Chapter")[0].split("Section")[0].split( "Psalm ")[0].strip() book_short = scripture_graph.BOOKS_SHORT[book] title_number = cssselect.CSSSelector(".titleNumber")(tree) if not title_number: return None, None # Table of contents, etc. chapter = int(list(title_number[0].itertext())[0].split()[-1]) return book_short, chapter
def __init__(self, seed_url_list, title=''): self.depth = 1 self.url_num, self.domain_num = 0, 0 self.file_no, self.file_cnt = 1, 0 self.css_selector = cssselect.CSSSelector("a") self.seed_url_list = seed_url_list self.domain_map = {} # {domain: domain_id} # {domain_id: {'domain':domain, 'robot':robot_parser}} self.domain_nodes = {} self.url_map = {} # {url: url_id} ''' url_nodes: {url_id: {'domain_id': domain_id, 'url': url, 'out_links': [out_link_url], in_link: (url_id)}}''' self.url_nodes = {} self.initializeSeedURL(title)
def getDecks(url): print('\033[1;31m>> start get decks list\033[m') decks = [] string = urllib.request.urlopen(url).read() parser = etree.HTMLParser() html = etree.fromstring(string, parser) select = cssselect.CSSSelector(r'#decks td') items = select(html) i = 0 while i < len(items): deck = {} # Deck Name a = items[i].find(r'div/span/a') deck['name'] = a.text.strip() deck['url'] = 'http://www.hearthpwn.com' + a.get('href') arena = items[i].get('class').find( 't-arena-cell') != -1 and True or False i = i + 1 # Deck Type deck['type'] = items[i].text.strip() i = i + 1 # Mana i = i + 1 # Class deck['class'] = items[i].text.strip() i = i + 1 # Rating deck['rating'] = items[i].find(r'div').text.strip() i = i + 1 # Views deck['views'] = items[i].text.strip() i = i + 1 # Comments i = i + 1 deck['comments'] = items[i].text.strip() # Cost deck['cost'] = items[i].text.strip() i = i + 1 # Updated deck['updated'] = items[i].find(r'abbr').get('title') deck['patch'] = items[i].find(r'span').text.strip() i = i + 1 # if arena deck pass if not arena: decks.append(deck) return decks
def process_macros(html_input): """ A template tag that processes a "ductus-html5" string into viewable html5. For now, it only runs macros. """ from lxml import etree, cssselect source = etree.HTML(html_input) macro_tags = cssselect.CSSSelector('div.ductus-macro')(source) for mt in macro_tags: macro_name = mt.get('data-macro-name') try: mt = _registered_html_macros[macro_name](mt, source) except KeyError: pass # macros are simply <div> tags in the input, fail silently if we don't know how to process them return mark_safe(etree.tostring(source))
def filter(ElementTree, **kwargs): if not "filter" in kwargs or kwargs["filter"] == None: return selector = cssselect.CSSSelector(kwargs["filter"]) for element in selector(ElementTree.getroot()): previous = element.getprevious() parent = element.getparent() if element.tail != None: if previous != None: if previous.tail != None: previous.tail = previous.tail + element.tail else: previous.tail = element.tail else: if parent.text != None: parent.text = parent.text + element.tail else: parent.text = element.tail parent.remove(element)
def duanzi_scrapter(html_doc, page_num=1): html_after_cleaner = cleaner.clean_html(html_doc) # 去除段子内容中的<br> pattern = re.compile('<br>|\n') html_after_cleaner = re.sub(pattern, '', html_after_cleaner) document = etree.fromstring(html_after_cleaner, parser) print('正在解析第%s页段子...' % str(page_num)) try: sel = cssselect.CSSSelector('#content-left > div') for e in sel(document): try: # a content 获取段子信息 a = e.find('.//a[@class="contentHerf"]') a_href = a.attrib['href'] # 格式/article/105323928 spans = e.findall('.//a[@class="contentHerf"]/div/span') if len(spans) > 1: # 出现“查看全文” urls.add_new_url(a_href) # 保存段子链接 else: duanzi_info = {} duanzi_info[ 'dz_url'] = 'https://www.qiushibaike.com' + a_href # 段子链接地址 duanzi_info['dzContent'] = spans[0].text # 段子内容 # div stats spans = e.findall('.//div[@class="stats"]/span') for span in spans: i = span.find('.//i') if span.get('class') == 'stats-vote': duanzi_info['vote_num'] = i.text # 投票数 elif span.get('class') == 'stats-comments': # 评论数 duanzi_info['comment_num'] = i.text collect_data(duanzi_info) except Exception as err: print('提取段子异常,进入下一循环') continue print('解析第%s页段子结束' % str(page_num)) next_page(page_num + 1) # 进入下一页 except TimeoutException as err: print('解析网页出错:', err.args) return next_page(page_num + 1) # 捕获异常,直接进入下一页