コード例 #1
0
ファイル: tools.py プロジェクト: rubensan112/Scrapper
def extract(tag, regex, type, link, webContent):
    if type == "text":
        try:
            select = cssselect.CSSSelector(tag)
            root = etree.HTML(webContent)
            result = select(root)
            result2 = result[0].text
            if regex == "":
                match2 = result2
                return match2
            else:
                try:
                    match = re.search(regex, result2,
                                      re.S)  # importante activar s flag
                    try:
                        match2 = match.group(0)
                        return match2
                    except:
                        print("No groups found")
                except:
                    print("No regex entry")

        except:
            print('No tags found')
    if type == "attribute":
        select = cssselect.CSSSelector(tag)
        root = etree.HTML(webContent)
        result = select(root)
        link1 = re.search('(.+?.com)', link, re.S)
        host = link1.group(0)
        result2 = (host + result[0].attrib["href"])
        return result2
コード例 #2
0
ファイル: services.py プロジェクト: rgcarrasqueira/vejasoh
def split_text_image_and_links(description):

    splitted_content = []

    document = html.document_fromstring(description)
    raw_text = document.text_content()

    splitted_content.append({'type': 'text', 'content': raw_text})

    document = html.fromstring(description)
    select = cssselect.CSSSelector("img")
    images = [el.get('src') for el in select(document)]

    if len(images) == 1:
        images = images[0]

    splitted_content.append({'type': 'image', 'content': images})

    document = html.fromstring(description)
    select = cssselect.CSSSelector("a")
    links = [el.get('href') for el in select(document)]

    splitted_content.append({'type': 'links', 'content': links})

    return splitted_content
コード例 #3
0
ファイル: graph_lib.py プロジェクト: skearnes/scripture-graph
def read_verses(tree, book: str, chapter: int) -> dict[str, Verse]:
    """Finds `Verse`s in the current document.

    Args:
        tree: ElementTree.
        book: Short name of the book.
        chapter: Chapter or section number.

    Returns:
        Dict of `Verse`s keyed by the reference form (e.g. "1 Ne. 3:7").
    """
    verses = {}
    for verse_element in cssselect.CSSSelector(".verse-first,.verse")(tree):
        verse = None
        for element in verse_element.iter():
            if element.get("class") == "verseNumber":
                verse = int(list(element.itertext())[0])
            # Remove verse numbers and reference markers.
            if element.get("class") in ["verseNumber", "marker"]:
                element.clear(keep_tail=True)
        text = "".join(verse_element.itertext())
        if not verse:
            if text.startswith(("After prayer", )):
                continue  # D&C 102:34.
            raise ValueError(
                f"could not find verse number for {book} {chapter}: {text}")
        key = f"{book} {chapter}:{verse}"
        verses[key] = Verse(book=book, chapter=chapter, verse=verse, text=text)
    return verses
コード例 #4
0
ファイル: graph_lib.py プロジェクト: skearnes/scripture-graph
def get_title(tree) -> str:
    """Extracts the title from an ElementTree."""
    selector = cssselect.CSSSelector("default|title", namespaces=NAMESPACES)
    headers = selector(tree)
    if len(headers) != 1:
        raise ValueError(f"unexpected number of titles: {headers}")
    return headers[0].text
    def test_token_created_before_last_updated_password_cannot_be_used(self):
        self.data_api_client.get_user.return_value = self.user(
            123,
            "*****@*****.**",
            1234,
            'email',
            'Name',
            is_token_valid=False)
        token = generate_token(self._user, self.app.config['SHARED_EMAIL_KEY'],
                               self.app.config['RESET_PASSWORD_TOKEN_NS'])
        url = '/user/reset-password/{}'.format(token)

        res = self.client.post(url,
                               data={
                                   'password': '******',
                                   'confirm_password': '******'
                               },
                               follow_redirects=True)

        assert res.status_code == 200
        document = html.fromstring(res.get_data(as_text=True))
        error_selector = cssselect.CSSSelector('div.dm-alert.dm-alert--error')
        error_elements = error_selector(document)
        assert len(error_elements) == 1
        assert reset_password.EXPIRED_PASSWORD_RESET_TOKEN_MESSAGE in error_elements[
            0].text_content()
        assert self.data_api_client.update_user_password.called is False
コード例 #6
0
    def set_custom_embed_code(self, data):
        """ Return the code that embed the code. Could be with the
            original size or the custom chosen.
        """
        if 'embed_html' not in data:
            return
        tree = etree.HTML(data['embed_html'])
        sel = cssselect.CSSSelector('body > *')
        el = sel(tree)
        # add a div around if there is more than one element into code
        if len(el) > 1:
            el = DIV(*el)
        else:
            el = el[0]

        # width and height attributes should not be set in a div tag
        if el.tag in ['iframe', 'object']:
            if data.get('width', None):
                el.attrib['width'] = data['width'] and str(
                    data['width']) or el.attrib['width']
            if data.get('height', None):
                el.attrib['height'] = data['height'] and str(
                    data['height']) or el.attrib['height']

        data['embed_html'] = sanitize_iframe_tag(html.tostring(el))
コード例 #7
0
ファイル: utils.py プロジェクト: mvdbeek/twill3
 def get_links(self):
     selector = cssselect.CSSSelector("a")
     return [
         # (stringify_children(l) or '', l.get("href"))
         (l.text or '', l.get("href"))
         for l in selector(self.lxml)
     ]
コード例 #8
0
ファイル: css.py プロジェクト: twocngdagz/scrapple
    def extract_links(self, selector):
        """
		Method for performing the link extraction for the crawler implementation.

		As in the extract_content method, the cssselect library is used to translate \
		the CSS selector expression into an XPath expression. 

		The selector passed as the argument is a selector to point to the anchor tags \
		that the crawler should pass through. A list of links is obtained, and the links \
		are iterated through. The relative paths are converted into absolute paths and \
		a ``CssSelector`` object is created with the URL of the next page as the argument \
		and this created object is yielded. 

		The extract_links method basically generates ``CssSelector`` objects for all of \
		the links to be crawled through.

		:param selector: The selector for the anchor tags to be crawled through
		:return: A ``CssSelector`` object for every page to be crawled through 
		
		"""
        sel = cssselect.CSSSelector(selector)
        links = sel(self.tree)
        for link in links:
            next_url = urljoin(self.url, link.get('href'))
            yield CssSelector(next_url)
コード例 #9
0
    def extract_next_links(self, url_data):
        """
        The url_data coming from the fetch_url method will be given as a parameter to this method. url_data contains the
        fetched url, the url content in binary format, and the size of the content in bytes. This method should return a
        list of urls in their absolute form (some links in the content are relative and needs to be converted to the
        absolute form). Validation of links is done later via is_valid method. It is not required to remove duplicates
        that have already been fetched. The frontier takes care of that.

        Suggested library: lxml
        """
        outputLinks = []
        root = html.fromstring(url_data['content'])
        new_Root = root.make_links_absolute(url_data['url'], resolve_base_href=True)
        #print(type(new_Root))

        parsed_uri = urlparse(url_data['url'])
        domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
        
        
        
        tree = etree.ElementTree(root)
        r = tree.getroot()
        select = cssselect.CSSSelector("a")
        links = [element.get('href') for element in select(r)]
        #currentPath = url_data['url'].strip('/')
        for link in links:
            if link is None:
                pass
            else:
                outputLinks.append(link)
        return outputLinks
コード例 #10
0
ファイル: page.py プロジェクト: sardor9/pyccuracy
 def quick_register(self, element_key, element_selector):
     if not lxml_available:
         raise RuntimeError(
             "You can't use CSS selectors unless you install lxml. Installing it is pretty easy. Check our docs at http://www.pyccuracy.org to know more."
         )
     selector = cssselect.CSSSelector(element_selector)
     xpath = selector.path.replace("descendant-or-self::", "//")
     self.register_element(element_key, xpath)
コード例 #11
0
    def css_select(self, selector: str) -> list:
        """
		Shortcut to select elements based on CSS selector.
		"""

        return self.xpath(
            cssselect.CSSSelector(selector,
                                  translator="html",
                                  namespaces=se.XHTML_NAMESPACES).path)
コード例 #12
0
ファイル: tools2.py プロジェクト: rubensan112/Scrapper
def loopload(tag,webContent,link,max_iter=100):
    linklist=[]
    select = cssselect.CSSSelector(tag)
    root = etree.HTML(webContent)
    result = select(root)
    link1 = re.search('(.+?.com)', link, re.S)
    host = link1.group(0)
    for k in result:
        linklist.append((host + k.attrib["href"]))
    return linklist
コード例 #13
0
    def extract_columns(self, *args, **kwargs):
        """
		Column data extraction for extract_tabular
		"""
        result_list = []
        result = kwargs.get('result', {})

        try:
            if type(kwargs.get('selector', '')) in [str, unicode]:
                selectors = [kwargs.get('selector', '')]
            elif type(kwargs.get('selector', '')) == list:
                selectors = kwargs.get('selector', '')
            else:
                raise Exception(
                    "Use a list of selector expressions for the various columns"
                )
            from itertools import izip, count
            pairs = izip(kwargs.get('table_headers', []), selectors)
            columns = {}
            for head, selector in pairs:
                sel = cssselect.CSSSelector(selector)
                columns[head] = sel(self.tree)
            try:
                for i in count(start=0):
                    r = result.copy()
                    for head in columns.keys():
                        if kwargs.get('verbosity', 0) > 1:
                            print("\nExtracting",
                                  head,
                                  "attribute",
                                  sep=' ',
                                  end='')
                        col = columns[head][i]
                        if kwargs.get('attr', 'text') == "text":
                            try:
                                content = kwargs.get('connector', '').join([
                                    make_ascii(x).strip()
                                    for x in col.itertext()
                                ])
                            except Exception:
                                content = kwargs.get('default', '')
                            content = content.replace("\n", " ").strip()
                        else:
                            content = col.get(kwargs.get('attr', 'text'))
                            if kwargs.get('attr', 'text') in ["href", "src"]:
                                content = urljoin(self.url, content)
                        r[head] = content
                    result_list.append(r)
            except IndexError:
                pass
        except TypeError:
            raise Exception("Selector expression string to be provided. Got " +
                            selector)

        return result_list
コード例 #14
0
def parseCard(html_code, i):
    result = {}
    parser = etree.HTMLParser()
    html = etree.fromstring(html_code, parser)
    # card id
    result['id'] = i
    # chinese name
    select = cssselect.CSSSelector("table.table_out a span")
    if not select(html):  # no data page
        return
    result['cname'] = select(html)[0].text.strip()
    # english name
    select = cssselect.CSSSelector("table.table_out a br")
    result['ename'] = select(html)[0].tail.strip()
    # image url
    select = cssselect.CSSSelector("#card_book_container img")
    result['img'] = select(html)[0].get('src')
    # others
    select = cssselect.CSSSelector(
        "div table div table.table_out div table tr td")
    foo = select(html)

    result['class'] = foo[7].text
    #result['source'] = foo[9].text
    result['level'] = foo[8].text
    result['type'] = foo[9].text
    result['race'] = foo[10].text
    result['mp'] = foo[11].getchildren()[0].text
    result['atk'] = foo[12].getchildren()[0].text
    result['hp'] = foo[13].getchildren()[0].text
    bar = foo[15].find('div')
    if bar is not None:
        result['eeffect'] = bar.text
        bar.clear()
    else:
        result['eeffect'] = ''
    result['ceffect'] = "".join(
        [t.strip() for t in foo[15].itertext() if t.strip()])
    result['desc'] = foo[17].text
    result['misc'] = "".join(
        [t.strip() for t in foo[len(foo) - 1].itertext() if t.strip()])
    return result
コード例 #15
0
ファイル: easy_xml.py プロジェクト: ksmaheshkumar/tools-3
def css_selector(selector: str) -> cssselect.CSSSelector:
	"""
	Create a CSS selector for the given selector string. Return a cached CSS selector if
	one already exists.
	"""

	sel = CSS_SELECTOR_CACHE.get(selector)
	if not sel:
		sel = cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES)
		CSS_SELECTOR_CACHE[selector] = sel
	return sel
コード例 #16
0
ファイル: utils.py プロジェクト: fabioperrella/fedora
    def find_link(self, pattern):
        selector = cssselect.CSSSelector("a")

        links = [
            # (stringify_children(l) or '', l.get("href"))
            (l.text or '', l.get("href")) for l in selector(self.lxml)
        ]
        for link in links:
            if re.search(pattern, link[0]) or re.search(pattern, link[1]):
                return link[1]
        return ''
コード例 #17
0
ファイル: css.py プロジェクト: egradman/tweetcutter
def match_selector(rule, tree):
    """Yield the ``(element, specificity)`` in ``tree`` matching ``rule``."""
    for selector in rule.selectorList:
        specificity = selector.specificity
        try:
            matcher = cssselect.CSSSelector(selector.selectorText)
        except cssselect.ExpressionError:
            # Unsupported selector
            # TODO: warn
            continue
        for element in matcher(tree):
            yield element, specificity
コード例 #18
0
    def extract_tabular(self, *args, **kwargs):
        """
		Method for performing the extraction of tabular data.

		As in the extract_content method, the cssselect library is used to translate \
		the CSS selector expression into an XPath expression. 

		:param result: A dictionary containing the extracted data so far
		:param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \
		A row extraction is when there is a single row to be extracted and mapped to a set of headers. \
		A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings.
		:param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers
		:param prefix: A prefix to be added to each header
		:param suffix: A suffix to be added to each header
		:param selector: For row extraction, this is a selector that gives the row to be extracted. \
		For column extraction, this is a list of selectors for each column.
		:param attr: The attribute to be extracted from the selected tag
		:param default: The default value to be used if the selector does not return any data
		:param verbosity: The verbosity set as the argument for scrapple run
		:return: A 2-tuple containing the list of all the column headers extracted and the list of \
		dictionaries which contain (header, content) pairs
		"""
        result = kwargs.get('result', {})
        result_list = []
        if type(kwargs.get('header', [])) in [str, unicode]:
            try:
                sel = cssselect.CSSSelector(kwargs.get('header', []))
                header_list = sel(self.tree)
                table_headers = [
                    kwargs.get('prefix', '') + h.text +
                    kwargs.get('suffix', '') for h in header_list
                ]
                if len(table_headers) == 0:
                    raise Exception("Invalid CSS selector " +
                                    kwargs.get('header', []))
            except TypeError:
                raise Exception(
                    "Selector expression string to be provided. Got " +
                    kwargs.get('header', []))
        else:
            table_headers = [
                kwargs.get('prefix', '') + h + kwargs.get('suffix', '')
                for h in kwargs.get('header', [])
            ]
        if kwargs.get('table_type', 'rows') not in ["rows", "columns"]:
            raise Exception("Specify 'rows' or 'columns' in table_type")
        kwargs.update({'table_headers': table_headers})
        if kwargs.get('table_type', 'rows') == "rows":
            result_list = self.extract_rows(**kwargs)
        else:
            result_list = self.extract_columns(**kwargs)
        return table_headers, result_list
コード例 #19
0
 def setTable(self):
     # some bug
     select_tables = cssselect.CSSSelector("table")
     symbol_count = 0
     # print(select_tables(self.code))
     for t in select_tables(self.code):
         cleaned = self.cleaner.clean_html(t)
         txt = sub("[\r\n\t ]", "", cleaned.text_content())
         # print(t)
         # print(txt)
         symbol_count += len(txt)
     if not self.features["size_of_text"] == 0:
         self.features["fraction_of_table"] = float(symbol_count) / self.features["size_of_text"]
コード例 #20
0
def search(word):
    res = requests.get(queryurl(word))
    doc = lhtml.fromstring(res.text)
    w = {}

    # word
    el_word = cssselect.CSSSelector("#headword > h1 > strong")(doc)
    if len(el_word) != 1:
        return None
    w["word"] = el_word[0].text_content()

    # pronounces
    el_prons = cssselect.CSSSelector(
        "body > div.contentPadding > div > div > div.lf_area > div.qdef > div.hd_area > div.hd_tf_lh > div > div:nth-child(even) > a"
    )(doc)
    pronounces = {}
    if len(el_prons) > 0:
        if len(el_prons) == 2:
            prEng = el_prons[1].get("onmouseover")
            prEng = re.search(r"https?://.*\.mp3", prEng).group()
            pronounces["eng"] = prEng
        prUs = el_prons[0].get("onmouseover")
        prUs = re.search(r"https?://.*\.mp3", prUs).group()
        pronounces["us"] = prUs
    w["pronounces"] = pronounces

    # definitions
    el_defs = cssselect.CSSSelector(
        "body > div.contentPadding > div > div > div.lf_area > div.qdef > ul > li"
    )(doc)
    definitions = []
    for el in el_defs:
        pos = cssselect.CSSSelector(".pos")(el)[0].text_content()
        defi = cssselect.CSSSelector(".def")(el)[0].text_content()
        definitions.append({"pos": pos, "def": defi})
    w["definitions"] = definitions

    # variants
    el_varis_kind = cssselect.CSSSelector(
        "body > div.contentPadding > div > div > div.lf_area > div.qdef > div.hd_div1 > div > span"
    )(doc)
    el_varis_word = cssselect.CSSSelector(
        "body > div.contentPadding > div > div > div.lf_area > div.qdef > div.hd_div1 > div > a"
    )(doc)
    variants = []
    for (kind, word) in itertools.izip(el_varis_kind, el_varis_word):
        variants.append({
            "kind": kind.text_content(),
            "word": word.text_content()
        })
    w["variants"] = variants

    return w
コード例 #21
0
 def getUsers(self):
     ula = cssselect.CSSSelector('div.mw-spcontent > ul > li > a')
     list_links = ula(self.lxml_root)
     total_users = []
     for link in list_links:
         if 'User:'******'href']:
             continue
         new_user = UserFromUserList(self.site, link.text)
         total_users.append(new_user)
         if link.get('class') == 'new':
             new_user.forceUserPage(False)
         else:
             new_user.forceUserPage(True)
     return total_users
コード例 #22
0
	def css_select(self, selector: str):
		"""
		Shortcut to select elements based on CSS selector.
		"""

		try:
			sel = CSS_SELECTOR_CACHE.get(selector)
			if not sel:
				sel = cssselect.CSSSelector(selector, translator="xhtml", namespaces=self.namespaces)
				CSS_SELECTOR_CACHE[selector] = sel

			return self.xpath(sel.path)
		except parser.SelectorSyntaxError as ex:
			raise se.InvalidCssException(f"Invalid selector: [css]{selector}[/]") from ex
コード例 #23
0
    def extract_content(self, *args, **kwargs):
        """
		Method for performing the content extraction for the given CSS selector.

		The cssselect library is used to handle CSS selector expressions. \
		XPath expressions have a higher speed of execution, so the given CSS selector \
		expression is translated into the corresponding XPath expression, by the \
		``cssselect.CSSSelector`` class. This selector can be used to extract content \
		from the element tree corresponding to the fetched web page.

		If the selector is "url", the URL of the current web page is returned.
		Otherwise, the selector expression is used to extract content. The particular \
		attribute to be extracted ("text", "href", etc.) is specified in the method \
		arguments, and this is used to extract the required content. If the content \
		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
		to convert the relative path into an absolute path.

		If the selector does not fetch any content, the default value is returned. \
		If no default value is specified, an exception is raised.

		:param selector: The CSS selector expression
		:param attr: The attribute to be extracted from the selected tag
		:param default: The default value to be used if the selector does not return any data
		:return: The extracted content

		"""
        try:
            selector, attr, default, connector = [
                kwargs.get(x, '')
                for x in ['selector', 'attr', 'default', 'connector']
            ]
            if selector == "url":
                return self.url
            sel = cssselect.CSSSelector(selector)
            if attr == "text":
                tag = sel(self.tree)[0]
                content = connector.join(
                    [make_ascii(x).strip() for x in tag.itertext()])
                content = content.replace("\n", " ").strip()
            else:
                content = sel(self.tree)[0].get(attr)
                if attr in ["href", "src"]:
                    content = urljoin(self.url, content)
            return content
        except IndexError:
            if default is not "":
                return default
            raise Exception("There is no content for the selector " + selector)
コード例 #24
0
def selector_exists(parsed_code, selector, namespaces_dict, is_xhtml):
    """
    Converts selector's text to XPath and make a search in xhtml file.
    Returns True if it finds a correspondence or the translation of the
    selector to XPath is not yet implemented by cssselect, False otherwise.
    """

    translator = 'xhtml' if is_xhtml else 'xml'
    try:
        if cssselect.CSSSelector(selector,
                                 translator=translator,
                                 namespaces=namespaces_dict)(parsed_code):
            return True
    except SelectorError:
        return True
    return False
コード例 #25
0
ファイル: graph_lib.py プロジェクト: skearnes/scripture-graph
def read_headers(tree) -> tuple[Optional[str], Optional[int]]:
    """Finds the book and chapter for the given document.

    Returns:
        book: Short name of the book (or None if not found).
        chapter: Chapter or section number (or None if not found).
    """
    title = get_title(tree)
    book = title.split("Chapter")[0].split("Section")[0].split(
        "Psalm ")[0].strip()
    book_short = scripture_graph.BOOKS_SHORT[book]
    title_number = cssselect.CSSSelector(".titleNumber")(tree)
    if not title_number:
        return None, None  # Table of contents, etc.
    chapter = int(list(title_number[0].itertext())[0].split()[-1])
    return book_short, chapter
コード例 #26
0
ファイル: crawler.py プロジェクト: wenty2015/web-crawler
    def __init__(self, seed_url_list, title=''):
        self.depth = 1
        self.url_num, self.domain_num = 0, 0
        self.file_no, self.file_cnt = 1, 0

        self.css_selector = cssselect.CSSSelector("a")

        self.seed_url_list = seed_url_list
        self.domain_map = {}  # {domain: domain_id}
        # {domain_id: {'domain':domain, 'robot':robot_parser}}
        self.domain_nodes = {}
        self.url_map = {}  # {url: url_id}
        ''' url_nodes: {url_id: {'domain_id': domain_id, 'url': url,
                                'out_links': [out_link_url], in_link: (url_id)}}'''
        self.url_nodes = {}
        self.initializeSeedURL(title)
コード例 #27
0
ファイル: 4-hearthpwn.py プロジェクト: summerwxy/hsc
def getDecks(url):
    print('\033[1;31m>> start get decks list\033[m')
    decks = []
    string = urllib.request.urlopen(url).read()
    parser = etree.HTMLParser()
    html = etree.fromstring(string, parser)
    select = cssselect.CSSSelector(r'#decks td')
    items = select(html)
    i = 0
    while i < len(items):
        deck = {}
        # Deck Name
        a = items[i].find(r'div/span/a')
        deck['name'] = a.text.strip()
        deck['url'] = 'http://www.hearthpwn.com' + a.get('href')
        arena = items[i].get('class').find(
            't-arena-cell') != -1 and True or False
        i = i + 1
        # Deck Type
        deck['type'] = items[i].text.strip()
        i = i + 1
        # Mana
        i = i + 1
        # Class
        deck['class'] = items[i].text.strip()
        i = i + 1
        # Rating
        deck['rating'] = items[i].find(r'div').text.strip()
        i = i + 1
        # Views
        deck['views'] = items[i].text.strip()
        i = i + 1
        # Comments
        i = i + 1
        deck['comments'] = items[i].text.strip()
        # Cost
        deck['cost'] = items[i].text.strip()
        i = i + 1
        # Updated
        deck['updated'] = items[i].find(r'abbr').get('title')
        deck['patch'] = items[i].find(r'span').text.strip()
        i = i + 1
        # if arena deck pass
        if not arena:
            decks.append(deck)
    return decks
コード例 #28
0
ファイル: textwiki.py プロジェクト: jennspics/ductus
def process_macros(html_input):
    """
    A template tag that processes a "ductus-html5" string into viewable html5.
    For now, it only runs macros.
    """

    from lxml import etree, cssselect
    source = etree.HTML(html_input)
    macro_tags = cssselect.CSSSelector('div.ductus-macro')(source)
    for mt in macro_tags:
        macro_name = mt.get('data-macro-name')
        try:
            mt = _registered_html_macros[macro_name](mt, source)
        except KeyError:
            pass  # macros are simply <div> tags in the input, fail silently if we don't know how to process them

    return mark_safe(etree.tostring(source))
コード例 #29
0
ファイル: filter.py プロジェクト: gsnedders/anolis
def filter(ElementTree, **kwargs):
    if not "filter" in kwargs or kwargs["filter"] == None:
        return
    selector = cssselect.CSSSelector(kwargs["filter"])
    for element in selector(ElementTree.getroot()):
        previous = element.getprevious()
        parent = element.getparent()
        if element.tail != None:
            if previous != None:
                if previous.tail != None:
                    previous.tail = previous.tail + element.tail
                else:
                    previous.tail = element.tail
            else:
                if parent.text != None:
                    parent.text = parent.text + element.tail
                else:
                    parent.text = element.tail
        parent.remove(element)
コード例 #30
0
ファイル: qiushibaike.py プロジェクト: yiliqsmy/pythonsrc
def duanzi_scrapter(html_doc, page_num=1):
    html_after_cleaner = cleaner.clean_html(html_doc)
    # 去除段子内容中的<br>
    pattern = re.compile('<br>|\n')
    html_after_cleaner = re.sub(pattern, '', html_after_cleaner)
    document = etree.fromstring(html_after_cleaner, parser)
    print('正在解析第%s页段子...' % str(page_num))
    try:
        sel = cssselect.CSSSelector('#content-left > div')
        for e in sel(document):

            try:
                # a content  获取段子信息
                a = e.find('.//a[@class="contentHerf"]')
                a_href = a.attrib['href']  # 格式/article/105323928
                spans = e.findall('.//a[@class="contentHerf"]/div/span')
                if len(spans) > 1:  # 出现“查看全文”
                    urls.add_new_url(a_href)  # 保存段子链接
                else:
                    duanzi_info = {}
                    duanzi_info[
                        'dz_url'] = 'https://www.qiushibaike.com' + a_href  # 段子链接地址
                    duanzi_info['dzContent'] = spans[0].text  # 段子内容

                    # div stats
                    spans = e.findall('.//div[@class="stats"]/span')
                    for span in spans:
                        i = span.find('.//i')
                        if span.get('class') == 'stats-vote':
                            duanzi_info['vote_num'] = i.text  # 投票数
                        elif span.get('class') == 'stats-comments':  # 评论数
                            duanzi_info['comment_num'] = i.text
                    collect_data(duanzi_info)

            except Exception as err:
                print('提取段子异常,进入下一循环')
                continue
        print('解析第%s页段子结束' % str(page_num))
        next_page(page_num + 1)  # 进入下一页
    except TimeoutException as err:
        print('解析网页出错:', err.args)
        return next_page(page_num + 1)  # 捕获异常,直接进入下一页