def html(self, value=no_default, **kwargs): """This cannot be wrapped and needs (almost) full override. """ if value is no_default: html = super(MyQuery, self).html(value, **kwargs) return self.strip_namespaces(html) else: if isinstance(value, self.__class__): new_html = unicode(value) elif isinstance(value, basestring): new_html = value elif not value: new_html = '' else: raise ValueError(type(value)) for tag in self: for child in tag.getchildren(): tag.remove(child) root = fromstring(self._wrap_root(new_html), self.parser)[0] children = root.getchildren() if children: tag.extend(children) tag.text = root.text tag.tail = root.tail return self
def parse_credit_assign(html): q = pq(fromstring(html)) ttl_amt = 0 ls_credits = [] for each in q("div.invest-item"): each = pq(each) amt0 = float(each("p.project-info span.decimal")[0].text.replace(',', '')) amt1 = float(each("p.project-info span.decimal")[1].text.replace(',', '')) amt0 = amt1 - amt0 ttl_amt += amt1 logger.debug("credit: %.2f,%.2f" % (amt0, amt1)) ls_credits.append({ "url": urlparse.urljoin(JIMU_BASE_URL, each.parent().attr.href), "amount": amt1, "bal": amt0, "interest_rate": float(each("div.invest-item-feature span.invest-item-profit")[0] .text), "remain_days": int(each("div.invest-item-feature span.invest-item-profit")[1] .text.replace(',', '')), "title": each("div.invest-item-subtitle").text() }) if len(ls_credits) > 0: logger.info("total credit#: %d, amount:%.2f" % (len(ls_credits), ttl_amt)) return ls_credits
def get_origion_prj_id(session, url): rsp = session.get(url) root = pq(fromstring(rsp.text)) prj_href = root('#creditAssignData > div.row-fluid.credit-assign-content > div.span8 > div.credit-assign-title > h5 > a').attr.href if prj_href: return prj_href.split("/")[-1] else: return None
def _get_root(self, value): """In case of creating the value from string, namespace has to be passed """ if isinstance(value, basestring): root = fromstring(self._wrap_root(value), self.parser)[0] # TODO: warn if root has more than one child? value = root.getchildren()[0] return super(MyQuery, self)._get_root(value)
def html(self, value=no_default): """Get or set the html representation of sub nodes. Get the text value:: >>> d = PyQuery('<div><span>toto</span></div>') >>> print(d.html()) <span>toto</span> Set the text value:: >>> d.html('<span>Youhou !</span>') [<div>] >>> print(d) <div><span>Youhou !</span></div> """ if value is no_default: if not self: return None tag = self[0] children = tag.getchildren() if not children: return tag.text html = tag.text or '' html += unicode('').join([etree.tostring(e, encoding=unicode) for e in children]) return html else: if isinstance(value, PyQuery): new_html = unicode(value) elif isinstance(value, basestring): new_html = value elif not value: new_html = '' else: raise ValueError(type(value)) for tag in self: for child in tag.getchildren(): tag.remove(child) root = fromstring(unicode('<root>') + new_html + unicode('</root>'), self.parser)[0] children = root.getchildren() if children: tag.extend(children) tag.text = root.text tag.tail = root.tail return self
def parse_prj_list(html): """ 返回项目列表和下一页URL """ root = pq(fromstring(html)) ls_prj = [] date_ = None hour_ = None for oel in root("a.invest-item"): title_ = pq(oel)("div.invest-item-title").text() if isinstance(title_, unicode): title_ = unicode.encode(title_, "utf8") import re m = re.findall("(\d{6})-(\d{2})$", title_) if m: date_ = m[0][0] hour_ = m[0][1] date_ = ls_prj.append({ "href": pq(oel).attr.href, "title": title_.decode('utf-8', 'ignore'), "amount": re.findall('[0-9\.]+', pq(oel)("p.project-info span").text().split('/')[-1] .strip())[0], "interest": pq(oel)("span.invest-item-profit")[0].text.strip(), "term_months": pq(oel)("span.invest-item-profit")[1].text.strip(), "date": date_, "hour": hour_, }) next_page_el = root("div.pagination.pagination-centered li.active").next() if next_page_el: next_page_url = next_page_el("a").attr.href next_page_url = urlparse.urljoin(JIMU_BASE_URL, next_page_url) return ls_prj, next_page_url
def __init__(self, *args, **kwargs): namespaces = kwargs.get('namespaces') # if namespaces: if 1: kwargs['parser'] = 'xml' length = len(args) if length == 1: selector, context = None, args[0] elif length == 2: selector, context = args if isinstance(context, basestring): root = fromstring(self._wrap_root(context, namespaces), kwargs.get('parser'))[0] if selector is not None: args = (selector, root.getchildren()) else: args = (root.getchildren(), ) super(MyQuery, self).__init__(*args, **kwargs)
pyq_str = pq(html_doc) head = pyq_str('head') print(head.html()) print(head.text()) pyq_str = pq(etree.fromstring(html_doc)) head = pyq_str('head') print(head.html()) print(head.text()) pyq_url = pq(url='http://www.baidu.com', encoding='utf-8') head = pyq_url('head') print(head.html()) print(head.text()) pyq_fromstring = fromstring(html_doc, 'html') print(pyq_fromstring[0].head.tag) pyq_str = pq(html_doc) body = pyq_str('body') print(body.html()) print(body.text()) css = 'a[@id="link1"]' print(pyq_str(css).attr.id) print(pyq_str(css).attr['id']) print(pyq_str(css).attr['class']) print(pyq_str(css).parent()) #父标签 print('-' * 20) css_f = "p[@class='story']" print(pyq_str(css_f).children()) #子标签
def __init__(self, *args, **kwargs): html = None elements = [] self._base_url = None self.parser = kwargs.pop('parser', None) if (len(args) >= 1 and (not PY3k and isinstance(args[0], basestring) or (PY3k and isinstance(args[0], str))) and args[0].split('://', 1)[0] in ('http', 'https')): kwargs['url'] = args[0] if len(args) >= 2: kwargs['data'] = args[1] args = [] if 'parent' in kwargs: self._parent = kwargs.pop('parent') else: self._parent = no_default if 'css_translator' in kwargs: self._translator = kwargs.pop('css_translator') elif self.parser in ('xml',): self._translator = self._translator_class(xhtml=True) elif self._parent is not no_default: self._translator = self._parent._translator else: self._translator = self._translator_class(xhtml=False) namespaces = kwargs.pop('namespaces', {}) if kwargs: # specific case to get the dom if 'filename' in kwargs: html = open(kwargs['filename']) elif 'url' in kwargs: url = kwargs.pop('url') if 'opener' in kwargs: opener = kwargs.pop('opener') html = opener(url, **kwargs) else: html = url_opener(url, kwargs) if not self.parser: self.parser = 'html' self._base_url = url else: raise ValueError('Invalid keyword arguments %s' % kwargs) elements = fromstring(html, self.parser) # close open descriptor if possible if hasattr(html, 'close'): try: html.close() except: pass else: # get nodes # determine context and selector if any selector = context = no_default length = len(args) if length == 1: context = args[0] elif length == 2: selector, context = args else: raise ValueError( "You can't do that. Please, provide arguments") # get context if isinstance(context, basestring): try: elements = fromstring(context, self.parser) except Exception: raise elif isinstance(context, self.__class__): # copy elements = context[:] elif isinstance(context, list): elements = context elif isinstance(context, etree._Element): elements = [context] # select nodes if elements and selector is not no_default: xpath = self._css_to_xpath(selector) results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=namespaces)) elements = results list.__init__(self, elements)
def __init__(self, *args, **kwargs): html = None elements = [] self._base_url = None self.parser = kwargs.pop('parser', None) if (len(args) >= 1 and (not PY3k and isinstance(args[0], basestring) or (PY3k and isinstance(args[0], str))) and args[0].split('://', 1)[0] in ('http', 'https')): kwargs['url'] = args[0] if len(args) >= 2: kwargs['data'] = args[1] args = [] if 'parent' in kwargs: self._parent = kwargs.pop('parent') else: self._parent = no_default if 'css_translator' in kwargs: self._translator = kwargs.pop('css_translator') elif self.parser in ('xml', ): self._translator = self._translator_class(xhtml=True) elif self._parent is not no_default: self._translator = self._parent._translator else: self._translator = self._translator_class(xhtml=False) namespaces = kwargs.pop('namespaces', {}) if kwargs: # specific case to get the dom if 'filename' in kwargs: html = open(kwargs['filename']) elif 'url' in kwargs: url = kwargs.pop('url') if 'opener' in kwargs: opener = kwargs.pop('opener') html = opener(url, **kwargs) else: html = url_opener(url, kwargs) if not self.parser: self.parser = 'html' self._base_url = url else: raise ValueError('Invalid keyword arguments %s' % kwargs) elements = fromstring(html, self.parser) # close open descriptor if possible if hasattr(html, 'close'): try: html.close() except: pass else: # get nodes # determine context and selector if any selector = context = no_default length = len(args) if length == 1: context = args[0] elif length == 2: selector, context = args else: raise ValueError( "You can't do that. Please, provide arguments") # get context if isinstance(context, basestring): try: elements = fromstring(context, self.parser) except Exception: raise elif isinstance(context, self.__class__): # copy elements = context[:] elif isinstance(context, list): elements = context elif isinstance(context, etree._Element): elements = [context] # select nodes if elements and selector is not no_default: xpath = self._css_to_xpath(selector) results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=namespaces)) elements = results list.__init__(self, elements)