Ejemplo n.º 1
0
    def html(self, value=no_default, **kwargs):
        """This cannot be wrapped and needs (almost) full override.
        """
        if value is no_default:
            html = super(MyQuery, self).html(value, **kwargs)
            return self.strip_namespaces(html)

        else:
            if isinstance(value, self.__class__):
                new_html = unicode(value)
            elif isinstance(value, basestring):
                new_html = value
            elif not value:
                new_html = ''
            else:
                raise ValueError(type(value))

            for tag in self:
                for child in tag.getchildren():
                    tag.remove(child)
                root = fromstring(self._wrap_root(new_html), self.parser)[0]
                children = root.getchildren()
                if children:
                    tag.extend(children)
                tag.text = root.text
                tag.tail = root.tail

        return self
Ejemplo n.º 2
0
def parse_credit_assign(html):
    q = pq(fromstring(html))
    ttl_amt = 0
    ls_credits = []
    for each in q("div.invest-item"):
        each = pq(each)
        amt0 = float(each("p.project-info span.decimal")[0].text.replace(',', ''))
        amt1 = float(each("p.project-info span.decimal")[1].text.replace(',', ''))
        amt0 = amt1 - amt0
        ttl_amt += amt1
        logger.debug("credit: %.2f,%.2f" % (amt0, amt1))
        ls_credits.append({
            "url":
            urlparse.urljoin(JIMU_BASE_URL, each.parent().attr.href),
            "amount":
            amt1,
            "bal":
            amt0,
            "interest_rate":
            float(each("div.invest-item-feature span.invest-item-profit")[0]
                .text),
            "remain_days":
            int(each("div.invest-item-feature span.invest-item-profit")[1]
                .text.replace(',', '')),
            "title":
            each("div.invest-item-subtitle").text()
        })

    if len(ls_credits) > 0:
        logger.info("total credit#: %d, amount:%.2f" % (len(ls_credits),
                                                    ttl_amt))
    return ls_credits
Ejemplo n.º 3
0
def get_origion_prj_id(session, url):
    rsp = session.get(url)
    root = pq(fromstring(rsp.text))
    prj_href = root('#creditAssignData > div.row-fluid.credit-assign-content > div.span8 > div.credit-assign-title > h5 > a').attr.href
    if prj_href:
        return prj_href.split("/")[-1]
    else:
        return None
Ejemplo n.º 4
0
    def _get_root(self, value):
        """In case of creating the value from string, namespace has to be passed
        """
        if isinstance(value, basestring):
            root = fromstring(self._wrap_root(value), self.parser)[0]
            # TODO: warn if root has more than one child?
            value = root.getchildren()[0]

        return super(MyQuery, self)._get_root(value)
Ejemplo n.º 5
0
    def html(self, value=no_default):
        """Get or set the html representation of sub nodes.

        Get the text value::

            >>> d = PyQuery('<div><span>toto</span></div>')
            >>> print(d.html())
            <span>toto</span>

        Set the text value::

            >>> d.html('<span>Youhou !</span>')
            [<div>]
            >>> print(d)
            <div><span>Youhou !</span></div>
        """
        if value is no_default:
            if not self:
                return None
            tag = self[0]
            children = tag.getchildren()
            if not children:
                return tag.text
            html = tag.text or ''
            html += unicode('').join([etree.tostring(e, encoding=unicode) for e in children])
            return html
        else:
            if isinstance(value, PyQuery):
                new_html = unicode(value)
            elif isinstance(value, basestring):
                new_html = value
            elif not value:
                new_html = ''
            else:
                raise ValueError(type(value))

            for tag in self:
                for child in tag.getchildren():
                    tag.remove(child)
                root = fromstring(unicode('<root>') + new_html + unicode('</root>'), self.parser)[0]
                children = root.getchildren()
                if children:
                    tag.extend(children)
                tag.text = root.text
                tag.tail = root.tail
        return self
Ejemplo n.º 6
0
def parse_prj_list(html):
    """
        返回项目列表和下一页URL
    """
    root = pq(fromstring(html))
    ls_prj = []
    date_ = None
    hour_ = None
    for oel in root("a.invest-item"):
        title_ = pq(oel)("div.invest-item-title").text()
        if isinstance(title_, unicode):
            title_ = unicode.encode(title_, "utf8")
        import re
        m = re.findall("(\d{6})-(\d{2})$", title_)
        if m:
            date_ = m[0][0]
            hour_ = m[0][1]
        date_ = ls_prj.append({
            "href":
            pq(oel).attr.href,
            "title":
            title_.decode('utf-8', 'ignore'),
            "amount":
            re.findall('[0-9\.]+',
                       pq(oel)("p.project-info span").text().split('/')[-1]
                       .strip())[0],
            "interest":
            pq(oel)("span.invest-item-profit")[0].text.strip(),
            "term_months":
            pq(oel)("span.invest-item-profit")[1].text.strip(),
            "date":
            date_,
            "hour":
            hour_,
        })

    next_page_el = root("div.pagination.pagination-centered li.active").next()
    if next_page_el:
        next_page_url = next_page_el("a").attr.href
        next_page_url = urlparse.urljoin(JIMU_BASE_URL, next_page_url)
    return ls_prj, next_page_url
Ejemplo n.º 7
0
    def __init__(self, *args, **kwargs):
        namespaces = kwargs.get('namespaces')
        #        if namespaces:
        if 1:
            kwargs['parser'] = 'xml'

            length = len(args)
            if length == 1:
                selector, context = None, args[0]
            elif length == 2:
                selector, context = args

            if isinstance(context, basestring):
                root = fromstring(self._wrap_root(context, namespaces),
                                  kwargs.get('parser'))[0]

                if selector is not None:
                    args = (selector, root.getchildren())
                else:
                    args = (root.getchildren(), )

        super(MyQuery, self).__init__(*args, **kwargs)
Ejemplo n.º 8
0
pyq_str = pq(html_doc)
head = pyq_str('head')
print(head.html())
print(head.text())

pyq_str = pq(etree.fromstring(html_doc))
head = pyq_str('head')
print(head.html())
print(head.text())

pyq_url = pq(url='http://www.baidu.com', encoding='utf-8')
head = pyq_url('head')
print(head.html())
print(head.text())

pyq_fromstring = fromstring(html_doc, 'html')
print(pyq_fromstring[0].head.tag)

pyq_str = pq(html_doc)
body = pyq_str('body')
print(body.html())
print(body.text())
css = 'a[@id="link1"]'
print(pyq_str(css).attr.id)
print(pyq_str(css).attr['id'])
print(pyq_str(css).attr['class'])
print(pyq_str(css).parent())  #父标签
print('-' * 20)
css_f = "p[@class='story']"
print(pyq_str(css_f).children())  #子标签
Ejemplo n.º 9
0
    def __init__(self, *args, **kwargs):
        html = None
        elements = []
        self._base_url = None
        self.parser = kwargs.pop('parser', None)

        if (len(args) >= 1 and
                (not PY3k and isinstance(args[0], basestring) or
                (PY3k and isinstance(args[0], str))) and
                args[0].split('://', 1)[0] in ('http', 'https')):
            kwargs['url'] = args[0]
            if len(args) >= 2:
                kwargs['data'] = args[1]
            args = []

        if 'parent' in kwargs:
            self._parent = kwargs.pop('parent')
        else:
            self._parent = no_default

        if 'css_translator' in kwargs:
            self._translator = kwargs.pop('css_translator')
        elif self.parser in ('xml',):
            self._translator = self._translator_class(xhtml=True)
        elif self._parent is not no_default:
            self._translator = self._parent._translator
        else:
            self._translator = self._translator_class(xhtml=False)

        namespaces = kwargs.pop('namespaces', {})

        if kwargs:
            # specific case to get the dom
            if 'filename' in kwargs:
                html = open(kwargs['filename'])
            elif 'url' in kwargs:
                url = kwargs.pop('url')
                if 'opener' in kwargs:
                    opener = kwargs.pop('opener')
                    html = opener(url, **kwargs)
                else:
                    html = url_opener(url, kwargs)
                if not self.parser:
                    self.parser = 'html'
                self._base_url = url
            else:
                raise ValueError('Invalid keyword arguments %s' % kwargs)

            elements = fromstring(html, self.parser)
            # close open descriptor if possible
            if hasattr(html, 'close'):
                try:
                    html.close()
                except:
                    pass

        else:
            # get nodes

            # determine context and selector if any
            selector = context = no_default
            length = len(args)
            if length == 1:
                context = args[0]
            elif length == 2:
                selector, context = args
            else:
                raise ValueError(
                    "You can't do that. Please, provide arguments")

            # get context
            if isinstance(context, basestring):
                try:
                    elements = fromstring(context, self.parser)
                except Exception:
                    raise
            elif isinstance(context, self.__class__):
                # copy
                elements = context[:]
            elif isinstance(context, list):
                elements = context
            elif isinstance(context, etree._Element):
                elements = [context]

            # select nodes
            if elements and selector is not no_default:
                xpath = self._css_to_xpath(selector)
                results = []
                for tag in elements:
                    results.extend(tag.xpath(xpath, namespaces=namespaces))
                elements = results

        list.__init__(self, elements)
Ejemplo n.º 10
0
    def __init__(self, *args, **kwargs):
        html = None
        elements = []
        self._base_url = None
        self.parser = kwargs.pop('parser', None)

        if (len(args) >= 1 and (not PY3k and isinstance(args[0], basestring) or
                                (PY3k and isinstance(args[0], str)))
                and args[0].split('://', 1)[0] in ('http', 'https')):
            kwargs['url'] = args[0]
            if len(args) >= 2:
                kwargs['data'] = args[1]
            args = []

        if 'parent' in kwargs:
            self._parent = kwargs.pop('parent')
        else:
            self._parent = no_default

        if 'css_translator' in kwargs:
            self._translator = kwargs.pop('css_translator')
        elif self.parser in ('xml', ):
            self._translator = self._translator_class(xhtml=True)
        elif self._parent is not no_default:
            self._translator = self._parent._translator
        else:
            self._translator = self._translator_class(xhtml=False)

        namespaces = kwargs.pop('namespaces', {})

        if kwargs:
            # specific case to get the dom
            if 'filename' in kwargs:
                html = open(kwargs['filename'])
            elif 'url' in kwargs:
                url = kwargs.pop('url')
                if 'opener' in kwargs:
                    opener = kwargs.pop('opener')
                    html = opener(url, **kwargs)
                else:
                    html = url_opener(url, kwargs)
                if not self.parser:
                    self.parser = 'html'
                self._base_url = url
            else:
                raise ValueError('Invalid keyword arguments %s' % kwargs)

            elements = fromstring(html, self.parser)
            # close open descriptor if possible
            if hasattr(html, 'close'):
                try:
                    html.close()
                except:
                    pass

        else:
            # get nodes

            # determine context and selector if any
            selector = context = no_default
            length = len(args)
            if length == 1:
                context = args[0]
            elif length == 2:
                selector, context = args
            else:
                raise ValueError(
                    "You can't do that. Please, provide arguments")

            # get context
            if isinstance(context, basestring):
                try:
                    elements = fromstring(context, self.parser)
                except Exception:
                    raise
            elif isinstance(context, self.__class__):
                # copy
                elements = context[:]
            elif isinstance(context, list):
                elements = context
            elif isinstance(context, etree._Element):
                elements = [context]

            # select nodes
            if elements and selector is not no_default:
                xpath = self._css_to_xpath(selector)
                results = []
                for tag in elements:
                    results.extend(tag.xpath(xpath, namespaces=namespaces))
                elements = results

        list.__init__(self, elements)