Ejemplo n.º 1
0
 def _parse_html(self, doc):
     user = dict()
     header = doc('#user_header')
     user['name'] = header.children('#user_sub').children(
         '.image').children('.img').children('img').attr('alt')
     info = header('ul.info').children()
     for child in info[1:]:
         text = PyQuery(child).text()
         if text.endswith('cm'):
             user['height'] = text
         elif text in {'MEN', 'WOMEN', 'KIDS'}:
             user['sex'] = text
         elif text.endswith(u'嵗'):
             user['age'] = text
         elif u'髮' in text:
             user['hairstyle'] = text
         else:
             user['location'] = text
     brands = [_.text() for _ in header('.favorite')('ul').items('li')]
     if brands:
         user['brandLike'] = '|'.join(brands)
     use = [_.text() for _ in doc('#gbl_related_link')('ul').items('li')]
     if use:
         user['brandUse'] = '|'.join(use)
     return user
Ejemplo n.º 2
0
def get_diy_links(host, html, pattern, suffix):
    d = PyQuery(html)
    pattern = '{} a'.format(pattern)
    try:
        link_list = d(pattern)
    except Exception:
        yield
    else:
        for link in link_list:
            href = PyQuery(link).attr('href').encode('utf-8')
            loc = urlparse.urlparse(href).netloc
            if not loc:
                href = host + href
            if href.endswith(suffix):
                yield href