Ejemplo n.º 1
0
 def parse_org_info(self, response):
     """ 抽取组织主页的信息
     """
     request_list = self.make_request_list()
     for request in request_list:
         self.crawler.stats.inc_value('request_issued')
         yield request
         
     user_info_item = UserInfoItem()
     user_info_item['is_org'] = True
     user_info_item['uid'] = response.meta['uid']
     
     #import ipdb; ipdb.set_trace()
     # find user name
     # 发生了跳转,则意味着该帐号有个性域名
     m = re_UserNamePage.search(response.url)
     if m != None:
         user_info_item['username'] = user_info_item['uid']
     else:
         m = re_Site.search(response.url)
         user_info_item['username'] = m.group(1)
         
     soup = beautiful_soup(response.body)
     for script in soup.find_all('script'):
         text = script.text
         if text.startswith('FM.view'):
             text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
             data = json.loads(text)
             domid = data['domid']
             if domid.startswith('Pl_Core_T8CustomTriColumn__'):
                 info_soup = beautiful_soup(data['html'])
                 td_list = info_soup.find_all('td', attrs = {'class':'S_line1'})
                 if len(td_list) != 3:
                     log.msg('Error parsing: %s' % response.url, log.INFO)
                 else:
                     user_info_item['n_follows'] = int(td_list[0].find('strong').text.strip())
                     user_info_item['n_fans'] = int(td_list[1].find('strong').text.strip())
                     
             elif domid.startswith('Pl_Core_UserInfo__'):
                 info_soup = beautiful_soup(data['html'])
                 user_info_item['nickname'] = info_soup.find('p', attrs={'class':'info'}).find('span').text.strip()
                 ul_list = info_soup.find('ul', attrs={'class':'ul_detail'})
                 li_list = ul_list.find_all('li', attrs={'class':'item S_line2 clearfix'})
                 if len(li_list) == 0:
                     log.msg('Error parsing: %s' % response.url, log.INFO)
                 else:
                     user_info_item['category'] = li_list[0].find('span', attrs={'class':'item_text W_fl'}).text.strip()
                     if len(li_list) > 1:
                         user_info_item['intro'] = li_list[1].find('span', attrs={'class':'item_text W_fl'}).text.strip()
                 
             else:
                 pass
     
     #print user_info_item
     #import ipdb; ipdb.set_trace()
     
     yield user_info_item
Ejemplo n.º 2
0
def store_info(country=None):
  """Fetch metadata for all stores"""
  browse_page = utils.beautiful_soup('http://www.bricklink.com/browse.asp')
  country_links = (
    browse_page
    .find(text='Stores:').parent.parent.next_sibling
    .find_all('a', href=re.compile('countryID'))
  )

  result = []

  for country_link in country_links:
    country_name = country_link.text
    country_id = utils.get_params(country_link['href'])['countryID']

    # skip this country link if we're only gathering data on one country
    if country is not None and country_id != country:
      continue

    country_page = utils.beautiful_soup('http://www.bricklink.com' + country_link['href'])
    store_links = country_page.find_all('a', href=re.compile('store.asp'))

    for store_link in store_links:
      store_page = utils.beautiful_soup('http://www.bricklink.com' + '/' + store_link['href'])
      params = utils.get_params(store_page.find('frame', src=re.compile('^storeTop.asp'))['src'])

      store_name = params['storeName']
      store_id = params['uID']
      country_name = params['cn']
      country_id = params['c']
      seller_name = params['p_seller']
      feedback = params['p_feedback']

      store_splash = utils.beautiful_soup("http://www.bricklink.com/storeSplash.asp?uID=" + store_id)
      min_buy_elem = store_splash.find(text="Minimum Buy:")
      if min_buy_elem is not None:
        min_buy = min_buy_elem.parent.parent.parent.parent.next_sibling.find("font").text
        try:
          min_buy = re.search("US \$([0-9.]+)", min_buy).group(1)
          min_buy = float(min_buy)
        except AttributeError:
          # there's a minimum buy in a foreign currency :(
          continue
      else:
        min_buy = 0.0

      ships_to_elem = store_splash.find(text="Store Ships To:")
      if ships_to_elem is not None:
        ships = ships_to_elem.parent.parent.parent.parent.next_sibling.find_all(text=True)
        ships = map(lambda x: unicode(x), ships)
      else:
        ships = []


      entry = {
        'store_name': store_name,
        'store_id': int(store_id),
        'country_name': country_name,
        'country_id': country_id,
        'seller_name': seller_name,
        'feedback': int(feedback),
        'minimum_buy': min_buy,
        'ships': ships
      }
      print entry

      result.append(entry)

  return result
Ejemplo n.º 3
0
            self.body['pre_page'] = self.body['page']
            page_url = page_url + urllib.urlencode(self.body)
        elif page == 3:
            self.body['pagebar'] = '1'
            self.body['pre_page'] = self.body['page']
            page_url = page_url + urllib.urlencode(self.body)

        try:
            #print page_url
            req = urllib2.Request(page_url)
            req.add_header('User-Agent', self.headers)
            res = urllib2.urlopen(req)
        except urllib2.HTTPError, e:
            weiboMain.log.error('++++Open url [%s] failed. (Error code: %d) <@parser.get_weibo>++++' % (page_url, e.code))
            return
        soup = beautiful_soup(res.read())
        if soup is None:
            return 0

        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftProfileFeed__'):
                    page_soup = beautiful_soup(data['html'])
                    if page_soup is None:
                        weiboMain.log.error('++++Can not get weibo feed! <@parser.get_weibo>++++')
                        return 0
                    weibo_list = page_soup.find('div', attrs = {'class': 'WB_feed'})
Ejemplo n.º 4
0
def store_info(country=None):
    """Fetch metadata for all stores"""
    browse_page = utils.beautiful_soup('http://www.bricklink.com/browse.asp')
    country_links = (browse_page.find(
        text='Stores:').parent.parent.next_sibling.find_all(
            'a', href=re.compile('countryID')))

    result = []

    for country_link in country_links:
        country_name = country_link.text
        country_id = utils.get_params(country_link['href'])['countryID']

        # skip this country link if we're only gathering data on one country
        if country is not None and country_id != country:
            continue

        country_page = utils.beautiful_soup('http://www.bricklink.com' +
                                            country_link['href'])
        store_links = country_page.find_all('a', href=re.compile('store.asp'))

        for store_link in store_links:
            store_page = utils.beautiful_soup('http://www.bricklink.com' +
                                              '/' + store_link['href'])
            params = utils.get_params(
                store_page.find('frame',
                                src=re.compile('^storeTop.asp'))['src'])

            store_name = params['storeName']
            store_id = params['uID']
            country_name = params['cn']
            country_id = params['c']
            seller_name = params['p_seller']
            feedback = params['p_feedback']

            store_splash = utils.beautiful_soup(
                "http://www.bricklink.com/storeSplash.asp?uID=" + store_id)
            min_buy_elem = store_splash.find(text="Minimum Buy:")
            if min_buy_elem is not None:
                min_buy = min_buy_elem.parent.parent.parent.parent.next_sibling.find(
                    "font").text
                try:
                    min_buy = re.search("US \$([0-9.]+)", min_buy).group(1)
                    min_buy = float(min_buy)
                except AttributeError:
                    # there's a minimum buy in a foreign currency :(
                    continue
            else:
                min_buy = 0.0

            ships_to_elem = store_splash.find(text="Store Ships To:")
            if ships_to_elem is not None:
                ships = ships_to_elem.parent.parent.parent.parent.next_sibling.find_all(
                    text=True)
                ships = map(lambda x: unicode(x), ships)
            else:
                ships = []

            entry = {
                'store_name': store_name,
                'store_id': int(store_id),
                'country_name': country_name,
                'country_id': country_id,
                'seller_name': seller_name,
                'feedback': int(feedback),
                'minimum_buy': min_buy,
                'ships': ships
            }
            print entry

            result.append(entry)

    return result
Ejemplo n.º 5
0
    def parse_user_info(self, response):  # 默认的回调函数
        """ 普通用户信息抓取,例如:
        """
        #import ipdb; ipdb.set_trace()

        # 添加接下来要处理的request
        request_list = self.make_request_list()
        for request in request_list:
            self.crawler.stats.inc_value('request_issued')
            yield request

        is_valid = True  # 判别是否是合法的item
        log.msg('Parse url: %s' % response.url, level=log.INFO)
        #log.msg('Response body: %s' % response.body)

        # 如果是组织页面,那么会返回错误页面
        if response.url.find('pagenotfound') > 0:
            # 匹配源request
            #import ipdb; ipdb.set_trace()
            is_valid = False
            log.msg('Page not found: %s' % response.url, log.ERROR)
            uid = response.meta['uid']
            new_url = 'http://weibo.com/u/%s' % (uid)  # 直接访问组织帐号的首页
            request = Request(url=new_url,
                              callback=self.parse_org_info,
                              meta={'uid': uid})
            self.crawler.stats.inc_value('request_issued')
            yield request

        # TODO: 判断用户是否存在, http://weibo.com/sorry?usernotexists&code=100001。
        # 不过用户确实是存在的
        # TODO: 判断被封:http://weibo.com/sorry?userblock&is_viewer&code=20003
        # http://sass.weibo.com/accessdeny?uid=5445629123&ip=2682434316&location=1&callbackurl=http%3A%2F%2Fweibo.com%2Fu%2F2029154257

        # 从用户的信息页面抽取user info
        user_info_item = UserInfoItem()
        user_info_item['is_org'] = False

        # 从原始返回的页面中抽取uid和page_id
        m = re_UID.search(response.body)
        if m != None:
            user_info_item['uid'] = m.group(1)
        else:
            log.msg('Error parsing uid: %s' % response.url, log.ERROR)

        # 抽取page_id
        m = re_PAGEID.search(response.body)
        if m != None:
            user_info_item['page_id'] = m.group(1)
        else:
            log.msg('Error parsing page id: %s' % response.url, log.ERROR)

        # bs4解析
        soup = beautiful_soup(response.body)

        new_style = False
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title = block_div.find('h4',
                                                     attrs={
                                                         'class': 'obj_name'
                                                     }).text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    user_info_item['avatar'] = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    user_info_item['n_follows'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    user_info_item['n_fans'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    user_info_item['n_follows'] = int(
                        tds[0].find('strong').text)
                    user_info_item['n_fans'] = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    user_info_item['avatar'] = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']

                    #import ipdb; ipdb.set_trace()
                    # 判别该用户是否是认真用户
                    photo_div = header_soup.find_all('div',
                                                     attrs={
                                                         'class': 'pf_photo',
                                                         'node-type': 'photo'
                                                     })
                    if len(photo_div) > 0:
                        result = photo_div[0].find_all(
                            'a',
                            attrs={'href': 'http://verified.weibo.com/verify'})
                        if len(result) > 0:
                            user_info_item['is_verified'] = True
                        else:
                            user_info_item['is_verified'] = False
                    else:
                        log.msg('Can not find photo div: %s' % response.url,
                                log.ERROR)

            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    user_info_item['avatar'] = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    user_info_item[profile_map[k]['field']] = v
                    #setattr(user_info_item, profile_map[k]['field'], v)

        user_info_item['work'] = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = dict()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info['name'] = a.text
                            text = p.text
                            if '(' in text:
                                work_info['date'] = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info['location'] = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info['position'] = text.split(u':', 1)[1]
                            else:
                                work_info['detail'] = text
                    user_info_item['work'].append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = dict()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info['name'] = a.text
                    if '(' in text:
                        work_info['date'] = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info['location'] = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info['position'] = l.split(u':', 1)[1]
                        else:
                            work_info['detail'] = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    user_info_item['work'].append(work_info)

        user_info_item['edu'] = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = dict()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info['name'] = a.text
                            if '(' in text:
                                edu_info['date'] = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info['detail'] = text

                    user_info_item['edu'].append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = dict()
                    edu_info['name'] = name
                    if '(' in text:
                        edu_info['date'] = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info['detail'] = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    user_info_item['edu'].append(edu_info)

        user_info_item['tags'] = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        user_info_item['tags'].append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    user_info_item['tags'].append(a.text.strip())

        # 如果打算将item包含进去,则is_valid则为True
        log.msg('parse %s finish' % response.url, log.INFO)

        # 检查spider是否已经被封
        if not user_info_item['n_follows']:
            log.msg('The spider may have been banned.', log.ERROR)
        else:
            if is_valid:
                #print user_info_item
                yield user_info_item
Ejemplo n.º 6
0
    def parse_org_info(self, response):
        """ 抽取组织主页的信息
        """
        request_list = self.make_request_list()
        for request in request_list:
            self.crawler.stats.inc_value('request_issued')
            yield request

        user_info_item = UserInfoItem()
        user_info_item['is_org'] = True
        user_info_item['uid'] = response.meta['uid']

        #import ipdb; ipdb.set_trace()
        # find user name
        # 发生了跳转,则意味着该帐号有个性域名
        m = re_UserNamePage.search(response.url)
        if m != None:
            user_info_item['username'] = user_info_item['uid']
        else:
            m = re_Site.search(response.url)
            user_info_item['username'] = m.group(1)

        soup = beautiful_soup(response.body)
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    info_soup = beautiful_soup(data['html'])
                    td_list = info_soup.find_all('td',
                                                 attrs={'class': 'S_line1'})
                    if len(td_list) != 3:
                        log.msg('Error parsing: %s' % response.url, log.INFO)
                    else:
                        user_info_item['n_follows'] = int(
                            td_list[0].find('strong').text.strip())
                        user_info_item['n_fans'] = int(
                            td_list[1].find('strong').text.strip())

                elif domid.startswith('Pl_Core_UserInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    user_info_item['nickname'] = info_soup.find(
                        'p', attrs={
                            'class': 'info'
                        }).find('span').text.strip()
                    ul_list = info_soup.find('ul',
                                             attrs={'class': 'ul_detail'})
                    li_list = ul_list.find_all(
                        'li', attrs={'class': 'item S_line2 clearfix'})
                    if len(li_list) == 0:
                        log.msg('Error parsing: %s' % response.url, log.INFO)
                    else:
                        user_info_item['category'] = li_list[0].find(
                            'span', attrs={
                                'class': 'item_text W_fl'
                            }).text.strip()
                        if len(li_list) > 1:
                            user_info_item['intro'] = li_list[1].find(
                                'span', attrs={
                                    'class': 'item_text W_fl'
                                }).text.strip()

                else:
                    pass

        #print user_info_item
        #import ipdb; ipdb.set_trace()

        yield user_info_item
Ejemplo n.º 7
0
    def parse_user_card_info_text(self, uid, html_data):
        """ 给定html body解析 user info, 用户信息包括:avatar, uid, nickname, desc, location
        NOTE: 将该解析单独作为一个函数有两个作用:1)便于单独测试;2)使得parse_user_card_info函数结构清晰
        """
        user_info_item = UserInfoItem()
        #user_info_item['raw_html'] = html_data # 保存所有的原始数据
        user_info_item['uid'] = uid
        user_info_item['existed'] = True

        soup = beautiful_soup(html_data)

        nc_head = soup.find('div', attrs={'class': 'nc_head'})
        nc_content = soup.find('div', attrs={'class': 'nc_content'})

        pic_box = nc_head.find('div', attrs={'class': 'pic_box'})
        alist = pic_box.find_all('a')

        url = alist[0]['href']
        m = re_UserCardUrl.search(url)
        user_info_item['username'] = m.group(1)
        if user_info_item['username'][:2] == 'u/':
            user_info_item['username'] = user_info_item['username'][2:]

        user_info_item['nickname'] = pic_box.a.img['title']
        user_info_item['avatar'] = pic_box.a.img['src']

        user_info_item['is_org'] = False
        if len(alist) > 1:
            user_info_item['is_verified'] = True
            # 判断该帐号是组织帐号还是认证用户
            if alist[1].i['class'][1] == 'icon_pf_approve_co':
                user_info_item['is_org'] = True
        else:
            user_info_item['is_verified'] = False

        mask = nc_head.find('div', attrs={'class': 'mask'})
        name = mask.find('div', attrs={'class': 'name'})
        user_info_item['nickname'] = name.a['title']
        if name.em['title'] == u'男':
            user_info_item['sex'] = True
        else:
            user_info_item['sex'] = False

        intro = mask.find('div', attrs={'class': 'intro W_autocut'})
        if intro.text.strip() != '':  # 可能个人简介不存在
            user_info_item['intro'] = intro.span['title']

        # 关注数和粉丝数
        def parse_number(text):
            # 抽取关注数和粉丝数
            num = int(re.search(r'\d+', text).group())
            if text.find(u'万') >= 0:
                num *= 10000
            return num

        follow_text = nc_content.find('span', attrs={
            'class': 'c_follow W_fb'
        }).text.strip()
        user_info_item['n_follows'] = parse_number(follow_text)
        fans_text = nc_content.find('span', attrs={
            'class': 'c_fans W_fb'
        }).text.strip()
        user_info_item['n_fans'] = parse_number(fans_text)
        weibo_text = nc_content.find('span', attrs={
            'class': 'c_weibo W_fb'
        }).text.strip()
        user_info_item['n_weibo'] = parse_number(weibo_text)

        user_info_list = nc_content.find_all('li', attrs={'class': 'info_li'})
        if len(
                user_info_list
        ) >= 1:  # 有可能用户card不含有location信息,如:http://weibo.com/1947597977/info
            user_info_item['location'] = user_info_list[0].a['title']
            if len(user_info_list) >= 2:
                if user_info_list[1].text.find(u'毕业于') >= 0:
                    user_info_item['edu'] = user_info_list[1].a['title']
                    if len(user_info_list) >= 3:
                        user_info_item['work'] = user_info_list[2].a['title']
                else:
                    user_info_item['work'] = user_info_list[1].a['title']

        return user_info_item
Ejemplo n.º 8
0
    def parse_user_info(self, response): # 默认的回调函数
        """ 普通用户信息抓取,例如:
        """
        #import ipdb; ipdb.set_trace()
        
        # 添加接下来要处理的request
        request_list = self.make_request_list()
        for request in request_list:
            self.crawler.stats.inc_value('request_issued')
            yield request
        
        is_valid = True # 判别是否是合法的item
        log.msg('Parse url: %s' % response.url, level=log.INFO)
        #log.msg('Response body: %s' % response.body)
        
        # 如果是组织页面,那么会返回错误页面
        if response.url.find('pagenotfound') > 0:
            # 匹配源request
            #import ipdb; ipdb.set_trace()
            is_valid = False
            log.msg('Page not found: %s' % response.url, log.ERROR)
            uid = response.meta['uid']
            new_url = 'http://weibo.com/u/%s' % (uid) # 直接访问组织帐号的首页
            request = Request(url=new_url, callback=self.parse_org_info, meta={'uid':uid})
            self.crawler.stats.inc_value('request_issued')
            yield request
        
        # TODO: 判断用户是否存在, http://weibo.com/sorry?usernotexists&code=100001。
        # 不过用户确实是存在的
        # TODO: 判断被封:http://weibo.com/sorry?userblock&is_viewer&code=20003
        # http://sass.weibo.com/accessdeny?uid=5445629123&ip=2682434316&location=1&callbackurl=http%3A%2F%2Fweibo.com%2Fu%2F2029154257
        
        # 从用户的信息页面抽取user info
        user_info_item = UserInfoItem()
        user_info_item['is_org'] = False
        
        # 从原始返回的页面中抽取uid和page_id
        m = re_UID.search(response.body)
        if m != None:
            user_info_item['uid'] = m.group(1)
        else:
            log.msg('Error parsing uid: %s' % response.url, log.ERROR)
            
        # 抽取page_id
        m = re_PAGEID.search(response.body)
        if m != None:
            user_info_item['page_id'] = m.group(1)
        else:
            log.msg('Error parsing page id: %s' % response.url, log.ERROR)
        
        # bs4解析
        soup = beautiful_soup(response.body)
        
        new_style = False
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}):
                        block_title = block_div.find('h4', attrs={'class': 'obj_name'}).text.strip()
                        inner_div = block_div.find('div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    user_info_item['avatar'] = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    user_info_item['n_follows'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    user_info_item['n_fans'] = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    user_info_item['n_follows'] = int(tds[0].find('strong').text)
                    user_info_item['n_fans'] = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    user_info_item['avatar'] = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
                    
                    #import ipdb; ipdb.set_trace()
                    # 判别该用户是否是认真用户
                    photo_div = header_soup.find_all('div', attrs={'class':'pf_photo', 'node-type':'photo'})
                    if len(photo_div) > 0:
                        result = photo_div[0].find_all('a', attrs={'href':'http://verified.weibo.com/verify'})
                        if len(result) > 0:
                            user_info_item['is_verified'] = True
                        else:
                            user_info_item['is_verified'] = False
                    else:
                        log.msg('Can not find photo div: %s' % response.url, log.ERROR)
                    
            elif 'STK' in text:
                text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    user_info_item['avatar'] = soup.find('img')['src']
        
        profile_map = {
            u'昵称': {'field': 'nickname'},
            u'所在地': {'field': 'location'},
            u'性别': {'field': 'sex', 
                    'func': lambda s: True if s == u'男' else False},
            u'生日': {'field': 'birth'},
            u'博客': {'field': 'blog'},
            u'个性域名': {'field': 'site'},
            u'简介': {'field': 'intro'},
            u'邮箱': {'field': 'email'},
            u'QQ': {'field': 'qq'},
            u'MSN': {'field': 'msn'}
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={'class': 'pt_title'}).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    user_info_item[profile_map[k]['field']] = v
                    #setattr(user_info_item, profile_map[k]['field'], v)
                
        user_info_item['work'] = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = dict()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info['name'] = a.text
                            text = p.text
                            if '(' in text:
                                work_info['date'] = text.strip().split('(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info['location'] = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info['position'] = text.split(u':', 1)[1]
                            else:
                                work_info['detail'] = text
                    user_info_item['work'].append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = dict()
                    
                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info['name'] = a.text
                    if '(' in text:
                        work_info['date'] = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info['location'] = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info['position'] = l.split(u':', 1)[1]
                        else:
                            work_info['detail'] = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()
                    
                    user_info_item['work'].append(work_info)
            
        user_info_item['edu'] = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = dict()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info['name'] = a.text
                            if '(' in text:
                                edu_info['date'] = text.strip().split('(')[1].strip().strip(')')
                        else:
                            edu_info['detail'] = text
                            
                    user_info_item['edu'].append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)
                
                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx+1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos: end_pos]
                    
                    edu_info = dict()
                    edu_info['name'] = name
                    if '(' in text:
                        edu_info['date'] = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')')+1:]
                    text = text[end_pos:]
                    edu_info['detail'] = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    user_info_item['edu'].append(edu_info)
                    
        user_info_item['tags'] = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        user_info_item['tags'].append(a.text)
            else:
                for a in tags_div.find('span', attrs={'class': 'pt_detail'}).find_all('a'):
                    user_info_item['tags'].append(a.text.strip())
                
        # 如果打算将item包含进去,则is_valid则为True
        log.msg('parse %s finish' % response.url, log.INFO)
        
        # 检查spider是否已经被封
        if not user_info_item['n_follows']:
            log.msg('The spider may have been banned.', log.ERROR)
        else:
            if is_valid:
                #print user_info_item
                yield user_info_item
Ejemplo n.º 9
0
 def parse_user_card_info_text(self, uid, html_data):
     """ 给定html body解析 user info, 用户信息包括:avatar, uid, nickname, desc, location
     NOTE: 将该解析单独作为一个函数有两个作用:1)便于单独测试;2)使得parse_user_card_info函数结构清晰
     """
     user_info_item = UserInfoItem()
     #user_info_item['raw_html'] = html_data # 保存所有的原始数据
     user_info_item['uid'] = uid
     user_info_item['existed'] = True
     
     soup = beautiful_soup(html_data)
     
     nc_head = soup.find('div', attrs={'class':'nc_head'})
     nc_content = soup.find('div', attrs={'class':'nc_content'})
     
     pic_box = nc_head.find('div', attrs={'class':'pic_box'})
     alist = pic_box.find_all('a')
     
     url = alist[0]['href']
     m = re_UserCardUrl.search(url)
     user_info_item['username'] = m.group(1)
     if user_info_item['username'][:2] == 'u/':
         user_info_item['username'] = user_info_item['username'][2:]
         
     user_info_item['nickname'] = pic_box.a.img['title']
     user_info_item['avatar'] = pic_box.a.img['src']
     
     user_info_item['is_org'] = False
     if len(alist) > 1:
         user_info_item['is_verified'] = True
         # 判断该帐号是组织帐号还是认证用户
         if alist[1].i['class'][1] == 'icon_pf_approve_co':
             user_info_item['is_org'] = True
     else:
         user_info_item['is_verified'] = False
                         
     mask = nc_head.find('div', attrs={'class':'mask'})
     name = mask.find('div', attrs={'class':'name'})
     user_info_item['nickname'] = name.a['title']
     if name.em['title'] == u'男':
         user_info_item['sex'] = True
     else:
         user_info_item['sex'] = False
         
     intro = mask.find('div', attrs={'class':'intro W_autocut'})
     if intro.text.strip() != '':  # 可能个人简介不存在
         user_info_item['intro'] = intro.span['title']
     
     # 关注数和粉丝数
     def parse_number(text):
         # 抽取关注数和粉丝数
         num = int(re.search(r'\d+', text).group())
         if text.find(u'万') >= 0:
             num *= 10000
         return num
         
     follow_text = nc_content.find('span', attrs={'class':'c_follow W_fb'}).text.strip()
     user_info_item['n_follows'] = parse_number(follow_text)
     fans_text = nc_content.find('span', attrs={'class':'c_fans W_fb'}).text.strip()
     user_info_item['n_fans'] = parse_number(fans_text)
     weibo_text = nc_content.find('span', attrs={'class':'c_weibo W_fb'}).text.strip()
     user_info_item['n_weibo'] = parse_number(weibo_text)
     
     user_info_list = nc_content.find_all('li', attrs={'class':'info_li'})
     if len(user_info_list) >= 1: # 有可能用户card不含有location信息,如:http://weibo.com/1947597977/info
         user_info_item['location'] = user_info_list[0].a['title']
         if len(user_info_list) >= 2:
             if user_info_list[1].text.find(u'毕业于') >= 0:
                 user_info_item['edu'] = user_info_list[1].a['title']
                 if len(user_info_list) >= 3:
                     user_info_item['work'] = user_info_list[2].a['title']
             else:
                 user_info_item['work'] = user_info_list[1].a['title']
     
     return user_info_item