def load_item(self, user_item): user = GithubUserItem() user['url'] = host + ''.join(user_item.xpath('a/@href').extract()) user['_id'] = user['username'] = user['url'].split('/')[-1] user['url'] = host + "/" + user['_id'] user['nickname'] = ''.join( user_item.xpath( 'div[@class="user-list-info"]/a/following-sibling::text()[1]'). extract()).strip() user['user_id'] = ''.join(user_item.xpath( 'a/img/@src').extract()).split("/")[-1].split("?")[0] user['location'] = ''.join( user_item.xpath('div[@class="user-list-info"]/ul/li/span/text()'). extract()).strip() user['email'] = ''.join( user_item.xpath( 'div[@class="user-list-info"]/ul/li/span/a/text()').extract()) user['join_date'] = ''.join( user_item.xpath( 'div[@class="user-list-info"]/ul/li/span/time/@datetime'). extract()) user['update_time'] = str(datetime.now()) return user
def parse_user_all_info(self, response): selector = Selector(response) user = GithubUserItem() user['_id'] = user['username'] = response.url.split('/')[-1] user['url'] = response.url user['update_time'] = str(datetime.now()) if len( selector.xpath( '//div[@itemtype="http://schema.org/Organization"]'). extract()) > 0: user['type'] = 1 user['nickname'] = ''.join( selector.xpath('//div[@class="org-header-info"]/h1/span/text()' ).extract()) user['user_id'] = ''.join( selector.xpath('//div[@class="org-header-wrapper"]/img/@src'). extract()).split("/")[-1].split("?")[0] user['location'] = ''.join( selector.xpath( '//div[@class="org-header-info"]/ul/li[@class="meta-item"]/span[@itemprop="location"]/text()' ).extract()) user['website'] = ''.join( selector.xpath( '//div[@class="org-header-info"]/ul/li[@class="meta-item"]/a[@itemprop="url"]/text()' ).extract()) user['email'] = ''.join( selector.xpath( '//div[@class="org-header-info"]/ul/li[@class="meta-item"]/a[@itemprop="email"]/text()' ).extract()) user['member_num'] = ''.join( selector.xpath( '//div[@class="org-module simple-box"]/h3[@class="org-module-title org-members-title"]/a/span[1]' ).extract()) else: user['type'] = 0 user['nickname'] = ''.join( selector.xpath( '//div[@class="column one-fourth vcard"]/h1/span[@class="vcard-fullname"]/text()' ).extract()) user['user_id'] = ''.join( selector.xpath( '//div[@class="column one-fourth vcard"]/a[1]/@href'). extract()).split("/")[-1].split("?")[0] user['company'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@itemprop="worksFor"]/text()' ).extract()) user['location'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@itemprop="homeLocation"]/text()' ).extract()) user['email'] = urllib.unquote(''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@class="vcard-detail"]/a[@class="email"]/text()' ).extract())) user['website'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@class="vcard-detail"]/a[@class="url"]/text()' ).extract()) user['join_date'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@class="vcard-detail"]/time[@class="join-date"]/@datetime' ).extract()) nums = selector.xpath( '//div[@class="column one-fourth vcard"]/div[@class="vcard-stats"]/a/strong/text()' ).extract() user['follower_num'] = nums[0] user['star_num'] = nums[1] user['followee_num'] = nums[2] user['organizations'] = selector.xpath( '//div[@class="clearfix"]/a[@itemprop="follows"]/@href' ).extract() yield user
def parse_user(self, response): selector = Selector(response) user = GithubUserItem() user['_id'] = user['username'] = response.url.split('/')[-1] user['url'] = response.url if len( selector.xpath( '//div[@itemtype="http://schema.org/Organization"]'). extract()) > 0: user['type'] = "Org" user['nickname'] = ''.join( selector.xpath('//div[@class="org-header-info"]/h1/span/text()' ).extract()) user['location'] = ''.join( selector.xpath( '//ul[@class="org-header-meta"]/li[@class="meta-item"]/span[@itemprop="location"]/text()' ).extract()) user['email'] = urllib.unquote(''.join( selector.xpath( '//ul[@class="org-header-meta"]/li[@class="meta-item"]/a[@itemprop="email"]/text()' ).extract())) user['website'] = ''.join( selector.xpath( '//ul[@class="org-header-meta"]/li[@class="meta-item"]/a[@itemprop="url"]/text()' ).extract()) user['member_num'] = ''.join( selector.xpath( '//div[@class="org-module simple-box"]/h3[@class="org-module-title org-members-title"]/a/span[1]' ).extract()) user['update_time'] = str(datetime.now()) else: user['type'] = "User" user['user_id'] = ''.join( selector.xpath( '//div[@class="column one-fourth vcard"]/a/@href').extract( )).split("/")[-1].split("?")[0] user['nickname'] = ''.join( selector.xpath( '//div[@class="column one-fourth vcard"]/h1/span[@class="vcard-fullname"]/text()' ).extract()) user['company'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li/span[@itemprop="worksFor"]/text()' ).extract()) user['location'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@itemprop="homeLocation"]/text()' ).extract()) user['email'] = urllib.unquote(''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@class="vcard-detail"]/a[starts-with(@href,"mailto")]/@data-email' ).extract())) user['website'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@class="vcard-detail"]/a[@class="url"]/text()' ).extract()) user['join_date'] = ''.join( selector.xpath( '//ul[@class="vcard-details"]/li[@class="vcard-detail"]/span[@class="join-date"]/text()' ).extract()) user['follower_num'] = ''.join( selector.xpath('//div[@class="vcard-stats"]/a[1]/strong/text()' ).extract()) user['star_num'] = ''.join( selector.xpath('//div[@class="vcard-stats"]/a[2]/strong/text()' ).extract()) user['followee_num'] = ''.join( selector.xpath('//div[@class="vcard-stats"]/a[3]/strong/text()' ).extract()) #user['repo_num'] = '' user['org_num'] = str( len( selector.xpath( '//div[@class="vcard-orgs"]/div/a').extract())) user['update_time'] = str(datetime.now()) yield user print 'NEW:%s' % user['username']