def parse_user_details(self, response): if response.css('.org-name'): return self.parse_org_details(response, response.url) baseInfo_div = response.css(".vcard")[0] des = UserDetailItem() des["name"] = baseInfo_div.css(".vcard-fullname") des["login"] = baseInfo_div.css(".vcard-username") des["company"] = baseInfo_div.css('li[itemprop="worksFor"]') des["location"] = baseInfo_div.css('li[itemprop="homeLocation"]') des["email"] = baseInfo_div.css(".email") des["created_at"] = baseInfo_div.css(".join-date") des["blog"] = baseInfo_div.css(".url") for i in des: try: des[i] = des[i].xpath('string()').extract()[0] except: des[i] = None orgs = response.xpath('//*[contains(@class, "avatar-group-item")]/@href').extract() des["orgs"] = orgs new_task(response.url+"/following") new_task(response.url+"/followers") new_task(response.url+"?tab=repositories") new_task("/".join(response.url.split("/")[:-1])+"/stars/"+response.url.split("/")[-1]) for i in orgs: #logger.debug("new_task,parse_user_details:"+i) new_task("https://github.com"+i) #add orgs return des
def parse_org_details(self, response, url): org_details = OrgDetailItem() org_details["login"] = url.split("/")[-1] #org_details["name"] = response_soup.select(".js-username")[0].getText() org_details["name"] = response.css(".org-name").xpath('string()').extract() try: org_details["description"] =response.css(".org-description").xpath('string()').extract()[0] except: pass meta_soup = response.css(".meta-item") try: org_details["location"] = meta_soup[0].xpath('string()').extract()[0] except: pass try: org_details["link"] = meta_soup[1].xpath('string()').extract()[0] except: pass try: org_details["email"] = meta_soup[2].xpath('string()').extract()[0] except: pass new_task("/".join(url.split("/")[:-1])+"/orgs/"+url.split("/")[-1]+"/people") new_task(url+"?page=1") return org_details
def _parse_topic_list_info(self, response): soup = BeautifulSoup(response.body) all_link = soup.findAll("link") base_topic_url = u"/api/topics/show.json?id={}" for link_item in all_link[2:]: topic_id = link_item.attrs["href"].split("#")[0].split("/")[-1] new_task(base_topic_url.format(topic_id))
def parse_org_details(self, response, url): org_details = OrgDetailItem() org_details["login"] = url.split("/")[-1] #org_details["name"] = response_soup.select(".js-username")[0].getText() org_details["name"] = response.css(".org-name").xpath( 'string()').extract() try: org_details["description"] = response.css( ".org-description").xpath('string()').extract()[0] except: pass meta_soup = response.css(".meta-item") try: org_details["location"] = meta_soup[0].xpath( 'string()').extract()[0] except: pass try: org_details["link"] = meta_soup[1].xpath('string()').extract()[0] except: pass try: org_details["email"] = meta_soup[2].xpath('string()').extract()[0] except: pass new_task("/".join(url.split("/")[:-1]) + "/orgs/" + url.split("/")[-1] + "/people") new_task(url + "?page=1") return org_details
def parse_user_details(self, response): if response.css('.org-name'): return self.parse_org_details(response, response.url) baseInfo_div = response.css(".vcard")[0] des = UserDetailItem() des["name"] = baseInfo_div.css(".vcard-fullname") des["login"] = baseInfo_div.css(".vcard-username") des["company"] = baseInfo_div.css('li[itemprop="worksFor"]') des["location"] = baseInfo_div.css('li[itemprop="homeLocation"]') des["email"] = baseInfo_div.css(".email") des["created_at"] = baseInfo_div.css(".join-date") des["blog"] = baseInfo_div.css(".url") for i in des: try: des[i] = des[i].xpath('string()').extract()[0] except: des[i] = None orgs = response.xpath( '//*[contains(@class, "avatar-group-item")]/@href').extract() des["orgs"] = orgs new_task(response.url + "/following") new_task(response.url + "/followers") new_task(response.url + "?tab=repositories") new_task("/".join(response.url.split("/")[:-1]) + "/stars/" + response.url.split("/")[-1]) for i in orgs: #logger.debug("new_task,parse_user_details:"+i) new_task("https://github.com" + i) #add orgs return des
def get_repo_detail(self, response): repo_detail = RepoDetail() #repo_detail_soup = BeautifulSoup(response.body) try: if u'This repository is empty.' == response.xpath( "//h3/text()").extract()[0]: repo_detail["full_name"] = "/" + response.url.split( "/")[-2] + "/" + response.url.split("/")[-1] repo_detail["empty"] = True return repo_detail except: pass try: repo_detail["description"] = response.css( ".repository-meta").xpath('string()').extract()[0] except exceptions.IndexError: repo_detail["description"] = None if response.css("#readme"): repo_detail["readme"] = response.css("#readme").xpath( 'string()').extract()[0] else: repo_detail["readme"] = "" try: repo_detail["fork"] = "/" + response.css(".fork-flag").xpath( './/a/text()').extract()[0] except exceptions.IndexError: repo_detail["fork"] = None language_list_soup = response.css(".language-color") language_list = [] language_detail = LanguageDetail() for i in language_list_soup: language_detail = {} l = i.xpath(".//@aria-label").extract() if l: l = l[0] language_detail[l.split()[0]] = l.split()[1] language_list.append(language_detail) nums = response.css(".num") repo_detail["master_commit_num"] = int( nums[0].xpath('string()').extract()[0].replace(",", "").replace( "+", "")) repo_detail["branch_num"] = int( nums[1].xpath('string()').extract()[0].replace(",", "")) repo_detail["release_num"] = int( nums[2].xpath('string()').extract()[0].replace(",", "")) repo_detail["language_list"] = language_list repo_detail["full_name"] = "/" + response.url.split( "/")[-2] + "/" + response.url.split("/")[-1] counts = response.css('.social-count').xpath('text()').extract() repo_detail['watch_count'] = counts[0].strip().replace(',', '') repo_detail['star_count'] = counts[1].strip().replace(',', '') repo_detail['fork_count'] = counts[2].strip().replace(',', '') repo_detail["empty"] = False new_task(response.url + "/graphs/contributors-data") return repo_detail
def parse_directory(self, response): links = response.xpath(self.dir_link_xpath).extract() for link in links: if link.startswith('http'): if 'linkedin.com/pub' in link: r = get_redis() r.sadd('linkedin:user', link) continue new_task(link)
def get_user_repos(self, response): user_repos_item = UserRepos() repos = response.xpath('//h3[@class="repo-list-name"]/a/@href').extract() user_repos_item["repos"] = repos user_repos_item["login"] = response.url.split("/")[-1].split("?")[0] for i in repos: #logger.debug("new_task,parse_user_repos:"+i) new_task("https://github.com"+i) return user_repos_item
def __init__(self, *args, **kwargs): super(BlogSpider, self).__init__(*args, **kwargs) cnt = 0 r = get_redis() r.delete('blog:tasks:fingerprint') with open(os.path.join(cpath, 'github_user_blog_list')) as fi: for line in fi: cnt += 1 if cnt > 100:break new_task(line.strip())
def get_user_repos(self, response): user_repos_item = UserRepos() repos = response.xpath( '//h3[@class="repo-list-name"]/a/@href').extract() user_repos_item["repos"] = repos user_repos_item["login"] = response.url.split("/")[-1].split("?")[0] for i in repos: #logger.debug("new_task,parse_user_repos:"+i) new_task("https://github.com" + i) return user_repos_item
def add_pagination_newtask(self, soup): pages_soup = soup.css(".pagination") if pages_soup: #for i in pages_soup[0].findAll("a"): for i in pages_soup[0].xpath('a/@href').extract(): try: #new_task("https://github.com" + i.attrs.get("href")) #logger.debug("new_task,add_pagination_newtask:"+i) new_task("https://github.com" + i) except: pass
def _parse_user_tags_info(self, response): tags = [self._f(k) for k in response.xpath("//li[contains(@class, 'tagPopup')]/a/text()").extract()] if tags: next_page = response.xpath("//li[@class='next']/a/@href").extract() if next_page: new_task(next_page[0]) item = SegmentfaultUserTagItem() item["login"] = response.url.split("/")[4] item["tags"] = tags return item
def _parse_user_follower_info(self, response): member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())] for member in member_list: new_task(member) login = response.url.split("/")[4] for follower_item in response.xpath("//div[contains(@class, 'stream-list__item')]/a/@href"): item = SegmentfaultUserFollowerItem() item["login"] = login item["follower"] = self._f(follower_item.extract()).split("/")[2] yield item
def __init__(self, *args, **kwargs): super(LinkedinSpider, self).__init__(*args, **kwargs) filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list') r = get_redis() r.delete('linkedin:tasks:fingerprint') with open(filepath) as fi: cnt = 0 for line in fi: new_task(line.strip()) cnt += 1 if cnt > 100:break
def _parse_user_bookmark_info(self, response): login = response.url.split("/")[4] for bookmark_list in response.xpath("//section[contains(@class, 'stream-list__item')]"): if bookmark_list: item = SegmentfaultUserBookmarkItem() item["login"] = login link = self._f(bookmark_list.xpath(".//strong/a/@href")[0].extract()) item["bookmark_id"] = link.split("/")[2] new_task(link) yield item
def __init__(self, *args, **kwargs): super(LinkedinSpider, self).__init__(*args, **kwargs) filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list') r = get_redis() r.delete('linkedin:tasks:fingerprint') with open(filepath) as fi: cnt = 0 for line in fi: new_task(line.strip()) cnt += 1 if cnt > 100: break
def get_org_repos(self, response): #org_soup = BeautifulSoup(response.body) repos_list_repo = response.xpath("//h3//a/@href") org_repos_item = OrgRepos() repos_list = repos_list_repo.extract() org_repos_item["repos"] = repos_list org_repos_item["login"] = response.url.split("/")[-1].split("?")[0] self.add_pagination_newtask(response) for i in repos_list: #logger.debug("new_task,get_org_repos:"+i) new_task("https://github.com"+i) return org_repos_item
def get_org_repos(self, response): #org_soup = BeautifulSoup(response.body) repos_list_repo = response.xpath("//h3//a/@href") org_repos_item = OrgRepos() repos_list = repos_list_repo.extract() org_repos_item["repos"] = repos_list org_repos_item["login"] = response.url.split("/")[-1].split("?")[0] self.add_pagination_newtask(response) for i in repos_list: #logger.debug("new_task,get_org_repos:"+i) new_task("https://github.com" + i) return org_repos_item
def _parse_user_bookmark_info(self, response): login = response.url.split("/")[4] for bookmark_list in response.xpath( "//section[contains(@class, 'stream-list__item')]"): if bookmark_list: item = SegmentfaultUserBookmarkItem() item["login"] = login link = self._f( bookmark_list.xpath(".//strong/a/@href")[0].extract()) item["bookmark_id"] = link.split("/")[2] new_task(link) yield item
def get_org_members(self, response): #people_soup = BeautifulSoup(response.body) member_list_soup = response.css(".member-username") org_members_item = OrgMembers() members_list = [] for i in member_list_soup: members_list.append(i.xpath('string()').extract()[0]) org_members_item['members'] = members_list for i in members_list: new_task("https://github.com/" + i) org_members_item["login"] = response.url.split("/")[-2] self.add_pagination_newtask(response) return org_members_item
def get_org_members(self, response): #people_soup = BeautifulSoup(response.body) member_list_soup =response.css(".member-username") org_members_item = OrgMembers() members_list = [] for i in member_list_soup: members_list.append(i.xpath('string()').extract()[0]) org_members_item['members'] = members_list for i in members_list: new_task("https://github.com/"+i) org_members_item["login"] = response.url.split("/")[-2] self.add_pagination_newtask(response) return org_members_item
def _parse_user_tags_info(self, response): tags = [ self._f(k) for k in response.xpath( "//li[contains(@class, 'tagPopup')]/a/text()").extract() ] if tags: next_page = response.xpath("//li[@class='next']/a/@href").extract() if next_page: new_task(next_page[0]) item = SegmentfaultUserTagItem() item["login"] = response.url.split("/")[4] item["tags"] = tags return item
def _parse_user_info(self, response): f_response = json.loads(response.body) if not f_response or f_response["status"] == "notfound": return item = V2ExUserItem() item["id"] = f_response["id"] item["login"] = f_response["username"] item["tagline"] = f_response["tagline"].strip() item["description"] = f_response["bio"].strip() item["avatar_url"] = f_response["avatar_large"] item["created_at"] = self._f_timestr(f_response["created"]) new_task(u"/member/{}".format(item["login"])) return item
def get_repo_detail(self, response): repo_detail = RepoDetail() #repo_detail_soup = BeautifulSoup(response.body) try: if u'This repository is empty.' == response.xpath("//h3/text()").extract()[0]: repo_detail["full_name"] = "/"+response.url.split("/")[-2]+"/"+response.url.split("/")[-1] repo_detail["empty"] = True return repo_detail except: pass try: repo_detail["description"] = response.css(".repository-meta").xpath('string()').extract()[0] except exceptions.IndexError: repo_detail["description"] = None if response.css("#readme"): repo_detail["readme"] = response.css("#readme").xpath('string()').extract()[0] else: repo_detail["readme"] = "" try: repo_detail["fork"] = "/" + response.css(".fork-flag").xpath('.//a/text()').extract()[0] except exceptions.IndexError: repo_detail["fork"] = None language_list_soup = response.css(".language-color") language_list = [] language_detail = LanguageDetail() for i in language_list_soup: language_detail = {} l = i.xpath(".//@aria-label").extract() if l: l = l[0] language_detail[l.split()[0]] = l.split()[1] language_list.append(language_detail) nums = response.css(".num") repo_detail["master_commit_num"] = int(nums[0].xpath('string()').extract()[0].replace(",", "").replace("+", "")) repo_detail["branch_num"] = int(nums[1].xpath('string()').extract()[0].replace(",", "")) repo_detail["release_num"] = int(nums[2].xpath('string()').extract()[0].replace(",", "")) repo_detail["language_list"] = language_list repo_detail["full_name"] = "/"+response.url.split("/")[-2]+"/"+response.url.split("/")[-1] counts = response.css('.social-count').xpath('text()').extract() repo_detail['watch_count'] = counts[0].strip().replace(',', '') repo_detail['star_count'] = counts[1].strip().replace(',', '') repo_detail['fork_count'] = counts[2].strip().replace(',', '') repo_detail["empty"] = False new_task(response.url+"/graphs/contributors-data") return repo_detail
def get_user_following(self, response): #fbsoup = BeautifulSoup(response.body) user_following_item = UserFollowing() following = response.xpath('//div[@class="users"]//h3//a/@href').extract() user_following_item["following"] = following user_following_item["login"] = response.url.split("/")[-2] if "page=" not in response.url: #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", "")) following_num = int(response.css(".counter")[0].xpath('string()').extract()[0].replace(",", "")) for j in xrange(following_num/self.following_num+1): new_task(response.url+"?page="+str(j+1)) for i in following: #logger.debug("new_task,get_user_following:"+i) new_task("https://github.com"+i) return user_following_item
def parse(self, response): """ 此方法不要使用 yield,如果有多个结果,返回一个 list 就可以了。 """ ids = response.xpath('//a/text()').extract() for id in ids: # 任务放在 redis 里面,这里会做去重 new_task(id.encode('utf-8'), self.get_fingerprint) # 或者让 scrapy 自己调度任务, 但是这里不会做去重 #yield Request('http://127.0.0.1:8080/%s' % id.encode('utf-8')) item = TestItem() item['uid'] = response.url # yield item return [item]
def get_user_followers(self, response): #logger.debug("user_followers_1") user_followers_item = UserFollowers() followers = response.css(".follow-list").xpath("li/a/@href").extract() user_followers_item["followers"] = followers user_followers_item["login"] = response.url.split("/")[-2] if "page=" not in response.url: #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", "")) followers_num = int(response.css(".counter")[0].xpath('string()').extract()[0].replace(",", "")) for j in xrange(followers_num/self.followers_num+1): new_task(response.url+"?page="+str(j+1)) for i in followers: #logger.debug("new_task,get_user_followers:"+i) new_task("https://github.com"+i) #logger.debug("user_followers_2") return user_followers_item
def _parse_user_follower_info(self, response): member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) login = response.url.split("/")[4] for follower_item in response.xpath( "//div[contains(@class, 'stream-list__item')]/a/@href"): item = SegmentfaultUserFollowerItem() item["login"] = login item["follower"] = self._f(follower_item.extract()).split("/")[2] yield item
def get_user_stared(self, response): user_stared = response.css(".repo-list-item") user_stars_item = UserStars() stars = [] for i in user_stared: one_star = {} one_star["name"] = i.css("h3").css("a::attr(href)").extract()[0] one_star["time"] = i.css(".repo-list-meta")[0].css("time::attr(datetime)").extract()[0] stars.append(one_star) user_stars_item["stars"] = stars user_stars_item["login"] = response.url.split("/")[-1].split("?")[0] next_soup = response.css(".pagination") if next_soup: next_url = next_soup[0].xpath("a/@href").extract() for i in next_url: new_task(i) return user_stars_item
def _parse_topic_info(self, response): f_response = json.loads(response.body) if "error" in f_response: return None item = RubychinaTopicItem() item["id"] = f_response["id"] item["title"] = f_response["title"] item["created_at"] = f_response["created_at"] item["replies_count"] = f_response["replies_count"] item["owner_login"] = f_response["user"]["login"] item["content"] = f_response["body_html"] item["hits"] = f_response["hits"] for reply_item in f_response["replies"]: new_task(reply_item["user"]["login"]) return item
def get_user_following(self, response): #fbsoup = BeautifulSoup(response.body) user_following_item = UserFollowing() following = response.xpath( '//div[@class="users"]//h3//a/@href').extract() user_following_item["following"] = following user_following_item["login"] = response.url.split("/")[-2] if "page=" not in response.url: #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", "")) following_num = int( response.css(".counter")[0].xpath('string()').extract() [0].replace(",", "")) for j in xrange(following_num / self.following_num + 1): new_task(response.url + "?page=" + str(j + 1)) for i in following: #logger.debug("new_task,get_user_following:"+i) new_task("https://github.com" + i) return user_following_item
def get_user_stared(self, response): user_stared = response.css(".repo-list-item") user_stars_item = UserStars() stars = [] for i in user_stared: one_star = {} one_star["name"] = i.css("h3").css("a::attr(href)").extract()[0] one_star["time"] = i.css(".repo-list-meta")[0].css( "time::attr(datetime)").extract()[0] stars.append(one_star) user_stars_item["stars"] = stars user_stars_item["login"] = response.url.split("/")[-1].split("?")[0] next_soup = response.css(".pagination") if next_soup: next_url = next_soup[0].xpath("a/@href").extract() for i in next_url: new_task(i) return user_stars_item
def get_user_followers(self, response): #logger.debug("user_followers_1") user_followers_item = UserFollowers() followers = response.css(".follow-list").xpath("li/a/@href").extract() user_followers_item["followers"] = followers user_followers_item["login"] = response.url.split("/")[-2] if "page=" not in response.url: #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", "")) followers_num = int( response.css(".counter")[0].xpath('string()').extract() [0].replace(",", "")) for j in xrange(followers_num / self.followers_num + 1): new_task(response.url + "?page=" + str(j + 1)) for i in followers: #logger.debug("new_task,get_user_followers:"+i) new_task("https://github.com" + i) #logger.debug("user_followers_2") return user_followers_item
def _parse_topic_replies_info(self, response): f_response = json.loads(response.body) if not f_response: return topic_id = response.url.split("?")[-1] for reply_item in f_response: item = V2ExTopicReplyItem() item["reply_id"] = reply_item["id"] item["topic_id"] = topic_id item["thank_count"] = reply_item["thanks"] item["content"] = reply_item["content_rendered"].strip() item["author"] = reply_item["member"]["username"] if item["author"]: new_task("/api/members/show.json?id={}".format(reply_item["member"]["id"])) item["created_at"] = self._f_timestr(reply_item["created"]) yield item
def parse_followees(self, response): ret = json.loads(response.body) item = ZhihuRelationItem() item['uid'] = self.get_uid_from_url(response.url) item['followees'] = [] item['_id'] = '%s:%s' % (item['uid'], self.get_offset_from_url(response.url)) for people in ret['data']: item['followees'].append({ 'uid': people.get('id'), #'email': people.get('email'), #'gender': people.get('gender'), #'sina_weibo_url': people.get('sina_weibo_url'), #'sina_weibo_name': people.get('sina_weibo_name'), #'headline': people.get('headline'), #'description': people.get('description'), }) new_task(people['id']) yield item
def _parse_topic_info(self, response): item = SegmentfaultTopicItem() item["question_id"] = response.url.split("/")[4] item["title"] = self._f(response.xpath("//h1[@id='questionTitle']/a/text()")[0].extract()) item["author"] = self._f(response.xpath("//div[@class='author']/a/@href")[0].extract().split("/")[-1]) item["created_at"] = self._format_timestr(self._f( "".join([k.strip() for k in response.xpath("//div[@class='author']/text()").extract()]).split()[0])) topic_status = response.xpath('//div[@class="col-md-3"][1]/ul/li/strong/text()') item["follower_count"] = self._format_count(self._f(topic_status[0].extract())) item["collection_count"] = self._format_count(self._f(topic_status[1].extract())) item["hit_count"] = self._format_count(self._f(topic_status[2].extract())) question_tag = response.xpath("//article[@class='widget-question__item']") item["like_count"] = self._format_count(self._f(question_tag.xpath(".//span[@class='count']/text()")[0].extract())) item["question"] = self._f(question_tag.xpath(".//div[@class='post-offset']/div[1]")[0].extract()) item["tags"] = [self._f(k) for k in question_tag.xpath(".//div[@class='post-offset']/ul[1]/li/a/text()").extract()] answer_list_tag = response.xpath("//article[contains(@class, 'widget-answers__item')]") answer_list = [] for answer_item in answer_list_tag: answer_id = self._f(answer_item.xpath(".//@id")[0].extract()) answer_like_count = self._format_count(self._f(answer_item.xpath(".//span[@class='count']/text()")[0].extract())) answer_accepted = bool(answer_item.xpath(".//div[@class='accepted-flag']")) answer_author = self._f(answer_item.xpath(".//div[@class='post-offset']/a/@href")[0].extract()) answer_created_at = self._format_timestr(self._f( answer_item.xpath(".//span[contains(@class, 'text-muted')]/text()")[0].extract().strip().split()[0])) answer = self._f(answer_item.xpath(".//div[contains(@class, 'answer')]")[0].extract()) answer_list.append(dict(like_count=answer_like_count, accepted=answer_accepted, anthor=answer_author, created_at=answer_created_at, answer=answer, answer_id=answer_id)) item["answers"] = answer_list # add new task member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())] for member in member_list: new_task(member) return item
def _parse_article_info(self, response): item = SegmentfaultArticleItem() item["article_id"] = response.url.split("/")[4] item["title"] = self._f(response.xpath("//h1[@id='articleTitle']/a/text()")[0].extract()) item["author"] = self._f(response.xpath("//div[@class='author']/a/@href")[0].extract().split("/")[-1]) item["created_at"] = self._format_timestr(self._f( response.xpath("//div[@class='author']/text()")[-1].extract().strip().split()[0])) topic_status = response.xpath('//div[@class="col-md-3"][1]/ul/li/strong/text()') item["follower_count"] = self._format_count(self._f(topic_status[0].extract())) item["collection_count"] = self._format_count(self._f(topic_status[1].extract())) item["hit_count"] = self._format_count(self._f(topic_status[2].extract())) item["article"] = self._f(response.xpath("//div[contains(@class, 'article')]")[0].extract()) item["tags"] = list(set([self._f(k) for k in response.xpath("//li[@class='tagPopup']/a/text()").extract()])) member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())] for member in member_list: new_task(member) return item
def _parse_topic_info(self, response): f_response = json.loads(response.body) if not f_response: return f_response = f_response[0] item = V2ExTopicItem() item["topic_id"] = f_response["id"] item["title"] = f_response["title"] item["content"] = f_response["content_rendered"] item["author"] = f_response["member"]["username"] if item["author"]: new_task(u"/api/members/show.json?id={}".format(f_response["member"]["id"])) item["node_name"] = f_response["node"]["name"] item["created_at"] = self._f_timestr(f_response["created"]) item["reply_count"] = f_response["replies"] if item["reply_count"]: new_task(u"/api/replies/show.json?topic_id={}".format(item["topic_id"])) return item
def _parse_article_info(self, response): item = SegmentfaultArticleItem() item["article_id"] = response.url.split("/")[4] item["title"] = self._f( response.xpath("//h1[@id='articleTitle']/a/text()")[0].extract()) item["author"] = self._f( response.xpath("//div[@class='author']/a/@href") [0].extract().split("/")[-1]) item["created_at"] = self._format_timestr( self._f( response.xpath("//div[@class='author']/text()") [-1].extract().strip().split()[0])) topic_status = response.xpath( '//div[@class="col-md-3"][1]/ul/li/strong/text()') item["follower_count"] = self._format_count( self._f(topic_status[0].extract())) item["collection_count"] = self._format_count( self._f(topic_status[1].extract())) item["hit_count"] = self._format_count( self._f(topic_status[2].extract())) item["article"] = self._f( response.xpath("//div[contains(@class, 'article')]")[0].extract()) item["tags"] = list( set([ self._f(k) for k in response.xpath( "//li[@class='tagPopup']/a/text()").extract() ])) member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) return item
def _parse_user_info(self, response): item = SegmentfaultUserItem() profile_tag = response.xpath( "//div[contains(@class, 'profile-header')]") item["login"] = response.url.split("/")[4] item["avatar_url"] = self._f( profile_tag.xpath(".//img/@src")[0].extract()) if not item["avatar_url"].startswith("http"): item["avatar_url"] = None item["name"] = self._f(profile_tag.xpath(".//h4/text()")[0].extract()) item["social_contact"] = self._format_social_contact( profile_tag.xpath(".//li/a")) profile_item_map = { u"所在城市": "location", u"现任职位": "position", u"院校专业": "education", u"个人网站": "blog_url", } item.update([(v, None) for _, v in profile_item_map.iteritems()]) item["major"] = None for profile_item in response.xpath( "//ul[contains(@class, 'profile-links')]/li"): f_profile_item = [ self._f(k) for k in re.sub( '<[^>]*>', '', profile_item.extract()).strip().split(u":") ] key = f_profile_item[0] value = " ".join(f_profile_item[1:]) if key in profile_item_map: item[profile_item_map[key]] = value if item["education"]: education_list = item["education"].split(" ") item["education"] = self._f(education_list[0]) if len(education_list) > 1: item["major"] = self._f(u" ".join(education_list[1:])) item["follower_count"] = self._format_count( self._f( response.xpath("//a[@class='funsCount']/text()")[0].extract())) profile_detail_tag = response.xpath("//div[@class='col-md-4 profile']") reputation_count, emblem_count, like_count = [ self._f(k.extract()) for k in profile_detail_tag.xpath(".//strong/text()")[:3] ] item["reputation_count"] = self._format_count(reputation_count) item["emblem_count"] = self._format_count(emblem_count) item["like_count"] = self._format_count(like_count) item["introduction"] = self._f( profile_detail_tag.xpath(".//div[contains(@class, 'profile-bio')]") [0].extract()) profile_info_list = response.xpath( "//div[@class='col-md-4 profile']/following-sibling::div/ul/li") item["answer_count"] = self._format_count( self._f(profile_info_list[3].xpath(".//span/text()")[0].extract())) item["question_count"] = self._format_count( self._f(profile_info_list[4].xpath(".//span/text()")[0].extract())) item["article_count"] = self._format_count( self._f(profile_info_list[5].xpath(".//span/text()")[0].extract())) item["following_tag_count"] = self._format_count( self._f(profile_info_list[-1].xpath( ".//span[@class='badge']/text()")[0].extract())) user_href = "/" + "/".join(response.url.split("/")[3:]) if item["answer_count"]: pages = int( math.ceil(item["answer_count"] / float(self.answer_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/answers?page={}".format(user_href, i)) if item["question_count"]: pages = int( math.ceil(item["question_count"] / float(self.question_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/questions?page={}".format(user_href, i)) if item["article_count"]: pages = int( math.ceil(item["article_count"] / float(self.article_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/blogs?page={}".format(user_href, i)) if item["following_tag_count"]: pages = int( math.ceil(item["following_tag_count"] / float(self.following_tag_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/following/tags?page={}".format(user_href, i)) if item["follower_count"]: pages = int( math.ceil(item["follower_count"] / float(self.follower_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/followed/users?page={}".format(user_href, i)) # tags new_task("{}/tags?page={}".format(user_href, 1)) new_task("{}/bookmarks?page={}".format(user_href, 1)) new_task("{}/bookmarks?page={}".format(user_href, 2)) member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) return item
def _parse_topic_info(self, response): item = SegmentfaultTopicItem() item["question_id"] = response.url.split("/")[4] item["title"] = self._f( response.xpath("//h1[@id='questionTitle']/a/text()")[0].extract()) item["author"] = self._f( response.xpath("//div[@class='author']/a/@href") [0].extract().split("/")[-1]) item["created_at"] = self._format_timestr( self._f("".join([ k.strip() for k in response.xpath( "//div[@class='author']/text()").extract() ]).split()[0])) topic_status = response.xpath( '//div[@class="col-md-3"][1]/ul/li/strong/text()') item["follower_count"] = self._format_count( self._f(topic_status[0].extract())) item["collection_count"] = self._format_count( self._f(topic_status[1].extract())) item["hit_count"] = self._format_count( self._f(topic_status[2].extract())) question_tag = response.xpath( "//article[@class='widget-question__item']") item["like_count"] = self._format_count( self._f( question_tag.xpath(".//span[@class='count']/text()") [0].extract())) item["question"] = self._f( question_tag.xpath(".//div[@class='post-offset']/div[1]") [0].extract()) item["tags"] = [ self._f(k) for k in question_tag.xpath( ".//div[@class='post-offset']/ul[1]/li/a/text()").extract() ] answer_list_tag = response.xpath( "//article[contains(@class, 'widget-answers__item')]") answer_list = [] for answer_item in answer_list_tag: answer_id = self._f(answer_item.xpath(".//@id")[0].extract()) answer_like_count = self._format_count( self._f( answer_item.xpath(".//span[@class='count']/text()") [0].extract())) answer_accepted = bool( answer_item.xpath(".//div[@class='accepted-flag']")) answer_author = self._f( answer_item.xpath(".//div[@class='post-offset']/a/@href") [0].extract()) answer_created_at = self._format_timestr( self._f( answer_item.xpath( ".//span[contains(@class, 'text-muted')]/text()") [0].extract().strip().split()[0])) answer = self._f( answer_item.xpath(".//div[contains(@class, 'answer')]") [0].extract()) answer_list.append( dict(like_count=answer_like_count, accepted=answer_accepted, anthor=answer_author, created_at=answer_created_at, answer=answer, answer_id=answer_id)) item["answers"] = answer_list # add new task member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) return item
def _parse_toplic_and_article_info(self, response): xpath_list = "//h2[@class='title']/a/@href" for topic_href in response.xpath(xpath_list): new_task(topic_href.extract().strip())
def _parse_user_info(self, response): item = SegmentfaultUserItem() profile_tag = response.xpath("//div[contains(@class, 'profile-header')]") item["login"] = response.url.split("/")[4] item["avatar_url"] = self._f(profile_tag.xpath(".//img/@src")[0].extract()) if not item["avatar_url"].startswith("http"): item["avatar_url"] = None item["name"] = self._f(profile_tag.xpath(".//h4/text()")[0].extract()) item["social_contact"] = self._format_social_contact(profile_tag.xpath(".//li/a")) profile_item_map = { u"所在城市": "location", u"现任职位": "position", u"院校专业": "education", u"个人网站": "blog_url", } item.update([(v, None) for _, v in profile_item_map.iteritems()]) item["major"] = None for profile_item in response.xpath("//ul[contains(@class, 'profile-links')]/li"): f_profile_item = [self._f(k) for k in re.sub('<[^>]*>', '', profile_item.extract()).strip().split(u":")] key = f_profile_item[0] value = " ".join(f_profile_item[1:]) if key in profile_item_map: item[profile_item_map[key]] = value if item["education"]: education_list = item["education"].split(" ") item["education"] = self._f(education_list[0]) if len(education_list) > 1: item["major"] = self._f(u" ".join(education_list[1:])) item["follower_count"] = self._format_count(self._f(response.xpath("//a[@class='funsCount']/text()")[0].extract())) profile_detail_tag = response.xpath("//div[@class='col-md-4 profile']") reputation_count, emblem_count, like_count = [self._f(k.extract()) for k in profile_detail_tag.xpath(".//strong/text()")[:3]] item["reputation_count"] = self._format_count(reputation_count) item["emblem_count"] = self._format_count(emblem_count) item["like_count"] = self._format_count(like_count) item["introduction"] = self._f(profile_detail_tag.xpath(".//div[contains(@class, 'profile-bio')]")[0].extract()) profile_info_list = response.xpath("//div[@class='col-md-4 profile']/following-sibling::div/ul/li") item["answer_count"] = self._format_count(self._f(profile_info_list[3].xpath(".//span/text()")[0].extract())) item["question_count"] = self._format_count(self._f(profile_info_list[4].xpath(".//span/text()")[0].extract())) item["article_count"] = self._format_count(self._f(profile_info_list[5].xpath(".//span/text()")[0].extract())) item["following_tag_count"] = self._format_count(self._f(profile_info_list[-1].xpath(".//span[@class='badge']/text()")[0].extract())) user_href = "/" + "/".join(response.url.split("/")[3:]) if item["answer_count"]: pages = int(math.ceil(item["answer_count"] / float(self.answer_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/answers?page={}".format(user_href, i)) if item["question_count"]: pages = int(math.ceil(item["question_count"] / float(self.question_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/questions?page={}".format(user_href, i)) if item["article_count"]: pages = int(math.ceil(item["article_count"] / float(self.article_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/blogs?page={}".format(user_href, i)) if item["following_tag_count"]: pages = int(math.ceil(item["following_tag_count"] / float(self.following_tag_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/following/tags?page={}".format(user_href, i)) if item["follower_count"]: pages = int(math.ceil(item["follower_count"] / float(self.follower_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/followed/users?page={}".format(user_href, i)) # tags new_task("{}/tags?page={}".format(user_href, 1)) new_task("{}/bookmarks?page={}".format(user_href, 1)) new_task("{}/bookmarks?page={}".format(user_href, 2)) member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())] for member in member_list: new_task(member) return item