class BlogSpider(BaseSpider): name = 'blog' redis_start_urls = tasks_generator(build_url) start_urls = redis_start_urls # ref: http://blog.trojanhunter.com/2012/09/26/the-best-regex-to-validate-an-email-address/ email_regex = '''(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])''' def __init__(self, *args, **kwargs): super(BlogSpider, self).__init__(*args, **kwargs) cnt = 0 r = get_redis() r.delete('blog:tasks:fingerprint') with open(os.path.join(cpath, 'github_user_blog_list')) as fi: for line in fi: cnt += 1 if cnt > 100:break new_task(line.strip()) def parse(self, response): yields = [] hrefs = response.xpath('//a/@href').extract() for href in hrefs: href = href.encode('utf-8') """ if href.endswith('.png') or href.endswith('.css') or href.endswith('.js') \ or href.endswith('.jpeg') or href.endswith('gif') or href.endswith('.jpg') \ or href.endswith('.tar') or href.endswith('.gz') or href.endswith('.exe') \ or href.endswith('.apk'): continue """ if href.startswith('http'): hostname1 = urlparse(href).hostname hostname2 = urlparse(response.url).hostname if hostname1 == hostname2: yields.append(Request(href)) continue hostname_arr1 = hostname1.split('.') hostname_arr2 = hostname2.split('.') if len(hostname_arr1) >= 3 and len(hostname_arr2) >= 3 and hostname_arr1[1:] == hostname_arr2[1:]: yields.append(Request(href)) else: yields.append(Request('http://%s/%s' % (urlparse(response.url).hostname, href))) email = response.selector.re(self.email_regex) #item = BlogItem() item = HTMLItem() item['uid'] = response.url item['html'] =response.body if email: item['other'] = email yields.append(item) return yields def get_fingerprint(self, task_string): return task_string
class TestSpider(BaseSpider): # 爬虫的名字,运行的时候是根据这个名字来的 name = 'test' seeds_start_urls = ['http://127.0.0.1:8080/%d' % i for i in xrange(10)] redis_start_urls = tasks_generator(build_url) start_urls = itertools.chain(seeds_start_urls, redis_start_urls) def __init__(self, *args, **kwargs): super(TestSpider, self).__init__(*args, **kwargs) """ 爬虫初始化 """ pass def parse(self, response): """ 此方法不要使用 yield,如果有多个结果,返回一个 list 就可以了。 """ ids = response.xpath('//a/text()').extract() for id in ids: # 任务放在 redis 里面,这里会做去重 new_task(id.encode('utf-8'), self.get_fingerprint) # 或者让 scrapy 自己调度任务, 但是这里不会做去重 #yield Request('http://127.0.0.1:8080/%s' % id.encode('utf-8')) item = TestItem() item['uid'] = response.url # yield item return [item] def get_fingerprint(self, task_string): return task_string
class GithubSpider(scrapy.Spider): name = "github" allowed_domains = ["github.com"] seeds_start_urls = ( 'https://github.com/BYVoid', 'https://github.com/fqj1994', 'https://github.com/binhe22', 'https://github.com/atupal', 'https://github.com/fqj1994', 'https://github.com/BYVoid', ) redis_start_urls = tasks_generator(build_url) start_urls = itertools.chain(seeds_start_urls, redis_start_urls) following_num = 51 followers_num = 51 def parse(self, response): func_type = self.get_func(response.url) print response.url response_type_map = { "user_detail": self.parse_user_details, "followers": self.get_user_followers, "following": self.get_user_following, "repos": self.get_user_repos, "stars": self.get_user_stared, "repo_detail": self.get_repo_detail, "contributors": self.get_repo_contributor, "org_people": self.get_org_members, "org_repos": self.get_org_repos, } print response.url, func_type try: return response_type_map[func_type](response) except Exception, e: logger.error(response.url + " " + str(e)) logger.error(str(response.status) + str(response.body))
class SegmentfaultAllSpider(twcrawler.spiders.base.base_spider.BaseSpider): def get_fingerprint(self, url): return url name = "segmentfault_all" allowed_domains = ["segmentfault.com"] base_topiclist_url = u"http://segmentfault.com/questions/newest?page={}" base_articlelist_url = u"http://segmentfault.com/blogs/newest?page={}" answer_items_each_page = 20 question_items_each_page = 20 article_items_each_page = 20 following_tag_items_each_page = 20 follower_items_each_page = 20 max_topiclist_page = 800 #max_topiclist_page = 2 max_article_page = 500 #max_article_page = 2 # TODO(test) #topiclist_urls = [base_topiclist_url.format(i) for i in xrange(1, max_topiclist_page)] #articlelist_urls = [base_articlelist_url.format(i) for i in xrange(1, max_article_page)] #topiclist_urls.extend(articlelist_urls) #topiclist_urls = [] redis_start_urls = tasks_generator(build_url) #start_urls = itertools.chain(topiclist_urls, redis_start_urls) start_urls = itertools.chain(all_segmentfault_users, redis_start_urls) def _format_timestr(self, timestr): # timestr(2天前,2小时前,3月18日, 2014年12月09日) timestr = re.sub(ur"(年|月|日)", "-", timestr) if timestr.endswith("-"): timestr = timestr[:-1] result_time = parser.parse(timestr) else: timegroup = re.search(r"^([0-9]{1,})(.*)$", timestr) if timegroup.group(2) == u"小时前": f_time = datetime.datetime.now() - datetime.timedelta( hours=int(timegroup.group(1))) elif timegroup.group(2) == u"分钟前": f_time = datetime.datetime.now() - datetime.timedelta( minutes=int(timegroup.group(1))) elif timegroup.group(2) == u"天前": f_time = datetime.datetime.now() - datetime.timedelta( days=int(timegroup.group(1))) else: logging.error(("EEEEEEEEEEEE", timestr)) raise result_time = datetime.datetime(year=f_time.year, month=f_time.month, day=f_time.day, hour=f_time.hour, minute=f_time.minute) return result_time.strftime("%Y-%m-%d %H:%M:%S") def _format_count(self, count): if count.startswith("-"): negative = -1 count = count[1:] else: negative = 1 if count.endswith("k"): return int(float(count[:-1]) * 1000) * negative if count.endswith("w"): return int(float(count[:-1]) * 10000) * negative if re.match(r"^[0-9]{1,}$", count): return int(count) * negative else: logging.error(("EEEEEEEEEEEEEE", count)) raise def parse(self, response): f_url = response.url.split("?")[0].split("/") rep_type = f_url[3] if len(f_url) > 5: child_rep_type = f_url[5] rep_type = u"{}_{}".format(rep_type, child_rep_type) if len(f_url) > 6: rep_type = u"{}_{}".format(rep_type, f_url[6]) response_type_map = { "questions": self._parse_toplic_and_article_info, "blogs": self._parse_toplic_and_article_info, "q": self._parse_topic_info, "u": self._parse_user_info, "a": self._parse_article_info, "bookmark": self._parse_bookmark_info, "u_answers": self._parse_user_answer_info, "u_questions": self._parse_user_question_info, "u_blogs": self._parse_user_article_info, "u_following_tags": self._parse_user_following_tags_info, "u_bookmarks": self._parse_user_bookmark_info, "u_followed_users": self._parse_user_follower_info, "u_tags": self._parse_user_tags_info, } return response_type_map[rep_type](response) def _format_social_contact(self, social_item_list): social_contact = dict() for social_item in social_item_list: key = self._f( social_item.xpath(".//@class")[0].extract().split("-")[-1]) value = self._f(social_item.xpath(".//@href")[0].extract()) if value != u"javascript:void(0);": social_contact[key] = value return social_contact def _f(self, val): return val.strip() def _parse_toplic_and_article_info(self, response): xpath_list = "//h2[@class='title']/a/@href" for topic_href in response.xpath(xpath_list): new_task(topic_href.extract().strip()) def _parse_topic_info(self, response): item = SegmentfaultTopicItem() item["question_id"] = response.url.split("/")[4] item["title"] = self._f( response.xpath("//h1[@id='questionTitle']/a/text()")[0].extract()) item["author"] = self._f( response.xpath("//div[@class='author']/a/@href") [0].extract().split("/")[-1]) item["created_at"] = self._format_timestr( self._f("".join([ k.strip() for k in response.xpath( "//div[@class='author']/text()").extract() ]).split()[0])) topic_status = response.xpath( '//div[@class="col-md-3"][1]/ul/li/strong/text()') item["follower_count"] = self._format_count( self._f(topic_status[0].extract())) item["collection_count"] = self._format_count( self._f(topic_status[1].extract())) item["hit_count"] = self._format_count( self._f(topic_status[2].extract())) question_tag = response.xpath( "//article[@class='widget-question__item']") item["like_count"] = self._format_count( self._f( question_tag.xpath(".//span[@class='count']/text()") [0].extract())) item["question"] = self._f( question_tag.xpath(".//div[@class='post-offset']/div[1]") [0].extract()) item["tags"] = [ self._f(k) for k in question_tag.xpath( ".//div[@class='post-offset']/ul[1]/li/a/text()").extract() ] answer_list_tag = response.xpath( "//article[contains(@class, 'widget-answers__item')]") answer_list = [] for answer_item in answer_list_tag: answer_id = self._f(answer_item.xpath(".//@id")[0].extract()) answer_like_count = self._format_count( self._f( answer_item.xpath(".//span[@class='count']/text()") [0].extract())) answer_accepted = bool( answer_item.xpath(".//div[@class='accepted-flag']")) answer_author = self._f( answer_item.xpath(".//div[@class='post-offset']/a/@href") [0].extract()) answer_created_at = self._format_timestr( self._f( answer_item.xpath( ".//span[contains(@class, 'text-muted')]/text()") [0].extract().strip().split()[0])) answer = self._f( answer_item.xpath(".//div[contains(@class, 'answer')]") [0].extract()) answer_list.append( dict(like_count=answer_like_count, accepted=answer_accepted, anthor=answer_author, created_at=answer_created_at, answer=answer, answer_id=answer_id)) item["answers"] = answer_list # add new task member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) return item def _parse_user_info(self, response): item = SegmentfaultUserItem() profile_tag = response.xpath( "//div[contains(@class, 'profile-header')]") item["login"] = response.url.split("/")[4] item["avatar_url"] = self._f( profile_tag.xpath(".//img/@src")[0].extract()) if not item["avatar_url"].startswith("http"): item["avatar_url"] = None item["name"] = self._f(profile_tag.xpath(".//h4/text()")[0].extract()) item["social_contact"] = self._format_social_contact( profile_tag.xpath(".//li/a")) profile_item_map = { u"所在城市": "location", u"现任职位": "position", u"院校专业": "education", u"个人网站": "blog_url", } item.update([(v, None) for _, v in profile_item_map.iteritems()]) item["major"] = None for profile_item in response.xpath( "//ul[contains(@class, 'profile-links')]/li"): f_profile_item = [ self._f(k) for k in re.sub( '<[^>]*>', '', profile_item.extract()).strip().split(u":") ] key = f_profile_item[0] value = " ".join(f_profile_item[1:]) if key in profile_item_map: item[profile_item_map[key]] = value if item["education"]: education_list = item["education"].split(" ") item["education"] = self._f(education_list[0]) if len(education_list) > 1: item["major"] = self._f(u" ".join(education_list[1:])) item["follower_count"] = self._format_count( self._f( response.xpath("//a[@class='funsCount']/text()")[0].extract())) profile_detail_tag = response.xpath("//div[@class='col-md-4 profile']") reputation_count, emblem_count, like_count = [ self._f(k.extract()) for k in profile_detail_tag.xpath(".//strong/text()")[:3] ] item["reputation_count"] = self._format_count(reputation_count) item["emblem_count"] = self._format_count(emblem_count) item["like_count"] = self._format_count(like_count) item["introduction"] = self._f( profile_detail_tag.xpath(".//div[contains(@class, 'profile-bio')]") [0].extract()) profile_info_list = response.xpath( "//div[@class='col-md-4 profile']/following-sibling::div/ul/li") item["answer_count"] = self._format_count( self._f(profile_info_list[3].xpath(".//span/text()")[0].extract())) item["question_count"] = self._format_count( self._f(profile_info_list[4].xpath(".//span/text()")[0].extract())) item["article_count"] = self._format_count( self._f(profile_info_list[5].xpath(".//span/text()")[0].extract())) item["following_tag_count"] = self._format_count( self._f(profile_info_list[-1].xpath( ".//span[@class='badge']/text()")[0].extract())) user_href = "/" + "/".join(response.url.split("/")[3:]) if item["answer_count"]: pages = int( math.ceil(item["answer_count"] / float(self.answer_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/answers?page={}".format(user_href, i)) if item["question_count"]: pages = int( math.ceil(item["question_count"] / float(self.question_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/questions?page={}".format(user_href, i)) if item["article_count"]: pages = int( math.ceil(item["article_count"] / float(self.article_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/blogs?page={}".format(user_href, i)) if item["following_tag_count"]: pages = int( math.ceil(item["following_tag_count"] / float(self.following_tag_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/following/tags?page={}".format(user_href, i)) if item["follower_count"]: pages = int( math.ceil(item["follower_count"] / float(self.follower_items_each_page))) for i in xrange(1, pages + 1): new_task("{}/followed/users?page={}".format(user_href, i)) # tags new_task("{}/tags?page={}".format(user_href, 1)) new_task("{}/bookmarks?page={}".format(user_href, 1)) new_task("{}/bookmarks?page={}".format(user_href, 2)) member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) return item def _parse_user_answer_info(self, response): login = response.url.split("/")[4] for answer_item in response.xpath("//h2[@class='title']/a/@href"): item = SegmentfaultUserAnswerItem() answer_link = self._f(answer_item.extract()).split("/") item["question_id"] = answer_link[2] item["answer_id"] = answer_link[3] item["login"] = login yield item def _parse_user_question_info(self, response): login = response.url.split("/")[4] for question_item in response.xpath("//h2[@class='title']/a/@href"): item = SegmentfaultUserQuestionItem() question_link = self._f(question_item.extract()).split("/") item["question_id"] = question_link[2] item["login"] = login yield item def _parse_user_article_info(self, response): login = response.url.split("/")[4] for article_item in response.xpath("//h2[@class='title']/a/@href"): item = SegmentfaultUserArticleItem() article_link = self._f(article_item.extract()).split("/") item["article_id"] = article_link[2] item["login"] = login yield item def _parse_user_following_tags_info(self, response): login = response.url.split("/")[4] item = SegmentfaultUserFollowingTagItem() item["login"] = login item["following_tags"] = list( set([ self._f(k) for k in response.xpath( "//h4[contains(@class, 'h5')]/a/text()").extract() ])) return item def _parse_article_info(self, response): item = SegmentfaultArticleItem() item["article_id"] = response.url.split("/")[4] item["title"] = self._f( response.xpath("//h1[@id='articleTitle']/a/text()")[0].extract()) item["author"] = self._f( response.xpath("//div[@class='author']/a/@href") [0].extract().split("/")[-1]) item["created_at"] = self._format_timestr( self._f( response.xpath("//div[@class='author']/text()") [-1].extract().strip().split()[0])) topic_status = response.xpath( '//div[@class="col-md-3"][1]/ul/li/strong/text()') item["follower_count"] = self._format_count( self._f(topic_status[0].extract())) item["collection_count"] = self._format_count( self._f(topic_status[1].extract())) item["hit_count"] = self._format_count( self._f(topic_status[2].extract())) item["article"] = self._f( response.xpath("//div[contains(@class, 'article')]")[0].extract()) item["tags"] = list( set([ self._f(k) for k in response.xpath( "//li[@class='tagPopup']/a/text()").extract() ])) member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) return item def _parse_user_bookmark_info(self, response): login = response.url.split("/")[4] for bookmark_list in response.xpath( "//section[contains(@class, 'stream-list__item')]"): if bookmark_list: item = SegmentfaultUserBookmarkItem() item["login"] = login link = self._f( bookmark_list.xpath(".//strong/a/@href")[0].extract()) item["bookmark_id"] = link.split("/")[2] new_task(link) yield item def _parse_bookmark_info(self, response): bookmark_id = response.url.split("/")[4] for bookmark_item in response.xpath("//h2[@class='title']/a/@href"): item = SegmentfaultBookmarkItem() bookmark_link = self._f(bookmark_item.extract()).split("/") item["bookmark_id"] = bookmark_id item["bookmark_list"] = dict(type=bookmark_link[1], id=bookmark_link[2]) yield item def _parse_user_follower_info(self, response): member_list = [ self._f(k) for k in set( response.xpath( "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract()) ] for member in member_list: new_task(member) login = response.url.split("/")[4] for follower_item in response.xpath( "//div[contains(@class, 'stream-list__item')]/a/@href"): item = SegmentfaultUserFollowerItem() item["login"] = login item["follower"] = self._f(follower_item.extract()).split("/")[2] yield item def _parse_user_tags_info(self, response): tags = [ self._f(k) for k in response.xpath( "//li[contains(@class, 'tagPopup')]/a/text()").extract() ] if tags: next_page = response.xpath("//li[@class='next']/a/@href").extract() if next_page: new_task(next_page[0]) item = SegmentfaultUserTagItem() item["login"] = response.url.split("/")[4] item["tags"] = tags return item
class StackoverflowQuestionSpider(scrapy.Spider): handle_httpstatus_list = [ 201, 202, 203, 204, 205, 206, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 500, 501, 502, 503, 504, 505, ] name = "stackoverflow_question" allowed_domains = ["stackoverflow.com"] start = 4 #end = 28000000 end = 104 #start_urls = ( # 'http://stackoverflow.com/feeds/question/%d'%i for i in xrange(start, end) #) start_urls = tasks_generator(build_url) id_xpath = '//entry/id' rank_xpath = '//entry/rank' title_xpath = '//entry/title' tag_xpath = '//entry/category/@term' author_name_xpath = '//entry/author/name' author_uri_xpath = '//entry/author/uri' link_xpath = '//entry/link/@href' published_xpath = '//entry/published' updated_xpath = '//entry/updated' content_xpath = '//entry/summary' HTML_200_STRING = [] HTML_404_STRING = [] HTML_MOBILE_STRING = [] def parse(self, response): if response.status in self.handle_httpstatus_list: return if 'StackExchange.ready' in response.body and "Page Not Found" in response.body: return response.selector.remove_namespaces() ids = extract_text_null(self, 'id', response) ranks = extract_text_null(self, 'rank', response) titles = extract_text_null(self, 'title', response) tags = response.xpath(self.tag_xpath).extract() author_names = extract_text_null(self, 'author_name', response) author_uris = extract_text_null(self, 'author_uri', response) links = response.xpath(self.link_xpath).extract() publisheds = extract_text_null(self, 'published', response) updateds = extract_text_null(self, 'updated', response) contents = extract_text_null(self, 'content', response) item = StackoverflowQuestionItem() item['uid'] = response.url.rstrip('/').split('/')[-1] item['rank'] = ranks[0] item['title'] = titles[0] item['tags'] = tags item['author_name'] = author_names[0] item['author_uri'] = author_uris[0] item['author_uid'] = author_uris[0].split('/')[-1] item['link'] = links[0] item['published'] = publisheds[0] item['updated'] = updateds[0] item['content'] = contents[0] item['answers'] = [] pipeline = metric.get_redis().pipeline() for i in xrange(1, len(ids)): answer = {} answer['uid'] = ids[i].split('#')[-1] pipeline.hincrby(':'.join([metric.metric_key, 'answer']), answer['uid'], 1) answer['rank'] = ranks[i] answer['author_name'] = author_names[i] answer['author_uri'] = author_uris[i] answer['author_uid'] = author_uris[i].split('/')[-1] answer['link'] = links[i] answer['published'] = publisheds[i] answer['updated'] = updateds[i] answer['content'] = contents[i] item['answers'].append(answer) pipeline.execute() return item def parse_list(self, reponse): pass def parse_page(self, response): pass
class LinkedinSpider(BaseSpider): name = 'linkedin' seeds_start_urls = (('http://cn.linkedin.com/directory/people-%s/' % i) for i in string.lowercase) seeds_start_urls = itertools.chain( seeds_start_urls, (('http://cn.linkedin.com/directory/people-%d/' % i) for i in xrange(1, 27))) seeds_start_urls = [] redis_start_urls = tasks_generator(build_url) start_urls = itertools.chain(seeds_start_urls, redis_start_urls) full_name_xpath = '//span[@class="full-name"]/text()' title_xpath = '//p[@class="title"]/text()' location_xpath = '//a[@name="location"]/text() | //span[@class="locality"]/text()' industry_xpath = '//a[@name="industry"]/text() | //dd[@class="industry"]/text()' language_xpath = 'string(//div[@id="languages"])' experience_header_h4_xpath = '//span[@class="experience-date-locale"]/preceding-sibling::*//h4' #experience_header_h5_xpath = 'string(//span[@class="experience-date-locale"]/preceding-sibling::*//h5)' experience_header_h5_xpath = '//span[@class="experience-date-locale"]/preceding-sibling::*//h5//a[@dir="auto"]' experience_time_xpath = '//span[@class="experience-date-locale"]' education_school_xpath = '//span[@class="education-date"]/preceding-sibling::header//h4' education_degree_xpath = '//span[@class="education-date"]/preceding-sibling::header//*[@class="degree"]/text()' education_major_xpath = '//span[@class="education-date"]/preceding-sibling::header//*[@class="major"]' education_date_xpath = '//span[@class="education-date"]' project_name_xpath = '//span[@class="projects-date"]/preceding-sibling::*//span[@dir="auto"]/text()' project_date_xpath = '//span[@class="projects-date"]' project_detail_xpath = '//span[@class="projects-date"]/following-sibling::p/text()' summary_xpath = 'string(//div[@class="summary"]/p)' skill_num_endorsements_xpath = '//span[@class="num-endorsements"]/text()' skill_name_xpath = '//span[@class="endorse-item-name"]/*/text()' member_connections_xpath = '//div[@class="member-connections"]/strong/text()' dir_link_xpath = '//li[@class="content"]/a/@href' def __init__(self, *args, **kwargs): super(LinkedinSpider, self).__init__(*args, **kwargs) filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list') r = get_redis() r.delete('linkedin:tasks:fingerprint') with open(filepath) as fi: cnt = 0 for line in fi: new_task(line.strip()) cnt += 1 if cnt > 100: break def parse(self, response): if 'linkedin.com/pub' in response.url: item = LinkedinItem() for field in [ 'full_name', 'title', 'location', 'industry', 'language', 'summary', 'member_connections' ]: try: item[field] = response.xpath( getattr(self, '%s_xpath' % field)).extract()[0] except: item[field] = '' experience_h4 = [ i.xpath('string()').extract()[0] for i in response.xpath(self.experience_header_h4_xpath) ] experience_h5 = [ i.xpath('string()').extract()[0] for i in response.xpath(self.experience_header_h5_xpath) ] experience_time = [ i.xpath('string()').extract()[0] for i in response.xpath(self.experience_time_xpath) ] experience = [{ 'title': x, 'company': y, 'time': z } for x, y, z in zip(experience_h4, experience_h5, experience_time) ] item['experience'] = experience education_school = [ i.xpath('string()').extract()[0] for i in response.xpath(self.education_school_xpath) ] education_degree = response.xpath( self.education_degree_xpath).extract() education_major = [ i.xpath('string()').extract()[0] for i in response.xpath(self.education_major_xpath) ] education_date = [ i.xpath('string()').extract()[0] for i in response.xpath(self.education_date_xpath) ] education = [{ 'school': x, 'degree': y, 'major': z, 'date': d } for x, y, z, d in zip(education_school, education_degree, education_major, education_date)] project_name = response.xpath(self.project_name_xpath).extract() project_date = [ i.xpath("string()").extract()[0] for i in response.xpath(self.project_date_xpath) ] project_detail = response.xpath( self.project_detail_xpath).extract() item['project'] = [{ 'name': x, 'date': y, 'detail': z } for x, y, z in zip(project_name, project_date, project_detail)] skill_name = response.xpath(self.skill_name_xpath).extract() skill_num_endorsements = response.xpath( self.skill_num_endorsements_xpath).extract() if not skill_num_endorsements: skill_num_endorsements = [0] * len(skill_name) item['skills'] = zip(skill_name, skill_num_endorsements) item['education'] = education item['public_profile_url'] = response.url return item def parse_directory(self, response): links = response.xpath(self.dir_link_xpath).extract() for link in links: if link.startswith('http'): if 'linkedin.com/pub' in link: r = get_redis() r.sadd('linkedin:user', link) continue new_task(link) def get_fingerprint(self, task_string): return task_string
class RubychinaAllSpider(twcrawler.spiders.base.base_spider.BaseSpider): name = "rubychina_all" allowed_domains = ["ruby-china.org"] base_topic_url = u"https://ruby-china.org/api/topics/{}.json" # max_topic_number = 25000 # start_urls = [base_topic_url.format(k) for k in xrange(1, max_topic_number + 1)] start_urls = [base_topic_url.format(1)] redis_start_urls = tasks_generator(build_url) start_urls = itertools.chain(start_urls, redis_start_urls) def parse(self, response): tag = response.url.split("/")[4] tag_map = { "users": self._parse_user_info, "topics": self._parse_topic_info } return tag_map[tag](response) def _parse_topic_info(self, response): f_response = json.loads(response.body) if "error" in f_response: return None item = RubychinaTopicItem() item["id"] = f_response["id"] item["title"] = f_response["title"] item["created_at"] = f_response["created_at"] item["replies_count"] = f_response["replies_count"] item["owner_login"] = f_response["user"]["login"] item["content"] = f_response["body_html"] item["hits"] = f_response["hits"] for reply_item in f_response["replies"]: new_task(reply_item["user"]["login"]) return item def _format_user_v(self, val): if val: f_val = val.strip() return f_val if f_val else None def _parse_user_info(self, response): if response.headers["Content-Type"] != "application/json": return None f_response = json.loads(response.body) item = RubychinaUserItem() item["name"] = self._format_user_v(f_response["name"]) item["login"] = f_response["login"].strip() item["location"] = self._format_user_v(f_response["location"]) item["company"] = self._format_user_v(f_response["company"]) item["twitter_id"] = self._format_user_v(f_response["twitter"]) item["blog_url"] = self._format_user_v(f_response["website"]) item["bio"] = self._format_user_v(f_response["bio"]) item["tagline"] = self._format_user_v(f_response["tagline"]) item["github_id"] = self._format_user_v(f_response["github_url"]) item["github_id"] = item["github_id"][len("https://github.com/"):] if item["github_id"] else None item["email"] = self._format_user_v(f_response["email"]) item["avatar_url"] = self._format_user_v(f_response["avatar_url"]) return item
class ZhihuUserSpider(twcrawler.spiders.base.base_spider.BaseSpider): name = "zhihu_user" allowed_domains = ["zhihu.com"] seeds_urls = ( 'http://api.zhihu.com/people/54e5f13a0d10654a461b318d27978341', 'http://api.zhihu.com/people/0970f947b898ecc0ec035f9126dd4e08', 'http://api.zhihu.com/people/1d56ab293e529b7d78acfe621d8173ed', 'http://api.zhihu.com/people/46a588b2d664291f7fb5fec44bc60f71', 'http://api.zhihu.com/people/08c4f44866e0e339c4e519f4631befd0', 'http://api.zhihu.com/people/d01aeaca8cc7de042044213bb540268a', 'http://api.zhihu.com/people/2ae8b3af01d40abc77ebeda7ecc350a9', 'http://api.zhihu.com/people/44a8c67957c79ce9b229ed2774dcaa1a', 'http://api.zhihu.com/people/44faf17ff5f5a4ccc4cf9bda47de8da2', 'http://api.zhihu.com/people/a06cfb38e37dac1658e6457df4d7f032', 'http://api.zhihu.com/people/b6d28ac2b88b7f230552bab4a0aceaca', 'http://api.zhihu.com/people/3ec3b166992a5a90a1083945d2490d38', 'http://api.zhihu.com/people/d073f194bcabc1cec5ef69d0b534de99', 'http://api.zhihu.com/people/78e3b98074a915b222ae1be4ab038a6e', 'http://api.zhihu.com/people/b6f80220378c8b0b78175dd6a0b9c680', 'http://api.zhihu.com/people/9dcef282c46c0342b5f76d6baa0b3631', 'http://api.zhihu.com/people/227135f01257c4d6bdb21a726ef6d53a', 'http://api.zhihu.com/people/5dfdfcfb9b22d25bc8b639a86ae3692a', 'http://api.zhihu.com/people/fcbb342dc45f1c6e2e18925c3d2cf264', 'http://api.zhihu.com/people/99953853cc4219fabe8327301058357c', #'https://api.zhihu.com/people/54e5f13a0d10654a461b318d27978341', ) url_generator = tasks_generator(build_url) start_urls = chain(seeds_urls, url_generator) #followees_url = 'https://api.zhihu.com/people/%(uid)s/followees?limit=%(limit)d&offset=%(offset)d' followees_url = 'http://api.zhihu.com/people/%(uid)s/followees?limit=%(limit)d&offset=%(offset)d' def compress(self, item, *foi): if not item: return item ret = {} for key in foi: ret[key] = item.get(key) return ret def parse(self, response): if 'followees' in response.url: for item in self.parse_followees(response): yield item return ret = json.loads(response.body) item = ZhihuUserItem() for key in item.fields: if key == 'uid': item[key] = ret.get('id') #elif key == 'education': # item[key] = [] # for edu in ret.get(key, []): # item[key].append( self.compress(edu, 'id', 'name', 'experience', 'type') ) #elif key == 'employment': # item[key] = [] # raw = ret.get('employment', []) # for emp in raw: # item[key].append({ # 'id': emp[0].get('id'), # 'name': emp[0].get('name'), # 'job_name': emp[1].get('name'), # 'type': emp[0].get('topic'), # }) #elif key == 'business': # item[key] = self.compress(ret.get(key), 'id', 'name', 'type') #elif key == 'location': # item[key] = [] # for loc in ret.get(key, []): # item[key].append( self.compress(loc, 'id', 'name', 'experience', 'type') ) else: item[key] = ret.get(key) item['_id'] = item['uid'] yield item offset = 0 limit = 20 while offset < int(ret['following_count']): url = self.followees_url % ({ 'uid': ret['id'], 'limit': limit, 'offset': offset, }) yield scrapy.Request(url) #yield scrapy.Request(url, callback=self.parse_followees) offset += 20 def parse_followees(self, response): ret = json.loads(response.body) item = ZhihuRelationItem() item['uid'] = self.get_uid_from_url(response.url) item['followees'] = [] item['_id'] = '%s:%s' % (item['uid'], self.get_offset_from_url(response.url)) for people in ret['data']: item['followees'].append({ 'uid': people.get('id'), #'email': people.get('email'), #'gender': people.get('gender'), #'sina_weibo_url': people.get('sina_weibo_url'), #'sina_weibo_name': people.get('sina_weibo_name'), #'headline': people.get('headline'), #'description': people.get('description'), }) new_task(people['id']) yield item def get_uid_from_url(self, url): #idx_b = url.find('people/') + len('people/') idx_b = url.find('people/') + 7 idx_e = url[idx_b:].find('/') return url[idx_b:idx_b + idx_e] def get_offset_from_url(self, url): b = url.find('offset=') if b == -1: return '0' return url[b + 7:] def get_fingerprint(self, url): return url
class V2exSpider(twcrawler.spiders.base.base_spider.BaseSpider): name = "v2ex_all" allowed_domains = ["www.v2ex.com"] max_topic_page = 8000 base_topic_list = u"https://www.v2ex.com/feed/tab/all.xml?page={}" raw_start_urls = [base_topic_list.format(k) for k in xrange(1, max_topic_page)] raw_start_urls = [] redis_start_urls = tasks_generator(build_url) start_urls = itertools.chain(raw_start_urls, redis_start_urls) #start_urls = ["https://www.v2ex.com/member/Livid"] #start_urls = ["https://www.v2ex.com/member/jianshuio"] #start_urls = ["https://www.v2ex.com/feed/tab/all.xml?page=1"] #start_urls = ["https://www.v2ex.com/api/topics/show.json?id=182731"] start_urls = ["https://www.v2ex.com/api/replies/show.json?topic_id=182731"] def get_fingerprint(self, task_string): return task_string def _f_timestr(self, timestamp): return datetime.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S") def _f(self, item): return [k.strip() for k in item.extract()] def parse(self, response): type_ = "_".join(response.url.split("/")[3:-1]) response_map_type = { "feed_tab": self._parse_topic_list_info, "api_topics": self._parse_topic_info, "api_replies": self._parse_topic_replies_info, "api_members": self._parse_user_info, "member": self._parse_user_detail_info, } return response_map_type[type_](response) def _parse_topic_list_info(self, response): soup = BeautifulSoup(response.body) all_link = soup.findAll("link") base_topic_url = u"/api/topics/show.json?id={}" for link_item in all_link[2:]: topic_id = link_item.attrs["href"].split("#")[0].split("/")[-1] new_task(base_topic_url.format(topic_id)) def _parse_topic_info(self, response): f_response = json.loads(response.body) if not f_response: return f_response = f_response[0] item = V2ExTopicItem() item["topic_id"] = f_response["id"] item["title"] = f_response["title"] item["content"] = f_response["content_rendered"] item["author"] = f_response["member"]["username"] if item["author"]: new_task(u"/api/members/show.json?id={}".format(f_response["member"]["id"])) item["node_name"] = f_response["node"]["name"] item["created_at"] = self._f_timestr(f_response["created"]) item["reply_count"] = f_response["replies"] if item["reply_count"]: new_task(u"/api/replies/show.json?topic_id={}".format(item["topic_id"])) return item def _parse_topic_replies_info(self, response): f_response = json.loads(response.body) if not f_response: return topic_id = response.url.split("?")[-1] for reply_item in f_response: item = V2ExTopicReplyItem() item["reply_id"] = reply_item["id"] item["topic_id"] = topic_id item["thank_count"] = reply_item["thanks"] item["content"] = reply_item["content_rendered"].strip() item["author"] = reply_item["member"]["username"] if item["author"]: new_task("/api/members/show.json?id={}".format(reply_item["member"]["id"])) item["created_at"] = self._f_timestr(reply_item["created"]) yield item def _parse_user_info(self, response): f_response = json.loads(response.body) if not f_response or f_response["status"] == "notfound": return item = V2ExUserItem() item["id"] = f_response["id"] item["login"] = f_response["username"] item["tagline"] = f_response["tagline"].strip() item["description"] = f_response["bio"].strip() item["avatar_url"] = f_response["avatar_large"] item["created_at"] = self._f_timestr(f_response["created"]) new_task(u"/member/{}".format(item["login"])) return item def _parse_user_detail_info(self, response): item = V2ExUserDetailItem() item["login"] = response.url.split("/")[-1] balance_type_list = ["gold", "silver", "bronze"] item["balance"] = dict.fromkeys(balance_type_list, 0) balance_item = response.xpath("//div[@class='balance_area']") if balance_item: balance_type = [k.split("/")[-1].split(".")[0].strip() for k in self._f(balance_item.xpath(".//img/@src"))] balance_score = [int(k) for k in self._f(balance_item.xpath(".//text()"))] for k_, v_ in zip(balance_type, balance_score): if k_ in balance_type_list: item["balance"][k_] = v_ social_link_item = response.xpath("//div[contains(@class, 'markdown_body')][1]/a") social_link_map = { "social_twitter": "twitter_login", "social_home": "blog_url", "social_geo": "location", "social_instagram": "instagram_login", "social_ps": "ps_login", "social_twitch": "twitch_login", "social_steam": "steam_login", "social_bn": "battle_tag_login", "social_dribbble": "dribble_login", "social_github": "github_login", "social_btc": "btc_address" } for _, v in social_link_map.iteritems(): item[v] = None if social_link_item: social_type = [k.split("/")[-1].split(".")[0].strip() for k in self._f(social_link_item.xpath(".//img/@src"))] social_value = self._f(social_link_item.xpath(".//text()")) for t_, v_ in zip(social_type, social_value): if t_ in social_link_map: item[social_link_map[t_]] = v_ return item