Exemple #1
0
class BlogSpider(BaseSpider):

    name = 'blog'
    redis_start_urls = tasks_generator(build_url)
    start_urls = redis_start_urls

    # ref: http://blog.trojanhunter.com/2012/09/26/the-best-regex-to-validate-an-email-address/
    email_regex = '''(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'''

    def __init__(self, *args, **kwargs):
        super(BlogSpider, self).__init__(*args, **kwargs)
        cnt = 0
        r = get_redis()
        r.delete('blog:tasks:fingerprint')
        with open(os.path.join(cpath, 'github_user_blog_list')) as fi:
            for line in fi:
                cnt += 1
                if cnt > 100:break
                new_task(line.strip())

    def parse(self, response):
        yields = []
        hrefs = response.xpath('//a/@href').extract()
        for href in hrefs:
            href = href.encode('utf-8')
            """
            if href.endswith('.png') or href.endswith('.css') or href.endswith('.js') \
                or href.endswith('.jpeg') or href.endswith('gif') or href.endswith('.jpg') \
                or href.endswith('.tar') or href.endswith('.gz') or href.endswith('.exe') \
                or href.endswith('.apk'):
                    continue
            """

            if href.startswith('http'):
                hostname1 = urlparse(href).hostname
                hostname2 = urlparse(response.url).hostname
                if hostname1 == hostname2:
                    yields.append(Request(href))
                    continue
                hostname_arr1 = hostname1.split('.')
                hostname_arr2 = hostname2.split('.')
                if len(hostname_arr1) >= 3 and len(hostname_arr2) >= 3 and hostname_arr1[1:] == hostname_arr2[1:]:
                    yields.append(Request(href))
            else:
                yields.append(Request('http://%s/%s' % (urlparse(response.url).hostname, href)))

        email = response.selector.re(self.email_regex)

        #item = BlogItem()
        item = HTMLItem()
        item['uid'] = response.url
        item['html'] =response.body
        if email:
            item['other'] = email
        yields.append(item)

        return yields

    def get_fingerprint(self, task_string):
        return task_string
Exemple #2
0
class TestSpider(BaseSpider):

    # 爬虫的名字,运行的时候是根据这个名字来的
    name = 'test'
    seeds_start_urls = ['http://127.0.0.1:8080/%d' % i for i in xrange(10)]
    redis_start_urls = tasks_generator(build_url)
    start_urls = itertools.chain(seeds_start_urls, redis_start_urls)

    def __init__(self, *args, **kwargs):
        super(TestSpider, self).__init__(*args, **kwargs)
        """ 爬虫初始化
        """
        pass

    def parse(self, response):
        """ 此方法不要使用 yield,如果有多个结果,返回一个 list 就可以了。
        """
        ids = response.xpath('//a/text()').extract()

        for id in ids:
            # 任务放在 redis 里面,这里会做去重
            new_task(id.encode('utf-8'), self.get_fingerprint)
            # 或者让 scrapy 自己调度任务, 但是这里不会做去重
            #yield Request('http://127.0.0.1:8080/%s' % id.encode('utf-8'))

        item = TestItem()
        item['uid'] = response.url

        # yield item
        return [item]

    def get_fingerprint(self, task_string):
        return task_string
Exemple #3
0
class GithubSpider(scrapy.Spider):
    name = "github"
    allowed_domains = ["github.com"]
    seeds_start_urls = (
        'https://github.com/BYVoid',
        'https://github.com/fqj1994',
        'https://github.com/binhe22',
        'https://github.com/atupal',
        'https://github.com/fqj1994',
        'https://github.com/BYVoid',
    )
    redis_start_urls = tasks_generator(build_url)
    start_urls = itertools.chain(seeds_start_urls, redis_start_urls)
    following_num = 51
    followers_num = 51

    def parse(self, response):
        func_type = self.get_func(response.url)
        print response.url
        response_type_map = {
            "user_detail": self.parse_user_details,
            "followers": self.get_user_followers,
            "following": self.get_user_following,
            "repos": self.get_user_repos,
            "stars": self.get_user_stared,
            "repo_detail": self.get_repo_detail,
            "contributors": self.get_repo_contributor,
            "org_people": self.get_org_members,
            "org_repos": self.get_org_repos,
        }

        print response.url, func_type
        try:
            return response_type_map[func_type](response)
        except Exception, e:
            logger.error(response.url + " " + str(e))
            logger.error(str(response.status) + str(response.body))
Exemple #4
0
class SegmentfaultAllSpider(twcrawler.spiders.base.base_spider.BaseSpider):
    def get_fingerprint(self, url):
        return url

    name = "segmentfault_all"
    allowed_domains = ["segmentfault.com"]

    base_topiclist_url = u"http://segmentfault.com/questions/newest?page={}"
    base_articlelist_url = u"http://segmentfault.com/blogs/newest?page={}"

    answer_items_each_page = 20
    question_items_each_page = 20
    article_items_each_page = 20
    following_tag_items_each_page = 20
    follower_items_each_page = 20

    max_topiclist_page = 800
    #max_topiclist_page = 2
    max_article_page = 500
    #max_article_page = 2

    # TODO(test)
    #topiclist_urls = [base_topiclist_url.format(i) for i in xrange(1, max_topiclist_page)]
    #articlelist_urls = [base_articlelist_url.format(i) for i in xrange(1, max_article_page)]
    #topiclist_urls.extend(articlelist_urls)
    #topiclist_urls = []

    redis_start_urls = tasks_generator(build_url)
    #start_urls = itertools.chain(topiclist_urls, redis_start_urls)
    start_urls = itertools.chain(all_segmentfault_users, redis_start_urls)

    def _format_timestr(self, timestr):
        # timestr(2天前,2小时前,3月18日, 2014年12月09日)
        timestr = re.sub(ur"(年|月|日)", "-", timestr)
        if timestr.endswith("-"):
            timestr = timestr[:-1]
            result_time = parser.parse(timestr)
        else:
            timegroup = re.search(r"^([0-9]{1,})(.*)$", timestr)
            if timegroup.group(2) == u"小时前":
                f_time = datetime.datetime.now() - datetime.timedelta(
                    hours=int(timegroup.group(1)))
            elif timegroup.group(2) == u"分钟前":
                f_time = datetime.datetime.now() - datetime.timedelta(
                    minutes=int(timegroup.group(1)))
            elif timegroup.group(2) == u"天前":
                f_time = datetime.datetime.now() - datetime.timedelta(
                    days=int(timegroup.group(1)))
            else:
                logging.error(("EEEEEEEEEEEE", timestr))
                raise
            result_time = datetime.datetime(year=f_time.year,
                                            month=f_time.month,
                                            day=f_time.day,
                                            hour=f_time.hour,
                                            minute=f_time.minute)
            return result_time.strftime("%Y-%m-%d %H:%M:%S")

    def _format_count(self, count):
        if count.startswith("-"):
            negative = -1
            count = count[1:]
        else:
            negative = 1

        if count.endswith("k"):
            return int(float(count[:-1]) * 1000) * negative
        if count.endswith("w"):
            return int(float(count[:-1]) * 10000) * negative

        if re.match(r"^[0-9]{1,}$", count):
            return int(count) * negative
        else:
            logging.error(("EEEEEEEEEEEEEE", count))
            raise

    def parse(self, response):
        f_url = response.url.split("?")[0].split("/")
        rep_type = f_url[3]
        if len(f_url) > 5:
            child_rep_type = f_url[5]
            rep_type = u"{}_{}".format(rep_type, child_rep_type)
            if len(f_url) > 6:
                rep_type = u"{}_{}".format(rep_type, f_url[6])

        response_type_map = {
            "questions": self._parse_toplic_and_article_info,
            "blogs": self._parse_toplic_and_article_info,
            "q": self._parse_topic_info,
            "u": self._parse_user_info,
            "a": self._parse_article_info,
            "bookmark": self._parse_bookmark_info,
            "u_answers": self._parse_user_answer_info,
            "u_questions": self._parse_user_question_info,
            "u_blogs": self._parse_user_article_info,
            "u_following_tags": self._parse_user_following_tags_info,
            "u_bookmarks": self._parse_user_bookmark_info,
            "u_followed_users": self._parse_user_follower_info,
            "u_tags": self._parse_user_tags_info,
        }
        return response_type_map[rep_type](response)

    def _format_social_contact(self, social_item_list):
        social_contact = dict()
        for social_item in social_item_list:
            key = self._f(
                social_item.xpath(".//@class")[0].extract().split("-")[-1])
            value = self._f(social_item.xpath(".//@href")[0].extract())
            if value != u"javascript:void(0);":
                social_contact[key] = value
        return social_contact

    def _f(self, val):
        return val.strip()

    def _parse_toplic_and_article_info(self, response):
        xpath_list = "//h2[@class='title']/a/@href"
        for topic_href in response.xpath(xpath_list):
            new_task(topic_href.extract().strip())

    def _parse_topic_info(self, response):
        item = SegmentfaultTopicItem()
        item["question_id"] = response.url.split("/")[4]
        item["title"] = self._f(
            response.xpath("//h1[@id='questionTitle']/a/text()")[0].extract())
        item["author"] = self._f(
            response.xpath("//div[@class='author']/a/@href")
            [0].extract().split("/")[-1])
        item["created_at"] = self._format_timestr(
            self._f("".join([
                k.strip() for k in response.xpath(
                    "//div[@class='author']/text()").extract()
            ]).split()[0]))

        topic_status = response.xpath(
            '//div[@class="col-md-3"][1]/ul/li/strong/text()')
        item["follower_count"] = self._format_count(
            self._f(topic_status[0].extract()))
        item["collection_count"] = self._format_count(
            self._f(topic_status[1].extract()))
        item["hit_count"] = self._format_count(
            self._f(topic_status[2].extract()))

        question_tag = response.xpath(
            "//article[@class='widget-question__item']")
        item["like_count"] = self._format_count(
            self._f(
                question_tag.xpath(".//span[@class='count']/text()")
                [0].extract()))
        item["question"] = self._f(
            question_tag.xpath(".//div[@class='post-offset']/div[1]")
            [0].extract())
        item["tags"] = [
            self._f(k) for k in question_tag.xpath(
                ".//div[@class='post-offset']/ul[1]/li/a/text()").extract()
        ]

        answer_list_tag = response.xpath(
            "//article[contains(@class, 'widget-answers__item')]")
        answer_list = []
        for answer_item in answer_list_tag:
            answer_id = self._f(answer_item.xpath(".//@id")[0].extract())
            answer_like_count = self._format_count(
                self._f(
                    answer_item.xpath(".//span[@class='count']/text()")
                    [0].extract()))
            answer_accepted = bool(
                answer_item.xpath(".//div[@class='accepted-flag']"))
            answer_author = self._f(
                answer_item.xpath(".//div[@class='post-offset']/a/@href")
                [0].extract())

            answer_created_at = self._format_timestr(
                self._f(
                    answer_item.xpath(
                        ".//span[contains(@class, 'text-muted')]/text()")
                    [0].extract().strip().split()[0]))
            answer = self._f(
                answer_item.xpath(".//div[contains(@class, 'answer')]")
                [0].extract())
            answer_list.append(
                dict(like_count=answer_like_count,
                     accepted=answer_accepted,
                     anthor=answer_author,
                     created_at=answer_created_at,
                     answer=answer,
                     answer_id=answer_id))
            item["answers"] = answer_list

        # add new task
        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)
        return item

    def _parse_user_info(self, response):
        item = SegmentfaultUserItem()
        profile_tag = response.xpath(
            "//div[contains(@class, 'profile-header')]")

        item["login"] = response.url.split("/")[4]
        item["avatar_url"] = self._f(
            profile_tag.xpath(".//img/@src")[0].extract())
        if not item["avatar_url"].startswith("http"):
            item["avatar_url"] = None

        item["name"] = self._f(profile_tag.xpath(".//h4/text()")[0].extract())
        item["social_contact"] = self._format_social_contact(
            profile_tag.xpath(".//li/a"))

        profile_item_map = {
            u"所在城市": "location",
            u"现任职位": "position",
            u"院校专业": "education",
            u"个人网站": "blog_url",
        }
        item.update([(v, None) for _, v in profile_item_map.iteritems()])
        item["major"] = None
        for profile_item in response.xpath(
                "//ul[contains(@class, 'profile-links')]/li"):
            f_profile_item = [
                self._f(k) for k in re.sub(
                    '<[^>]*>', '', profile_item.extract()).strip().split(u":")
            ]
            key = f_profile_item[0]
            value = " ".join(f_profile_item[1:])
            if key in profile_item_map:
                item[profile_item_map[key]] = value
        if item["education"]:
            education_list = item["education"].split(" ")
            item["education"] = self._f(education_list[0])

            if len(education_list) > 1:
                item["major"] = self._f(u" ".join(education_list[1:]))

        item["follower_count"] = self._format_count(
            self._f(
                response.xpath("//a[@class='funsCount']/text()")[0].extract()))

        profile_detail_tag = response.xpath("//div[@class='col-md-4 profile']")
        reputation_count, emblem_count, like_count = [
            self._f(k.extract())
            for k in profile_detail_tag.xpath(".//strong/text()")[:3]
        ]
        item["reputation_count"] = self._format_count(reputation_count)
        item["emblem_count"] = self._format_count(emblem_count)
        item["like_count"] = self._format_count(like_count)
        item["introduction"] = self._f(
            profile_detail_tag.xpath(".//div[contains(@class, 'profile-bio')]")
            [0].extract())

        profile_info_list = response.xpath(
            "//div[@class='col-md-4 profile']/following-sibling::div/ul/li")
        item["answer_count"] = self._format_count(
            self._f(profile_info_list[3].xpath(".//span/text()")[0].extract()))
        item["question_count"] = self._format_count(
            self._f(profile_info_list[4].xpath(".//span/text()")[0].extract()))
        item["article_count"] = self._format_count(
            self._f(profile_info_list[5].xpath(".//span/text()")[0].extract()))
        item["following_tag_count"] = self._format_count(
            self._f(profile_info_list[-1].xpath(
                ".//span[@class='badge']/text()")[0].extract()))

        user_href = "/" + "/".join(response.url.split("/")[3:])
        if item["answer_count"]:
            pages = int(
                math.ceil(item["answer_count"] /
                          float(self.answer_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/answers?page={}".format(user_href, i))

        if item["question_count"]:
            pages = int(
                math.ceil(item["question_count"] /
                          float(self.question_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/questions?page={}".format(user_href, i))

        if item["article_count"]:
            pages = int(
                math.ceil(item["article_count"] /
                          float(self.article_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/blogs?page={}".format(user_href, i))

        if item["following_tag_count"]:
            pages = int(
                math.ceil(item["following_tag_count"] /
                          float(self.following_tag_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/following/tags?page={}".format(user_href, i))

        if item["follower_count"]:
            pages = int(
                math.ceil(item["follower_count"] /
                          float(self.follower_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/followed/users?page={}".format(user_href, i))

        # tags
        new_task("{}/tags?page={}".format(user_href, 1))
        new_task("{}/bookmarks?page={}".format(user_href, 1))
        new_task("{}/bookmarks?page={}".format(user_href, 2))

        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)
        return item

    def _parse_user_answer_info(self, response):
        login = response.url.split("/")[4]
        for answer_item in response.xpath("//h2[@class='title']/a/@href"):
            item = SegmentfaultUserAnswerItem()
            answer_link = self._f(answer_item.extract()).split("/")
            item["question_id"] = answer_link[2]
            item["answer_id"] = answer_link[3]
            item["login"] = login
            yield item

    def _parse_user_question_info(self, response):
        login = response.url.split("/")[4]
        for question_item in response.xpath("//h2[@class='title']/a/@href"):
            item = SegmentfaultUserQuestionItem()
            question_link = self._f(question_item.extract()).split("/")
            item["question_id"] = question_link[2]
            item["login"] = login
            yield item

    def _parse_user_article_info(self, response):
        login = response.url.split("/")[4]
        for article_item in response.xpath("//h2[@class='title']/a/@href"):
            item = SegmentfaultUserArticleItem()
            article_link = self._f(article_item.extract()).split("/")
            item["article_id"] = article_link[2]
            item["login"] = login
            yield item

    def _parse_user_following_tags_info(self, response):
        login = response.url.split("/")[4]
        item = SegmentfaultUserFollowingTagItem()
        item["login"] = login
        item["following_tags"] = list(
            set([
                self._f(k) for k in response.xpath(
                    "//h4[contains(@class, 'h5')]/a/text()").extract()
            ]))
        return item

    def _parse_article_info(self, response):
        item = SegmentfaultArticleItem()
        item["article_id"] = response.url.split("/")[4]
        item["title"] = self._f(
            response.xpath("//h1[@id='articleTitle']/a/text()")[0].extract())
        item["author"] = self._f(
            response.xpath("//div[@class='author']/a/@href")
            [0].extract().split("/")[-1])
        item["created_at"] = self._format_timestr(
            self._f(
                response.xpath("//div[@class='author']/text()")
                [-1].extract().strip().split()[0]))

        topic_status = response.xpath(
            '//div[@class="col-md-3"][1]/ul/li/strong/text()')
        item["follower_count"] = self._format_count(
            self._f(topic_status[0].extract()))
        item["collection_count"] = self._format_count(
            self._f(topic_status[1].extract()))
        item["hit_count"] = self._format_count(
            self._f(topic_status[2].extract()))

        item["article"] = self._f(
            response.xpath("//div[contains(@class, 'article')]")[0].extract())
        item["tags"] = list(
            set([
                self._f(k) for k in response.xpath(
                    "//li[@class='tagPopup']/a/text()").extract()
            ]))

        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)
        return item

    def _parse_user_bookmark_info(self, response):
        login = response.url.split("/")[4]
        for bookmark_list in response.xpath(
                "//section[contains(@class, 'stream-list__item')]"):
            if bookmark_list:
                item = SegmentfaultUserBookmarkItem()
                item["login"] = login
                link = self._f(
                    bookmark_list.xpath(".//strong/a/@href")[0].extract())
                item["bookmark_id"] = link.split("/")[2]

                new_task(link)
                yield item

    def _parse_bookmark_info(self, response):
        bookmark_id = response.url.split("/")[4]
        for bookmark_item in response.xpath("//h2[@class='title']/a/@href"):
            item = SegmentfaultBookmarkItem()
            bookmark_link = self._f(bookmark_item.extract()).split("/")
            item["bookmark_id"] = bookmark_id
            item["bookmark_list"] = dict(type=bookmark_link[1],
                                         id=bookmark_link[2])
            yield item

    def _parse_user_follower_info(self, response):
        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)

        login = response.url.split("/")[4]
        for follower_item in response.xpath(
                "//div[contains(@class, 'stream-list__item')]/a/@href"):
            item = SegmentfaultUserFollowerItem()
            item["login"] = login
            item["follower"] = self._f(follower_item.extract()).split("/")[2]
            yield item

    def _parse_user_tags_info(self, response):
        tags = [
            self._f(k) for k in response.xpath(
                "//li[contains(@class, 'tagPopup')]/a/text()").extract()
        ]
        if tags:
            next_page = response.xpath("//li[@class='next']/a/@href").extract()
            if next_page:
                new_task(next_page[0])

            item = SegmentfaultUserTagItem()
            item["login"] = response.url.split("/")[4]
            item["tags"] = tags
            return item
class StackoverflowQuestionSpider(scrapy.Spider):

    handle_httpstatus_list = [
        201,
        202,
        203,
        204,
        205,
        206,
        400,
        401,
        402,
        403,
        404,
        405,
        406,
        407,
        408,
        409,
        410,
        411,
        412,
        413,
        414,
        415,
        416,
        417,
        500,
        501,
        502,
        503,
        504,
        505,
    ]

    name = "stackoverflow_question"
    allowed_domains = ["stackoverflow.com"]
    start = 4
    #end = 28000000
    end = 104
    #start_urls = (
    #    'http://stackoverflow.com/feeds/question/%d'%i for i in xrange(start, end)
    #)
    start_urls = tasks_generator(build_url)

    id_xpath = '//entry/id'
    rank_xpath = '//entry/rank'
    title_xpath = '//entry/title'
    tag_xpath = '//entry/category/@term'
    author_name_xpath = '//entry/author/name'
    author_uri_xpath = '//entry/author/uri'
    link_xpath = '//entry/link/@href'
    published_xpath = '//entry/published'
    updated_xpath = '//entry/updated'
    content_xpath = '//entry/summary'

    HTML_200_STRING = []
    HTML_404_STRING = []
    HTML_MOBILE_STRING = []

    def parse(self, response):
        if response.status in self.handle_httpstatus_list:
            return

        if 'StackExchange.ready' in response.body and "Page Not Found" in response.body:
            return

        response.selector.remove_namespaces()

        ids = extract_text_null(self, 'id', response)
        ranks = extract_text_null(self, 'rank', response)
        titles = extract_text_null(self, 'title', response)
        tags = response.xpath(self.tag_xpath).extract()
        author_names = extract_text_null(self, 'author_name', response)
        author_uris = extract_text_null(self, 'author_uri', response)
        links = response.xpath(self.link_xpath).extract()
        publisheds = extract_text_null(self, 'published', response)
        updateds = extract_text_null(self, 'updated', response)
        contents = extract_text_null(self, 'content', response)

        item = StackoverflowQuestionItem()
        item['uid'] = response.url.rstrip('/').split('/')[-1]
        item['rank'] = ranks[0]
        item['title'] = titles[0]
        item['tags'] = tags
        item['author_name'] = author_names[0]
        item['author_uri'] = author_uris[0]
        item['author_uid'] = author_uris[0].split('/')[-1]
        item['link'] = links[0]
        item['published'] = publisheds[0]
        item['updated'] = updateds[0]
        item['content'] = contents[0]
        item['answers'] = []

        pipeline = metric.get_redis().pipeline()
        for i in xrange(1, len(ids)):
            answer = {}
            answer['uid'] = ids[i].split('#')[-1]
            pipeline.hincrby(':'.join([metric.metric_key, 'answer']),
                             answer['uid'], 1)
            answer['rank'] = ranks[i]
            answer['author_name'] = author_names[i]
            answer['author_uri'] = author_uris[i]
            answer['author_uid'] = author_uris[i].split('/')[-1]
            answer['link'] = links[i]
            answer['published'] = publisheds[i]
            answer['updated'] = updateds[i]
            answer['content'] = contents[i]
            item['answers'].append(answer)
        pipeline.execute()

        return item

    def parse_list(self, reponse):
        pass

    def parse_page(self, response):
        pass
Exemple #6
0
class LinkedinSpider(BaseSpider):

    name = 'linkedin'
    seeds_start_urls = (('http://cn.linkedin.com/directory/people-%s/' % i)
                        for i in string.lowercase)
    seeds_start_urls = itertools.chain(
        seeds_start_urls, (('http://cn.linkedin.com/directory/people-%d/' % i)
                           for i in xrange(1, 27)))
    seeds_start_urls = []
    redis_start_urls = tasks_generator(build_url)
    start_urls = itertools.chain(seeds_start_urls, redis_start_urls)

    full_name_xpath = '//span[@class="full-name"]/text()'
    title_xpath = '//p[@class="title"]/text()'
    location_xpath = '//a[@name="location"]/text() | //span[@class="locality"]/text()'
    industry_xpath = '//a[@name="industry"]/text() | //dd[@class="industry"]/text()'
    language_xpath = 'string(//div[@id="languages"])'

    experience_header_h4_xpath = '//span[@class="experience-date-locale"]/preceding-sibling::*//h4'
    #experience_header_h5_xpath = 'string(//span[@class="experience-date-locale"]/preceding-sibling::*//h5)'
    experience_header_h5_xpath = '//span[@class="experience-date-locale"]/preceding-sibling::*//h5//a[@dir="auto"]'
    experience_time_xpath = '//span[@class="experience-date-locale"]'

    education_school_xpath = '//span[@class="education-date"]/preceding-sibling::header//h4'
    education_degree_xpath = '//span[@class="education-date"]/preceding-sibling::header//*[@class="degree"]/text()'
    education_major_xpath = '//span[@class="education-date"]/preceding-sibling::header//*[@class="major"]'
    education_date_xpath = '//span[@class="education-date"]'

    project_name_xpath = '//span[@class="projects-date"]/preceding-sibling::*//span[@dir="auto"]/text()'
    project_date_xpath = '//span[@class="projects-date"]'
    project_detail_xpath = '//span[@class="projects-date"]/following-sibling::p/text()'

    summary_xpath = 'string(//div[@class="summary"]/p)'
    skill_num_endorsements_xpath = '//span[@class="num-endorsements"]/text()'
    skill_name_xpath = '//span[@class="endorse-item-name"]/*/text()'
    member_connections_xpath = '//div[@class="member-connections"]/strong/text()'

    dir_link_xpath = '//li[@class="content"]/a/@href'

    def __init__(self, *args, **kwargs):
        super(LinkedinSpider, self).__init__(*args, **kwargs)
        filepath = os.path.join(get_main_path(),
                                'spiders/linkedin/pub_china_user_list')
        r = get_redis()
        r.delete('linkedin:tasks:fingerprint')
        with open(filepath) as fi:
            cnt = 0
            for line in fi:
                new_task(line.strip())
                cnt += 1
                if cnt > 100: break

    def parse(self, response):
        if 'linkedin.com/pub' in response.url:
            item = LinkedinItem()
            for field in [
                    'full_name', 'title', 'location', 'industry', 'language',
                    'summary', 'member_connections'
            ]:
                try:
                    item[field] = response.xpath(
                        getattr(self, '%s_xpath' % field)).extract()[0]
                except:
                    item[field] = ''

            experience_h4 = [
                i.xpath('string()').extract()[0]
                for i in response.xpath(self.experience_header_h4_xpath)
            ]
            experience_h5 = [
                i.xpath('string()').extract()[0]
                for i in response.xpath(self.experience_header_h5_xpath)
            ]
            experience_time = [
                i.xpath('string()').extract()[0]
                for i in response.xpath(self.experience_time_xpath)
            ]
            experience = [{
                'title': x,
                'company': y,
                'time': z
            } for x, y, z in zip(experience_h4, experience_h5, experience_time)
                          ]
            item['experience'] = experience
            education_school = [
                i.xpath('string()').extract()[0]
                for i in response.xpath(self.education_school_xpath)
            ]
            education_degree = response.xpath(
                self.education_degree_xpath).extract()
            education_major = [
                i.xpath('string()').extract()[0]
                for i in response.xpath(self.education_major_xpath)
            ]
            education_date = [
                i.xpath('string()').extract()[0]
                for i in response.xpath(self.education_date_xpath)
            ]
            education = [{
                'school': x,
                'degree': y,
                'major': z,
                'date': d
            } for x, y, z, d in zip(education_school, education_degree,
                                    education_major, education_date)]

            project_name = response.xpath(self.project_name_xpath).extract()
            project_date = [
                i.xpath("string()").extract()[0]
                for i in response.xpath(self.project_date_xpath)
            ]
            project_detail = response.xpath(
                self.project_detail_xpath).extract()
            item['project'] = [{
                'name': x,
                'date': y,
                'detail': z
            } for x, y, z in zip(project_name, project_date, project_detail)]

            skill_name = response.xpath(self.skill_name_xpath).extract()
            skill_num_endorsements = response.xpath(
                self.skill_num_endorsements_xpath).extract()
            if not skill_num_endorsements:
                skill_num_endorsements = [0] * len(skill_name)
            item['skills'] = zip(skill_name, skill_num_endorsements)
            item['education'] = education
            item['public_profile_url'] = response.url

            return item

    def parse_directory(self, response):
        links = response.xpath(self.dir_link_xpath).extract()
        for link in links:
            if link.startswith('http'):
                if 'linkedin.com/pub' in link:
                    r = get_redis()
                    r.sadd('linkedin:user', link)
                    continue
                new_task(link)

    def get_fingerprint(self, task_string):
        return task_string
class RubychinaAllSpider(twcrawler.spiders.base.base_spider.BaseSpider):
    name = "rubychina_all"
    allowed_domains = ["ruby-china.org"]

    base_topic_url = u"https://ruby-china.org/api/topics/{}.json"

    # max_topic_number = 25000
    # start_urls = [base_topic_url.format(k) for k in xrange(1, max_topic_number + 1)]
    start_urls = [base_topic_url.format(1)]
    redis_start_urls = tasks_generator(build_url)
    start_urls = itertools.chain(start_urls, redis_start_urls)

    def parse(self, response):
        tag = response.url.split("/")[4]
        tag_map = {
            "users": self._parse_user_info,
            "topics": self._parse_topic_info
        }
        return tag_map[tag](response)

    def _parse_topic_info(self, response):
        f_response = json.loads(response.body)
        if "error" in f_response:
            return None

        item = RubychinaTopicItem()
        item["id"] = f_response["id"]
        item["title"] = f_response["title"]
        item["created_at"] = f_response["created_at"]
        item["replies_count"] = f_response["replies_count"]
        item["owner_login"] = f_response["user"]["login"]
        item["content"] = f_response["body_html"]
        item["hits"] = f_response["hits"]

        for reply_item in f_response["replies"]:
            new_task(reply_item["user"]["login"])
        return item

    def _format_user_v(self, val):
        if val:
            f_val = val.strip()
            return f_val if f_val else None

    def _parse_user_info(self, response):
        if response.headers["Content-Type"] != "application/json":
            return None

        f_response = json.loads(response.body)
        item = RubychinaUserItem()
        item["name"] = self._format_user_v(f_response["name"])
        item["login"] = f_response["login"].strip()
        item["location"] = self._format_user_v(f_response["location"])
        item["company"] = self._format_user_v(f_response["company"])
        item["twitter_id"] = self._format_user_v(f_response["twitter"])
        item["blog_url"] = self._format_user_v(f_response["website"])
        item["bio"] = self._format_user_v(f_response["bio"])
        item["tagline"] = self._format_user_v(f_response["tagline"])

        item["github_id"] = self._format_user_v(f_response["github_url"])
        item["github_id"] = item["github_id"][len("https://github.com/"):] if item["github_id"] else None
        item["email"] = self._format_user_v(f_response["email"])
        item["avatar_url"] = self._format_user_v(f_response["avatar_url"])
        return item
Exemple #8
0
class ZhihuUserSpider(twcrawler.spiders.base.base_spider.BaseSpider):
    name = "zhihu_user"
    allowed_domains = ["zhihu.com"]
    seeds_urls = (
        'http://api.zhihu.com/people/54e5f13a0d10654a461b318d27978341',
        'http://api.zhihu.com/people/0970f947b898ecc0ec035f9126dd4e08',
        'http://api.zhihu.com/people/1d56ab293e529b7d78acfe621d8173ed',
        'http://api.zhihu.com/people/46a588b2d664291f7fb5fec44bc60f71',
        'http://api.zhihu.com/people/08c4f44866e0e339c4e519f4631befd0',
        'http://api.zhihu.com/people/d01aeaca8cc7de042044213bb540268a',
        'http://api.zhihu.com/people/2ae8b3af01d40abc77ebeda7ecc350a9',
        'http://api.zhihu.com/people/44a8c67957c79ce9b229ed2774dcaa1a',
        'http://api.zhihu.com/people/44faf17ff5f5a4ccc4cf9bda47de8da2',
        'http://api.zhihu.com/people/a06cfb38e37dac1658e6457df4d7f032',
        'http://api.zhihu.com/people/b6d28ac2b88b7f230552bab4a0aceaca',
        'http://api.zhihu.com/people/3ec3b166992a5a90a1083945d2490d38',
        'http://api.zhihu.com/people/d073f194bcabc1cec5ef69d0b534de99',
        'http://api.zhihu.com/people/78e3b98074a915b222ae1be4ab038a6e',
        'http://api.zhihu.com/people/b6f80220378c8b0b78175dd6a0b9c680',
        'http://api.zhihu.com/people/9dcef282c46c0342b5f76d6baa0b3631',
        'http://api.zhihu.com/people/227135f01257c4d6bdb21a726ef6d53a',
        'http://api.zhihu.com/people/5dfdfcfb9b22d25bc8b639a86ae3692a',
        'http://api.zhihu.com/people/fcbb342dc45f1c6e2e18925c3d2cf264',
        'http://api.zhihu.com/people/99953853cc4219fabe8327301058357c',
        #'https://api.zhihu.com/people/54e5f13a0d10654a461b318d27978341',
    )
    url_generator = tasks_generator(build_url)
    start_urls = chain(seeds_urls, url_generator)

    #followees_url = 'https://api.zhihu.com/people/%(uid)s/followees?limit=%(limit)d&offset=%(offset)d'
    followees_url = 'http://api.zhihu.com/people/%(uid)s/followees?limit=%(limit)d&offset=%(offset)d'

    def compress(self, item, *foi):
        if not item:
            return item
        ret = {}
        for key in foi:
            ret[key] = item.get(key)
        return ret

    def parse(self, response):
        if 'followees' in response.url:
            for item in self.parse_followees(response):
                yield item
            return

        ret = json.loads(response.body)
        item = ZhihuUserItem()
        for key in item.fields:
            if key == 'uid':
                item[key] = ret.get('id')
            #elif key == 'education':
            #  item[key] = []
            #  for edu in ret.get(key, []):
            #    item[key].append( self.compress(edu, 'id', 'name', 'experience', 'type') )
            #elif key == 'employment':
            #  item[key] = []
            #  raw = ret.get('employment', [])
            #  for emp in raw:
            #    item[key].append({
            #      'id': emp[0].get('id'),
            #      'name': emp[0].get('name'),
            #      'job_name': emp[1].get('name'),
            #      'type': emp[0].get('topic'),
            #      })
            #elif key == 'business':
            #  item[key] = self.compress(ret.get(key), 'id', 'name', 'type')
            #elif key == 'location':
            #  item[key] = []
            #  for loc in ret.get(key, []):
            #    item[key].append( self.compress(loc, 'id', 'name', 'experience', 'type') )
            else:
                item[key] = ret.get(key)

        item['_id'] = item['uid']
        yield item

        offset = 0
        limit = 20
        while offset < int(ret['following_count']):
            url = self.followees_url % ({
                'uid': ret['id'],
                'limit': limit,
                'offset': offset,
            })
            yield scrapy.Request(url)
            #yield scrapy.Request(url, callback=self.parse_followees)
            offset += 20

    def parse_followees(self, response):
        ret = json.loads(response.body)
        item = ZhihuRelationItem()
        item['uid'] = self.get_uid_from_url(response.url)
        item['followees'] = []
        item['_id'] = '%s:%s' % (item['uid'],
                                 self.get_offset_from_url(response.url))
        for people in ret['data']:
            item['followees'].append({
                'uid': people.get('id'),
                #'email': people.get('email'),
                #'gender': people.get('gender'),
                #'sina_weibo_url': people.get('sina_weibo_url'),
                #'sina_weibo_name': people.get('sina_weibo_name'),
                #'headline': people.get('headline'),
                #'description': people.get('description'),
            })
            new_task(people['id'])
        yield item

    def get_uid_from_url(self, url):
        #idx_b = url.find('people/') + len('people/')
        idx_b = url.find('people/') + 7
        idx_e = url[idx_b:].find('/')
        return url[idx_b:idx_b + idx_e]

    def get_offset_from_url(self, url):
        b = url.find('offset=')
        if b == -1:
            return '0'
        return url[b + 7:]

    def get_fingerprint(self, url):
        return url
Exemple #9
0
class V2exSpider(twcrawler.spiders.base.base_spider.BaseSpider):

    name = "v2ex_all"
    allowed_domains = ["www.v2ex.com"]

    max_topic_page = 8000
    base_topic_list = u"https://www.v2ex.com/feed/tab/all.xml?page={}"
    raw_start_urls = [base_topic_list.format(k) for k in xrange(1, max_topic_page)]
    raw_start_urls = []

    redis_start_urls = tasks_generator(build_url)
    start_urls = itertools.chain(raw_start_urls, redis_start_urls)

    #start_urls = ["https://www.v2ex.com/member/Livid"]
    #start_urls = ["https://www.v2ex.com/member/jianshuio"]
    #start_urls = ["https://www.v2ex.com/feed/tab/all.xml?page=1"]
    #start_urls = ["https://www.v2ex.com/api/topics/show.json?id=182731"]
    start_urls = ["https://www.v2ex.com/api/replies/show.json?topic_id=182731"]

    def get_fingerprint(self, task_string):
        return task_string

    def _f_timestr(self, timestamp):
        return datetime.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")

    def _f(self, item):
        return [k.strip() for k in item.extract()]

    def parse(self, response):
        type_ = "_".join(response.url.split("/")[3:-1])
        response_map_type = {
            "feed_tab": self._parse_topic_list_info,
            "api_topics": self._parse_topic_info,
            "api_replies": self._parse_topic_replies_info,
            "api_members": self._parse_user_info,
            "member": self._parse_user_detail_info,
        }
        return response_map_type[type_](response)

    def _parse_topic_list_info(self, response):
        soup = BeautifulSoup(response.body)
        all_link = soup.findAll("link")
        base_topic_url = u"/api/topics/show.json?id={}"
        for link_item in all_link[2:]:
            topic_id = link_item.attrs["href"].split("#")[0].split("/")[-1]
            new_task(base_topic_url.format(topic_id))

    def _parse_topic_info(self, response):
        f_response = json.loads(response.body)
        if not f_response:
            return
        f_response = f_response[0]
        item = V2ExTopicItem()

        item["topic_id"] = f_response["id"]
        item["title"] = f_response["title"]
        item["content"] = f_response["content_rendered"]
        item["author"] = f_response["member"]["username"]

        if item["author"]:
            new_task(u"/api/members/show.json?id={}".format(f_response["member"]["id"]))

        item["node_name"] = f_response["node"]["name"]
        item["created_at"] = self._f_timestr(f_response["created"])
        item["reply_count"] = f_response["replies"]

        if item["reply_count"]:
            new_task(u"/api/replies/show.json?topic_id={}".format(item["topic_id"]))
        return item

    def _parse_topic_replies_info(self, response):
        f_response = json.loads(response.body)
        if not f_response:
            return

        topic_id = response.url.split("?")[-1]
        for reply_item in f_response:
            item = V2ExTopicReplyItem()

            item["reply_id"] = reply_item["id"]
            item["topic_id"] = topic_id
            item["thank_count"] = reply_item["thanks"]
            item["content"] = reply_item["content_rendered"].strip()
            item["author"] = reply_item["member"]["username"]
            if item["author"]:
                new_task("/api/members/show.json?id={}".format(reply_item["member"]["id"]))
            item["created_at"] = self._f_timestr(reply_item["created"])
            yield item

    def _parse_user_info(self, response):
        f_response = json.loads(response.body)
        if not f_response or f_response["status"] == "notfound":
            return

        item = V2ExUserItem()
        item["id"] = f_response["id"]
        item["login"] = f_response["username"]
        item["tagline"] = f_response["tagline"].strip()
        item["description"] = f_response["bio"].strip()
        item["avatar_url"] = f_response["avatar_large"]
        item["created_at"] = self._f_timestr(f_response["created"])
        new_task(u"/member/{}".format(item["login"]))
        return item

    def _parse_user_detail_info(self, response):
        item = V2ExUserDetailItem()
        item["login"] = response.url.split("/")[-1]

        balance_type_list = ["gold", "silver", "bronze"]
        item["balance"] = dict.fromkeys(balance_type_list, 0)
        balance_item = response.xpath("//div[@class='balance_area']")
        if balance_item:
            balance_type = [k.split("/")[-1].split(".")[0].strip() for k in self._f(balance_item.xpath(".//img/@src"))]
            balance_score = [int(k) for k in self._f(balance_item.xpath(".//text()"))]
            for k_, v_ in zip(balance_type, balance_score):
                if k_ in balance_type_list:
                    item["balance"][k_] = v_

        social_link_item = response.xpath("//div[contains(@class, 'markdown_body')][1]/a")
        social_link_map = {
            "social_twitter": "twitter_login",
            "social_home": "blog_url",
            "social_geo": "location",
            "social_instagram": "instagram_login",
            "social_ps": "ps_login",
            "social_twitch": "twitch_login",
            "social_steam": "steam_login",
            "social_bn": "battle_tag_login",
            "social_dribbble": "dribble_login",
            "social_github": "github_login",
            "social_btc": "btc_address"
        }
        for _, v in social_link_map.iteritems():
            item[v] = None

        if social_link_item:
            social_type = [k.split("/")[-1].split(".")[0].strip() for k in
                           self._f(social_link_item.xpath(".//img/@src"))]
            social_value = self._f(social_link_item.xpath(".//text()"))
            for t_, v_ in zip(social_type, social_value):
                if t_ in social_link_map:
                    item[social_link_map[t_]] = v_
        return item