Python new_task Exemples, twcrawler.add_task.new_task Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def parse_user_details(self, response):
   if response.css('.org-name'):
     return self.parse_org_details(response, response.url)
   baseInfo_div = response.css(".vcard")[0]
   des = UserDetailItem()
   des["name"] = baseInfo_div.css(".vcard-fullname")
   des["login"] = baseInfo_div.css(".vcard-username")
   des["company"] = baseInfo_div.css('li[itemprop="worksFor"]')
   des["location"] = baseInfo_div.css('li[itemprop="homeLocation"]')
   des["email"] = baseInfo_div.css(".email")
   des["created_at"] = baseInfo_div.css(".join-date")
   des["blog"] =  baseInfo_div.css(".url")
   for i in des:
     try:
       des[i] = des[i].xpath('string()').extract()[0]
     except:
       des[i] = None
   orgs = response.xpath('//*[contains(@class, "avatar-group-item")]/@href').extract()
   des["orgs"] = orgs
   new_task(response.url+"/following")
   new_task(response.url+"/followers")
   new_task(response.url+"?tab=repositories")
   new_task("/".join(response.url.split("/")[:-1])+"/stars/"+response.url.split("/")[-1])
   for i in orgs:
     #logger.debug("new_task,parse_user_details:"+i)
     new_task("https://github.com"+i) #add orgs
   return des

Exemple #2

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

    def parse_org_details(self, response, url):
      org_details = OrgDetailItem()
      org_details["login"] = url.split("/")[-1]
      #org_details["name"] = response_soup.select(".js-username")[0].getText()
      org_details["name"] = response.css(".org-name").xpath('string()').extract()
      try:
        org_details["description"] =response.css(".org-description").xpath('string()').extract()[0]
      except:
        pass
      meta_soup = response.css(".meta-item")

      try:
        org_details["location"] = meta_soup[0].xpath('string()').extract()[0]
      except:
        pass
      
      try:
        org_details["link"] = meta_soup[1].xpath('string()').extract()[0]
      except:
        pass
      try:
        org_details["email"] = meta_soup[2].xpath('string()').extract()[0]
      except:
        pass
      new_task("/".join(url.split("/")[:-1])+"/orgs/"+url.split("/")[-1]+"/people")
      new_task(url+"?page=1")
      return org_details

Exemple #3

0

Afficher le fichier

Fichier : v2ex.py Projet : heicks/github-crawler

 def _parse_topic_list_info(self, response):
     soup = BeautifulSoup(response.body)
     all_link = soup.findAll("link")
     base_topic_url = u"/api/topics/show.json?id={}"
     for link_item in all_link[2:]:
         topic_id = link_item.attrs["href"].split("#")[0].split("/")[-1]
         new_task(base_topic_url.format(topic_id))

Exemple #4

0

Afficher le fichier

    def parse_org_details(self, response, url):
        org_details = OrgDetailItem()
        org_details["login"] = url.split("/")[-1]
        #org_details["name"] = response_soup.select(".js-username")[0].getText()
        org_details["name"] = response.css(".org-name").xpath(
            'string()').extract()
        try:
            org_details["description"] = response.css(
                ".org-description").xpath('string()').extract()[0]
        except:
            pass
        meta_soup = response.css(".meta-item")

        try:
            org_details["location"] = meta_soup[0].xpath(
                'string()').extract()[0]
        except:
            pass

        try:
            org_details["link"] = meta_soup[1].xpath('string()').extract()[0]
        except:
            pass
        try:
            org_details["email"] = meta_soup[2].xpath('string()').extract()[0]
        except:
            pass
        new_task("/".join(url.split("/")[:-1]) + "/orgs/" +
                 url.split("/")[-1] + "/people")
        new_task(url + "?page=1")
        return org_details

Exemple #5

0

Afficher le fichier

 def parse_user_details(self, response):
     if response.css('.org-name'):
         return self.parse_org_details(response, response.url)
     baseInfo_div = response.css(".vcard")[0]
     des = UserDetailItem()
     des["name"] = baseInfo_div.css(".vcard-fullname")
     des["login"] = baseInfo_div.css(".vcard-username")
     des["company"] = baseInfo_div.css('li[itemprop="worksFor"]')
     des["location"] = baseInfo_div.css('li[itemprop="homeLocation"]')
     des["email"] = baseInfo_div.css(".email")
     des["created_at"] = baseInfo_div.css(".join-date")
     des["blog"] = baseInfo_div.css(".url")
     for i in des:
         try:
             des[i] = des[i].xpath('string()').extract()[0]
         except:
             des[i] = None
     orgs = response.xpath(
         '//*[contains(@class, "avatar-group-item")]/@href').extract()
     des["orgs"] = orgs
     new_task(response.url + "/following")
     new_task(response.url + "/followers")
     new_task(response.url + "?tab=repositories")
     new_task("/".join(response.url.split("/")[:-1]) + "/stars/" +
              response.url.split("/")[-1])
     for i in orgs:
         #logger.debug("new_task,parse_user_details:"+i)
         new_task("https://github.com" + i)  #add orgs
     return des

Exemple #6

0

Afficher le fichier

    def get_repo_detail(self, response):
        repo_detail = RepoDetail()
        #repo_detail_soup = BeautifulSoup(response.body)

        try:
            if u'This repository is empty.' == response.xpath(
                    "//h3/text()").extract()[0]:
                repo_detail["full_name"] = "/" + response.url.split(
                    "/")[-2] + "/" + response.url.split("/")[-1]
                repo_detail["empty"] = True
                return repo_detail
        except:
            pass

        try:
            repo_detail["description"] = response.css(
                ".repository-meta").xpath('string()').extract()[0]
        except exceptions.IndexError:
            repo_detail["description"] = None

        if response.css("#readme"):
            repo_detail["readme"] = response.css("#readme").xpath(
                'string()').extract()[0]
        else:
            repo_detail["readme"] = ""
        try:
            repo_detail["fork"] = "/" + response.css(".fork-flag").xpath(
                './/a/text()').extract()[0]
        except exceptions.IndexError:
            repo_detail["fork"] = None
        language_list_soup = response.css(".language-color")
        language_list = []
        language_detail = LanguageDetail()
        for i in language_list_soup:
            language_detail = {}
            l = i.xpath(".//@aria-label").extract()
            if l:
                l = l[0]
                language_detail[l.split()[0]] = l.split()[1]
                language_list.append(language_detail)

        nums = response.css(".num")
        repo_detail["master_commit_num"] = int(
            nums[0].xpath('string()').extract()[0].replace(",", "").replace(
                "+", ""))
        repo_detail["branch_num"] = int(
            nums[1].xpath('string()').extract()[0].replace(",", ""))
        repo_detail["release_num"] = int(
            nums[2].xpath('string()').extract()[0].replace(",", ""))
        repo_detail["language_list"] = language_list
        repo_detail["full_name"] = "/" + response.url.split(
            "/")[-2] + "/" + response.url.split("/")[-1]
        counts = response.css('.social-count').xpath('text()').extract()
        repo_detail['watch_count'] = counts[0].strip().replace(',', '')
        repo_detail['star_count'] = counts[1].strip().replace(',', '')
        repo_detail['fork_count'] = counts[2].strip().replace(',', '')
        repo_detail["empty"] = False
        new_task(response.url + "/graphs/contributors-data")
        return repo_detail

Exemple #7

0

Afficher le fichier

Fichier : linkedin.py Projet : heicks/github-crawler

 def parse_directory(self, response):
     links = response.xpath(self.dir_link_xpath).extract()
     for link in links:
         if link.startswith('http'):
             if 'linkedin.com/pub' in link:
                 r = get_redis()
                 r.sadd('linkedin:user', link)
                 continue
             new_task(link)

Exemple #8

0

Afficher le fichier

Fichier : linkedin.py Projet : heicks/github-crawler

 def parse_directory(self, response):
     links = response.xpath(self.dir_link_xpath).extract()
     for link in links:
         if link.startswith('http'):
             if 'linkedin.com/pub' in link:
                 r = get_redis()
                 r.sadd('linkedin:user', link)
                 continue
             new_task(link)

Exemple #9

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def get_user_repos(self, response):
     user_repos_item = UserRepos()
     repos = response.xpath('//h3[@class="repo-list-name"]/a/@href').extract()
     user_repos_item["repos"] = repos
     user_repos_item["login"] = response.url.split("/")[-1].split("?")[0]
     for i in repos:
       #logger.debug("new_task,parse_user_repos:"+i)
       new_task("https://github.com"+i)
     return user_repos_item

Exemple #10

0

Afficher le fichier

Fichier : blog.py Projet : heicks/github-crawler

 def __init__(self, *args, **kwargs):
     super(BlogSpider, self).__init__(*args, **kwargs)
     cnt = 0
     r = get_redis()
     r.delete('blog:tasks:fingerprint')
     with open(os.path.join(cpath, 'github_user_blog_list')) as fi:
         for line in fi:
             cnt += 1
             if cnt > 100:break
             new_task(line.strip())

Exemple #11

0

Afficher le fichier

 def get_user_repos(self, response):
     user_repos_item = UserRepos()
     repos = response.xpath(
         '//h3[@class="repo-list-name"]/a/@href').extract()
     user_repos_item["repos"] = repos
     user_repos_item["login"] = response.url.split("/")[-1].split("?")[0]
     for i in repos:
         #logger.debug("new_task,parse_user_repos:"+i)
         new_task("https://github.com" + i)
     return user_repos_item

Exemple #12

0

Afficher le fichier

 def add_pagination_newtask(self, soup):
     pages_soup = soup.css(".pagination")
     if pages_soup:
         #for i in pages_soup[0].findAll("a"):
         for i in pages_soup[0].xpath('a/@href').extract():
             try:
                 #new_task("https://github.com" + i.attrs.get("href"))
                 #logger.debug("new_task,add_pagination_newtask:"+i)
                 new_task("https://github.com" + i)
             except:
                 pass

Exemple #13

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

    def _parse_user_tags_info(self, response):
        tags = [self._f(k) for k in response.xpath("//li[contains(@class, 'tagPopup')]/a/text()").extract()]
        if tags:
            next_page = response.xpath("//li[@class='next']/a/@href").extract()
            if next_page:
                new_task(next_page[0])

            item = SegmentfaultUserTagItem()
            item["login"] = response.url.split("/")[4]
            item["tags"] = tags
            return item

Exemple #14

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

    def _parse_user_follower_info(self, response):
        member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())]
        for member in member_list:
            new_task(member)

        login = response.url.split("/")[4]
        for follower_item in response.xpath("//div[contains(@class, 'stream-list__item')]/a/@href"):
            item = SegmentfaultUserFollowerItem()
            item["login"] = login
            item["follower"] = self._f(follower_item.extract()).split("/")[2]
            yield item

Exemple #15

0

Afficher le fichier

Fichier : linkedin.py Projet : heicks/github-crawler

 def __init__(self, *args, **kwargs):
     super(LinkedinSpider, self).__init__(*args, **kwargs)
     filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list')
     r = get_redis()
     r.delete('linkedin:tasks:fingerprint')
     with open(filepath) as fi:
         cnt = 0
         for line in fi:
             new_task(line.strip())
             cnt += 1
             if cnt > 100:break

Exemple #16

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def add_pagination_newtask(self, soup):
   pages_soup = soup.css(".pagination")
   if pages_soup:
     #for i in pages_soup[0].findAll("a"):
     for i in pages_soup[0].xpath('a/@href').extract():
       try:
         #new_task("https://github.com" + i.attrs.get("href"))
         #logger.debug("new_task,add_pagination_newtask:"+i)
         new_task("https://github.com" + i)
       except:
         pass

Exemple #17

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

    def _parse_user_bookmark_info(self, response):
        login = response.url.split("/")[4]
        for bookmark_list in response.xpath("//section[contains(@class, 'stream-list__item')]"):
            if bookmark_list:
                item = SegmentfaultUserBookmarkItem()
                item["login"] = login
                link = self._f(bookmark_list.xpath(".//strong/a/@href")[0].extract())
                item["bookmark_id"] = link.split("/")[2]

                new_task(link)
	        yield item

Exemple #18

0

Afficher le fichier

Fichier : linkedin.py Projet : heicks/github-crawler

 def __init__(self, *args, **kwargs):
     super(LinkedinSpider, self).__init__(*args, **kwargs)
     filepath = os.path.join(get_main_path(),
                             'spiders/linkedin/pub_china_user_list')
     r = get_redis()
     r.delete('linkedin:tasks:fingerprint')
     with open(filepath) as fi:
         cnt = 0
         for line in fi:
             new_task(line.strip())
             cnt += 1
             if cnt > 100: break

Exemple #19

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def get_org_repos(self, response):
   #org_soup = BeautifulSoup(response.body)
   repos_list_repo = response.xpath("//h3//a/@href")
   org_repos_item = OrgRepos()
   repos_list = repos_list_repo.extract()
   org_repos_item["repos"] = repos_list
   org_repos_item["login"] = response.url.split("/")[-1].split("?")[0]
   self.add_pagination_newtask(response)
   for i in repos_list:
     #logger.debug("new_task,get_org_repos:"+i)
     new_task("https://github.com"+i)
   return org_repos_item

Exemple #20

0

Afficher le fichier

 def get_org_repos(self, response):
     #org_soup = BeautifulSoup(response.body)
     repos_list_repo = response.xpath("//h3//a/@href")
     org_repos_item = OrgRepos()
     repos_list = repos_list_repo.extract()
     org_repos_item["repos"] = repos_list
     org_repos_item["login"] = response.url.split("/")[-1].split("?")[0]
     self.add_pagination_newtask(response)
     for i in repos_list:
         #logger.debug("new_task,get_org_repos:"+i)
         new_task("https://github.com" + i)
     return org_repos_item

Exemple #21

0

Afficher le fichier

    def _parse_user_bookmark_info(self, response):
        login = response.url.split("/")[4]
        for bookmark_list in response.xpath(
                "//section[contains(@class, 'stream-list__item')]"):
            if bookmark_list:
                item = SegmentfaultUserBookmarkItem()
                item["login"] = login
                link = self._f(
                    bookmark_list.xpath(".//strong/a/@href")[0].extract())
                item["bookmark_id"] = link.split("/")[2]

                new_task(link)
                yield item

Exemple #22

0

Afficher le fichier

 def get_org_members(self, response):
     #people_soup = BeautifulSoup(response.body)
     member_list_soup = response.css(".member-username")
     org_members_item = OrgMembers()
     members_list = []
     for i in member_list_soup:
         members_list.append(i.xpath('string()').extract()[0])
     org_members_item['members'] = members_list
     for i in members_list:
         new_task("https://github.com/" + i)
     org_members_item["login"] = response.url.split("/")[-2]
     self.add_pagination_newtask(response)
     return org_members_item

Exemple #23

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def get_org_members(self, response):
   #people_soup = BeautifulSoup(response.body)
   member_list_soup =response.css(".member-username")
   org_members_item = OrgMembers()
   members_list = []
   for i in member_list_soup:
     members_list.append(i.xpath('string()').extract()[0])
   org_members_item['members'] = members_list
   for i in members_list:
     new_task("https://github.com/"+i)
   org_members_item["login"] = response.url.split("/")[-2]
   self.add_pagination_newtask(response)
   return org_members_item

Exemple #24

0

Afficher le fichier

    def _parse_user_tags_info(self, response):
        tags = [
            self._f(k) for k in response.xpath(
                "//li[contains(@class, 'tagPopup')]/a/text()").extract()
        ]
        if tags:
            next_page = response.xpath("//li[@class='next']/a/@href").extract()
            if next_page:
                new_task(next_page[0])

            item = SegmentfaultUserTagItem()
            item["login"] = response.url.split("/")[4]
            item["tags"] = tags
            return item

Exemple #25

0

Afficher le fichier

Fichier : v2ex.py Projet : heicks/github-crawler

    def _parse_user_info(self, response):
        f_response = json.loads(response.body)
        if not f_response or f_response["status"] == "notfound":
            return

        item = V2ExUserItem()
        item["id"] = f_response["id"]
        item["login"] = f_response["username"]
        item["tagline"] = f_response["tagline"].strip()
        item["description"] = f_response["bio"].strip()
        item["avatar_url"] = f_response["avatar_large"]
        item["created_at"] = self._f_timestr(f_response["created"])
        new_task(u"/member/{}".format(item["login"]))
        return item

Exemple #26

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

    def get_repo_detail(self, response):
        repo_detail = RepoDetail()
        #repo_detail_soup = BeautifulSoup(response.body)

        try:
          if u'This repository is empty.' == response.xpath("//h3/text()").extract()[0]:
            repo_detail["full_name"] = "/"+response.url.split("/")[-2]+"/"+response.url.split("/")[-1]
            repo_detail["empty"] = True
            return repo_detail
        except:
          pass

        try:
          repo_detail["description"] = response.css(".repository-meta").xpath('string()').extract()[0]
        except exceptions.IndexError:
          repo_detail["description"] = None

        if response.css("#readme"):
          repo_detail["readme"] = response.css("#readme").xpath('string()').extract()[0]
        else:
          repo_detail["readme"] = ""
        try:
          repo_detail["fork"] = "/" + response.css(".fork-flag").xpath('.//a/text()').extract()[0]
        except exceptions.IndexError:
          repo_detail["fork"] = None 
        language_list_soup = response.css(".language-color")
        language_list = []
        language_detail = LanguageDetail()
        for i in language_list_soup:
          language_detail = {}
          l = i.xpath(".//@aria-label").extract()
          if l:
            l = l[0]
            language_detail[l.split()[0]] = l.split()[1]
            language_list.append(language_detail)

        nums = response.css(".num")
        repo_detail["master_commit_num"] = int(nums[0].xpath('string()').extract()[0].replace(",", "").replace("+", ""))
        repo_detail["branch_num"] = int(nums[1].xpath('string()').extract()[0].replace(",", ""))
        repo_detail["release_num"] = int(nums[2].xpath('string()').extract()[0].replace(",", ""))
        repo_detail["language_list"] = language_list
        repo_detail["full_name"] = "/"+response.url.split("/")[-2]+"/"+response.url.split("/")[-1]
        counts = response.css('.social-count').xpath('text()').extract()
        repo_detail['watch_count'] = counts[0].strip().replace(',', '')
        repo_detail['star_count'] = counts[1].strip().replace(',', '')
        repo_detail['fork_count'] = counts[2].strip().replace(',', '')
        repo_detail["empty"] = False
        new_task(response.url+"/graphs/contributors-data")
        return repo_detail

Exemple #27

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def get_user_following(self, response):
   #fbsoup = BeautifulSoup(response.body)
   user_following_item = UserFollowing()
   following = response.xpath('//div[@class="users"]//h3//a/@href').extract()
   user_following_item["following"] = following
   user_following_item["login"] = response.url.split("/")[-2]
   if "page=" not in response.url:
     #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", ""))
     following_num = int(response.css(".counter")[0].xpath('string()').extract()[0].replace(",", ""))
     for j in xrange(following_num/self.following_num+1):
       new_task(response.url+"?page="+str(j+1))
   for i in following:
     #logger.debug("new_task,get_user_following:"+i)
     new_task("https://github.com"+i)
   return user_following_item

Exemple #28

0

Afficher le fichier

    def parse(self, response):
        """ 此方法不要使用 yield，如果有多个结果，返回一个 list 就可以了。
        """
        ids = response.xpath('//a/text()').extract()

        for id in ids:
            # 任务放在 redis 里面，这里会做去重
            new_task(id.encode('utf-8'), self.get_fingerprint)
            # 或者让 scrapy 自己调度任务, 但是这里不会做去重
            #yield Request('http://127.0.0.1:8080/%s' % id.encode('utf-8'))

        item = TestItem()
        item['uid'] = response.url

        # yield item
        return [item]

Exemple #29

0

Afficher le fichier

Fichier : test.py Projet : heicks/github-crawler

    def parse(self, response):
        """ 此方法不要使用 yield，如果有多个结果，返回一个 list 就可以了。
        """
        ids = response.xpath('//a/text()').extract()

        for id in ids:
            # 任务放在 redis 里面，这里会做去重
            new_task(id.encode('utf-8'), self.get_fingerprint)
            # 或者让 scrapy 自己调度任务, 但是这里不会做去重
            #yield Request('http://127.0.0.1:8080/%s' % id.encode('utf-8'))

        item = TestItem()
        item['uid'] = response.url

        # yield item
        return [item]

Exemple #30

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def get_user_followers(self, response):
   #logger.debug("user_followers_1")
   user_followers_item = UserFollowers()
   followers = response.css(".follow-list").xpath("li/a/@href").extract()
   user_followers_item["followers"] = followers
   user_followers_item["login"] = response.url.split("/")[-2]
   if "page=" not in response.url:
     #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", ""))
     followers_num = int(response.css(".counter")[0].xpath('string()').extract()[0].replace(",", ""))
     for j in xrange(followers_num/self.followers_num+1):
       new_task(response.url+"?page="+str(j+1))
   for i in followers:
     #logger.debug("new_task,get_user_followers:"+i)
     new_task("https://github.com"+i)
   #logger.debug("user_followers_2")
   return user_followers_item

Exemple #31

0

Afficher le fichier

    def _parse_user_follower_info(self, response):
        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)

        login = response.url.split("/")[4]
        for follower_item in response.xpath(
                "//div[contains(@class, 'stream-list__item')]/a/@href"):
            item = SegmentfaultUserFollowerItem()
            item["login"] = login
            item["follower"] = self._f(follower_item.extract()).split("/")[2]
            yield item

Exemple #32

0

Afficher le fichier

Fichier : github.py Projet : heicks/github-crawler

 def get_user_stared(self, response):
     user_stared = response.css(".repo-list-item")
     user_stars_item = UserStars()
     stars = []
     for i in user_stared:
       one_star = {}
       one_star["name"] = i.css("h3").css("a::attr(href)").extract()[0]
       one_star["time"] = i.css(".repo-list-meta")[0].css("time::attr(datetime)").extract()[0]
       stars.append(one_star)
     user_stars_item["stars"] = stars
     user_stars_item["login"] = response.url.split("/")[-1].split("?")[0]
     next_soup = response.css(".pagination")
     if next_soup:
       next_url = next_soup[0].xpath("a/@href").extract()
       for i in next_url:
         new_task(i)
     return user_stars_item

Exemple #33

0

Afficher le fichier

Fichier : rubychina_all.py Projet : heicks/github-crawler

    def _parse_topic_info(self, response):
        f_response = json.loads(response.body)
        if "error" in f_response:
            return None

        item = RubychinaTopicItem()
        item["id"] = f_response["id"]
        item["title"] = f_response["title"]
        item["created_at"] = f_response["created_at"]
        item["replies_count"] = f_response["replies_count"]
        item["owner_login"] = f_response["user"]["login"]
        item["content"] = f_response["body_html"]
        item["hits"] = f_response["hits"]

        for reply_item in f_response["replies"]:
            new_task(reply_item["user"]["login"])
        return item

Exemple #34

0

Afficher le fichier

 def get_user_following(self, response):
     #fbsoup = BeautifulSoup(response.body)
     user_following_item = UserFollowing()
     following = response.xpath(
         '//div[@class="users"]//h3//a/@href').extract()
     user_following_item["following"] = following
     user_following_item["login"] = response.url.split("/")[-2]
     if "page=" not in response.url:
         #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", ""))
         following_num = int(
             response.css(".counter")[0].xpath('string()').extract()
             [0].replace(",", ""))
         for j in xrange(following_num / self.following_num + 1):
             new_task(response.url + "?page=" + str(j + 1))
     for i in following:
         #logger.debug("new_task,get_user_following:"+i)
         new_task("https://github.com" + i)
     return user_following_item

Exemple #35

0

Afficher le fichier

 def get_user_stared(self, response):
     user_stared = response.css(".repo-list-item")
     user_stars_item = UserStars()
     stars = []
     for i in user_stared:
         one_star = {}
         one_star["name"] = i.css("h3").css("a::attr(href)").extract()[0]
         one_star["time"] = i.css(".repo-list-meta")[0].css(
             "time::attr(datetime)").extract()[0]
         stars.append(one_star)
     user_stars_item["stars"] = stars
     user_stars_item["login"] = response.url.split("/")[-1].split("?")[0]
     next_soup = response.css(".pagination")
     if next_soup:
         next_url = next_soup[0].xpath("a/@href").extract()
         for i in next_url:
             new_task(i)
     return user_stars_item

Exemple #36

0

Afficher le fichier

 def get_user_followers(self, response):
     #logger.debug("user_followers_1")
     user_followers_item = UserFollowers()
     followers = response.css(".follow-list").xpath("li/a/@href").extract()
     user_followers_item["followers"] = followers
     user_followers_item["login"] = response.url.split("/")[-2]
     if "page=" not in response.url:
         #following_num = int(fbsoup.select(".counter")[0].getText().replace(",", ""))
         followers_num = int(
             response.css(".counter")[0].xpath('string()').extract()
             [0].replace(",", ""))
         for j in xrange(followers_num / self.followers_num + 1):
             new_task(response.url + "?page=" + str(j + 1))
     for i in followers:
         #logger.debug("new_task,get_user_followers:"+i)
         new_task("https://github.com" + i)
     #logger.debug("user_followers_2")
     return user_followers_item

Exemple #37

0

Afficher le fichier

Fichier : v2ex.py Projet : heicks/github-crawler

    def _parse_topic_replies_info(self, response):
        f_response = json.loads(response.body)
        if not f_response:
            return

        topic_id = response.url.split("?")[-1]
        for reply_item in f_response:
            item = V2ExTopicReplyItem()

            item["reply_id"] = reply_item["id"]
            item["topic_id"] = topic_id
            item["thank_count"] = reply_item["thanks"]
            item["content"] = reply_item["content_rendered"].strip()
            item["author"] = reply_item["member"]["username"]
            if item["author"]:
                new_task("/api/members/show.json?id={}".format(reply_item["member"]["id"]))
            item["created_at"] = self._f_timestr(reply_item["created"])
            yield item

Exemple #38

0

Afficher le fichier

Fichier : user_zhihu.py Projet : heicks/github-crawler

 def parse_followees(self, response):
   ret = json.loads(response.body)
   item = ZhihuRelationItem()
   item['uid'] = self.get_uid_from_url(response.url)
   item['followees'] = []
   item['_id'] = '%s:%s' % (item['uid'], self.get_offset_from_url(response.url))
   for people in ret['data']:
     item['followees'].append({
       'uid': people.get('id'),
       #'email': people.get('email'),
       #'gender': people.get('gender'),
       #'sina_weibo_url': people.get('sina_weibo_url'),
       #'sina_weibo_name': people.get('sina_weibo_name'),
       #'headline': people.get('headline'),
       #'description': people.get('description'),
       })
     new_task(people['id'])
   yield item

Exemple #39

0

Afficher le fichier

 def parse_followees(self, response):
     ret = json.loads(response.body)
     item = ZhihuRelationItem()
     item['uid'] = self.get_uid_from_url(response.url)
     item['followees'] = []
     item['_id'] = '%s:%s' % (item['uid'],
                              self.get_offset_from_url(response.url))
     for people in ret['data']:
         item['followees'].append({
             'uid': people.get('id'),
             #'email': people.get('email'),
             #'gender': people.get('gender'),
             #'sina_weibo_url': people.get('sina_weibo_url'),
             #'sina_weibo_name': people.get('sina_weibo_name'),
             #'headline': people.get('headline'),
             #'description': people.get('description'),
         })
         new_task(people['id'])
     yield item

Exemple #40

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

    def _parse_topic_info(self, response):
        item = SegmentfaultTopicItem()
        item["question_id"] = response.url.split("/")[4]
        item["title"] = self._f(response.xpath("//h1[@id='questionTitle']/a/text()")[0].extract())
        item["author"] = self._f(response.xpath("//div[@class='author']/a/@href")[0].extract().split("/")[-1])
        item["created_at"] = self._format_timestr(self._f(
            "".join([k.strip() for k in response.xpath("//div[@class='author']/text()").extract()]).split()[0]))

        topic_status = response.xpath('//div[@class="col-md-3"][1]/ul/li/strong/text()')
        item["follower_count"] = self._format_count(self._f(topic_status[0].extract()))
        item["collection_count"] = self._format_count(self._f(topic_status[1].extract()))
        item["hit_count"] = self._format_count(self._f(topic_status[2].extract()))

        question_tag = response.xpath("//article[@class='widget-question__item']")
        item["like_count"] = self._format_count(self._f(question_tag.xpath(".//span[@class='count']/text()")[0].extract()))
        item["question"] = self._f(question_tag.xpath(".//div[@class='post-offset']/div[1]")[0].extract())
        item["tags"] = [self._f(k) for k in question_tag.xpath(".//div[@class='post-offset']/ul[1]/li/a/text()").extract()]

        answer_list_tag = response.xpath("//article[contains(@class, 'widget-answers__item')]")
        answer_list = []
        for answer_item in answer_list_tag:
            answer_id = self._f(answer_item.xpath(".//@id")[0].extract())
            answer_like_count = self._format_count(self._f(answer_item.xpath(".//span[@class='count']/text()")[0].extract()))
            answer_accepted = bool(answer_item.xpath(".//div[@class='accepted-flag']"))
            answer_author = self._f(answer_item.xpath(".//div[@class='post-offset']/a/@href")[0].extract())

            answer_created_at = self._format_timestr(self._f(
                answer_item.xpath(".//span[contains(@class, 'text-muted')]/text()")[0].extract().strip().split()[0]))
            answer = self._f(answer_item.xpath(".//div[contains(@class, 'answer')]")[0].extract())
            answer_list.append(dict(like_count=answer_like_count,
                                    accepted=answer_accepted,
                                    anthor=answer_author,
                                    created_at=answer_created_at,
                                    answer=answer,
                                    answer_id=answer_id))
            item["answers"] = answer_list

        # add new task
        member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())]
        for member in member_list:
            new_task(member)
        return item

Exemple #41

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

    def _parse_article_info(self, response):
        item = SegmentfaultArticleItem()
        item["article_id"] = response.url.split("/")[4]
        item["title"] = self._f(response.xpath("//h1[@id='articleTitle']/a/text()")[0].extract())
        item["author"] = self._f(response.xpath("//div[@class='author']/a/@href")[0].extract().split("/")[-1])
        item["created_at"] = self._format_timestr(self._f(
            response.xpath("//div[@class='author']/text()")[-1].extract().strip().split()[0]))

        topic_status = response.xpath('//div[@class="col-md-3"][1]/ul/li/strong/text()')
        item["follower_count"] = self._format_count(self._f(topic_status[0].extract()))
        item["collection_count"] = self._format_count(self._f(topic_status[1].extract()))
        item["hit_count"] = self._format_count(self._f(topic_status[2].extract()))

        item["article"] = self._f(response.xpath("//div[contains(@class, 'article')]")[0].extract())
        item["tags"] = list(set([self._f(k) for k in response.xpath("//li[@class='tagPopup']/a/text()").extract()]))

        member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())]
        for member in member_list:
            new_task(member)
        return item

Exemple #42

0

Afficher le fichier

Fichier : v2ex.py Projet : heicks/github-crawler

    def _parse_topic_info(self, response):
        f_response = json.loads(response.body)
        if not f_response:
            return
        f_response = f_response[0]
        item = V2ExTopicItem()

        item["topic_id"] = f_response["id"]
        item["title"] = f_response["title"]
        item["content"] = f_response["content_rendered"]
        item["author"] = f_response["member"]["username"]

        if item["author"]:
            new_task(u"/api/members/show.json?id={}".format(f_response["member"]["id"]))

        item["node_name"] = f_response["node"]["name"]
        item["created_at"] = self._f_timestr(f_response["created"])
        item["reply_count"] = f_response["replies"]

        if item["reply_count"]:
            new_task(u"/api/replies/show.json?topic_id={}".format(item["topic_id"]))
        return item

Exemple #43

0

Afficher le fichier

    def _parse_article_info(self, response):
        item = SegmentfaultArticleItem()
        item["article_id"] = response.url.split("/")[4]
        item["title"] = self._f(
            response.xpath("//h1[@id='articleTitle']/a/text()")[0].extract())
        item["author"] = self._f(
            response.xpath("//div[@class='author']/a/@href")
            [0].extract().split("/")[-1])
        item["created_at"] = self._format_timestr(
            self._f(
                response.xpath("//div[@class='author']/text()")
                [-1].extract().strip().split()[0]))

        topic_status = response.xpath(
            '//div[@class="col-md-3"][1]/ul/li/strong/text()')
        item["follower_count"] = self._format_count(
            self._f(topic_status[0].extract()))
        item["collection_count"] = self._format_count(
            self._f(topic_status[1].extract()))
        item["hit_count"] = self._format_count(
            self._f(topic_status[2].extract()))

        item["article"] = self._f(
            response.xpath("//div[contains(@class, 'article')]")[0].extract())
        item["tags"] = list(
            set([
                self._f(k) for k in response.xpath(
                    "//li[@class='tagPopup']/a/text()").extract()
            ]))

        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)
        return item

Exemple #44

0

Afficher le fichier

    def _parse_user_info(self, response):
        item = SegmentfaultUserItem()
        profile_tag = response.xpath(
            "//div[contains(@class, 'profile-header')]")

        item["login"] = response.url.split("/")[4]
        item["avatar_url"] = self._f(
            profile_tag.xpath(".//img/@src")[0].extract())
        if not item["avatar_url"].startswith("http"):
            item["avatar_url"] = None

        item["name"] = self._f(profile_tag.xpath(".//h4/text()")[0].extract())
        item["social_contact"] = self._format_social_contact(
            profile_tag.xpath(".//li/a"))

        profile_item_map = {
            u"所在城市": "location",
            u"现任职位": "position",
            u"院校专业": "education",
            u"个人网站": "blog_url",
        }
        item.update([(v, None) for _, v in profile_item_map.iteritems()])
        item["major"] = None
        for profile_item in response.xpath(
                "//ul[contains(@class, 'profile-links')]/li"):
            f_profile_item = [
                self._f(k) for k in re.sub(
                    '<[^>]*>', '', profile_item.extract()).strip().split(u"：")
            ]
            key = f_profile_item[0]
            value = " ".join(f_profile_item[1:])
            if key in profile_item_map:
                item[profile_item_map[key]] = value
        if item["education"]:
            education_list = item["education"].split(" ")
            item["education"] = self._f(education_list[0])

            if len(education_list) > 1:
                item["major"] = self._f(u" ".join(education_list[1:]))

        item["follower_count"] = self._format_count(
            self._f(
                response.xpath("//a[@class='funsCount']/text()")[0].extract()))

        profile_detail_tag = response.xpath("//div[@class='col-md-4 profile']")
        reputation_count, emblem_count, like_count = [
            self._f(k.extract())
            for k in profile_detail_tag.xpath(".//strong/text()")[:3]
        ]
        item["reputation_count"] = self._format_count(reputation_count)
        item["emblem_count"] = self._format_count(emblem_count)
        item["like_count"] = self._format_count(like_count)
        item["introduction"] = self._f(
            profile_detail_tag.xpath(".//div[contains(@class, 'profile-bio')]")
            [0].extract())

        profile_info_list = response.xpath(
            "//div[@class='col-md-4 profile']/following-sibling::div/ul/li")
        item["answer_count"] = self._format_count(
            self._f(profile_info_list[3].xpath(".//span/text()")[0].extract()))
        item["question_count"] = self._format_count(
            self._f(profile_info_list[4].xpath(".//span/text()")[0].extract()))
        item["article_count"] = self._format_count(
            self._f(profile_info_list[5].xpath(".//span/text()")[0].extract()))
        item["following_tag_count"] = self._format_count(
            self._f(profile_info_list[-1].xpath(
                ".//span[@class='badge']/text()")[0].extract()))

        user_href = "/" + "/".join(response.url.split("/")[3:])
        if item["answer_count"]:
            pages = int(
                math.ceil(item["answer_count"] /
                          float(self.answer_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/answers?page={}".format(user_href, i))

        if item["question_count"]:
            pages = int(
                math.ceil(item["question_count"] /
                          float(self.question_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/questions?page={}".format(user_href, i))

        if item["article_count"]:
            pages = int(
                math.ceil(item["article_count"] /
                          float(self.article_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/blogs?page={}".format(user_href, i))

        if item["following_tag_count"]:
            pages = int(
                math.ceil(item["following_tag_count"] /
                          float(self.following_tag_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/following/tags?page={}".format(user_href, i))

        if item["follower_count"]:
            pages = int(
                math.ceil(item["follower_count"] /
                          float(self.follower_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/followed/users?page={}".format(user_href, i))

        # tags
        new_task("{}/tags?page={}".format(user_href, 1))
        new_task("{}/bookmarks?page={}".format(user_href, 1))
        new_task("{}/bookmarks?page={}".format(user_href, 2))

        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)
        return item

Exemple #45

0

Afficher le fichier

    def _parse_topic_info(self, response):
        item = SegmentfaultTopicItem()
        item["question_id"] = response.url.split("/")[4]
        item["title"] = self._f(
            response.xpath("//h1[@id='questionTitle']/a/text()")[0].extract())
        item["author"] = self._f(
            response.xpath("//div[@class='author']/a/@href")
            [0].extract().split("/")[-1])
        item["created_at"] = self._format_timestr(
            self._f("".join([
                k.strip() for k in response.xpath(
                    "//div[@class='author']/text()").extract()
            ]).split()[0]))

        topic_status = response.xpath(
            '//div[@class="col-md-3"][1]/ul/li/strong/text()')
        item["follower_count"] = self._format_count(
            self._f(topic_status[0].extract()))
        item["collection_count"] = self._format_count(
            self._f(topic_status[1].extract()))
        item["hit_count"] = self._format_count(
            self._f(topic_status[2].extract()))

        question_tag = response.xpath(
            "//article[@class='widget-question__item']")
        item["like_count"] = self._format_count(
            self._f(
                question_tag.xpath(".//span[@class='count']/text()")
                [0].extract()))
        item["question"] = self._f(
            question_tag.xpath(".//div[@class='post-offset']/div[1]")
            [0].extract())
        item["tags"] = [
            self._f(k) for k in question_tag.xpath(
                ".//div[@class='post-offset']/ul[1]/li/a/text()").extract()
        ]

        answer_list_tag = response.xpath(
            "//article[contains(@class, 'widget-answers__item')]")
        answer_list = []
        for answer_item in answer_list_tag:
            answer_id = self._f(answer_item.xpath(".//@id")[0].extract())
            answer_like_count = self._format_count(
                self._f(
                    answer_item.xpath(".//span[@class='count']/text()")
                    [0].extract()))
            answer_accepted = bool(
                answer_item.xpath(".//div[@class='accepted-flag']"))
            answer_author = self._f(
                answer_item.xpath(".//div[@class='post-offset']/a/@href")
                [0].extract())

            answer_created_at = self._format_timestr(
                self._f(
                    answer_item.xpath(
                        ".//span[contains(@class, 'text-muted')]/text()")
                    [0].extract().strip().split()[0]))
            answer = self._f(
                answer_item.xpath(".//div[contains(@class, 'answer')]")
                [0].extract())
            answer_list.append(
                dict(like_count=answer_like_count,
                     accepted=answer_accepted,
                     anthor=answer_author,
                     created_at=answer_created_at,
                     answer=answer,
                     answer_id=answer_id))
            item["answers"] = answer_list

        # add new task
        member_list = [
            self._f(k) for k in set(
                response.xpath(
                    "//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())
        ]
        for member in member_list:
            new_task(member)
        return item

Exemple #46

0

Afficher le fichier

 def _parse_toplic_and_article_info(self, response):
     xpath_list = "//h2[@class='title']/a/@href"
     for topic_href in response.xpath(xpath_list):
         new_task(topic_href.extract().strip())

Exemple #47

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

    def _parse_user_info(self, response):
        item = SegmentfaultUserItem()
        profile_tag = response.xpath("//div[contains(@class, 'profile-header')]")

        item["login"] = response.url.split("/")[4]
        item["avatar_url"] = self._f(profile_tag.xpath(".//img/@src")[0].extract())
        if not item["avatar_url"].startswith("http"):
            item["avatar_url"] = None

        item["name"] = self._f(profile_tag.xpath(".//h4/text()")[0].extract())
        item["social_contact"] = self._format_social_contact(profile_tag.xpath(".//li/a"))

        profile_item_map = {
            u"所在城市": "location",
            u"现任职位": "position",
            u"院校专业": "education",
            u"个人网站": "blog_url",
        }
        item.update([(v, None) for _, v in profile_item_map.iteritems()])
        item["major"] = None
        for profile_item in response.xpath("//ul[contains(@class, 'profile-links')]/li"):
            f_profile_item = [self._f(k) for k in re.sub('<[^>]*>', '', profile_item.extract()).strip().split(u"：")]
            key = f_profile_item[0]
            value = " ".join(f_profile_item[1:])
            if key in profile_item_map:
                item[profile_item_map[key]] = value
        if item["education"]:
            education_list = item["education"].split(" ")
            item["education"] = self._f(education_list[0])

            if len(education_list) > 1:
                item["major"] = self._f(u" ".join(education_list[1:]))

        item["follower_count"] = self._format_count(self._f(response.xpath("//a[@class='funsCount']/text()")[0].extract()))

        profile_detail_tag = response.xpath("//div[@class='col-md-4 profile']")
        reputation_count, emblem_count, like_count = [self._f(k.extract()) for k in profile_detail_tag.xpath(".//strong/text()")[:3]]
        item["reputation_count"] = self._format_count(reputation_count)
        item["emblem_count"] = self._format_count(emblem_count)
        item["like_count"] = self._format_count(like_count)
        item["introduction"] = self._f(profile_detail_tag.xpath(".//div[contains(@class, 'profile-bio')]")[0].extract())

        profile_info_list = response.xpath("//div[@class='col-md-4 profile']/following-sibling::div/ul/li")
        item["answer_count"] = self._format_count(self._f(profile_info_list[3].xpath(".//span/text()")[0].extract()))
        item["question_count"] = self._format_count(self._f(profile_info_list[4].xpath(".//span/text()")[0].extract()))
        item["article_count"] = self._format_count(self._f(profile_info_list[5].xpath(".//span/text()")[0].extract()))
        item["following_tag_count"] = self._format_count(self._f(profile_info_list[-1].xpath(".//span[@class='badge']/text()")[0].extract()))

        user_href = "/" + "/".join(response.url.split("/")[3:])
        if item["answer_count"]:
            pages = int(math.ceil(item["answer_count"] / float(self.answer_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/answers?page={}".format(user_href, i))

        if item["question_count"]:
            pages = int(math.ceil(item["question_count"] / float(self.question_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/questions?page={}".format(user_href, i))

        if item["article_count"]:
            pages = int(math.ceil(item["article_count"] / float(self.article_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/blogs?page={}".format(user_href, i))

        if item["following_tag_count"]:
            pages = int(math.ceil(item["following_tag_count"] / float(self.following_tag_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/following/tags?page={}".format(user_href, i))

        if item["follower_count"]:
            pages = int(math.ceil(item["follower_count"] / float(self.follower_items_each_page)))
            for i in xrange(1, pages + 1):
                new_task("{}/followed/users?page={}".format(user_href, i))

        # tags
        new_task("{}/tags?page={}".format(user_href, 1))
        new_task("{}/bookmarks?page={}".format(user_href, 1))
        new_task("{}/bookmarks?page={}".format(user_href, 2))

        member_list = [self._f(k) for k in set(response.xpath("//a[re:test(@href, '^\/u\/([^\/])*$')]/@href").extract())]
        for member in member_list:
            new_task(member)
        return item

Exemple #48

0

Afficher le fichier

Fichier : segmentfault_all.py Projet : heicks/github-crawler

 def _parse_toplic_and_article_info(self, response):
     xpath_list = "//h2[@class='title']/a/@href"
     for topic_href in response.xpath(xpath_list):
         new_task(topic_href.extract().strip())