Example #1
0
 def _craw(self, url, param=None, *args):
     # while True:
     res = requests.get(url, params=param, headers=headers)
     # param["page"] = param["page"] + 1
     if res.status_code == 200:
         # response
         body_json = res.json()
         if body_json:
             res_list = []
             for arti in body_json:
                 arti = arti['object']['data']
                 data = third_post_db.find_by_pt_id(
                     "jianshu-" + str(arti['id']), self.third_id)
                 if data is None:
                     # 构建
                     post = ThirdPost(self.third_id, self.third_name, 0)
                     post.tags = ''
                     # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间
                     post.post_id = "jianshu-" + str(arti['id'])
                     post.title = arti['title']
                     post.author = arti['user']['nickname']
                     post.content = arti['public_abbr']
                     post.like_num = arti['likes_count']
                     post.comment_num = arti['public_comments_count']
                     post.redirect_url = 'https://www.jianshu.com/p/' + \
                         arti["slug"]
                     post.creatime = arrow.get(
                         arti['first_shared_at']).format(
                             'YYYY-MM-DD HH:mm:ss')
                     res_list.append(post)
             log.info("[%s]爬取-> %s  %d条记录", self.third_name, url,
                      len(res_list))
             self.batch_insert(res_list)
Example #2
0
    def _craw(self, url, param=None, *args):
        res = requests.post(url, json.dumps(param), headers=headers)
        if res.status_code == 200:
            # html文档
            body_json = res.json()
            print(body_json)

            article_list = body_json['data']
            res_list = []

            for post in article_list:

                p = ThirdPost(self.third_id, self.third_name, 0)

                p.title = post['article_title']

                tags = []
                for t in post['topic']:
                    tags.append(t['name'])
                p.tags = ",".join(tags)
                p.post_id = "infoq-" + post['uuid']
                if 'author' in post.keys():
                    p.author = post['author'][0]['nickname']
                else:
                    p.author = "InfoQ"
                p.content = post['article_summary']
                p.redirect_url = "https://www.infoq.cn/article/" + post['uuid']
                p.creatime = arrow.get(post['utime'] /
                                       1000).format('YYYY-MM-DD HH:mm:ss')
                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s   %d条记录", self.third_name, url,
                     len(res_list))
            self.batch_insert(res_list)
Example #3
0
    def _craw(self, url, param=None, *args):

        res = requests.get(url)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, "html.parser")

            # 所有的文章
            posts = soup.find_all("div", class_="list_article_item")
            res_list = []

            # 多个文章解析
            for post in posts:

                p = ThirdPost(self.third_id, self.third_name, 0)

                tip_spans = post.find("div", class_="tip").find_all("span")

                # postId
                p.post_id = post['data-id']
                # 标题
                p.title = post.find("div", class_="title").a.string
                # 跳转路由
                p.redirect_url = host + post.find("div",
                                                  class_="title").a['href']
                # 创建时间
                now_year = datetime.datetime.now().year
                p.creatime = str(now_year) + "-" + list(
                    tip_spans)[2].string.strip()
                # 作者
                p.author = list(tip_spans)[0].string.strip()
                # 标签
                p.tags = args[0]

                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)

            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)
Example #4
0
    def _craw(self, url, param=None, *args):

        res = requests.get(url, param)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, 'html.parser')

            res_list = []

            # 所有的文章
            posts = soup.find_all("div", class_="article-lwrap")

            for post in posts:

                p = ThirdPost(self.third_id, self.third_name,0)

                post_a = post.find("a", "title")
                # 标题
                p.title = post_a.p.string
                # 跳转路由
                p.redirect_url = host+post_a['href']
                p.author = post.find("a", class_='nickName').string.strip()
                # 创建时间
                p.creatime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                # postId
                p.post_id = "/imooc"+post_a['href']
                # tags
                p_skills = post.find("span", class_="skill")
                p_tags = p_skills.find_all("a")
                tags = []
                for tag in p_tags:
                    tags.append(tag.span.string)
                p.tags = ",".join(tags)
                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s   %d条记录",
                     self.third_name, url, len(res_list))
            self.batch_insert(res_list)
Example #5
0
    def _craw(self, url, param=None, *args):
        res = requests.post(url, json.dumps(param), headers=header)
        if res.status_code == 200:
            like_total = args[0]  # 至少喜欢的数量
            # juejin response
            body_json = res.json()
            print(body_json)
            if body_json['data'] is None:
                log.error("爬取掘金失败" + body_json['errors'])
                return
            article_list = body_json['data']['articleFeed']['items']['edges']

            res_list = []
            for artiCol in article_list:

                arti = artiCol['node']

                data = third_post_db.find_by_pt_id(
                    arti['id'], self.third_id)

                if data is None and arti['likeCount'] > like_total:  # 大于30喜欢的加入
                    # 构建
                    post = ThirdPost(self.third_id, self.third_name, 0)
                    tags = []
                    for t in arti['tags']:
                        tags.append(t['title'])
                    post.tags = ",".join(tags)
                    # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间
                    post.post_id = arti['id']
                    post.title = arti['title']
                    post.author = arti['user']['username']
                    post.content = arti['content']
                    post.like_num = arti['likeCount']
                    post.comment_num = arti['commentsCount']
                    post.redirect_url = arti['originalUrl']
                    post.creatime = arrow.get(
                        arti['createdAt']).format('YYYY-MM-DD HH:mm:ss')

                    res_list.append(post)
            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)