コード例 #1
0
    def _craw(self, url, param=None, *args):
        res = requests.post(url, param)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, 'html.parser')
            # 所有的文章
            posts = soup.find_all("div", class_="post_item")

            res_list = []
            for post in posts:

                p = ThirdPost(self.third_id, self.third_name,0)
                post_a = post.find("a", class_="titlelnk")
                #  # 跳转路由
                p.redirect_url = post_a['href']
                # postId
                p.post_id = re.findall(r"/p/(.+?)\.html", p.redirect_url)[0]
                # title
                p.title = post_a.string
                # 创建时间
                p.creatime = post_a.next_sibling.string

                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)
コード例 #2
0
ファイル: jianshu.py プロジェクト: luorixin/craw_tech
 def _craw(self, url, param=None, *args):
     # while True:
     res = requests.get(url, params=param, headers=headers)
     # param["page"] = param["page"] + 1
     if res.status_code == 200:
         # response
         body_json = res.json()
         if body_json:
             res_list = []
             for arti in body_json:
                 arti = arti['object']['data']
                 data = third_post_db.find_by_pt_id(
                     "jianshu-" + str(arti['id']), self.third_id)
                 if data is None:
                     # 构建
                     post = ThirdPost(self.third_id, self.third_name, 0)
                     post.tags = ''
                     # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间
                     post.post_id = "jianshu-" + str(arti['id'])
                     post.title = arti['title']
                     post.author = arti['user']['nickname']
                     post.content = arti['public_abbr']
                     post.like_num = arti['likes_count']
                     post.comment_num = arti['public_comments_count']
                     post.redirect_url = 'https://www.jianshu.com/p/' + \
                         arti["slug"]
                     post.creatime = arrow.get(
                         arti['first_shared_at']).format(
                             'YYYY-MM-DD HH:mm:ss')
                     res_list.append(post)
             log.info("[%s]爬取-> %s  %d条记录", self.third_name, url,
                      len(res_list))
             self.batch_insert(res_list)
コード例 #3
0
    def _craw(self, url, param=None, *args):
        res = requests.post(url, json.dumps(param), headers=headers)
        if res.status_code == 200:
            # html文档
            body_json = res.json()
            print(body_json)

            article_list = body_json['data']
            res_list = []

            for post in article_list:

                p = ThirdPost(self.third_id, self.third_name, 0)

                p.title = post['article_title']

                tags = []
                for t in post['topic']:
                    tags.append(t['name'])
                p.tags = ",".join(tags)
                p.post_id = "infoq-" + post['uuid']
                if 'author' in post.keys():
                    p.author = post['author'][0]['nickname']
                else:
                    p.author = "InfoQ"
                p.content = post['article_summary']
                p.redirect_url = "https://www.infoq.cn/article/" + post['uuid']
                p.creatime = arrow.get(post['utime'] /
                                       1000).format('YYYY-MM-DD HH:mm:ss')
                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s   %d条记录", self.third_name, url,
                     len(res_list))
            self.batch_insert(res_list)
コード例 #4
0
ファイル: third_post_db.py プロジェクト: luorixin/craw_tech
 def find_by_pt_id(self, post_id, third_id):
     db = mysql_db.mysql
     sql = "select title from tb_third_post where object_id=%s and third_type=%s"
     try:
         data = db.query_one(sql, (post_id, third_id))
         return data
     except Exception as e:
         log.info("执行Mysql: %s 时出错:%s" % (sql, e))
コード例 #5
0
ファイル: crawler.py プロジェクト: luorixin/craw_tech
 def batch_insert(self, res_list):
     if len(res_list) > 0:
         r_list = []
         for l in res_list:
             item = (l.third_id, l.third_name, l.post_id, l.title, l.tags, l.author, l.content,
                     l.like_num, l.comment_num, l.redirect_url, l.creatime, l.can_analysis)
             r_list.append(item)
         # 批量入库
         log.info("执行db操作,%s文章入库", self.third_name)
         third_post_db.batch_insert(r_list)
コード例 #6
0
ファイル: third_post_db.py プロジェクト: luorixin/craw_tech
    def batch_insert(self, list):

        db = mysql_db.mysql
        sql = "insert into tb_third_post (third_type,third_name," \
              "object_id,title,tags,author,content,like_num,comment_num,redirect_url,creatime,can_analysis,created_at) values(%s,%s,%s,%s," \
              "%s,%s,%s,%s,%s,%s,%s,%s,now())"
        try:
            db.executemany(sql, list)
        except Exception as e:
            log.info("执行Mysql: %s 时出错:%s" % (sql, e))
コード例 #7
0
ファイル: kf_toutiao.py プロジェクト: luorixin/craw_tech
    def _craw(self, url, param=None, *args):
        res = requests.get(url, param, headers=headers)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, "html.parser")
            # 所有的文章
            posts = soup.find_all("div", class_="post")

            res_list = []
            # 多个文章解析
            for post in posts:
                p = ThirdPost(self.third_id, self.third_name, 0)
                post_content = post.find("div", class_="content")
                meta = post_content.find("div", class_="meta")
                if param == None:
                    user_info = post.find("div", class_="user-info").find(
                        "div", class_="info")
                    # 作者
                    p.author = user_info.h4.text
                    # postId
                    p.post_id = post_content.p.a["href"]
                    # 内容
                    p.content = post_content.p.a.string
                else:
                    # 作者
                    author_wrap = soup.find("div", class_="m-b").h3.text
                    p.author = author_wrap
                    # postId
                    p.post_id = post_content["data-url"]
                    p.content = ""
                # 标题
                p.title = post_content.h3.a.string
                # 赞数量
                p.like_num = post.find(
                    "a", class_="like-button").find("span").string
                # 评论数量
                p.comment_num = list(meta.find("span").stripped_strings)[0]
                # 跳转路由
                p.redirect_url = host + post_content.h3.a['href']

                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)
コード例 #8
0
ファイル: importnew.py プロジェクト: luorixin/craw_tech
    def _craw(self, url, param=None, *args):

        res = requests.post(url)

        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, 'html.parser')

            res_list = []

            # 所有的文章
            archive = soup.find("div", id="archive")
            posts = archive.find_all("div", class_="post")
            for post in posts:

                p = ThirdPost(self.third_id, self.third_name, 0)

                post_meta = post.find("div", class_="post-meta")
                post_a = post_meta.find("a", "meta-title")
                # 跳转路由
                p.redirect_url = post_a['href']
                # postId
                p.post_id = "importnew-" + re.findall(r"m/(.+?)\.html",
                                                      p.redirect_url)[0]
                # 标题
                p.title = post_a.string
                # 默认平台名
                p.author = self.third_name
                # 创建时间
                p.creatime = post_a.next_sibling.next_sibling.split(
                    "|")[0].strip()
                # 内容
                p.content = post.find("span", class_="excerpt").p.string
                if p.content is None:
                    p.content = ""
                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s   %d条记录", self.third_name, url,
                     len(res_list))
            self.batch_insert(res_list)
コード例 #9
0
ファイル: tuiku.py プロジェクト: luorixin/craw_tech
    def _craw(self, url, param=None, *args):

        res = requests.get(url)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, "html.parser")

            # 所有的文章
            posts = soup.find_all("div", class_="list_article_item")
            res_list = []

            # 多个文章解析
            for post in posts:

                p = ThirdPost(self.third_id, self.third_name, 0)

                tip_spans = post.find("div", class_="tip").find_all("span")

                # postId
                p.post_id = post['data-id']
                # 标题
                p.title = post.find("div", class_="title").a.string
                # 跳转路由
                p.redirect_url = host + post.find("div",
                                                  class_="title").a['href']
                # 创建时间
                now_year = datetime.datetime.now().year
                p.creatime = str(now_year) + "-" + list(
                    tip_spans)[2].string.strip()
                # 作者
                p.author = list(tip_spans)[0].string.strip()
                # 标签
                p.tags = args[0]

                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)

            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)
コード例 #10
0
    def _craw(self, url, param=None, *args):

        res = requests.get(url, param)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, 'html.parser')

            res_list = []

            # 所有的文章
            posts = soup.find_all("div", class_="article-lwrap")

            for post in posts:

                p = ThirdPost(self.third_id, self.third_name,0)

                post_a = post.find("a", "title")
                # 标题
                p.title = post_a.p.string
                # 跳转路由
                p.redirect_url = host+post_a['href']
                p.author = post.find("a", class_='nickName').string.strip()
                # 创建时间
                p.creatime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                # postId
                p.post_id = "/imooc"+post_a['href']
                # tags
                p_skills = post.find("span", class_="skill")
                p_tags = p_skills.find_all("a")
                tags = []
                for tag in p_tags:
                    tags.append(tag.span.string)
                p.tags = ",".join(tags)
                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s   %d条记录",
                     self.third_name, url, len(res_list))
            self.batch_insert(res_list)
コード例 #11
0
    def _craw(self, url, param=None, *args):
        res = requests.post(url, json.dumps(param), headers=header)
        if res.status_code == 200:
            like_total = args[0]  # 至少喜欢的数量
            # juejin response
            body_json = res.json()
            print(body_json)
            if body_json['data'] is None:
                log.error("爬取掘金失败" + body_json['errors'])
                return
            article_list = body_json['data']['articleFeed']['items']['edges']

            res_list = []
            for artiCol in article_list:

                arti = artiCol['node']

                data = third_post_db.find_by_pt_id(
                    arti['id'], self.third_id)

                if data is None and arti['likeCount'] > like_total:  # 大于30喜欢的加入
                    # 构建
                    post = ThirdPost(self.third_id, self.third_name, 0)
                    tags = []
                    for t in arti['tags']:
                        tags.append(t['title'])
                    post.tags = ",".join(tags)
                    # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间
                    post.post_id = arti['id']
                    post.title = arti['title']
                    post.author = arti['user']['username']
                    post.content = arti['content']
                    post.like_num = arti['likeCount']
                    post.comment_num = arti['commentsCount']
                    post.redirect_url = arti['originalUrl']
                    post.creatime = arrow.get(
                        arti['createdAt']).format('YYYY-MM-DD HH:mm:ss')

                    res_list.append(post)
            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)
コード例 #12
0
ファイル: segment.py プロジェクト: luorixin/craw_tech
    def _craw(self, url, param=None, *args):
        res = requests.get(url)
        if res.status_code == 200:
            # html文档
            htmls = res.text
            soup = BeautifulSoup(htmls, 'html.parser')
            news_list = soup.find("div", class_="news-list")
            # 所有的文章
            posts = news_list.find_all("div", class_="news-item")

            res_list = []

            for post in posts:

                p = ThirdPost(self.third_id, self.third_name, 0)

                post_title = post.find("h4", "news__item-title")
                p.title = post_title.text

                post_href = post.find("a", target="_blank")
                p.redirect_url = host + post_href['href']

                post_author = post.find("span", class_="author")
                p.author = post_author.a.text
                # 是SegmentFault不
                if p.author == "SegmentFault":
                    continue
                # 创建时间
                p.creatime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                # postId
                p.post_id = "segment-" + post_href['href']
                # content
                p.content = post.find("div",
                                      class_="article-excerpt").text.strip()
                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s   %d条记录", self.third_name, url,
                     len(res_list))
            self.batch_insert(res_list)
コード例 #13
0
    def _craw(self, url, param=None, *args):
        res = requests.get(url)
        if res.status_code == 200:
            # html文档
            htmls = res.text
            soup = BeautifulSoup(htmls, 'html.parser')
            # 解析出页码数
            total_box = soup.find("div", class_="statistics_t")
            total = int(total_box.find("span").text)
            total_page = int(math.ceil(total / 20))
            # 一页一页爬取
            index = 1
            while total_page >= 1 and index <= total_page:
                article_list = []
                param = {"page": index}
                res = requests.get(url, param)
                if res.status_code == 200:
                    # htmls文档
                    htmls = res.text
                    soup = BeautifulSoup(htmls, 'html.parser')
                    detail_list = soup.find(
                        "ul", class_="detail_list").find_all("li")
                    for detail in detail_list:
                        href = detail.find("a")["href"]
                        article_list.append(href)

                    res_list = []
                    log.info("该专题有%s篇文章", len(article_list))

                    for i, article in enumerate(article_list):

                        log.info("%s-->%d", article, i + 1)
                        res = requests.get(article)
                        if res.status_code == 200:
                            # htmls文档
                            htmls = res.text
                            soup = BeautifulSoup(htmls, 'html.parser')

                            p = ThirdPost(self.third_id, self.third_name,
                                          self.can_analysis)
                            p.redirect_url = article
                            p.post_id = re.findall(r"/article/(.+)",
                                                   p.redirect_url)[0]
                            p.title = soup.find("h1",
                                                class_="title-article").text
                            p.author = soup.find("a", id="uid").text
                            # 文章内容
                            a = soup.find("article")
                            # 查看更多和版权信息删除
                            ar = a.find("div", class_="article-copyright")
                            hide = a.find("div", class_="hide-article-box")
                            if hide is not None:
                                hide.replace_with("")
                            if ar is not None:
                                ar.replace_with("")
                            # url 解决反盗链
                            imgList = a.find_all("img")
                            for img in imgList:
                                url_str = "https://www.chaoyer.com/api/file/proxy?proxy=https://blog.csdn.net&img=" + str(
                                    img["src"])
                                img["src"] = url_str
                                p.content = html.escape((str(a)))
                            p.creatime = datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S")
                            data = third_post_db.find_by_pt_id(
                                p.post_id, p.third_id)
                            if data is None:
                                res_list.append(p)
                    log.info("[%s]爬取-> %s   %d条记录", self.third_name, url,
                             len(res_list))
                    self.batch_insert(res_list)
                    index = index + 1