Beispiel #1
0
    def parse(self, src, acinfolist):
        max_id = 0  #max_id这个字段用来查询更多的投稿
        now = 0
        rows = []  #front_urlData
        for data in src:
            try:
                row = ACcommentsInfoPO()  #保存一篇投稿抓取的内容
                #获取投稿类型
                if data[0][0:5] == '/v/ac':
                    row.set_id(data[0][5:])
                    row.set_type('视频')
                elif data[0][0:5] == '/a/ac':
                    row.set_id(data[0][5:])
                    row.set_type('文章')
                elif data[0][0:5] == '/v/ab':
                    #番剧的id和其他不一样,加负号以示区别
                    row.set_id('-' + data[0][5:])
                    row.set_type('番剧')
                else:
                    continue

                #获取acid和url
                row.set_url(ACFUN + data[0])

                #max_id这个字段用来查询更多的投稿,比如我从首页获取的最大投稿是ac190000,那么一会我会多抓去ac188900到ac190000的评论信息
                if max_id < int(data[0][5:]):
                    max_id = int(data[0][5:])

                #先过滤掉前面几个字
                data[1] = data[1][7:]

                row.set_title(data[1])
                row.set_check_time(str(datetime.datetime.now()))

            except Exception:
                continue
            if str(row.get_id()) not in acinfolist:
                rows.append(row)
                acinfolist.append(str(row.get_id()))

        #开始随机抓取评论
        if len(rows) > 0:
            self.create_more(rows, max_id, acinfolist)
            #投稿信息单独放一张表
            self.ac_comments.db_proc.ACCommentsInfo.insert(rows)

        return acinfolist