Ejemplo n.º 1
0
class CommentsTask(object):
    """
    评论抓力任务处理类
    """
    last_ts = datetime.datetime.now()
    pg = PG()
    db_m = DB_M.get_default_database()

    def get_hot_news(self):
        """
        TO-DO:获取带抓取评论的热门/推荐新闻
        :return:
        """
        pass

    def get_news(self):
        """
        获取待抓取评论的新闻
        :return: 新闻列表(url,标题,插入时间)
        """
        news_last = []
        while True:
            current_ts = datetime.datetime.now()
            conn = self.pg.getconn()
            cursor = conn.cursor()
            sql = """select url,title,ctime,docid from newslist_v2
                     where
                     ctime >= %s and ctime < %s
                     and comment=0
                     and chid not in (28,34,1);"""
            params = (self.last_ts, current_ts)
            try:
                cursor.execute(sql, params)
                for news in cursor.fetchall():
                    news_last.append((news[0], news[1], news[2], news[3]))
            except Exception as e:
                logging.warning(e)
            else:
                cursor.close()
            finally:
                self.pg.putconn(conn)
            self.last_ts = current_ts
            if news_last:
                break
            else:
                time.sleep(60)
        return news_last

    def get_account_info(self):
        """
        从MONGO获取微博账号信息,用于评论抓取
        :return: 账号信息列表
        """
        logging.info("To Get comm-account info")
        comm_info = self.db_m.weibo_account.find({
            "type": "comm",
            "state": True
        })
        return list(comm_info)
Ejemplo n.º 2
0
 def __init__(self):
     self.pg = PG()
     self.db_m = DB_M.get_default_database()
Ejemplo n.º 3
0
class CommentsUpload(object):
    """
    评论上传操作类
    """
    def __init__(self):
        self.pg = PG()
        self.db_m = DB_M.get_default_database()

    @staticmethod
    def format_time(t=None):
        """
        格式化时间
        :param t: 时间字符串,若空使用当前utc时间
        :return:
        """
        f = "%Y-%m-%d %H:%M:%S"
        result = None
        if t is None:
            return datetime.datetime.now()
        try:
            result = datetime.datetime.strptime(t, f)
        except Exception as e:
            logging.warning(e, exc_info=True)
        if result is None:
            result = datetime.datetime.now()
        return result

    def unique(cls, news_id, user_name, content):
        """
        创建唯一性约束(根据用户名和评论内容的MD5)
        :param news_id: 新闻id
        :param user_name: 用户名
        :param content: 评论内容
        :return:
        """
        m = hashlib.md5()
        m.update(str(news_id))
        if user_name:
            m.update(user_name)
        if content:
            m.update(content)
        return m.hexdigest()

    def upload_pg(self, comment, docid):
        """
        上传PG操作实现
        :param comment:
        :param news_url:
        :return:
        """
        flag = True
        doc = dict()
        doc["cid"] = str(comment["comment_id"]) + "tacey"
        doc["docid"] = docid
        doc["uname"] = comment["user_name"]
        doc["avatar"] = comment["user_logo"]
        doc["ctime"] = self.format_time(comment["create_time"])
        doc["commend"] = comment["like_number"]
        doc["content"] = comment["content"]

        conn = self.pg.getconn()
        cursor = conn.cursor()
        sql_insert = """INSERT INTO commentlist_v2
                        (cid, docid, uname, avatar, ctime, commend, content)
                        VALUES
                        (%s, %s, %s, %s, %s, %s, %s);"""
        params_insert = (doc["cid"], doc["docid"], doc["uname"], doc["avatar"],
                         doc["ctime"], doc["commend"], doc["content"])
        sql_update = "UPDATE newslist_v2 SET comment=comment+1 WHERE docid=%s"
        params_update = (doc["docid"], )
        try:
            cursor.execute(sql_insert, params_insert)
            cursor.execute(sql_update, params_update)
            conn.commit()
        except Exception as e:
            logging.warning(e, exc_info=True)
            conn.rollback()
            flag = False
        else:
            cursor.close()
        finally:
            self.pg.putconn(conn)
        return flag

    def upload_comment(self, docid, comment):
        """
        上传评论(做一层try-except)
        :param news_url: 新闻url
        :param comment: 评论(单条)
        :return:
        """
        try:
            flag = self.upload_pg(comment, docid)
            if flag:
                logging.info("Comment Upload Success")
            else:
                logging.warning("Comment Upload Failed")
            return True
        except Exception as e:
            logging.warning(e, exc_info=True)
            return False

    def upload_comments(self, news_url, docid, comments, meta):
        """
        评论上传
        :param news_url: 新闻URL
        :param docid: docid
        :param comments: 评论列表
        :param meta: 评论来源微博信息
        :return:
        """
        count = 0
        for comment in comments:
            result = self.upload_comment(docid, comment)
            if result: count += 1
        result = 0 if count > 0 else 1
        self.stat(news_url, result, meta=meta)

    def stat(self, news_url, result, w_account=None, meta=None):
        """
        记录评论抓取结果
        :param news_url:新闻url
        :param result: 处理结果
                       0:得到评论并成功入库
                       1:得到评论入库失败
                       2: 新闻标题太短
                       3:未搜索到有评论微博
                       4:搜索到有评论微博,未获取到评论
                       5: 搜索新闻被Ban
        :return:
        """
        process_time = datetime.datetime.utcnow()
        self.db_m.weibo_comment_stat.insert({
            "news_url": news_url,
            "result": result,
            "process_time": process_time,
            "w_account": w_account,
            "meta": meta
        })

    @staticmethod
    def show_comments(comments):
        """
        打印抓取到的评论结果,DEBUG时查看
        :param comments: 评论列表
        :return:
        """
        for comment in comments:
            print "User Name:       ", comment["user_name"]
            print "User Logo:       ", comment["user_logo"]
            print "Comment Time:    ", comment["create_time"]
            print "Comment Content: ", comment["content"]
            print "Like Number:     ", comment["like_number"]
            print "Comment ID:      ", comment["comment_id"]
            print "=" * 20
Ejemplo n.º 4
0
 def __init__(self):
     self.uploader = CommentsUpload()
     self.pg = PG()
     self.last_ts = datetime.datetime.now() - datetime.timedelta(minutes=10)
Ejemplo n.º 5
0
class CommentTask(object):
    def __init__(self):
        self.uploader = CommentsUpload()
        self.pg = PG()
        self.last_ts = datetime.datetime.now() - datetime.timedelta(minutes=10)

    def get_hot_news(self):
        """
        获取待抓取评论的热点新闻
        :return: 新闻列表(url,标题,插入时间)
        """
        news_last = []
        while True:
            current_ts = datetime.datetime.now()
            conn = self.pg.getconn()
            cursor = conn.cursor()
            sql = """select nid,title,docid from newslist_v2
                     where nid in (select nid from newsrecommendhot where
                     ctime >= %s and ctime < %s)"""
            params = (self.last_ts, current_ts)
            try:
                cursor.execute(sql, params)
                for news in cursor.fetchall():
                    item = dict()
                    item["nid"] = news[0]
                    item["title"] = news[1]
                    item["docid"] = news[2]
                    news_last.append(item)
            except Exception as e:
                logging.warning(e)
            else:
                cursor.close()
            finally:
                self.pg.putconn(conn)
            self.last_ts = current_ts
            if news_last:
                break
            else:
                time.sleep(60)
        return news_last

    def get_news(self):
        return [{
            "title": "鹿晗首次演唱《如果》",
            "nid": 0,
            "docid": 'https://kuaibao.qq.com/s/20170409G01WXA00'
        }]

    def get_comment(self, title):
        search_list = Toutiao.get_search_list(title)
        top_select = Toutiao.select_search(search_list)
        try:
            logging.info("[Toutiao-Title]" +
                         top_select.get("title", "Not Get Toutiao-Comment"))
        except Exception as e:
            logging.error(e)
        comments = Toutiao.get_comments(top_select)
        return comments

    def upload_comment(self, docid, comments):
        # self.uploader.show_comments(comments)
        self.uploader.upload_comments(docid=docid, comments=comments)

    def run(self, sleep_min, sleep_max):
        logging.info("->To get the hot news")
        news_list = self.get_hot_news()
        logging.info("->To get comment")
        for news in news_list:
            title = news.get("title")
            docid = news.get("docid")
            logging.info(title)
            comments = self.get_comment(title)
            self.upload_comment(docid, comments)
            time.sleep(random.randint(sleep_min, sleep_max))

    def test(self, title):
        comments = self.get_comment(title)
        self.uploader.show_comments(comments)
Ejemplo n.º 6
0
 def __init__(self):
     self.pg = PG()
Ejemplo n.º 7
0
class CommentsUpload(object):
    """
    评论上传操作类
    """
    def __init__(self):
        self.pg = PG()

    @staticmethod
    def format_time(t=None):
        """
        格式化时间
        :param t: 时间字符串,若空使用当前utc时间
        :return:
        """
        result = None
        if t is None:
            return datetime.datetime.now()
        try:
            result = datetime.datetime.fromtimestamp(t)
        except Exception as e:
            logging.warning(e, exc_info=True)
        if result is None:
            result = datetime.datetime.now()
        return result

    def unique(cls, news_id, user_name, content):
        """
        创建唯一性约束(根据用户名和评论内容的MD5)
        :param news_id: 新闻id
        :param user_name: 用户名
        :param content: 评论内容
        :return:
        """
        m = hashlib.md5()
        m.update(str(news_id))
        if user_name:
            m.update(user_name)
        if content:
            m.update(content)
        return m.hexdigest()

    def upload_pg(self, comment, docid):
        """
        上传PG操作实现
        :param comment:
        :param docid:
        :return:
        """
        flag = True
        doc = dict()
        doc["cid"] = str(comment["comment_id"]) + "tacey"
        doc["docid"] = docid
        doc["uname"] = comment["user_name"]
        doc["avatar"] = comment["user_logo"]
        doc["ctime"] = self.format_time(comment["create_time"])
        doc["commend"] = comment["like_number"]
        doc["content"] = comment["content"]

        conn = self.pg.getconn()
        cursor = conn.cursor()
        sql_insert = """INSERT INTO commentlist_v2
                        (cid, docid, uname, avatar, ctime, commend, content)
                        VALUES
                        (%s, %s, %s, %s, %s, %s, %s);"""
        params_insert = (doc["cid"], doc["docid"], doc["uname"], doc["avatar"],
                         doc["ctime"], doc["commend"], doc["content"])
        sql_update = "UPDATE newslist_v2 SET comment=comment+1 WHERE docid=%s"
        params_update = (doc["docid"], )
        try:
            cursor.execute(sql_insert, params_insert)
            cursor.execute(sql_update, params_update)
            conn.commit()
        except Exception as e:
            logging.warning(e, exc_info=True)
            conn.rollback()
            flag = False
        else:
            cursor.close()
        finally:
            self.pg.putconn(conn)
        return flag

    def upload_comment(self, docid, comment):
        """
        上传评论(做一层try-except)
        :param docid: 新闻url
        :param comment: 评论(单条)
        :return:
        """
        try:
            flag = self.upload_pg(comment, docid)
            if flag:
                logging.info("Comment Upload Success")
            else:
                logging.warning("Comment Upload Failed")
            return True
        except Exception as e:
            logging.warning(e, exc_info=True)
            return False

    def upload_comments(self, docid, comments):
        """
        :return:
        """
        for comment in comments:
            result = self.upload_comment(docid, comment)

    @staticmethod
    def show_comments(comments):
        """
        打印抓取到的评论结果,DEBUG时查看
        :param comments: 评论列表
        :return:
        """
        for comment in comments:
            print "User Name:       ", comment["user_name"]
            print "User Logo:       ", comment["user_logo"]
            print "Comment Time:    ", comment["create_time"]
            print "Comment Content: ", comment["content"]
            print "Like Number:     ", comment["like_number"]
            print "Comment ID:      ", comment["comment_id"]
            print "=" * 20