class CommentsTask(object): """ 评论抓力任务处理类 """ last_ts = pg = PG() db_m = DB_M.get_default_database() def get_hot_news(self): """ TO-DO:获取带抓取评论的热门/推荐新闻 :return: """ pass def get_news(self): """ 获取待抓取评论的新闻 :return: 新闻列表(url,标题,插入时间) """ news_last = [] while True: current_ts = conn = cursor = conn.cursor() sql = """select url,title,ctime,docid from newslist_v2 where ctime >= %s and ctime < %s and comment=0 and chid not in (28,34,1);""" params = (self.last_ts, current_ts) try: cursor.execute(sql, params) for news in cursor.fetchall(): news_last.append((news[0], news[1], news[2], news[3])) except Exception as e: logging.warning(e) else: cursor.close() finally: self.last_ts = current_ts if news_last: break else: time.sleep(60) return news_last def get_account_info(self): """ 从MONGO获取微博账号信息,用于评论抓取 :return: 账号信息列表 """"To Get comm-account info") comm_info = self.db_m.weibo_account.find({ "type": "comm", "state": True }) return list(comm_info)
def __init__(self): = PG() self.db_m = DB_M.get_default_database()
class CommentsUpload(object): """ 评论上传操作类 """ def __init__(self): = PG() self.db_m = DB_M.get_default_database() @staticmethod def format_time(t=None): """ 格式化时间 :param t: 时间字符串,若空使用当前utc时间 :return: """ f = "%Y-%m-%d %H:%M:%S" result = None if t is None: return try: result = datetime.datetime.strptime(t, f) except Exception as e: logging.warning(e, exc_info=True) if result is None: result = return result def unique(cls, news_id, user_name, content): """ 创建唯一性约束(根据用户名和评论内容的MD5) :param news_id: 新闻id :param user_name: 用户名 :param content: 评论内容 :return: """ m = hashlib.md5() m.update(str(news_id)) if user_name: m.update(user_name) if content: m.update(content) return m.hexdigest() def upload_pg(self, comment, docid): """ 上传PG操作实现 :param comment: :param news_url: :return: """ flag = True doc = dict() doc["cid"] = str(comment["comment_id"]) + "tacey" doc["docid"] = docid doc["uname"] = comment["user_name"] doc["avatar"] = comment["user_logo"] doc["ctime"] = self.format_time(comment["create_time"]) doc["commend"] = comment["like_number"] doc["content"] = comment["content"] conn = cursor = conn.cursor() sql_insert = """INSERT INTO commentlist_v2 (cid, docid, uname, avatar, ctime, commend, content) VALUES (%s, %s, %s, %s, %s, %s, %s);""" params_insert = (doc["cid"], doc["docid"], doc["uname"], doc["avatar"], doc["ctime"], doc["commend"], doc["content"]) sql_update = "UPDATE newslist_v2 SET comment=comment+1 WHERE docid=%s" params_update = (doc["docid"], ) try: cursor.execute(sql_insert, params_insert) cursor.execute(sql_update, params_update) conn.commit() except Exception as e: logging.warning(e, exc_info=True) conn.rollback() flag = False else: cursor.close() finally: return flag def upload_comment(self, docid, comment): """ 上传评论(做一层try-except) :param news_url: 新闻url :param comment: 评论(单条) :return: """ try: flag = self.upload_pg(comment, docid) if flag:"Comment Upload Success") else: logging.warning("Comment Upload Failed") return True except Exception as e: logging.warning(e, exc_info=True) return False def upload_comments(self, news_url, docid, comments, meta): """ 评论上传 :param news_url: 新闻URL :param docid: docid :param comments: 评论列表 :param meta: 评论来源微博信息 :return: """ count = 0 for comment in comments: result = self.upload_comment(docid, comment) if result: count += 1 result = 0 if count > 0 else 1 self.stat(news_url, result, meta=meta) def stat(self, news_url, result, w_account=None, meta=None): """ 记录评论抓取结果 :param news_url:新闻url :param result: 处理结果 0:得到评论并成功入库 1:得到评论入库失败 2: 新闻标题太短 3:未搜索到有评论微博 4:搜索到有评论微博,未获取到评论 5: 搜索新闻被Ban :return: """ process_time = datetime.datetime.utcnow() self.db_m.weibo_comment_stat.insert({ "news_url": news_url, "result": result, "process_time": process_time, "w_account": w_account, "meta": meta }) @staticmethod def show_comments(comments): """ 打印抓取到的评论结果,DEBUG时查看 :param comments: 评论列表 :return: """ for comment in comments: print "User Name: ", comment["user_name"] print "User Logo: ", comment["user_logo"] print "Comment Time: ", comment["create_time"] print "Comment Content: ", comment["content"] print "Like Number: ", comment["like_number"] print "Comment ID: ", comment["comment_id"] print "=" * 20
def __init__(self): self.uploader = CommentsUpload() = PG() self.last_ts = - datetime.timedelta(minutes=10)
class CommentTask(object): def __init__(self): self.uploader = CommentsUpload() = PG() self.last_ts = - datetime.timedelta(minutes=10) def get_hot_news(self): """ 获取待抓取评论的热点新闻 :return: 新闻列表(url,标题,插入时间) """ news_last = [] while True: current_ts = conn = cursor = conn.cursor() sql = """select nid,title,docid from newslist_v2 where nid in (select nid from newsrecommendhot where ctime >= %s and ctime < %s)""" params = (self.last_ts, current_ts) try: cursor.execute(sql, params) for news in cursor.fetchall(): item = dict() item["nid"] = news[0] item["title"] = news[1] item["docid"] = news[2] news_last.append(item) except Exception as e: logging.warning(e) else: cursor.close() finally: self.last_ts = current_ts if news_last: break else: time.sleep(60) return news_last def get_news(self): return [{ "title": "鹿晗首次演唱《如果》", "nid": 0, "docid": '' }] def get_comment(self, title): search_list = Toutiao.get_search_list(title) top_select = Toutiao.select_search(search_list) try:"[Toutiao-Title]" + top_select.get("title", "Not Get Toutiao-Comment")) except Exception as e: logging.error(e) comments = Toutiao.get_comments(top_select) return comments def upload_comment(self, docid, comments): # self.uploader.show_comments(comments) self.uploader.upload_comments(docid=docid, comments=comments) def run(self, sleep_min, sleep_max):"->To get the hot news") news_list = self.get_hot_news()"->To get comment") for news in news_list: title = news.get("title") docid = news.get("docid") comments = self.get_comment(title) self.upload_comment(docid, comments) time.sleep(random.randint(sleep_min, sleep_max)) def test(self, title): comments = self.get_comment(title) self.uploader.show_comments(comments)
def __init__(self): = PG()
class CommentsUpload(object): """ 评论上传操作类 """ def __init__(self): = PG() @staticmethod def format_time(t=None): """ 格式化时间 :param t: 时间字符串,若空使用当前utc时间 :return: """ result = None if t is None: return try: result = datetime.datetime.fromtimestamp(t) except Exception as e: logging.warning(e, exc_info=True) if result is None: result = return result def unique(cls, news_id, user_name, content): """ 创建唯一性约束(根据用户名和评论内容的MD5) :param news_id: 新闻id :param user_name: 用户名 :param content: 评论内容 :return: """ m = hashlib.md5() m.update(str(news_id)) if user_name: m.update(user_name) if content: m.update(content) return m.hexdigest() def upload_pg(self, comment, docid): """ 上传PG操作实现 :param comment: :param docid: :return: """ flag = True doc = dict() doc["cid"] = str(comment["comment_id"]) + "tacey" doc["docid"] = docid doc["uname"] = comment["user_name"] doc["avatar"] = comment["user_logo"] doc["ctime"] = self.format_time(comment["create_time"]) doc["commend"] = comment["like_number"] doc["content"] = comment["content"] conn = cursor = conn.cursor() sql_insert = """INSERT INTO commentlist_v2 (cid, docid, uname, avatar, ctime, commend, content) VALUES (%s, %s, %s, %s, %s, %s, %s);""" params_insert = (doc["cid"], doc["docid"], doc["uname"], doc["avatar"], doc["ctime"], doc["commend"], doc["content"]) sql_update = "UPDATE newslist_v2 SET comment=comment+1 WHERE docid=%s" params_update = (doc["docid"], ) try: cursor.execute(sql_insert, params_insert) cursor.execute(sql_update, params_update) conn.commit() except Exception as e: logging.warning(e, exc_info=True) conn.rollback() flag = False else: cursor.close() finally: return flag def upload_comment(self, docid, comment): """ 上传评论(做一层try-except) :param docid: 新闻url :param comment: 评论(单条) :return: """ try: flag = self.upload_pg(comment, docid) if flag:"Comment Upload Success") else: logging.warning("Comment Upload Failed") return True except Exception as e: logging.warning(e, exc_info=True) return False def upload_comments(self, docid, comments): """ :return: """ for comment in comments: result = self.upload_comment(docid, comment) @staticmethod def show_comments(comments): """ 打印抓取到的评论结果,DEBUG时查看 :param comments: 评论列表 :return: """ for comment in comments: print "User Name: ", comment["user_name"] print "User Logo: ", comment["user_logo"] print "Comment Time: ", comment["create_time"] print "Comment Content: ", comment["content"] print "Like Number: ", comment["like_number"] print "Comment ID: ", comment["comment_id"] print "=" * 20