def query_status(self, id): """ 查询status """ db = StoreMysql(**config.baidu_spider_move) query_status_sql = 'select `status` from {} where id = {}'.format( self.rank_store.table, id) try: result = db.query(query_status_sql) db.close() return result[0][0] except: print "query_status error" traceback.print_exc()
def start_requests(self): try: while 1: if self.is_get_tasks(): db = StoreMysql(**config.local_content) update_time = str(datetime.now()).split(".")[0] sql = "select id, keyword from keywords where status = 1 order by update_time asc, priority desc limit 0, {}".format( self.step) rows = db.query(sql) self.log_record.info( "datetime:{},task_results length:{}".format( datetime.now(), len(rows))) ids = list() if rows: for word in rows: task_id = word[0] ids.append({ "id": task_id, "update_time": update_time }) keyword = word[1] for i in range(1, 11): send_url = "http://weixin.sogou.com/weixin?query={}&_sug_type_=&s_from=input&_sug_=n&type=2&page={}&ie=utf8".format( keyword, i) urls = [{ "url": send_url, "type": 1, "ext_type": 1, 'keyword': keyword, 'task_id': task_id, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) self.stores[0].store_table(ids, "keywords", type=2, field="id") else: time.sleep(60 * 10) db.close() time.sleep(60 * 1) except Exception: print traceback.format_exc()
def save(self, device, keyword_id, rank): try: db = StoreMysql(**config.baidu_spider_move) sql = """insert into rank_daily(platform, out_id, rank, date, created_at, updated_at) values('{}', {}, {}, date(now()), now(), now()) on duplicate key update updated_at = now(), rank = case when values(rank) between 1 and 10 then values(rank) else rank end""".format( device, keyword_id, rank) db.do(sql) db.close() except: print traceback.format_exc() db.close()
def exec_sql(self, sql): db = StoreMysql(**config.baidu_spider_move) try: result = db.query(sql) db.close() return result except Exception: db.close()
def start_requests(self): try: while 1: if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \ and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize: db = StoreMysql(**config.weixin_content) source = SourceStore(config.weixin_content) update_time = str(datetime.now()).split(".")[0] sql = "select id, keyword from content_center.keywords order by update_time asc, priority desc limit 0, {};".format( self.step) rows = db.query(sql) self.log.info("datetime:{},task_results length:{}".format( datetime.now(), len(rows))) ids = list() if rows: for word in rows: task_id = word[0] ids.append({ "id": task_id, "update_time": update_time }) keyword = word[1] send_url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=y&_sug_type_='.format( keyword) urls = [{ "url": send_url, "type": 1, "ext_type": 3, 'keyword': keyword, 'task_id': task_id, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) source.store_table(ids, "keywords", type=2, field="id") db.close() time.sleep(60 * 2) except Exception: print traceback.format_exc()
def to_store_results(self, results, stores): """ type 1 列表页 title name 去重 2 详情页 数据 :param results: :param stores: :return: """ try: result = results["result"] type = results["type"] if type == 1: # log_start = time.time() for info in result: log_md5 = UtilMD5.md5(info["title"] + info["we_name"]) sql = "insert ignore into {}(md5) values('{}')".format( self.md5_table, str(log_md5)) s_id = stores[0].insert_row(sql) if s_id > 0: # self.spider_count += 1 urls = [{ "url": info['url'], "type": 1, "ext_type": 2, 'info': info, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) else: self.repeat_count += 1 if self.repeat_count > 1000: self.log_record.info("repeat_count:{}".format( self.repeat_count)) self.repeat_count = 0 elif type == 2: data = result if not self.judge_china(data["content"]): # 没有中文 self.no_china_count += 1 if self.no_china_count > 1000: self.log_record.info("no_china_count:{}".format( self.no_china_count)) self.no_china_count = 0 return weixin_content = { "summary": data.get("summary", ""), "content": data.get("content", ""), "keyword": data.get("keyword", ""), "title": data.get("title", ""), "wechat_name": data.get("wechat_name", ""), "wechat_num": data.get("wechat_num", "") } s_id = stores[0].store_table_one( weixin_content, "news_{}".format(self.table_index)) if s_id > 0: if s_id % self.table_count == 0: db = StoreMysql(**config.weixin_content) update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format( self.table_index) db.do(update_sql) self.table_index += 1 db.do(self.create_table_sql.format(self.table_index)) insert_sql = "insert into spider_table(table_name) values('news_{}')".format( self.table_index) db.do(insert_sql) time.sleep(1) db.close() else: time.sleep(0.1) time.sleep(2) except: print(traceback.format_exc())
def to_store_results(self, results, stores): """ type 1 列表页 title name 去重 2 详情页 数据 :param results:从store里读取的一个封装好的字典{'result':{'title':'...','abstract':'...'},'type':2} :param stores: :return: """ try: result = results["result"] # 这个是真正的结果,格式:字典 type = results["type"] # 处理方式,type=1 处理列表页,2:处理详情页 if type == 1: # log_start = time.time() # keyword = results["keyword"] for info in result: log_md5 = UtilMD5.md5(info["title"]) sql = "insert ignore into {}(md5) values('{}')".format( self.md5_table, str(log_md5)) s_id = stores[0].insert_row(sql) # 返回一个rowcount(受影响的行?) if s_id > 0: # self.spider_count += 1 urls = [{ "url": info['url'], "type": 1, "ext_type": 2, 'info': info, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) # 封装列表页获取的的url,去请求 else: self.repeat_count += 1 if self.repeat_count > 1000: self.log_record.info("repeat_count:{}".format( self.repeat_count)) self.repeat_count = 0 elif type == 2: data = result if not self.judge_china(data["content"]): # 没有中文 self.no_china_count += 1 if self.no_china_count > 1000: self.log_record.info("no_china_count:{}".format( self.no_china_count)) self.no_china_count = 0 return toutiao_content = { "category": data.get("category", ""), "content": data.get("content", ""), "publish_time": data.get("publish_time", ""), "title": data.get("title", ""), "abstract": data.get("abstract", ""), "tags": data.get("tags", ""), 'url': data.get('url', ''), 'keyword': data.get('keyword', '') } s_id = stores[0].store_table_one( toutiao_content, "news_{}".format(self.table_index)) if s_id > 0: if s_id % self.table_count == 0: db = StoreMysql(**config.toutiao_content) update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format( self.table_index) db.do(update_sql) self.table_index += 1 db.do(self.create_table_sql.format(self.table_index)) insert_sql = "insert into spider_table(table_name) values('news_{}')".format( self.table_index) db.do(insert_sql) time.sleep(1) db.close() else: time.sleep(0.1) time.sleep(4) except: print(traceback.format_exc())