def find_how_many_is_spider(): MYSQL_CONN = MYSQL() zhubo_rows = MYSQL_CONN.select_from_table(ZHUBO_LIVE_TABLE, "is_live = 1") len_is_live = [] for zhubo_row in zhubo_rows: len_is_live.append(zhubo_row) return len(len_is_live)
def delete_live_goods_temp(live_id): MYSQL_CONN = MYSQL() try: MYSQL_CONN.delete_from_table(LIVE_GOODS_TEMP_TABLE, "live_id={}".format(live_id)) except Exception as e: logging.error(live_id) logging.error(e)
def spider_basic(user_id): MYSQL_COON = MYSQL() url_info = "https://taobaolive.taobao.com/api/broadcaster_info/1.0?accountId={}".format( user_id) url_goods_list = "https://taobaolive.taobao.com/api/item_list/1.0?type=0&liveId=" url_live = "https://taobaolive.taobao.com/room/index.htm?userId={}".format( user_id) HEADERS = { "User-Agent": random.choice(USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', #'Host': "taobaolive.taobao.com", "Referer": "https://taobaolive.taobao.com/room/index.htm?userId={}".format( user_id) } SESSION.headers.update(HEADERS) response = session_get(url_live) response_text = response.text #print response_text live_id = re.search(r'liveId":(\d+)', response_text).group(1) # print live_id == str(0) #live_basic_info = {} goods_json_dict = get_goods_list(url_goods_list, live_id) info_json_dict = get_info(url_info) live_basic_info = dict(goods_json_dict, **info_json_dict) live_basic_info["zhubo_id"] = user_id live_basic_info["live_id"] = live_id if live_id == str(0): live_basic_info["is_live"] = 0 else: live_basic_info["is_live"] = 1 live_basic_info["crawl_time"] = datetime.now() live_basic_info["live_url"] = url_live #return live_basic_info #print repr(live_basic_info).decode("unicode-escape") logging.info("Spider one item into {}".format(LIVE_BASIC_TABLE)) MYSQL_COON.insert_into_table(live_basic_info, LIVE_BASIC_TABLE)
def get_zhubo_id_list(): zhubo_id_list = [] MYSQL_CONN = MYSQL() zhubo_id_row = MYSQL_CONN.select_from_table(ZHUBO_INFO_TABLE, []) for each_zhubo in zhubo_id_row: zhubo_id = each_zhubo["zhubo_id"] zhubo_id_list.append(zhubo_id) return zhubo_id_list
def insert_to_db(results, which_table): MYSQL_CONN = MYSQL() for each_result in results: try: MYSQL_CONN.insert_into_table(each_result, which_table) logging.info("spider one item into {}".format(which_table)) except Exception as e: logging.error(str(each_result)) logging.error(e)
def get_goods_id_list(): goods_id_list = [] MYSQL_CONN = MYSQL() live_goods_row = MYSQL_CONN.select_from_table(LIVE_GOODS_TABLE, []) for each_goods in live_goods_row: goods_id = each_goods["goods_id"] goods_id_list.append(goods_id) return goods_id_list
def get_goods_id_list_from_temp(live_id): goods_id_list = [] MYSQL_CONN = MYSQL() live_goods_row = MYSQL_CONN.select_from_table(LIVE_GOODS_TEMP_TABLE, "live_id={}".format(live_id)) for each_goods in live_goods_row: goods_id = each_goods["goods_id"] goods_id_list.append(goods_id) return goods_id_list
def get_rows(): MYSQL_COON = MYSQL() b = [] a = MYSQL_COON.select_from_table("live_taobao_webstar_crawl_live_goods", *b) n = 1 goods_list = [] for row in a: goods_id = row["goods_id"] yield goods_id MYSQL_COON.close_db()
def __init__(self, from_table, from_table_condition, need_to_update, which_module, into_table, need_to_return, which_need_in_row, update_into_table): super(TASK_OBJECT, self).__init__() self.MYSQL_CONN = MYSQL() self.from_table = from_table self.from_table_condition = from_table_condition self.need_to_update = need_to_update self.which_module = which_module self.into_table = into_table self.need_to_return = need_to_return self.which_need_in_row = which_need_in_row self.update_into_table = update_into_table
def update_zhubo_from_db(): MYSQL_CONN = MYSQL() #pool = multiprocessing.Pool(processes=10) zhubo_rows = MYSQL_CONN.select_from_table(ZHUBO_LIVE_TABLE, "is_live != 1") zhubo_id_list = [] for zhubo_row in zhubo_rows: zhubo_id = zhubo_row["zhubo_id"] #zhubo_id_list.append(zhubo_id) if str(0) != get_live_id(zhubo_id): logging.info("{} is living!.........".format(zhubo_id)) yield zhubo_id else: logging.info("{} is not living!".format(zhubo_id))
def from_live_goods_to_temp(live_id): goods_live_id_list = [] MYSQL_CONN = MYSQL() live_goods_row = MYSQL_CONN.select_from_table(LIVE_GOODS_TABLE, "live_id={}".format(live_id)) for each_goods in live_goods_row: goods_id = each_goods["goods_id"] live_id = each_goods["live_id"] each_item = {"goods_id": goods_id, "live_id": live_id} goods_live_id_list.append(each_item) #goods_live_id_dict[goods_id] = live_id #goods_id_list.append(goods_id) insert_to_db(goods_live_id_list, LIVE_GOODS_TEMP_TABLE)
class TASK_OBJECT(object): ''' This class has 6 parameters. from_table: the module is begin from which table to get the data from_table_condition: select from the table's condition, the default is empty list need_to_update: the module's begin need to update the table or not which_module: which module you need to start into_table: the module's results is need to insert into which table need_to_return: the module's multiprocessing need to return the data or not ''' def __init__(self, from_table, from_table_condition, need_to_update, which_module, into_table, need_to_return, which_need_in_row, update_into_table): super(TASK_OBJECT, self).__init__() self.MYSQL_CONN = MYSQL() self.from_table = from_table self.from_table_condition = from_table_condition self.need_to_update = need_to_update self.which_module = which_module self.into_table = into_table self.need_to_return = need_to_return self.which_need_in_row = which_need_in_row self.update_into_table = update_into_table def get_rows(self): rows = self.MYSQL_CONN.select_from_table(self.from_table, self.from_table_condition) for row in rows: if self.need_to_update: row = get_update_state(row) if row: yield row def multiprocess_task(self, new_list): pool = ThreadPool(THREAD_NUM) if self.need_to_return: results = pool.map(self.which_module, new_list) else: pool.map(self.which_module, new_list) pool.close() pool.join() if self.need_to_return: return results def insert_to_db(self, results): for each_result in results: try: if self.update_into_table: self.MYSQL_CONN.insert_into_table_with_replace( each_result, self.into_table) else: self.MYSQL_CONN.insert_into_table(each_result, self.into_table) except Exception as e: logging.error(str(each_result)) logging.error(e) def task_main(self): rows = self.get_rows() new_list = [] for row in rows: new_list.append(row[self.which_need_in_row]) if len(new_list) % THREAD_NUM == 0: if self.need_to_return: results = self.multiprocess_task(new_list) self.insert_to_db(results) else: self.multiprocess_task(new_list) new_list = [] if new_list: if self.need_to_return: results = self.multiprocess_task(new_list) self.insert_to_db(results) else: self.multiprocess_task(new_list) new_list = []
#coding:utf-8 import logging import sys import json import re sys.path.append("..") from MySql_InterFace.mysql_interface import MYSQL from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool from spider_zhubo import spider_zhubo MySQL_COON = MYSQL() def main(): b = [] a = MySQL_COON.select_from_table("live_taobao_webstar_crawl_live_basic", *b) n = 1 m = 1 list_url = [] for row in a: #print row["daren_url"] user_info_json = row["user_info_json"] user_info_json = json.loads(user_info_json) zhubo_url = user_info_json["result"]["model"]["broadCaster"]["jumpUrl"] #zhubo_url = re.sub(r"//","",zhubo_url) #print zhubo_url n = n + 1 list_url.append(zhubo_url) if n % 10 == 0:
def end_liveing(zhubo_id): MYSQL_CONN = MYSQL() logging.info("{} zhubo is not living!".format(zhubo_id)) info_dict = {"zhubo_id": zhubo_id, "is_live": "0"} MYSQL_CONN.insert_into_table_exist_update(info_dict, ZHUBO_LIVE_TABLE, "is_live=0")
def start_liveing(zhubo_id): info_dict = {"zhubo_id": zhubo_id, "is_live": "1"} MYSQL_CONN = MYSQL() MYSQL_CONN.insert_into_table_exist_update(info_dict, ZHUBO_LIVE_TABLE, "is_live=1")