class PageDataConfDao: def __init__(self): self.db = DataBase() def insert(self, page_data_id, p_type, p_key, p_value, p_description): key = self.db.insert("insert into ec_spider.t_page_data_conf (page_data_id, p_type, p_key, p_value, p_description, created, updated) values(%s{})".format(", %s"*6), (page_data_id, p_type, p_key, p_value, p_description, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query_by_page_data_id(self, page_data_id): data = self.db.query("select id, page_data_id, p_type, p_key, p_value, p_description, created, updated from ec_spider.t_page_data_conf where page_data_id = {}".format(page_data_id)) return data def delete(self, id): self.db.delete("delete from ec_spider.t_page_data_conf where id = {}".format(id)) def delete_by_page_data_id(self, page_data_id): self.db.delete("delete from ec_spider.t_page_data_conf where page_data_id = {}".format(page_data_id)) self.db.commit()
class DataTabColumnDao: def __init__(self): self.db = DataBase() def insert(self, data_tab_id, col_name, col_type, col_type_length, col_description, check_col_name, is_file_column, is_primary_key, is_data_maintenance_pk): key = self.db.insert("insert into ec_spider.t_data_tab_column (data_tab_id, col_name, col_type, col_type_length, col_description, check_col_name, is_file_column, is_primary_key, is_data_maintenance_pk, created, updated) values(%s{})".format(", %s"*10), (data_tab_id, col_name, col_type, col_type_length, col_description, check_col_name, is_file_column, is_primary_key, is_data_maintenance_pk, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query_by_tab_id(self, tab_id): data = self.db.query("select id, data_tab_id, col_name, col_type, col_type_length, col_description, check_col_name, is_file_column, is_primary_key, is_data_maintenance_pk, created, updated from ec_spider.t_data_tab_column where data_tab_id = {}".format(tab_id)) return data def delete(self, id): self.db.delete("delete from ec_spider.t_data_tab_column where id = {}".format(id)) def delete_by_data_tab_id(self, data_tab_id): self.db.delete("delete from ec_spider.t_data_tab_column where data_tab_id = {}".format(data_tab_id)) self.db.commit()
class PageDao: def __init__(self): self.db = DataBase() def insert(self, website, name, menu_level_first, url, menu_level_second=None, menu_level_third=None): key = self.db.insert("insert into ec_spider.t_page (website, name, menu_level_first, menu_level_second, menu_level_third, url, created, updated) values(%s{})".format(", %s"*7), (website, name, menu_level_first, menu_level_second, menu_level_third, url, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query(self, id): data = self.db.query("select id, website, name, menu_level_first, menu_level_second, menu_level_third, url, created, updated from ec_spider.t_page where id = {}".format(id)) return data def delete(self, id): self.db.delete("delete from ec_spider.t_page where id = {}".format(id)) self.db.commit()
class DataTabDao: def __init__(self): self.db = DataBase() def insert(self, name, page_data_id, check_name_rule, business_columns, pre_cnt=1): key = self.db.insert("insert into ec_spider.t_data_tab (name, page_data_id, check_name_rule, business_columns, pre_cnt, created, updated) values(%s{})".format(", %s"*6), (name, page_data_id, check_name_rule, business_columns, pre_cnt, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query_by_page_data_id(self, page_data_id): data = self.db.query("select id, name, page_data_id, check_name_rule, business_columns, pre_cnt, created, updated from ec_spider.t_data_tab where page_data_id = {}".format(page_data_id)) return data def query(self, id): data = self.db.query("select id, name, page_data_id, check_name_rule, business_columns, pre_cnt, created, updated from ec_spider.t_data_tab where id = {}".format(id)) return data def delete(self, id): self.db.delete("delete from ec_spider.t_data_tab where id = {}".format(id)) self.db.commit()
class PageDataDao: def __init__(self): self.db = DataBase() def insert(self, page_id, name, status, data_source_type, data_update_freq, data_update_time, rule_read_file_prefix, rule_save_path_suffix): key = self.db.insert("insert into ec_spider.t_page_data (page_id, name, status, data_source_type, data_update_freq, data_update_time, rule_read_file_prefix, rule_save_path_suffix, created, updated) values(%s{})".format(", %s"*9), (page_id, name, status, data_source_type, data_update_freq, data_update_time, rule_read_file_prefix, rule_save_path_suffix, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query(self, id): data = self.db.query("select id, page_id, name, status, data_source_type, data_update_freq, data_update_time, rule_read_file_prefix, rule_save_path_suffix, created, updated from ec_spider.t_page_data where id = {}".format(id)) return data def query_by_page_id(self, page_id): data = self.db.query("select id, page_id, name, status, data_source_type, data_update_freq, data_update_time, rule_read_file_prefix, rule_save_path_suffix, created, updated from ec_spider.t_page_data where page_id = {}".format(page_id)) return data def delete(self, id): self.db.delete("delete from ec_spider.t_page_data where id = {}".format(id)) self.db.commit()
def __init__(self, store_id, page_data_id, port): """ 初始化爬虫任务所需的信息 1.实例化对象:Store、PageData、Table 2.环境初始化 3.web_driver 连接确认 4.web_driver 店铺LOGIN确认,确认浏览正常并店铺已登录成功时置login_flag=True :param store_id: 店铺id,用来获取店铺对象 :param page_data_id: 抓取的页面数据块id,用来获取页面数据块对象 :param port: 已开启的浏览器服务端口 """ self.error = None self.login_flag = False try: self.store = StoreService().get_store(store_id) self.page_data = PageDataService().get_page_data(page_data_id) self.page = self.page_data.page self.db = DataBase() self.port = port self.FILE_PART_PATH = self.store.name + '/' + self.page_data.name + '/' + self.page_data.data_update_freq self.FILE_DOWNLOAD_PATH = setting.FILE_DOWNLOAD_PATH_PREFIX + '/' + self.store.name self.FILE_PROCESS_PATH = setting.FILE_PROCESS_PATH_PREFIX + '/' + self.FILE_PART_PATH self.FILE_BACKUP_PATH = setting.FILE_BACKUP_PATH_PREFIX + '/' + self.FILE_PART_PATH if not os.path.exists(self.FILE_DOWNLOAD_PATH): os.makedirs(self.FILE_DOWNLOAD_PATH) if not os.path.exists(self.FILE_PROCESS_PATH): os.makedirs(self.FILE_PROCESS_PATH) if not os.path.exists(self.FILE_BACKUP_PATH): os.makedirs(self.FILE_BACKUP_PATH) # 下载目录清理 self.clear_download_path() # 初始化webdriver,判断是否已登录 self.driver = None self.init_web_driver() self.check_store_login() # 数据维度字典 self.data_dimension_dict = {} # 下载文件取数时需要 self.file_names = [] # 单文件、单数据表存储,例:[DataFrame] # 多文件/多sheet、单数据表存储,例:[DataFrame, DataFrame, DataFrame] # TODO 暂无忽略 # 多文件/多sheet、多数据表存储:判断条件 page_data.is_multiple_tab() # 例:[{'tab.name', [DataFrame]}, {'tab.name', [DataFrame, DataFrame]}] self.source_data_list = [] self.data_list = [] except Exception as e: Logging.error(e) self.error = ErrorEnum.ERROR_1000
class StoreDao: def __init__(self): self.db = DataBase() def insert(self, name, plt_name, plt_store_id, login_username=None, url=None, status=1): key = self.db.insert( "insert into ec_spider.t_store (name, plt_name, plt_store_id, login_username, url, status, created, updated) values(%s{})" .format(', %s' * 7), (name, plt_name, plt_store_id, login_username, url, status, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query(self, id): data = self.db.query( "select id, name, plt_name, plt_store_id, login_username, url, status, created, updated from ec_spider.t_store where id = {}" .format(id)) return data def query_by_name(self, store_name): data = self.db.query( "select id, name, plt_name, plt_store_id, login_username, url, status, created, updated from ec_spider.t_store where name = '{}'" .format(store_name)) return data def delete(self, id): self.db.delete( "delete from ec_spider.t_store where id = {}".format(id)) self.db.commit() return True
def __init__(self): self.error = None self.db = DataBase()
def __init__(self): self.db = DataBase()
class StorePropertyDao: def __init__(self): self.db = DataBase() def insert(self, store_id, p_type, p_key, p_value, p_description): key = self.db.insert( "insert into ec_spider.t_store_property (store_id, p_type, p_key, p_value, p_description, created, updated) values(%s{})" .format(', %s' * 6), (store_id, p_type, p_key, p_value, p_description, get_current_timestamp(), get_current_timestamp())) self.db.commit() return key def query_by_store_id(self, store_id): data = self.db.query( "select id, store_id, p_type, p_key, p_value, p_description, created, updated from ec_spider.t_store_property where store_id = {}" .format(store_id)) return data def delete(self, id): self.db.delete( "delete from ec_spider.t_store_property where id = {}".format(id)) self.db.commit() def delete_by_store_id(self, store_id): self.db.delete( "delete from ec_spider.t_store_property where store_id = {}". format(store_id)) self.db.commit()
from common.db import DataBase if __name__ == '__main__': db1 = DataBase() # data1 = db1.query('select * from temp_test') # db1.dispose() db2 = DataBase() db2.dispose() print(db1.db_conn, db2.db_conn) print(DataBase._DB__pool)