def CreateTask(settings,*args,**kwargs): mysql_client = SQLServer.from_settings(settings,cf.get("MYSQL_SERVER","type"),db=cf.get("MYSQL_SERVER","db")) sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'CRAWLER_SPIDER_TASK';" column_name_list = [x[0] for x in mysql_client.select(sql)] # 查询Task表中的所有列名 sql = "SELECT {} FROM `CRAWLER_SPIDER_TASK` WHERE NextRunTime<NOW() AND `STATUS`='NEW';".format(",".join(column_name_list)) site_info_dict_list = [] for site_info in mysql_client.select(sql): # 查询所有当前要触发的任务,并转换格式 item ={} for i,x in enumerate(column_name_list): item[x] = site_info[i] site_info_dict_list.append(item) for site_info in site_info_dict_list: sql = "UPDATE CRAWLER_SPIDER_TASK SET NextRunTime=DATE_ADD(NextRunTime,interval IntervalTime MINUTE) WHERE id = {id}".format(id=site_info["ID"]) mysql_client.execute(sql) CHECK_POINT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") createSpiderTask(site_info,settings,CHECK_POINT) mysql_client.close()
def CreateTask(settings,*args,**kwargs): mysql_client = SQLServer.from_settings(settings,cf.get("MYSQL_SERVER","type"),db=cf.get("MYSQL_SERVER","db")) sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'Task';" column_name_list = [x[0] for x in mysql_client.select(sql)] # 查询Task表中的所有列名 sql = "SELECT {} FROM `Task` WHERE NextRunTime<NOW() AND `STATUS`='TEST';".format(",".join(column_name_list)) site_info_dict_list = [] for site_info in mysql_client.select(sql): # 查询所有当前要触发的任务,并转换格式 item ={} for i,x in enumerate(column_name_list): item[x] = site_info[i] site_info_dict_list.append(item) for site_info in site_info_dict_list: if get_current_ip() == settings.get("MASTER_HOST", ""): sql = "UPDATE Task SET NextRunTime=DATE_ADD(NextRunTime,interval IntervalTime MINUTE) WHERE id = {id}".format( id=site_info["ID"]) mysql_client.execute(sql) site_info["cf"] = cf site_info["CHECK_POINT"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(site_info) settings.set("CONCURRENT_REQUESTS", site_info.get("CONCURRENT_REQUESTS", 16), priority="project") crawler_process.crawl(site_info["SpiderName"], **site_info)
def runAllSpiderConsume(): mysql_client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type"), db=cf.get("MYSQL_SERVER", "db")) sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'CRAWLER_SPIDER_TASK';" column_name_list = [x[0] for x in mysql_client.select(sql)] # 查询Task表中的所有列名 sql = "SELECT {} FROM `CRAWLER_SPIDER_TASK` WHERE isUse=1;".format(",".join(column_name_list)) site_info_dict_list = [] for site_info in mysql_client.select(sql): # 查询所有当前要触发的任务,并转换格式 item = {} for i, x in enumerate(column_name_list): item[x] = site_info[i] site_info_dict_list.append(item) for site_info in site_info_dict_list: site_info["cf"] = cf settings.set("CONCURRENT_REQUESTS", site_info.get("CONCURRENT_REQUESTS", 16), priority="project") crawler_process.crawl(site_info["SpiderName"], **site_info)
def select(settings, SITE_ID): # todo 查询所有需要抓取的门店信息 mysql_client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type"), "bigdata") sql = "SELECT BIG_DATA_HOTEL_ID,SITE_ID,URL_CRAWL_INFO FROM `MS_EST_WH_HOTEL_SITE_REL` WHERE `STATUS`='NORMAL' AND SITE_ID={SITE_ID};".format( SITE_ID=SITE_ID) results = mysql_client.select(sql) hotel_id = [ 'WYN5180672', 'WYN5180201', 'WYN5181316', 'WYN5180311', 'WYN5181313', 'WYN5181092', 'WYN5181311', 'WYN5181081', 'WYN5181153', 'WYN5181112', 'WYN5181111', 'WYN5181161', 'WYN5181125', 'WYN5181041', 'WYN5181281', 'WYN5104701', 'WYN5282312', 'WYN5282531', 'WYN5419991', 'WYN5410022', 'WYN5102351', 'WYN5114951', 'WYN4300221', 'WYN5238431', 'WYN5240001', 'WYN4100051', 'WYN5300004', 'WYN0300021', 'WYN3000501', 'WYN7100002', 'WYN2018242', 'WYN2016001', 'WYN2154001', 'WYN2151013', 'WYN2151681', 'WYN3100141', 'WYN3100171', 'WYN3300021', 'WYN2230011', 'WYN3500071', 'WYN2610111', 'WYN2100361', 'WYN2132991' ] results = [x for x in results if x[0] in hotel_id] # print(results) mysql_client.close() return results
) ENGINE=InnoDB AUTO_INCREMENT=30722 DEFAULT CHARSET=utf8;""" CreateXieChengTableSql = """ CREATE TABLE `stage_tree` ( `ID` int(11) NOT NULL AUTO_INCREMENT, `cityName` varchar(255) DEFAULT NULL, `brand` varchar(255) DEFAULT NULL, `cityNamePY` varchar(255) DEFAULT NULL, `checkIn` varchar(255) DEFAULT NULL, `checkOut` varchar(255) DEFAULT NULL, `hotelAmount` varchar(255) DEFAULT NULL, `hotelId` varchar(255) DEFAULT NULL, `fullname` text, `price` varchar(255) DEFAULT NULL, `sorceInfo` text, `info` text, `CREATE_TIME` timestamp NULL DEFAULT CURRENT_TIMESTAMP, `UPDATE_TIME` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=30722 DEFAULT CHARSET=utf8;""" CreateExtractRuleTree = """ """ if __name__ == '__main__': settings = get_project_settings() client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type")) # client.do_execute(CreateTaskSQL) # client.do_execute(CreateSettingsSql) client.do_execute(CreateXieChengTableSql)