Ejemplo n.º 1
0
def CreateTask(settings,*args,**kwargs):
    mysql_client = SQLServer.from_settings(settings,cf.get("MYSQL_SERVER","type"),db=cf.get("MYSQL_SERVER","db"))
    sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'CRAWLER_SPIDER_TASK';"
    column_name_list = [x[0] for x in mysql_client.select(sql)]      # 查询Task表中的所有列名
    sql = "SELECT {} FROM `CRAWLER_SPIDER_TASK` WHERE NextRunTime<NOW() AND `STATUS`='NEW';".format(",".join(column_name_list))
    site_info_dict_list = []
    for site_info in mysql_client.select(sql):    # 查询所有当前要触发的任务,并转换格式
        item ={}
        for i,x in enumerate(column_name_list):
            item[x] = site_info[i]
        site_info_dict_list.append(item)

    for site_info in site_info_dict_list:
        sql = "UPDATE CRAWLER_SPIDER_TASK SET NextRunTime=DATE_ADD(NextRunTime,interval IntervalTime MINUTE) WHERE id = {id}".format(id=site_info["ID"])
        mysql_client.execute(sql)
        CHECK_POINT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        createSpiderTask(site_info,settings,CHECK_POINT)
    mysql_client.close()
Ejemplo n.º 2
0
def CreateTask(settings,*args,**kwargs):
    mysql_client = SQLServer.from_settings(settings,cf.get("MYSQL_SERVER","type"),db=cf.get("MYSQL_SERVER","db"))
    sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'Task';"
    column_name_list = [x[0] for x in mysql_client.select(sql)]      # 查询Task表中的所有列名
    sql = "SELECT {} FROM `Task` WHERE NextRunTime<NOW() AND `STATUS`='TEST';".format(",".join(column_name_list))
    site_info_dict_list = []
    for site_info in mysql_client.select(sql):    # 查询所有当前要触发的任务,并转换格式
        item ={}
        for i,x in enumerate(column_name_list):
            item[x] = site_info[i]
        site_info_dict_list.append(item)
    for site_info in site_info_dict_list:
        if get_current_ip() == settings.get("MASTER_HOST", ""):
            sql = "UPDATE Task SET NextRunTime=DATE_ADD(NextRunTime,interval IntervalTime MINUTE) WHERE id = {id}".format(
                id=site_info["ID"])
            mysql_client.execute(sql)
        site_info["cf"] = cf
        site_info["CHECK_POINT"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(site_info)
        settings.set("CONCURRENT_REQUESTS", site_info.get("CONCURRENT_REQUESTS", 16), priority="project")
        crawler_process.crawl(site_info["SpiderName"], **site_info)
Ejemplo n.º 3
0
def runAllSpiderConsume():
    mysql_client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type"), db=cf.get("MYSQL_SERVER", "db"))
    sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'CRAWLER_SPIDER_TASK';"
    column_name_list = [x[0] for x in mysql_client.select(sql)]  # 查询Task表中的所有列名
    sql = "SELECT {} FROM `CRAWLER_SPIDER_TASK` WHERE isUse=1;".format(",".join(column_name_list))
    site_info_dict_list = []
    for site_info in mysql_client.select(sql):  # 查询所有当前要触发的任务,并转换格式
        item = {}
        for i, x in enumerate(column_name_list):
            item[x] = site_info[i]
        site_info_dict_list.append(item)
    for site_info in site_info_dict_list:
        site_info["cf"] = cf
        settings.set("CONCURRENT_REQUESTS", site_info.get("CONCURRENT_REQUESTS", 16), priority="project")
        crawler_process.crawl(site_info["SpiderName"], **site_info)
Ejemplo n.º 4
0
def select(settings, SITE_ID):
    # todo 查询所有需要抓取的门店信息
    mysql_client = SQLServer.from_settings(settings,
                                           cf.get("MYSQL_SERVER", "type"),
                                           "bigdata")
    sql = "SELECT BIG_DATA_HOTEL_ID,SITE_ID,URL_CRAWL_INFO FROM `MS_EST_WH_HOTEL_SITE_REL` WHERE `STATUS`='NORMAL' AND SITE_ID={SITE_ID};".format(
        SITE_ID=SITE_ID)
    results = mysql_client.select(sql)
    hotel_id = [
        'WYN5180672', 'WYN5180201', 'WYN5181316', 'WYN5180311', 'WYN5181313',
        'WYN5181092', 'WYN5181311', 'WYN5181081', 'WYN5181153', 'WYN5181112',
        'WYN5181111', 'WYN5181161', 'WYN5181125', 'WYN5181041', 'WYN5181281',
        'WYN5104701', 'WYN5282312', 'WYN5282531', 'WYN5419991', 'WYN5410022',
        'WYN5102351', 'WYN5114951', 'WYN4300221', 'WYN5238431', 'WYN5240001',
        'WYN4100051', 'WYN5300004', 'WYN0300021', 'WYN3000501', 'WYN7100002',
        'WYN2018242', 'WYN2016001', 'WYN2154001', 'WYN2151013', 'WYN2151681',
        'WYN3100141', 'WYN3100171', 'WYN3300021', 'WYN2230011', 'WYN3500071',
        'WYN2610111', 'WYN2100361', 'WYN2132991'
    ]

    results = [x for x in results if x[0] in hotel_id]
    # print(results)
    mysql_client.close()
    return results
Ejemplo n.º 5
0
) ENGINE=InnoDB AUTO_INCREMENT=30722 DEFAULT CHARSET=utf8;"""

CreateXieChengTableSql = """
CREATE TABLE `stage_tree` (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `cityName` varchar(255) DEFAULT NULL,
  `brand` varchar(255) DEFAULT NULL,
  `cityNamePY` varchar(255) DEFAULT NULL,
  `checkIn` varchar(255) DEFAULT NULL,                                                       
  `checkOut` varchar(255) DEFAULT NULL,
  `hotelAmount` varchar(255) DEFAULT NULL,
  `hotelId` varchar(255) DEFAULT NULL,
  `fullname` text,
  `price` varchar(255) DEFAULT NULL,
  `sorceInfo` text,
  `info` text,
  `CREATE_TIME` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
  `UPDATE_TIME` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=30722 DEFAULT CHARSET=utf8;"""

CreateExtractRuleTree = """

"""

if __name__ == '__main__':
    settings = get_project_settings()
    client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type"))
    # client.do_execute(CreateTaskSQL)
    # client.do_execute(CreateSettingsSql)
    client.do_execute(CreateXieChengTableSql)