def fetch_light_infos(lightInfosService, __use_id, __type_id, __technology_id): _url = Constants.LAMPS_BY_USE_URL.format(__use_id, __type_id, __technology_id) if urlManagerService.has_fetch(_url): logger.info("light_infos(lamps)_URL : {} 已经抓取过".format(_url)) else: light_infos = get_json(_url) if light_infos["error"] == 0: light_info_list = light_infos["result"] order = 0 for light_info in light_info_list: lightInfosBean = LightInfosBean( __use_id, __type_id, __technology_id, order, light_info["bullet_list"], light_info["lamp_info"], light_info["linecard_id"], light_info["linecard_name"], light_info["linksAutomotive"], light_info["osram_bestnr"], light_info["osram_ean"], light_info["osram_ece"], light_info["pillar_id"], light_info["pillar_image"], light_info["pillar_name"], light_info["prio"], light_info["product_image"], light_info["product_zmp"], light_info["usp"]) lightInfosService.safe_insert(lightInfosBean) order += 1 url_manager = UrlManager(_url, "light_infos(lamps)") urlManagerService.safe_insert(url_manager) else: logger.info("light_infos(lamps)获取失败...")
def fetch_technology(technologyService, lightInfosService, _type_id, _use_id): _url = Constants.ALL_TECHNOLOGY_URL.format(_use_id, _type_id) if urlManagerService.has_fetch(_url): logger.info("technology_URL : {} 已经抓取过".format(_url)) else: all_technology = get_json(_url) if all_technology["error"] == 0: all_technologys = all_technology["result"] for technology in all_technologys: if type(all_technologys) == dict: technology_id = all_technologys[technology][ "technology_id"] technology_name = all_technologys[technology][ "technology_name"] else: # list technology_id = technology["technology_id"] technology_name = technology["technology_name"] technologyBean = TechnologyBean(_use_id, _type_id, technology_id, technology_name) technologyService.safe_insert(technologyBean) fetch_light_infos(lightInfosService, _use_id, _type_id, technology_id) url_manager = UrlManager(_url, "technology") urlManagerService.safe_insert(url_manager) else: logger.info("technology获取失败...")
def export_excel_work(manufacturer_data): # 表头占2行 title_low_num = 2 # 系列 列开始合并的列 model_merge_start = title_low_num sheet = create_sheet(manufacturer_data.manufacturer_name) front_title_list, rear_title_list, internal_title_list = get_light_titles( manufacturer_data) light_list = list() light_list.extend(front_title_list) light_list.extend(rear_title_list) light_list.extend(internal_title_list) title_len = len(title_list) front_len = len(front_title_list) rear_len = len(rear_title_list) internal_len = len(internal_title_list) sheet.write_merge(0, 0, title_len, title_len + front_len - 1, "前灯", set_title_style(2)) sheet.write_merge(0, 0, title_len + front_len, title_len + front_len + rear_len - 1, "后灯", set_title_style(3)) sheet.write_merge(0, 0, title_len + front_len + rear_len, title_len + front_len + rear_len + internal_len - 1, "内灯", set_title_style(5)) for col_num in range(0, len(title_list)): sheet.write_merge(0, 1, col_num, col_num, title_list[col_num], set_title_style()) write_row(sheet, light_list, 1, title_len, set_title_style()) for col_num in range(0, len(light_list) + len(title_list)): sheet.col(col_num).width = 256 * 20 dataset = get_light_dataset(manufacturer_data, front_title_list, rear_title_list, internal_title_list) logger.info("-------------------") for i in range(0, len(dataset)): write_row(sheet, list(dataset[i].values()), i + 2, 0) if i != 0: if dataset[i]["model_name"] != dataset[i - 1]["model_name"]: if i - 1 + title_low_num - model_merge_start > 1: sheet.write_merge(model_merge_start, i - 1 + title_low_num, 0, 0, dataset[i - 1]["model_name"], default_style) model_merge_start = i + title_low_num sheet.write_merge(model_merge_start, len(dataset) - 1 + title_low_num, 0, 0, dataset[len(dataset) - 1]["model_name"], default_style) workbook.save(Constants.EXCEL_PATH + "{}.xls".format(manufacturer_data.manufacturer_name))
def get_html(__url, fetch_num=None): if fetch_num is None: fetch_num = DEFAULT_FETCH_NUM headers = {'User-Agent': USER_AGENT} for i in range(0, fetch_num): try: logger.info("开始抓取[{}] : {} ".format(i + 1, __url)) html = requests.get(__url, headers=headers) break except BaseException as e: pass html.encoding = "UTF-8" logger.debug(logger.info("网页内容 : {} ".format(html.text))) return html
def fetch_all_light2(lightService, technologyService, lightInfosService, type_list): for type in type_list: all_light_url = Constants.ALL_LIGHT_URL.format(type.type_id) if urlManagerService.has_fetch(all_light_url): logger.info("light_URL : {} 已经抓取过".format(all_light_url)) else: all_light = get_json(all_light_url) if all_light["error"] == 0: all_light_list = all_light["result"] for use_id in all_light_list: light = all_light_list[use_id] lightBean = LightBean(type.type_id, light["use_id"], light["pos_id"], light["use_name"]) lightService.safe_insert(lightBean) fetch_technology(technologyService, lightInfosService, type.type_id, light["use_id"]) url_manager = UrlManager(all_light_url, "light") urlManagerService.safe_insert(url_manager) else: logger.info("light获取失败...")
def get_light_titles(__manufacturer_data): """计算前、后、内 灯的表头""" # 前灯集合 front_title_set = set() # 后灯集合 rear_title_set = set() # 内类集合 internal_title_set = set() model_list = modelService.select_by_manufacturer( __manufacturer_data.manufacturer_id) for mode in model_list: type_list = mode.typeList for type in type_list: light_list = type.lightList for light in light_list: if light.pos_id in Constants.FRONT_POS_ID_LIST: front_title_set.add(light.use_name) elif light.pos_id in Constants.REAR_POS_ID_LIST: rear_title_set.add(light.use_name) elif light.pos_id in Constants.INTERNAL_POS_ID_LIST: internal_title_set.add(light.use_name) front_title_list = list(front_title_set) rear_title_list = list(rear_title_set) internal_title_list = list(internal_title_set) front_title_list.sort() rear_title_list.sort() internal_title_list.sort() logger.info("front_title_list = {}".format(front_title_list)) logger.info("rear_title_list = {}".format(rear_title_list)) logger.info("internal_title_list = {}".format(internal_title_list)) return front_title_list, rear_title_list, internal_title_list
def fetch_type(__url, __typeService): """抓取type数据""" if urlManagerService.has_fetch(__url): logger.info("type_URL : {} 已经抓取过".format(__url)) else: all_type = get_json(__url) if all_type["error"] == 0: all_type_list = all_type["result"] for typeItem in all_type_list: logger.info("type : {} --> {}".format(typeItem["type_id"], typeItem)) type_from_year = typeItem["type_from"][0:4] type_from_month = typeItem["type_from"][4:6] type_to_year = typeItem["type_to"][0:4] type_to_month = typeItem["type_to"][4:6] typeBean = TypeBean( typeItem["type_id"], typeItem["model_id"], typeItem["type_from"], type_from_year, type_from_month, typeItem["type_to"], type_to_year, type_to_month, typeItem["type_kw"], typeItem["type_axles"], typeItem["type_tonnage"], typeItem["variant_id"], typeItem["type_name"], typeItem["kba"]) __typeService.safe_insert(typeBean) url_manager = UrlManager(__url, "type") urlManagerService.safe_insert(url_manager) else: logger.info("type获取失败...")
def fetch_model(__url, __manufacturer_id, __modelService, __typeService): """抓取model数据""" if urlManagerService.has_fetch(__url): logger.info("Model_URL : {} 已经抓取过".format(__url)) else: all_model = get_json(__url) if all_model["error"] == 0: all_model_list = all_model["result"] for modelItem in all_model_list: logger.info("model : {} --> {}".format( modelItem["model_id"], modelItem["model_name"])) __modelService.safe_insert( ModelBean(modelItem["model_id"], modelItem["model_name"], __manufacturer_id)) all_type_url = Constants.ALL_TYPE_URL.format( modelItem["model_id"]) fetch_type(all_type_url, __typeService) urlManagerService.safe_insert(UrlManager(__url, "model")) else: logger.info("model获取失败...")
def fetch_manufacturer(): """抓取品牌""" if urlManagerService.has_fetch(Constants.ALL_MANUFACTURER_URL): logger.info("manufacturer_URL : {} 已经抓取过".format( Constants.ALL_MANUFACTURER_URL)) else: all_manufacturer = get_json(Constants.ALL_MANUFACTURER_URL) if all_manufacturer["error"] == 0: all_manufacturer_dict = all_manufacturer["result"] for manufacturer_id in all_manufacturer_dict: logger.info("{} --> {}".format( manufacturer_id, all_manufacturer_dict[manufacturer_id])) manufacturerService.safe_insert( ManufacturerBean( StringUtil.strip(manufacturer_id), StringUtil.strip(all_manufacturer_dict[manufacturer_id] ["Manufacturer_name"]))) all_model_url = Constants.ALL_MODEL_URL.format(manufacturer_id) fetch_model(all_model_url, manufacturer_id, modelService, typeService) urlManagerService.safe_insert( UrlManager(Constants.ALL_MANUFACTURER_URL, "manufacturer")) else: logger.info("品牌获取失败...")
session = DBSession() return session.query(TypeBean).filter(TypeBean.type_id == __id).first() def select_by_model(self, __model_id, session=None): if session is None: session = DBSession() return session.query(TypeBean).filter_by(model_id=__model_id).all() def safe_insert(self, __type, session=None): if session is None: session = DBSession() try: if self.find_by_key(__type.type_id, session) is None: session.add(__type) session.commit() except BaseException as e: session.rollback() logger.error(e) def select_by_page(self, offset, limit, session=None): """查询,offset起始位置,limit 返回条数 """ if session is None: session = DBSession() return session.query(TypeBean).offset(offset).limit(limit).all() if __name__ == '__main__': typeService = TypeService() logger.info(typeService.count_type()) logger.info(typeService.select_by_page(0, 10))
if session is None: session = DBSession() return session.query(ModelBean).filter_by( manufacturer_id=__manufacturer_id).all() def safe_insert(self, __model, session=None): if session is None: session = DBSession() try: if self.find_by_key(__model.model_id, session) is None: session.add(__model) session.commit() except BaseException as e: session.rollback() logger.error(e) if __name__ == '__main__': service = ModelService() # model = ModelBean(2, "test53223", 1) # service.insert(model) # # selectall = service.select_all(ModelBean) # logger.info("----") # logger.info("--> {}".format(selectall)) model = service.find_by_key(11) logger.info("----") logger.info(model) logger.info("2222")