def get_value_and_key(file_path): """ 获取 xml 文件的 key - value :param file_path: 文件路径 :return: dic[key]-value """ if not file_path or not os.path.exists(file_path): Log.error("xml 文件不存在") return xml_doc = xml.dom.minidom.parse(file_path) nodes = xml_doc.getElementsByTagName('string') dic = collections.OrderedDict() for index, node in enumerate(nodes): key = node.getAttribute("name") value = XMLParse.get_text_node_value(node) dic[key] = value # Log.info("%s : %s" % (key, value)) array_nodes = xml_doc.getElementsByTagName("string-array") for array_node in array_nodes: key = array_node.getAttribute('name') child_nodes = array_node.getElementsByTagName('item') for idx, child_node in enumerate(child_nodes): newKey = key + "-INDEX-" + str(idx) value = XMLParse.get_text_node_value(child_node) dic[newKey] = value return dic
def xls2xml(self, xls_path, file_path, target_language, target_dir_path): """ :param xls_path: 表格路径 :param file_path: 目标文件路径 :param target_language: 目标语言 :param target_dir_path: 目标文件目录 """ Log.info("--- xls2xml ---") # 输入 excel if not xls_path or not os.path.exists(xls_path): return Constant.Error(Constant.ERROR_EXCEL_NOT_EXIST) xlsPath = xls_path self.filePath = file_path self.targetLanguage = target_language self.dirPath = target_dir_path # 获取 xls 对象,以及目标 sheet(这里默认为第一张表,index 从0开始) xlsParse = XLSParse() xlsParse.open_excel(xlsPath) sheet = xlsParse.sheet_by_index(0) Log.info("name = %s, rows number = %s,clos number = %s" % (sheet.name, sheet.nrows, sheet.ncols)) return self.convert(sheet)
def addParser(): parser = OptionParser() parser.add_option("-i", "--input", help="excel file path") parser.add_option("-f", "--targetFilePath", help="means target output is xml file and input the file path") parser.add_option("-l", "--targetLanguage", help="target language shortname(just for output is file)") parser.add_option("-d", "--targetDirPath", help="means target output is dir contains xml file(s)") (options, args) = parser.parse_args() Log.info("options: %s, args: %s" % (options, args)) return options
def update_xml_value(file_path, keys, values): # Log.info("--- updating xml... --- %s" % file_path) if not os.path.exists(file_path): return # Log.info ("--- string ---") # 读取文档 xml_doc = xml.dom.minidom.parse(file_path) # filename nodes = xml_doc.getElementsByTagName('string') for node in nodes: xmlKey = node.getAttribute("name") xmlValue = "" # 改变量仅用于输出 if node.firstChild is None: continue xmlValue = XMLParse.get_text_node_value(node) for index, key in enumerate(keys): if key == xmlKey and len(values[index]) != 0: node.firstChild.data = values[index] Log.info("%s : %s -- >%s " % (xmlKey, xmlValue, node.firstChild.data)) # Log.info("--- string end ---\n") # 数组 # Log.info("--- array ---") array_nodes = xml_doc.getElementsByTagName('string-array') for array_node in array_nodes: xmlKey = array_node.getAttribute('name') child_nodes = array_node.getElementsByTagName('item') for idx, child_node in enumerate(child_nodes): newKey = xmlKey + "-INDEX-" + str(idx) xmlValue = child_node.firstChild.data for index, key in enumerate(keys): if key == newKey and len(values[index]) != 0: child_node.firstChild.data = values[index] Log.info( "%s : %s --> %s" % (newKey, xmlValue, child_node.firstChild.data)) # Log.info("--- array end ---\n") writeFile = open(file_path, 'w') writeFile.write(xml_doc.toxml('utf-8')) writeFile.close()
def update_multi_xml_value(sub_dir_path, keys, values, modules): Log.info("\n\n" + sub_dir_path + "\n\n") ''' sub_dir_path: 目标子目录,比如 value-zh ''' if len(modules) == 0: return # 先排序,把 excel 中的统一 module 排到一起 # 排序,分块处理 current_module = modules[0] module_length_list = [] current_module_len = 0 modules_new = [] values_new = [] keys_new = [] for mid, module in enumerate(modules): if module is None or module == "": continue if current_module != module: module_length_list.append(current_module_len) current_module = module current_module_len = 0 modules_new.append(module) values_new.append(values[mid]) keys_new.append(keys[mid]) current_module_len += 1 module_length_list.append(current_module_len) start = 0 end = 0 for module_len in module_length_list: end += module_len subKeys = keys_new[start:end] subValues = values_new[start:end] module = modules_new[start] start += module_len filePath = sub_dir_path + module + ".xml" XMLParse.update_xml_value(filePath, subKeys, subValues)
import DocUtils import os from ExcelUtils import Excel import shutil import re import time from LogUtils import Log log = Log("config/config.txt") tdr_data_list = [] def test(): log.log_info("开始处理!") if not os.path.exists("excel"): os.mkdir("excel") if not os.path.exists("docx"): os.mkdir("docx") clear_dir("excel") clear_dir("docx") DocUtils.doc_2_docx(os.getcwd() + "/doc", os.getcwd() + "/docx") for parent, directory, files in os.walk(os.getcwd() + "/docx"): for f in files: if not re.match(".*格式件.*", f): continue try: deal_data_2_excel(parent + "/" + f) except Exception: # log.log_error(e) log.log_error("文件:" + parent + "/" + f + "格式有误,读取失败,请人工读取!")
url = 'http://util.online/spider/api/mail' weburl = 'https://util.online/spider/novel/' body = { "to": "", "subject": "XX小说出新章节咯", "text": "新的章节是 http://www.baidu.com", "html": "<h1>Welcome</h1><p>That was easy!</p ><a href=' '>新的章节</a >" } headers = {'content-type': "application/json"} db = pymysql.connect("39.104.226.149", "root", "root", "spider", charset='utf8') keyVa = {} log = Log() def main(): # 查询所有小说 da = seloss() # 查询key_value 表中的最新章 selKeyValue(da) # 查询小说是否更新 selNovel() threading.Timer(60, main).start() # 查询所有小说 def seloss(): ossCur = db.cursor()
_create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context # 根据传入参数设置从哪里开始下载 starturl = "https://www.dingdiann.com" searchurl = "https://www.dingdiann.com/searchbook.php?keyword=" db = pymysql.connect("39.104.226.149", "root", "root", "spider", charset='utf8') id_not_in = {} logger = Log() # 获取章节内容 def spiderContent(url, id, name): try: response = urllib2.urlopen(url, timeout=60) the_page = response.read() soup = BeautifulSoup(the_page, "html.parser") bookName = soup.select("div[class='bookname'] > h1")[0].text bookContent = soup.select("div[id='content']")[0] nextPage = soup.select("div[class='bottem1'] > a")[3]["href"] li_plants = bookContent.script if li_plants: li_plants.clear() data = str(bookContent).replace("\\", "").replace(
def convert(self, sheet): """ 真正转化部分 :param sheet: excel 的 sheet 对象 :return: ErrorConstant.Error """ Log.info("--- convert ---") keyIndex = -1 moduleIndex = -1 tempLanguageIndex = None # 返回由该行中所有单元格的数据组成的列表 try: firstRow = sheet.row_values(0) except Exception as e: return Constant.Error(Constant.EXCEPTION_EXL_FILE, e.message) if len(firstRow) == 0: return Constant.Error(Constant.ERROR_KEY_NOT_FOUND) for index in range(len(firstRow)): if firstRow[index] == self.keyTitle: keyIndex = index pass elif firstRow[index] == self.moduleTitle: moduleIndex = index pass elif firstRow[index] == self.targetLanguage: tempLanguageIndex = index pass if keyIndex == -1: return Constant.Error(Constant.ERROR_KEY_NOT_FOUND) # 获取 key 集合,并删除 title 项 xlsKeys = sheet.col_values(keyIndex) del xlsKeys[0] if self.filePath and tempLanguageIndex: # 输入是文件,指定目标语言 Log.debug("keyIndex = %s moduleIndex = %s languageIndex = %s" % (keyIndex, moduleIndex, tempLanguageIndex)) # 获取 value 集合,并删除 title 项 xlsValues = sheet.col_values(tempLanguageIndex) del xlsValues[0] XMLParse.update_xml_value(self.filePath, xlsKeys, xlsValues) return Constant.Error(Constant.SUCCESS) Log.debug("Not file") if moduleIndex == -1: return Constant.Error(Constant.ERROR_MODULE_NOT_FOUND) if not self.dirPath: # 目录为空,返回 Log.error("Error:输入不合法") return Constant.Error(Constant.ERROR_IMPORT_INPUT) if not os.path.exists(self.dirPath): Log.error("Error:目标目录不存在 %s" % self.dirPath) return Constant.Error(Constant.ERROR_DIR_NOT_EXIST) for index, title in enumerate(firstRow): if index < self.fromIndex: continue languageIndex = index targetLanguage = title # print languageIndex # print title xlsKeys = sheet.col_values(keyIndex) del xlsKeys[0] xlsModules = sheet.col_values(moduleIndex) del xlsModules[0] xlsValues = sheet.col_values(languageIndex) del xlsValues[0] # 文件路径(子目录) 比如; value-zh # ├── android # │ ├── values-zh # │ | ├── strings_device.xml # │ | ├── strings_me.xml # │ | ├── strings_moment.xml # │ ├── values-de # │ ├── values-ko sub_dir_path = covertTargetPath(self.dirPath, targetLanguage) print sub_dir_path if os.path.exists(sub_dir_path): XMLParse.update_multi_xml_value(sub_dir_path, xlsKeys, xlsValues, xlsModules) return Constant.Error(Constant.SUCCESS)