def listDir(self, dir1, timeRange=None): """ :param dir1:downData/www_tianyancha_com/detail/company_1 :param timeRange: fmt:2018020100-2018020200 :return: """ objects = [] tryTime = 0 # def getNameTS(name): m = re.search(r"(\d{4})/(\d{2})/(\d{2})/(\d{2})", name) return "%s%s%s%s" % (m.group(1), m.group(2), m.group(3), m.group(4)) while tryTime < 3: try: dir1 = preProcessDir(dir1) # 列出bucket中”fun/”目录下所有文件 beginTime, endTime = timeRange.split("-") if timeRange else ( None, None) import oss2 for idx, object_info in enumerate( oss2.ObjectIterator(self.oss, prefix=dir1)): if beginTime and endTime: ts = getNameTS(object_info.key) if ts < beginTime or ts >= endTime: # logDebug("ignore:%s"%object_info.key) continue objects.append(object_info.key) logDebug("%s:%s" % (idx, object_info.key)) return objects except Exception, e: logException() tryTime += 1 time.sleep(1)
def extract(self): """ :return: """ root = "C:/tempAliyun/downData/www_liepin_com" root2 = "E:/shanghai51" with codecs.open("%s/shanghailiepin.txt" % root2, "w", encoding="utf-8") as f2: f2.write("%s##%s##%s##%s##%s##%s\n" % ("name", "address", "website", "type", "name2", "info")) files = glob.glob("%s/*/*/*/*/*/*/*/*.gz" % root) total = 0 for idx, fileName2 in enumerate(files): num = 0 with gzip.open(fileName2, 'rb') as f: for line in f: try: d = json.loads(line) if u"上海" in d.get("name", ""): f2.write( u"%s##%s##%s##%s##%s##%s\n" % (d.get("name", ""), d.get("address", ""), d.get("website", ""), d.get("type", ""), d.get("businessLicense", ""), str2line(d.get("info", "")))) num += 1 except Exception: logException() total += num logInfo("%s:num=%s,total=%s" % (idx, num, total))
def readJsonFromAliyun(self, aliDir, timeRange=None, dest=None): """ 一个生成器 用法: lines = AliYun().readJsonFromAliyun('downData/www_tianyancha_com/detail/company_1', '2018020100-2018020300') for line in lines: logDebug(line['name']) :param aliDir: downData/www_tianyancha_com/detail/company_1 :param timeRange: fmt:2018020100-2018020200 :param localRoot:如果指定就用该目录,推荐不指定 :return: """ if not dest: dest = self._getDefaultDownRoot() files = self.listDir(aliDir, timeRange) for fileName in files: try: fileName2 = os.path.join(dest, fileName) if not os.path.exists(fileName2): self.downFile(fileName, dest) with gzip.open(fileName2, 'rb') as f: for line in f: yield json.loads(line) except Exception, e: logException()
def report(title, content, t_address, jobName=None, batch=None, needLog=1): """ 报告 :param title: 邮件标题 :param content: 邮件内容 :param t_address: 收件地址 :param needLog: 0,no log,1,send job logFile,2,send node log,3 both :return: """ attaches = [] if needLog: try: jobLog, nodeLog = getLogFileName(batch, jobName) logInfo("jobLog=%s\nnodeLog=%s" % (jobLog, nodeLog)) def getOneAttach(log): if os.path.exists(log): size = os.path.getsize(log) if size > 1024 * 500: # >500k,gizp it log = gzipOneFile(log) fname = os.path.split(log)[1] attaches.append((fname, log)) logInfo("add one attach%s,size=%s" % (log, size)) if needLog & 1: getOneAttach(jobLog) if needLog & 2: getOneAttach(nodeLog) except Exception: logException() # 发送邮件 Mail.sendEmail(title, content, t_address, attaches=attaches)
def saveSyncPoint(self, result,sync2Remote=False): """ 保存同步点 :param result: :param sync2Remote:默认每次都同步到remote :return:返回去掉syncInfo的数据 """ if self.index: try: index = self.index #如果result中 同步点信息 if CFG_DOWN_SYNCINFO in result: #把result中 已经同步的信息 赋值与syncInfo syncInfo = result.get(CFG_DOWN_SYNCINFO,{}) #删除result 中的同步点信息 del result[CFG_DOWN_SYNCINFO] data = { "id": md5(index), "idx": index, "syncInfo": syncInfo, "upTime": getTimestamp()#时间戳 } json2File(self.localFile,data) self.saveNum += 1 if sync2Remote or (self.saveNum%gConfig.get(CFG_DOWN_SYNCINTERVAL,5)==1): #默认每5次同步到remote: self.syncToRemote(data) except Exception, e: logException() return result
def resetDb(self): # 连接数据库 tryTime = 10 # try 10次连接 while tryTime > 0: try: self.close() database, host, user, passwd, port = self.curCfg from superbase.globalData import gConfig if self.dictCursor: from MySQLdb.cursors import DictCursor self.conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=database, port=port, charset="utf8", cursorclass=DictCursor) else: self.conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=database, port=port, charset="utf8") if self.conn: return except Exception: logException("tryTime-%s" % tryTime) tryTime -= 1
def createDb(dbNameKey, dbParams=None): """ 创建数据库 :param dbNameKey: 目前只有db.monitor数据监控 and db.business数据业务 :param dbParams: DEV and TEST 有默认参数,ONLINE需要通过jobManager分配 :return: """ from superbase.globalData import gConfig, gTop # CFG_DB_DISABLE 禁用,deprecated if gConfig.get(CFG_DB_DISABLE, 0): return # 获取数据监控或者数据业务 dbName = gConfig.get(dbNameKey) try: from superbase.globalData import gConfig db_params = AccountManager().getAccount( dbNameKey) if not dbParams else dbParams if not db_params: return db = createDb2(dbName, db_params, dictCursor=gConfig.get(CFG_DB_DICTCURSOR, 0)) # 把数据业务或者数据监控和mysql连接 配置到 全局数据单点控制 中 gTop.set(dbNameKey, db) except Exception: logException()
def downLists(self, listConf, listItemConf, resultHandlerClass, urlMgr): """ :param listConf: 列表配置 :param listItemConf: 列表项配置 :param resultHandlerClass: 结果处理类 :param urlMgr: urlManager 提供url :return: """ # 打印时间戳 logInfo("%s_begin downLists" % (getTimestamp())) err = num = 0 # url跳页 处理 for url in urlMgr.pageUrls(listConf): try: # debug 打印url logDebug(url) # 中转 获取原网页的源码content 并交给 downOneList2 处理 self.downOneList(url, listConf, listItemConf, resultHandlerClass()) num += 1 # 检查下载状态 err = self.checkDownStatus(num) if IS_ERROR(err): # 如果err<0 break break except Exception: logException() return err
def __decorator(*params): try: self.jobBegin() return func(*params) except Exception, e: self.jobError = e logException()
def asyncRun(cmd, shell=True): try: logDebug("asyncRun:%s" % cmd) subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=shell) except Exception, e: logException()
def __decorator(*params): tryTime = 0 while tryTime < maxTry: try: return func(*params) except Exception: logException("url=%s" % params[0]) tryTime += 1 time.sleep(tsleep)
def reloadModule(name): """ 重新加载模块 :param name: 模块名 :return: """ try: reload(sys.modules[name]) except Exception: logException()
def downDir(self, dir1, dest=None, timeRange=None): if not dest: dest = self._getDefaultDownRoot() objects = self.listDir(dir1, timeRange) for obj in objects: try: self.downFile(obj, dest) except Exception: logException()
def callFunction(func, argv): """ 调用函数优化 :param func: 要调用的方法 :param argv: 参数 :return: """ try: return apply(func, argv) except Exception: logException()
def close(self): """ #关闭数据库 回收资源 :return: """ try: if self.conn: self.conn.close() except Exception: logException() self.conn = None
def closeCursor(self, cur): """ 关闭游标 :param cur:掌舵者 :return: """ try: if cur: cur.close() except Exception, e: logException()
def applyFunc(obj, strFunc, arrArgs): """ 调用方法 :param obj: 要使用的对象 :param strFunc: 方法名 :param arrArgs: 参数 :return: """ try: return callFunc(obj, strFunc, arrArgs) except: logException()
def safeReg1(reg, str, tag): """ :param reg: pattern :param str: :param tag: for debug :return: """ try: # search搜索 str return reg.search(str).group(1) except Exception: logException("regError:%s--%s" % (tag, str))
def handle(self, result): try: result = self.preProcess(result) if result: r1 = self.sync.saveSyncPoint(result) path = self.getSavePath() fileName = self.getFileName() fileName = os.path.join(path, fileName) self.saveFile(fileName,r1) except Exception: logException()
def upDir(self, dir1, gzfirst=True): if gzfirst: gzAllFiles(dir1) files = getGZFromDir(dir1) for file in files: try: if True: # os.path.getsize(file)>30:#小于30Byte的文件不处理 self.upFile(file) else: logInfo("the file is too small,give up") except Exception: logException() return len(files)
def getResult(parent, template, result): """ :param parent: pyquery :param template: key:value, value is one of CssElement,ListElement,EmmbedElement :param result: 保存结果的字典 :return: """ try: # 遍历传入元素的节点 for key, element in template.items(): value = element.parse(parent) # 保存到字典中 result[key] = value except Exception: logException() # 记录异常到log日志
def upFile(self, file1): dest = self.preProcessPath(file1[len(self.prefix):]) MAX_TRY_UP = 10 tryTime = 0 # while tryTime < MAX_TRY_UP: try: with open(file1, 'rb') as fileobj: res = self.oss.put_object(dest, fileobj) if res and res.status == 200: logInfo("ret=%s,file=%s" % (res.status, dest)) return except Exception, e: logException() tryTime += 1 time.sleep(1)
def handle(self, parent, css, attr): """ 解析获取数据 :param parent: 上一级 :param css: css定位路径 :param attr: 属性 :return: 获取的值 """ value = "" try: value = self.func(parent, css, attr, self.otherParams) # 通过回调函数获取页面的数据 except Exception: logException() # 记录异常到log日志 return value
def safeEval(data, ret=None): """ :param data: e.g {'a':1} :return: None if error else the result """ try: if data: import ast # literal_eval 安全评估表达式节点或包含Python的字符串表达式。 return ast.literal_eval(data) except Exception: from superbase.utility.logUtil import logException logException(data) return ret
def getPageSync(driver, url, key, val,timeout=0): """ 获取页面同步 :return: """ element = None try: driver.get(url) if key: TIME_OUT_SEC = timeout if timeout else 10 element = WebDriverWait(driver, TIME_OUT_SEC).until(lambda x: x.find_element(key, val)) except Exception: logException() return element
def getElementSync(driver, keyVal,timeout=0): """ 获取元素同步 :param driver: :param keyVal: (key,val),eg.(By.ID,"id1") :return: """ element = None try: key, val = keyVal TIME_OUT_SEC = timeout if timeout else 10 # 直到 找到元素 element = WebDriverWait(driver, TIME_OUT_SEC, 0.1, True).until(lambda x: x.find_element(key, val)) except Exception: logException() return element
def handle(self, parent, css, attr): """ 解析获取数据 :param parent: 上一级 :param css: css定位下一级url的路径 :param attr: 属性 :return: 获取的值 """ value = "" try: url = Extractor.getValue(parent, css, attr) # 通过CSS Selector获取下一级页面的url value = self.func(url, self.conf, self.otherParams) # 通过回调函数获取下一级页面的数据 except Exception: logException() # 记录异常到log日志 return value
def handle(self, parent, css, attr): """ 解析获取数据 :param parent: 上一级 :param css: css定位路径 :param attr: 选取的属性 :return: 返回正则处理后的值 """ value = "" try: value = Extractor.getValue(parent, css, attr) # 通过CSS Selector获取数据 m = self.pat.search(value) # 进行正则匹配 value = m.group(1).strip() # 选取匹配的第一个元素,病去除两端空白 except Exception: logException("regex-error:value=%s css=%s attr=%s pat=%s" % (value, css, attr, self.debugInfo)) # 记录异常到log日志 return value
def deleteNullDir(dirr): """ 删除路径下的空目录 :param dirr: :return: """ if os.path.isdir(dirr): for p in os.listdir(dirr): d = os.path.join(dirr, p) if (os.path.isdir(d) == True): deleteNullDir(d) if not os.listdir(dirr): info = 'del empty dir: %s' % dirr try: os.rmdir(dirr) logInfo("done:%s" % info) except Exception, e: logException("fail-%s" % info)
def safeExecute(self, sql, values=None): """ sql执行函数 :param sql: :param values: :return: """ try: # 创建 游标 cur = self.conn.cursor() if values: # 执行sql命令 cur.execute(sql, values) else: cur.execute(sql) return cur except Exception: logException()