def crawlXimalaya(): # 爬取的文档名字 albumPath='D:\\lldxx\\' htmlUrl='https://www.ximalaya.com/ertong/33647650/' apiUrl='https://www.ximalaya.com/revision/play/v1/audio?id={0}&ptype=1' crawler=Crawler() # 获取所有的trackName和trackId content=crawler.downloadHtml(htmlUrl) idList=cleaner.getAlbumIdList(content) # print(idList) for item in idList: sections=item[1].split('/') trackId=sections[len(sections)-1] url=apiUrl.format(trackId) # 获取每个track的下载路径 data=crawler.getJson(url) audioUrl=data['data']['src'] # 下载并保存track data=crawler.downloadFile(audioUrl) FileUtil.writeFile(albumPath,'{0}.m4a'.format(item[0]),data.content)
usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath" \ "baseDataset baseDatasetFormat" \ "outputFilename outoutFileFormat" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") parser.add_option("-p", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) (c_options, args) = parser.parse_args() inputFilename1 = args[0] inputFileFormat1 = args[1] inputPath = args[2] baseFilename = args[3] baseFormat = args[4] outputFilename = args[5] outputFileFormat = args[6] print "Got options:", c_options, ", " \ "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \ ", base:", baseFilename, ",", baseFormat print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options).partitionBy(c_options.numPartitions) base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options) result_rdd = EntityCleaner.clean_rdds(input_rdd1, inputPath, base_rdd, c_options.numPartitions) fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
inputFilename1 = args[0] inputFileFormat1 = args[1] inputPath = args[2] baseFilename = args[3] baseFormat = args[4] joinResultFilename = args[5] joinFormat = args[6] outputFilename = args[7] outputFileFormat = args[8] removeElementsStr = c_options.remove removeElements = [] if len(removeElementsStr) > 0: removeElements = removeElementsStr.split(",") print "Got options:", c_options, ", " \ "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \ ", base:", baseFilename, ",", baseFormat, ", join:", joinResultFilename print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options) base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options) join_rdd = fileUtil.load_json_file(joinResultFilename, joinFormat, c_options) result_rdd = EntityMerger.merge_rdds(input_rdd1, inputPath, base_rdd, join_rdd, removeElements, c_options.numPartitions) fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
# then pass in result to merge-rdds along with input-rdd or output-rdd if defined, #set output-rdd as result from merge # return output if __name__ == "__main__": sc = SparkContext(appName="DIG-FRAMER") parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") parser.add_option("-n", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=5) (c_options, args) = parser.parse_args() frameFilename = args[0] rddFilename = args[1] outputFilename = args[2] if len(args) > 3: outputFileFormat = args[3] else: outputFileFormat = "text" type_to_rdd_json_input = open(rddFilename) type_to_rdd_json = json.load(type_to_rdd_json_input) type_to_rdd_json_input.close() frame_input = open(frameFilename) frame = json.load(frame_input) frame_input.close() fileUtil = FileUtil(sc) for key, val in type_to_rdd_json.items(): val["rdd"] = fileUtil.load_json_file(val["path"], val["format"], c_options) output_rdd = frame_json(frame, type_to_rdd_json) print "Write output to:", outputFilename fileUtil.save_json_file(output_rdd, outputFilename, outputFileFormat, c_options)
def index(): f = FileUtil.get_all_file(app.config["UPLOAD_FOLDER"]) return render_template("index.html", files=f)
def status(name): data = dict() data["name"] = name data["complete"] = FileUtil.is_exists(RESULT_FOLDER + FileUtil.spit_filename(name) + ".log") data["verify"] = FileUtil.is_exists(WORK_FOLDER + FileUtil.spit_filename(name)) return jsonify(result=data)
#!/usr/bin/env python from pyspark import SparkContext from optparse import OptionParser from fileUtil import FileUtil if __name__ == "__main__": sc = SparkContext(appName="DIG-TEXT-TO-SEQ") usage = "usage: %prog [options] inputDataset outputFilename" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename1 = args[0] outputFilename = args[1] print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd = fileUtil.load_json_file(inputFilename1, "text", c_options) print "Write output to:", outputFilename fileUtil.save_json_file(input_rdd, outputFilename, "sequence", c_options)
if __name__ == "__main__": sc = SparkContext(appName="DIG-TEXT-TO-SEQ") usage = "usage: %prog [options] inputDataset outputFilename" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename1 = args[0] outputFilename = args[1] print "Write output to:", outputFilename fileUtil = FileUtil(sc) def load_input(x, sep): parts = x.split(sep) if len(parts) >= 2: uri = parts[0] name = parts[1] return uri, {"uri":uri, "name":name} else: print "\n\n****************** Got non parse line:", x input_rdd =sc.textFile(inputFilename1).map(lambda x: load_input(x, c_options.separator)).filter(lambda x: x != None) print "Write output to:", outputFilename fileUtil.save_json_file(input_rdd, outputFilename, "text", c_options)
else: seen_objs.add(json.dumps(part)) return input_json if __name__ == "__main__": sc = SparkContext(appName="DIG-ENTITY_DEDUPLICATOR") usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath " \ "outputFilename outoutFileFormat" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] inputFileFormat = args[1] inputPath = args[2] print "Read ", inputFileFormat, " file from ", inputFilename, " with path:", inputPath outputFilename = args[3] outputFileFormat = args[4] print "Write output to:", outputFilename fileUtil = FileUtil(sc) input_rdd = fileUtil.load_json_file(inputFilename, inputFileFormat, c_options) result_rdd = input_rdd.mapValues(lambda x: EntityDeduplicator().deduplicate(x, inputPath)) fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
from optparse import OptionParser from fileUtil import FileUtil import json if __name__ == "__main__": sc = SparkContext(appName="DIG-ENTITY_MERGER") usage = "usage: %prog [options] inputDataset inputDatasetFormat" \ "outputFilename" parser = OptionParser() parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() inputFilename1 = args[0] inputFileFormat1 = args[1] outputFilename = args[2] print "Got options:", c_options, ",input:", inputFilename1 + ", output:", outputFilename fileUtil = FileUtil(sc) input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options) def write_result(x): key = x[0] #print "Got key:", key return json.dumps({"uri":key, "matches":[{"uri": key}]}) result = input_rdd1.map(write_result) result.saveAsTextFile(outputFilename)
def __init__(self): handler = logging.FileHandler("Log_test.txt") logger.addHandler(handler) logger.setLevel(logging.NOTSET) logging.info(time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化系统") #处理日期初始化,可以从config中读取 parseDate = DateClass() parseconfig = config() parsehtml = html() self.parseday = parseconfig.date #初始化配置信息 logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + "初始化日至处理日期:" + self.parseday) self.binpath = parseconfig.getBinPath() + '\\' logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化可执行文件目录:" + self.binpath) self.logpath = parseconfig.getLogPath() + '\\' logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化日志文件目录:" + self.logpath) self.docpath = parseconfig.getDocPath() + '\\' logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化文档目录:" + self.docpath) self.dbpath = parseconfig.getDbPath() + '\\' logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化数据库文件目录:" + self.dbpath) self.filepath = parseconfig.getFilePath() + '\\' logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化日志文件目录:" + self.filepath) self.header = parsehtml.header self.css = parsehtml.css self.start = parsehtml.start self.end = parsehtml.end self.js = parsehtml.js logging.info(time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化html参数") #设置数据库表名 self.tablename = "sn" + parseconfig.date logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化数据库表名:" + self.tablename) #数据库文件存储地址 self.dbfile = self.dbpath + self.parseday + '.db' logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化数据库文件:" + self.dbfile) self.export = self.docpath + parseconfig.date + "\\" logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化html文档目录:" + self.export) ff = FileUtil() ff.mkDir(self.export) logging.info( time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"创建html文档导出目录:" + self.export) fileCheck = FileUtil() errorLog = self.logpath + self.parseday fileCheck.checkFile('log/' + errorLog) fileCheck.checkFile('db/' + self.parseday)