Esempio n. 1
0
def crawlXimalaya():
    # 爬取的文档名字
    albumPath='D:\\lldxx\\'
    htmlUrl='https://www.ximalaya.com/ertong/33647650/'
    apiUrl='https://www.ximalaya.com/revision/play/v1/audio?id={0}&ptype=1'
    crawler=Crawler()
    # 获取所有的trackName和trackId
    content=crawler.downloadHtml(htmlUrl)
    idList=cleaner.getAlbumIdList(content)
    # print(idList)
    for item in idList:
        sections=item[1].split('/')
        trackId=sections[len(sections)-1]
        url=apiUrl.format(trackId)
        # 获取每个track的下载路径
        data=crawler.getJson(url)
        audioUrl=data['data']['src']
        # 下载并保存track
        data=crawler.downloadFile(audioUrl)
        FileUtil.writeFile(albumPath,'{0}.m4a'.format(item[0]),data.content)
Esempio n. 2
0
    usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath" \
            "baseDataset baseDatasetFormat" \
            "outputFilename outoutFileFormat"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")
    parser.add_option("-p", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)

    (c_options, args) = parser.parse_args()
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    inputPath = args[2]

    baseFilename = args[3]
    baseFormat = args[4]

    outputFilename = args[5]
    outputFileFormat = args[6]
    print "Got options:", c_options, ", " \
                         "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \
                         ", base:", baseFilename, ",", baseFormat
    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options).partitionBy(c_options.numPartitions)
    base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options)

    result_rdd = EntityCleaner.clean_rdds(input_rdd1, inputPath, base_rdd, c_options.numPartitions)

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
Esempio n. 3
0
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    inputPath = args[2]

    baseFilename = args[3]
    baseFormat = args[4]

    joinResultFilename = args[5]
    joinFormat = args[6]

    outputFilename = args[7]
    outputFileFormat = args[8]

    removeElementsStr = c_options.remove
    removeElements = []
    if len(removeElementsStr) > 0:
        removeElements = removeElementsStr.split(",")

    print "Got options:", c_options, ", " \
                         "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \
                         ", base:", baseFilename, ",", baseFormat, ", join:", joinResultFilename

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options)
    base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options)
    join_rdd = fileUtil.load_json_file(joinResultFilename, joinFormat, c_options)

    result_rdd = EntityMerger.merge_rdds(input_rdd1, inputPath, base_rdd, join_rdd, removeElements, c_options.numPartitions)

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
Esempio n. 4
0
# then pass in result to merge-rdds along with input-rdd or output-rdd if defined,
#set output-rdd as result from merge
# return output

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-FRAMER")
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t")
    parser.add_option("-n", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=5)

    (c_options, args) = parser.parse_args()
    frameFilename = args[0]
    rddFilename = args[1]
    outputFilename = args[2]
    if len(args) > 3:
        outputFileFormat = args[3]
    else:
        outputFileFormat = "text"
    type_to_rdd_json_input = open(rddFilename)
    type_to_rdd_json = json.load(type_to_rdd_json_input)
    type_to_rdd_json_input.close()
    frame_input = open(frameFilename)
    frame = json.load(frame_input)
    frame_input.close()
    fileUtil = FileUtil(sc)
    for key, val in type_to_rdd_json.items():
        val["rdd"] = fileUtil.load_json_file(val["path"], val["format"], c_options)
    output_rdd = frame_json(frame, type_to_rdd_json)
    print "Write output to:", outputFilename
    fileUtil.save_json_file(output_rdd, outputFilename, outputFileFormat, c_options)
Esempio n. 5
0
def index():
    f = FileUtil.get_all_file(app.config["UPLOAD_FOLDER"])
    return render_template("index.html", files=f)
Esempio n. 6
0
def status(name):
    data = dict()
    data["name"] = name
    data["complete"] = FileUtil.is_exists(RESULT_FOLDER + FileUtil.spit_filename(name) + ".log")
    data["verify"] = FileUtil.is_exists(WORK_FOLDER + FileUtil.spit_filename(name))
    return jsonify(result=data)
Esempio n. 7
0
#!/usr/bin/env python

from pyspark import SparkContext

from optparse import OptionParser
from fileUtil import FileUtil

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-TEXT-TO-SEQ")

    usage = "usage: %prog [options] inputDataset outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename1 = args[0]
    outputFilename = args[1]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_json_file(inputFilename1, "text", c_options)

    print "Write output to:", outputFilename
    fileUtil.save_json_file(input_rdd, outputFilename, "sequence", c_options)
if __name__ == "__main__":
    sc = SparkContext(appName="DIG-TEXT-TO-SEQ")

    usage = "usage: %prog [options] inputDataset outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename1 = args[0]
    outputFilename = args[1]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    def load_input(x, sep):
        parts = x.split(sep)
        if len(parts) >= 2:
            uri = parts[0]
            name = parts[1]
            return uri, {"uri":uri, "name":name}
        else:
            print "\n\n****************** Got non parse line:", x
    input_rdd =sc.textFile(inputFilename1).map(lambda x: load_input(x, c_options.separator)).filter(lambda x: x != None)

    print "Write output to:", outputFilename
    fileUtil.save_json_file(input_rdd, outputFilename, "text", c_options)


                        else:
                            seen_objs.add(json.dumps(part))

        return input_json


if __name__ == "__main__":
    sc = SparkContext(appName="DIG-ENTITY_DEDUPLICATOR")

    usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath " \
            "outputFilename outoutFileFormat"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename = args[0]
    inputFileFormat = args[1]
    inputPath = args[2]

    print "Read ", inputFileFormat, " file from ", inputFilename, " with path:", inputPath
    outputFilename = args[3]
    outputFileFormat = args[4]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_json_file(inputFilename, inputFileFormat, c_options)
    result_rdd = input_rdd.mapValues(lambda x: EntityDeduplicator().deduplicate(x, inputPath))

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
from optparse import OptionParser
from fileUtil import FileUtil
import json

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-ENTITY_MERGER")

    usage = "usage: %prog [options] inputDataset inputDatasetFormat" \
            "outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    outputFilename = args[2]

    print "Got options:", c_options, ",input:", inputFilename1 + ", output:", outputFilename

    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options)

    def write_result(x):
        key = x[0]
        #print "Got key:", key
        return json.dumps({"uri":key, "matches":[{"uri": key}]})

    result = input_rdd1.map(write_result)
    result.saveAsTextFile(outputFilename)
Esempio n. 11
0
    def __init__(self):

        handler = logging.FileHandler("Log_test.txt")
        logger.addHandler(handler)
        logger.setLevel(logging.NOTSET)
        logging.info(time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化系统")
        #处理日期初始化,可以从config中读取
        parseDate = DateClass()
        parseconfig = config()
        parsehtml = html()
        self.parseday = parseconfig.date
        #初始化配置信息
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + "初始化日至处理日期:" +
            self.parseday)
        self.binpath = parseconfig.getBinPath() + '\\'
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化可执行文件目录:" +
            self.binpath)
        self.logpath = parseconfig.getLogPath() + '\\'
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化日志文件目录:" +
            self.logpath)
        self.docpath = parseconfig.getDocPath() + '\\'
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化文档目录:" +
            self.docpath)
        self.dbpath = parseconfig.getDbPath() + '\\'
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化数据库文件目录:" +
            self.dbpath)
        self.filepath = parseconfig.getFilePath() + '\\'
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化日志文件目录:" +
            self.filepath)
        self.header = parsehtml.header
        self.css = parsehtml.css
        self.start = parsehtml.start
        self.end = parsehtml.end
        self.js = parsehtml.js
        logging.info(time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化html参数")
        #设置数据库表名
        self.tablename = "sn" + parseconfig.date
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化数据库表名:" +
            self.tablename)
        #数据库文件存储地址
        self.dbfile = self.dbpath + self.parseday + '.db'
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化数据库文件:" +
            self.dbfile)
        self.export = self.docpath + parseconfig.date + "\\"
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"初始化html文档目录:" +
            self.export)
        ff = FileUtil()
        ff.mkDir(self.export)
        logging.info(
            time.strftime('%Y-%m-%d %H:%M:%S') + "---" + u"创建html文档导出目录:" +
            self.export)
        fileCheck = FileUtil()
        errorLog = self.logpath + self.parseday
        fileCheck.checkFile('log/' + errorLog)
        fileCheck.checkFile('db/' + self.parseday)