Ejemplo n.º 1
0
#!/usr/bin/env python

from pyspark import SparkContext

from optparse import OptionParser
from fileUtil import FileUtil

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-TEXT-TO-SEQ")

    usage = "usage: %prog [options] inputDataset outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename1 = args[0]
    outputFilename = args[1]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_json_file(inputFilename1, "text", c_options)

    print "Write output to:", outputFilename
    fileUtil.save_json_file(input_rdd, outputFilename, "sequence", c_options)
Ejemplo n.º 2
0
    usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath" \
            "baseDataset baseDatasetFormat" \
            "outputFilename outoutFileFormat"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")
    parser.add_option("-p", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)

    (c_options, args) = parser.parse_args()
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    inputPath = args[2]

    baseFilename = args[3]
    baseFormat = args[4]

    outputFilename = args[5]
    outputFileFormat = args[6]
    print "Got options:", c_options, ", " \
                         "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \
                         ", base:", baseFilename, ",", baseFormat
    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options).partitionBy(c_options.numPartitions)
    base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options)

    result_rdd = EntityCleaner.clean_rdds(input_rdd1, inputPath, base_rdd, c_options.numPartitions)

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
if __name__ == "__main__":
    sc = SparkContext(appName="DIG-TEXT-TO-SEQ")

    usage = "usage: %prog [options] inputDataset outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename1 = args[0]
    outputFilename = args[1]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    def load_input(x, sep):
        parts = x.split(sep)
        if len(parts) >= 2:
            uri = parts[0]
            name = parts[1]
            return uri, {"uri":uri, "name":name}
        else:
            print "\n\n****************** Got non parse line:", x
    input_rdd =sc.textFile(inputFilename1).map(lambda x: load_input(x, c_options.separator)).filter(lambda x: x != None)

    print "Write output to:", outputFilename
    fileUtil.save_json_file(input_rdd, outputFilename, "text", c_options)