Ejemplo n.º 1
0
    usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath" \
            "baseDataset baseDatasetFormat" \
            "outputFilename outoutFileFormat"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")
    parser.add_option("-p", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)

    (c_options, args) = parser.parse_args()
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    inputPath = args[2]

    baseFilename = args[3]
    baseFormat = args[4]

    outputFilename = args[5]
    outputFileFormat = args[6]
    print "Got options:", c_options, ", " \
                         "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \
                         ", base:", baseFilename, ",", baseFormat
    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options).partitionBy(c_options.numPartitions)
    base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options)

    result_rdd = EntityCleaner.clean_rdds(input_rdd1, inputPath, base_rdd, c_options.numPartitions)

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
Ejemplo n.º 2
0
# then pass in result to merge-rdds along with input-rdd or output-rdd if defined,
#set output-rdd as result from merge
# return output

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-FRAMER")
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t")
    parser.add_option("-n", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=5)

    (c_options, args) = parser.parse_args()
    frameFilename = args[0]
    rddFilename = args[1]
    outputFilename = args[2]
    if len(args) > 3:
        outputFileFormat = args[3]
    else:
        outputFileFormat = "text"
    type_to_rdd_json_input = open(rddFilename)
    type_to_rdd_json = json.load(type_to_rdd_json_input)
    type_to_rdd_json_input.close()
    frame_input = open(frameFilename)
    frame = json.load(frame_input)
    frame_input.close()
    fileUtil = FileUtil(sc)
    for key, val in type_to_rdd_json.items():
        val["rdd"] = fileUtil.load_json_file(val["path"], val["format"], c_options)
    output_rdd = frame_json(frame, type_to_rdd_json)
    print "Write output to:", outputFilename
    fileUtil.save_json_file(output_rdd, outputFilename, outputFileFormat, c_options)
Ejemplo n.º 3
0
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    inputPath = args[2]

    baseFilename = args[3]
    baseFormat = args[4]

    joinResultFilename = args[5]
    joinFormat = args[6]

    outputFilename = args[7]
    outputFileFormat = args[8]

    removeElementsStr = c_options.remove
    removeElements = []
    if len(removeElementsStr) > 0:
        removeElements = removeElementsStr.split(",")

    print "Got options:", c_options, ", " \
                         "input:", inputFilename1, ",", inputFileFormat1, ",", inputPath, \
                         ", base:", baseFilename, ",", baseFormat, ", join:", joinResultFilename

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options)
    base_rdd = fileUtil.load_json_file(baseFilename, baseFormat, c_options)
    join_rdd = fileUtil.load_json_file(joinResultFilename, joinFormat, c_options)

    result_rdd = EntityMerger.merge_rdds(input_rdd1, inputPath, base_rdd, join_rdd, removeElements, c_options.numPartitions)

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
Ejemplo n.º 4
0
#!/usr/bin/env python

from pyspark import SparkContext

from optparse import OptionParser
from fileUtil import FileUtil

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-TEXT-TO-SEQ")

    usage = "usage: %prog [options] inputDataset outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename1 = args[0]
    outputFilename = args[1]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_json_file(inputFilename1, "text", c_options)

    print "Write output to:", outputFilename
    fileUtil.save_json_file(input_rdd, outputFilename, "sequence", c_options)
Ejemplo n.º 5
0
                        else:
                            seen_objs.add(json.dumps(part))

        return input_json


if __name__ == "__main__":
    sc = SparkContext(appName="DIG-ENTITY_DEDUPLICATOR")

    usage = "usage: %prog [options] inputDataset inputDatasetFormat inputPath " \
            "outputFilename outoutFileFormat"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename = args[0]
    inputFileFormat = args[1]
    inputPath = args[2]

    print "Read ", inputFileFormat, " file from ", inputFilename, " with path:", inputPath
    outputFilename = args[3]
    outputFileFormat = args[4]

    print "Write output to:", outputFilename
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_json_file(inputFilename, inputFileFormat, c_options)
    result_rdd = input_rdd.mapValues(lambda x: EntityDeduplicator().deduplicate(x, inputPath))

    fileUtil.save_json_file(result_rdd, outputFilename, outputFileFormat, c_options)
from optparse import OptionParser
from fileUtil import FileUtil
import json

if __name__ == "__main__":
    sc = SparkContext(appName="DIG-ENTITY_MERGER")

    usage = "usage: %prog [options] inputDataset inputDatasetFormat" \
            "outputFilename"
    parser = OptionParser()
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    inputFilename1 = args[0]
    inputFileFormat1 = args[1]
    outputFilename = args[2]

    print "Got options:", c_options, ",input:", inputFilename1 + ", output:", outputFilename

    fileUtil = FileUtil(sc)
    input_rdd1 = fileUtil.load_json_file(inputFilename1, inputFileFormat1, c_options)

    def write_result(x):
        key = x[0]
        #print "Got key:", key
        return json.dumps({"uri":key, "matches":[{"uri": key}]})

    result = input_rdd1.map(write_result)
    result.saveAsTextFile(outputFilename)