Ejemplo n.º 1
0
def testTokenizer(sc,
                  input_dir,
                  output_dir,
                  config,
                  limit=None,
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir,
                                 file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
    #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized,
                    output_dir,
                    file_format=output_file_format,
                    data_type=output_data_type,
                    **outOptions)
Ejemplo n.º 2
0
def testTokenizer(sc, input_dir, output_dir, config,
                  limit=None, 
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir, file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
     #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized, output_dir, file_format=output_file_format, 
                    data_type=output_data_type, 
                    **outOptions)
Ejemplo n.º 3
0
def preprocess(sc, inputDir, file_format, outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd = fileUtil.load_file(inputDir,
                                  file_format=file_format,
                                  data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x))
    fileUtil.save_file(vertexrdd,
                       outputDir + 'vertex',
                       file_format='text',
                       data_type='json')
    fileUtil.save_file(edgerdd,
                       outputDir + 'edges',
                       file_format='text',
                       data_type='json')
Ejemplo n.º 4
0
def testLSH(sc, inputFilename,outputFilename,configFilename,
             **kwargs):

    '''
    kwargs is a dictionary of inputs a sample input would look like
    options = {
               "file_format":"sequence",
               "data_type":"json",
               "numHashes":100 ,
               "numItemsInBand": 10,
               "computeSimilarity": True,
               "threshold":0.8,
               "base":"saam-city.json",
               "topk":3,
               "candidatesName":"candidates")
               }
    '''
    futil = FileUtil(sc)


    #Tokenize
    ######################
    rdd_input = futil.load_file(inputFilename, file_format=kwargs['file_format'], data_type="json")
    rdd_input.setName('rdd_input')

    tokenizer = Tokenizer(configFilename, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_input)

    futil = FileUtil(sc)
    outOptions = {}

    #you can save the tokens file here by using
    #futil.save_file(rdd_tokenized,outputFilename,file_format='sequence',data_type='json',**outOptions)

    #Hashing
    #######################

    hasher = Hasher(**kwargs)
    rdd_minHashes = hasher.perform(rdd_tokenized)
    rdd_minHashes.setName('rdd_minhashes')
    #futil.save_file(rdd_minHashes,outputFilename,file_format='sequence',data_type='json',**outOptions)


    #clustering
    #########################
    clusterer = Clusterer(**kwargs)
    rdd_clusters = clusterer.perform(rdd_minHashes)
    #futil.save_file(rdd_clusters,outputFilename,file_format='text',data_type='json',**outOptions)

    #unionfind
    #########################
    unionFind = UnionFind(**kwargs)
    rdd_unionfind = unionFind.perform(rdd_clusters)

    # SAVE DATA
    futil.save_file(rdd_unionfind,outputFilename,file_format='text',data_type='json',**outOptions)
Ejemplo n.º 5
0
def testSignature(sc, inputFileName, outputFileName, file_format):

    config = {"runlimit": 5, "field": "title_signature"}

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,
                          file_format=file_format,
                          data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
Ejemplo n.º 6
0
def testSignature(sc,inputFileName,outputFileName,file_format):
    """
    :param sc: this is sparkcontext variable needed for spark
    :param inputFileName: inputdir to which cluster_id needs to be added
    :param outputFileName: outputdir
    :param file_format: text/seq
    """
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json")
    addClusterId = AddClusterId()
    rdd = addClusterId.perform(rdd)
    fUtil.save_file(rdd,outputFileName,file_format='text',data_type='json')
Ejemplo n.º 7
0
def save_jsonlines(sc,
                   rdd,
                   output_dir,
                   file_format='sequence',
                   data_type='json',
                   separator='\t'):
    fUtil = FileUtil(sc)
    fUtil.save_file(rdd,
                    output_dir,
                    file_format=file_format,
                    data_type=data_type,
                    separator=separator)
Ejemplo n.º 8
0
def testSignature(sc,inputFileName,outputFileName,file_format):

    config = {
    "runlimit" : 5,
    "field" : "title_signature"
    }

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd,outputFileName,file_format="text",data_type="json")
Ejemplo n.º 9
0
def preprocess(sc,inputDir,file_format,outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd=fileUtil.load_file(inputDir,file_format=file_format,data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x:nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x : edges_mapper(x))
    fileUtil.save_file(vertexrdd,outputDir+'vertex',file_format='text',data_type='json')
    fileUtil.save_file(edgerdd,outputDir+'edges',file_format='text',data_type='json')
Ejemplo n.º 10
0
                                return decoder.line_to_predictions(
                                    ner_fea, Decoder(params), data,
                                    attribute_name, content_type)
                        return data

                    cdr_extractions_rdd = cdr_extractions_isi_rdd\
                            .mapValues(lambda x : apply_bbn_extractor(x))\
                            .repartition(numPartitions)\
                            .persist(StorageLevel.MEMORY_AND_DISK)

                    cdr_extractions_rdd.setName("cdr_extractions")

                if args.incremental is True:
                    if len(since) > 0:
                        fileUtil.save_file(
                            cdr_extractions_rdd,
                            outputFilename + '/cdr_extractions/' + since,
                            outputFileType, "json")
                    else:
                        fileUtil.save_file(
                            cdr_extractions_rdd,
                            outputFilename + '/cdr_extractions/initial',
                            outputFileType, "json")
                else:
                    fileUtil.save_file(cdr_extractions_rdd,
                                       outputFilename + '/cdr_extractions',
                                       outputFileType, "json")
            else:
                if args.incremental is True:
                    if len(since) > 0:
                        cdr_extractions_rdd = sc.sequenceFile(
                            outputFilename + '/cdr_extractions/' +
Ejemplo n.º 11
0
    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    #1. Apply the first karma Model
    outputRDD1 = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions)

    #2. Apply the second Karma Model
    outputRDD2 = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model2.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions)

    #3. Combined the data and then apply the Karma JSON Reducer
    reducedRDD = workflow.reduce_rdds(numPartitions, outputRDD1, outputRDD2)

    #4. Save the output
    fileUtil.save_file(reducedRDD, outputFilename, "text", "json")
Ejemplo n.º 12
0
        "name": "E82_Actor_Appellation",
        "uri": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation"
    }, {
        "name": "E67_Birth",
        "uri": "http://www.cidoc-crm.org/cidoc-crm/E67_Birth"
    }, {
        "name": "E69_Death",
        "uri": "http://www.cidoc-crm.org/cidoc-crm/E69_Death"
    }, {
        "name": "E52_Time-Span",
        "uri": "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span"
    }]
    frames = [{
        "name":
        "npgConstituents",
        "url":
        "https://raw.githubusercontent.com/american-art/aac-alignment/master/frames/npgConsitituents.json-ld"
    }]

    framer_output = workflow.apply_framer(reducedRDD, types, frames, 5, 2)
    for frame_name in framer_output:
        outputRDD = workflow.apply_context(framer_output[frame_name],
                                           contextUrl)
        outputRDD_after = outputRDD.mapValues(mapFunc)
        if not outputRDD_after.isEmpty():
            fileUtil.save_file(outputRDD_after,
                               outputFilename + "/" + frame_name, 'text',
                               'json')
            print "Save to:", ("---" + frame_name)
            # workflow.save_rdd_to_es(outputRDD, es_server, es_port, es_index + "/" + frame_name)
Ejemplo n.º 13
0
def convert(sc,inputFileName,outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,file_format="text",data_type="json")
    fileUtil.save_file(rdd,outputFileName)
def save_jsonlines(sc, rdd, output_dir, file_format='sequence', data_type='json', separator='\t'):
    fUtil = FileUtil(sc)
    fUtil.save_file(rdd, output_dir, file_format=file_format, data_type=data_type, separator=separator)
Ejemplo n.º 15
0
                      dest="threshold", default=0.0, help="similarity threshold")
    parser.add_option("-e", "--base", dest="base", type="string",
                      help="base file", default="")
    parser.add_option("-o", "--outputformat", dest="outputformat", type="string",
                      help="output file format: text/sequence", default="text")
    parser.add_option("-y", "--outputtype", dest="outputtype", type="string",
                      help="output type: csv/json", default="json")
    parser.add_option("-k", "--topk", dest="topk", type="int",
                      help="top n matches", default=3)
    parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)
    parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string",
                        help="name for json element for matching candidates", default="candidates")
    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    inputFilename = args[0]
    outputFilename = args[1]
    print "Save to:", outputFilename

    kwargs = as_dict(c_options)
    clusterer = Clusterer(**kwargs)
    fileUtil = FileUtil(sc)

    rdd = fileUtil.load_file(inputFilename,file_format='text')

    cluster_rdd = clusterer.compute_clusters(rdd)

    fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')

Ejemplo n.º 16
0
    #3. Save the output
    # fileUtil.save_file(outputRDD, outputFilename, "text", "json")

    #4. Reduce rdds
    reducedRDD = workflow.reduce_rdds(numFramerPartitions, outputRDD)
    reducedRDD.persist()

    types = [
        {"name": "E39_Actor", "uri": "http://www.cidoc-crm.org/cidoc-crm/E39_Actor"},
        {"name": "E82_Actor_Appellation", "uri": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation"},
        {"name": "E67_Birth", "uri": "http://www.cidoc-crm.org/cidoc-crm/E67_Birth"},
        {"name": "E69_Death", "uri": "http://www.cidoc-crm.org/cidoc-crm/E69_Death"},
        {"name": "E52_Time-Span", "uri": "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span"}
    ]
    frames = [
        {"name": "npgConstituents", "url": "https://raw.githubusercontent.com/american-art/aac-alignment/master/frames/npgConsitituents.json-ld"}
    ]

    type_to_rdd_json = workflow.apply_partition_on_types(reducedRDD, types)

    #5. Apply framer
    framer_output = workflow.apply_framer(reducedRDD, type_to_rdd_json, frames, numFramerPartitions, 10)

    for frame_name in framer_output:
        #6. Map function
        framer_output[frame_name] = framer_output[frame_name].mapValues(mapFunc)
        fileUtil.save_file(framer_output[frame_name], outputFilename + "/" + frame_name, 'text', 'json')
        print "Save to:", ("---" + frame_name)

Ejemplo n.º 17
0
                        default="sequence")
    parser.add_argument("-q",
                        "--query",
                        help="HIVE query to get data",
                        default="",
                        required=False)

    args = parser.parse_args()
    print("Got arguments:", args)

    inputTable = args.inputTable.strip()
    outputFilename = args.output.strip()
    outputFileType = args.outputtype.strip()
    hiveQuery = args.query.strip()
    numPartitions = int(args.partitions)

    numFramerPartitions = numPartitions / 2

    if len(hiveQuery) > 0:
        cdr_data = workflow.load_cdr_from_hive_query(hiveQuery)\
            .partitionBy(numPartitions) \
            .persist(StorageLevel.MEMORY_AND_DISK)
    else:
        cdr_data = workflow.load_cdr_from_hive_table(inputTable) \
            .partitionBy(numPartitions) \
            .persist(StorageLevel.MEMORY_AND_DISK)

    cdr_data.setName("cdr_data")

    fileUtil.save_file(cdr_data, outputFilename, outputFileType, "json")
Ejemplo n.º 18
0
    cityRDD = cityRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityRDD.setName("cityRDD")

    stateRDD = workflow.apply_context(stateRDD1, state_context)
    stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    stateRDD.setName("stateRDD")


    countryRDD = workflow.apply_context(countryRDD1, country_context)
    countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK)
    countryRDD.setName("countryRDD")

    cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd, city_context)
    cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityAlternateRDD.setName("cityAlternateRDD")
    fileUtil.save_file(cityAlternateRDD, outputFilename+"_cityalternate", "text", "json")

    city_reduced_rdd = workflow.reduce_rdds(10, cityRDD, cityAlternateRDD)
    city_reduced_rdd = city_reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK)
    city_reduced_rdd.setName("city_reduced_rdd")

    # fileUtil.save_file(countryRDD, outputFilename+"_Country", "text", "json")
    # fileUtil.save_file(city_reduced_rdd, outputFilename+"_City", "text", "json")
    # fileUtil.save_file(stateRDD, outputFilename+"_State", "text", "json")



    mergeRDD1 = EntityMerger.merge_rdds(city_reduced_rdd, "address.addressCountry", countryRDD,10)
    # fileUtil.save_file(mergeRDD1, outputFilename+"_State_Country", "text", "json")

    mergeRDD2 = EntityMerger.merge_rdds(mergeRDD1, "address.addressRegion", stateRDD,10)
    inputRDD = workflow.batch_read_csv(input)
    outputFileType = "sequence"

    #2. Apply the karma Model
    reduced_rdd = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/effect-alignment/master/models/ransonware/ransomware-model.ttl",
        "http://effect.isi.edu/data/",
        "http://schema.dig.isi.edu/ontology/Malware1",
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={"karma.input.delimiter": ","})
    if reduced_rdd is not None:
        reduced_rdd = reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK)
        fileUtil.save_file(reduced_rdd, outputFilename + '/reduced_rdd',
                           "sequence", "json")
        reduced_rdd.setName("karma_out_reduced")

        types = [{
            "name": "AttackEvent",
            "uri": "http://schema.dig.isi.edu/ontology/AttackEvent"
        }, {
            "name": "EmailAddress",
            "uri": "http://schema.dig.isi.edu/ontology/EmailAddress"
        }, {
            "name": "GeoCoordinates",
            "uri": "http://schema.org/GeoCoordinates"
        }, {
            "name": "Organization",
            "uri": "http://schema.org/Organization"
        }, {
Ejemplo n.º 20
0
        "url": github_base + "/frames/email.json"
    }]

    type_to_rdd_json = workflow.apply_partition_on_types(reduced_rdd, types)
    for type_name in type_to_rdd_json:
        type_to_rdd_json[type_name]["rdd"] = type_to_rdd_json[type_name][
            "rdd"].persist(StorageLevel.MEMORY_AND_DISK)
        type_to_rdd_json[type_name]["rdd"].setName(type_name)

    framer_output = workflow.apply_framer(reduced_rdd, type_to_rdd_json,
                                          frames, numFramerPartitions, 10)

    # We have the framer output. Now we can save it to disk and load it into Elastic Search
    for frame_name in framer_output:
        framer_output[frame_name] = framer_output[frame_name].coalesce(numFramerPartitions)\
                                                .persist(StorageLevel.MEMORY_AND_DISK)
        fileUtil.save_file(framer_output[frame_name],
                           outputFilename + "/" + frame_name, "text", "json")

        if not framer_output[frame_name].isEmpty():
            if loadelasticsearch:
                workflow.save_rdd_to_es(framer_output[frame_name], es_server,
                                        es_port, es_index + "/" + frame_name)

    reduced_rdd.unpersist()
    for type_name in type_to_rdd_json:
        type_to_rdd_json[type_name]["rdd"] = type_to_rdd_json[type_name][
            "rdd"].unpersist()

    for frame_name in framer_output:
        framer_output[frame_name].unpersist()
Ejemplo n.º 21
0
      --archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaContextWorkflow.py \
      ../sample-data/sample-unicode-jsonld.json text \
      https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \
      ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaContextWorkflow")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputType = argv[2]
    contextUrl = argv[3]
    outputFilename = argv[4]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    # Apply the context
    outputRDD = workflow.apply_context(inputRDD, contextUrl)

    # Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
Ejemplo n.º 22
0
                                   "http://www.cidoc-crm.org/cidoc-crm/E22_Man-Made_Object1",
                                   "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json",
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":","})

    #3. Save the output
    # fileUtil.save_file(outputRDD, outputFilename, "text", "json")

    reducedRDD = workflow.reduce_rdds(outputRDD)

    reducedRDD.persist()
    types = [
        {"name": "E82_Actor_Appellation", "uri": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation"}
    ]
    frames = [
        {"name": "AutryMakers", "url": "https://raw.githubusercontent.com/american-art/aac-alignment/master/frames/autryMakers.json-ld"}
    ]

    context = workflow.read_json_file(contextUrl)
    framer_output = workflow.apply_framer(reducedRDD, types, frames)
    for frame_name in framer_output:
        outputRDD = workflow.apply_context(framer_output[frame_name], context, contextUrl)
        #apply mapValues function
        outputRDD_after = outputRDD.mapValues(mapFunc)

        if not outputRDD_after.isEmpty():
            fileUtil.save_file(outputRDD_after, outputFilename + "/" + frame_name, 'text', 'json')
            print "Save to:", ("---" + frame_name)
            # workflow.save_rdd_to_es(outputRDD, es_server, es_port, es_index + "/" + frame_name)

Ejemplo n.º 23
0
from pyspark import SparkContext, SparkConf, StorageLevel
from optparse import OptionParser
from digSparkUtil.fileUtil import FileUtil

if __name__ == '__main__':
    parser = OptionParser()

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    filename = args[0]
    file_format = args[1]
    out_filename = args[2]
    out_format = args[3]
    uris = args[4].split(",")

    print "Filename", filename, file_format
    print "Output:", out_filename, out_format
    print "Filter:", args[4]

    sc = SparkContext(appName="DIG-FILTER")
    conf = SparkConf()

    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_file(filename, file_format, "json")
    output_rdd = input_rdd.filter(lambda x: x[0] in uris).coalesce(1)
    fileUtil.save_file(output_rdd, out_filename, out_format, "json")
Ejemplo n.º 24
0
def convert(sc, inputFileName, outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,
                             file_format="text",
                             data_type="json")
    fileUtil.save_file(rdd, outputFileName)
Ejemplo n.º 25
0
    cityRDD = cityRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityRDD.setName("cityRDD")

    stateRDD = workflow.apply_context(stateRDD1, state_context)
    stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    stateRDD.setName("stateRDD")

    countryRDD = workflow.apply_context(countryRDD1, country_context)
    countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK)
    countryRDD.setName("countryRDD")

    cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd,
                                              city_context)
    cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK)
    cityAlternateRDD.setName("cityAlternateRDD")
    fileUtil.save_file(cityAlternateRDD, outputFilename + "_cityalternate",
                       "text", "json")

    city_reduced_rdd = workflow.reduce_rdds(10, cityRDD, cityAlternateRDD)
    city_reduced_rdd = city_reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK)
    city_reduced_rdd.setName("city_reduced_rdd")

    # fileUtil.save_file(countryRDD, outputFilename+"_Country", "text", "json")
    # fileUtil.save_file(city_reduced_rdd, outputFilename+"_City", "text", "json")
    # fileUtil.save_file(stateRDD, outputFilename+"_State", "text", "json")

    mergeRDD1 = EntityMerger.merge_rdds(city_reduced_rdd,
                                        "address.addressCountry", countryRDD,
                                        10)
    # fileUtil.save_file(mergeRDD1, outputFilename+"_State_Country", "text", "json")

    mergeRDD2 = EntityMerger.merge_rdds(mergeRDD1, "address.addressRegion",