Esempio n. 1
0
    def merge_rdds(inputRDD, inputPath, baseRDD, joinRDD, removeElements, numPartitions):
        #1. Merge baseRDD and joinRDD
        #baseRDD: base_uri base_json
        #joinRDD: source_uri base_uri
        #output:  source_uri, base_json
        join_rdd_on_base = joinRDD.map(lambda x: load_linking_row(x, False)) #Get back base_uri -> source_uri
        base_merge = join_rdd_on_base.join(baseRDD).map(lambda x: (x[1][0], x[1][1]))

        #2. Extract the source_uri from inputRDD
        #output: source_uri, input_uri
        input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \
            .map(lambda (x, y): (y, x))

        #3. JOIN extracted source_uri with base
        #output source_uri, (input_uri, base_json)
        if numPartitions != -1:
            merge3 = input_source_rdd.join(base_merge, numPartitions)
        else:
            merge3 = input_source_rdd.join(base_merge)

        #4. Make input_uri as the key
        #output: input_uri, (source_uri, base_json)
        merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1])))

        #5. Group results by input_uri
        #output: input_uri, list(source_uri, base_json)
        merge5 = merge4.groupByKey()

        #6 Merge in input_json
        #output: input_uri, list(input_json), list(source_uri, base_json)
        merge6 = inputRDD.cogroup(merge5)

        #7 Replace JSON as necessary
        result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, removeElements))
        return result
Esempio n. 2
0
    def clean_rdds(inputRDD, inputPath, baseRDD, numPartitions):
        #1. Extract the source_uri from inputRDD
        #output: source_uri, input_uri
        input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \
            .map(lambda (x, y): (y, x))

        #2. JOIN extracted source_uri with base
        #output source_uri, (input_uri, base_json)
        merge3 = input_source_rdd.join(baseRDD, numPartitions)

        #3. Make input_uri as the key
        #output: input_uri, (source_uri, base_json)
        merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1])))

        #4. Group results by input_uri
        #output: input_uri, list(source_uri, base_json)
        merge5 = merge4.groupByKey(numPartitions)

        #5 Merge in input_json
        #output: input_uri, list(input_json), list(source_uri, base_json)
        merge6 = inputRDD.cogroup(merge5)

        #6 Replace JSON as necessary
        result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, []))
        return result
Esempio n. 3
0
    def merge_rdds(inputRDD, inputPath, baseRDD, numPartitions):
        # inputRDD: input_uri, input_json
        # base_rdd: merge_uri, base_json
        #
        # inputPath: path in input_json where merge_uri will be found
        #

        #2. Extract the source_uri from inputRDD
        #output: merge_uri, input_uri
        input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \
            .map(lambda (x, y): (y, x))

        #3. JOIN extracted source_uri with base
        #output merge_uri, (input_uri, base_json)
        merge3 = input_source_rdd.join(baseRDD)

        #4. Make input_uri as the key
        #output: input_uri, (merge_uri, base_json)
        merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], [(source_uri, join_res[1])])).partitionBy(numPartitions)

        #5. Group results by input_uri
        #output: input_uri, list(merge_uri, base_json))
        merge5 = merge4.reduceByKey(reduceLists)

        #6 Merge in input_json
        #output: input_uri, list(input_json), list(merge_uri, base_json)
        merge6 = inputRDD.cogroup(merge5)

        #7 Replace JSON as necessary
        result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath))
        return result