def merge_rdds(inputRDD, inputPath, baseRDD, joinRDD, removeElements, numPartitions): #1. Merge baseRDD and joinRDD #baseRDD: base_uri base_json #joinRDD: source_uri base_uri #output: source_uri, base_json join_rdd_on_base = joinRDD.map(lambda x: load_linking_row(x, False)) #Get back base_uri -> source_uri base_merge = join_rdd_on_base.join(baseRDD).map(lambda x: (x[1][0], x[1][1])) #2. Extract the source_uri from inputRDD #output: source_uri, input_uri input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \ .map(lambda (x, y): (y, x)) #3. JOIN extracted source_uri with base #output source_uri, (input_uri, base_json) if numPartitions != -1: merge3 = input_source_rdd.join(base_merge, numPartitions) else: merge3 = input_source_rdd.join(base_merge) #4. Make input_uri as the key #output: input_uri, (source_uri, base_json) merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1]))) #5. Group results by input_uri #output: input_uri, list(source_uri, base_json) merge5 = merge4.groupByKey() #6 Merge in input_json #output: input_uri, list(input_json), list(source_uri, base_json) merge6 = inputRDD.cogroup(merge5) #7 Replace JSON as necessary result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, removeElements)) return result
def clean_rdds(inputRDD, inputPath, baseRDD, numPartitions): #1. Extract the source_uri from inputRDD #output: source_uri, input_uri input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \ .map(lambda (x, y): (y, x)) #2. JOIN extracted source_uri with base #output source_uri, (input_uri, base_json) merge3 = input_source_rdd.join(baseRDD, numPartitions) #3. Make input_uri as the key #output: input_uri, (source_uri, base_json) merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1]))) #4. Group results by input_uri #output: input_uri, list(source_uri, base_json) merge5 = merge4.groupByKey(numPartitions) #5 Merge in input_json #output: input_uri, list(input_json), list(source_uri, base_json) merge6 = inputRDD.cogroup(merge5) #6 Replace JSON as necessary result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, [])) return result
def merge_rdds(inputRDD, inputPath, baseRDD, numPartitions): # inputRDD: input_uri, input_json # base_rdd: merge_uri, base_json # # inputPath: path in input_json where merge_uri will be found # #2. Extract the source_uri from inputRDD #output: merge_uri, input_uri input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \ .map(lambda (x, y): (y, x)) #3. JOIN extracted source_uri with base #output merge_uri, (input_uri, base_json) merge3 = input_source_rdd.join(baseRDD) #4. Make input_uri as the key #output: input_uri, (merge_uri, base_json) merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], [(source_uri, join_res[1])])).partitionBy(numPartitions) #5. Group results by input_uri #output: input_uri, list(merge_uri, base_json)) merge5 = merge4.reduceByKey(reduceLists) #6 Merge in input_json #output: input_uri, list(input_json), list(merge_uri, base_json) merge6 = inputRDD.cogroup(merge5) #7 Replace JSON as necessary result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath)) return result