def deduplicate(input_json, input_path): input_path_arr = input_path.strip().split(".") if len(input_path_arr) > 1: json_path = ".".join(input_path_arr[0: len(input_path_arr)-1]) input_objs = JSONUtil.extract_objects_from_path(input_json, json_path) else: input_objs = JSONUtil.to_list(input_json) # print "Got objects:", json.dumps(input_objs) last_path_elem = input_path_arr[len(input_path_arr)-1] for input_obj in input_objs: if last_path_elem in input_obj: last_input_obj = input_obj[last_path_elem] if isinstance(last_input_obj, list): seen_objs = set() rest = list() for part in last_input_obj: part_str = json.dumps(part) if part_str in seen_objs: # last_input_obj.remove(part) continue else: seen_objs.add(json.dumps(part)) rest.append(part) #print rest input_obj[last_path_elem] = rest return input_json
def clean_rdds(inputRDD, inputPath, baseRDD, numPartitions): #1. Extract the source_uri from inputRDD #output: source_uri, input_uri input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \ .map(lambda (x, y): (y, x)) #2. JOIN extracted source_uri with base #output source_uri, (input_uri, base_json) merge3 = input_source_rdd.join(baseRDD, numPartitions) #3. Make input_uri as the key #output: input_uri, (source_uri, base_json) merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1]))) #4. Group results by input_uri #output: input_uri, list(source_uri, base_json) merge5 = merge4.groupByKey(numPartitions) #5 Merge in input_json #output: input_uri, list(input_json), list(source_uri, base_json) merge6 = inputRDD.cogroup(merge5) #6 Replace JSON as necessary result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, [])) return result
def merge_rdds(inputRDD, inputPath, baseRDD, joinRDD, removeElements, numPartitions): #1. Merge baseRDD and joinRDD #baseRDD: base_uri base_json #joinRDD: source_uri base_uri #output: source_uri, base_json join_rdd_on_base = joinRDD.map(lambda x: load_linking_row(x, False)) #Get back base_uri -> source_uri base_merge = join_rdd_on_base.join(baseRDD).map(lambda x: (x[1][0], x[1][1])) #2. Extract the source_uri from inputRDD #output: source_uri, input_uri input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \ .map(lambda (x, y): (y, x)) #3. JOIN extracted source_uri with base #output source_uri, (input_uri, base_json) if numPartitions != -1: merge3 = input_source_rdd.join(base_merge, numPartitions) else: merge3 = input_source_rdd.join(base_merge) #4. Make input_uri as the key #output: input_uri, (source_uri, base_json) merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1]))) #5. Group results by input_uri #output: input_uri, list(source_uri, base_json) merge5 = merge4.groupByKey() #6 Merge in input_json #output: input_uri, list(input_json), list(source_uri, base_json) merge6 = inputRDD.cogroup(merge5) #7 Replace JSON as necessary result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, removeElements)) return result
def merge_rdds(inputRDD, inputPath, baseRDD, numPartitions): # inputRDD: input_uri, input_json # base_rdd: merge_uri, base_json # # inputPath: path in input_json where merge_uri will be found # #2. Extract the source_uri from inputRDD #output: merge_uri, input_uri input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \ .map(lambda (x, y): (y, x)) #3. JOIN extracted source_uri with base #output merge_uri, (input_uri, base_json) merge3 = input_source_rdd.join(baseRDD) #4. Make input_uri as the key #output: input_uri, (merge_uri, base_json) merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], [(source_uri, join_res[1])])).partitionBy(numPartitions) #5. Group results by input_uri #output: input_uri, list(merge_uri, base_json)) merge5 = merge4.reduceByKey(reduceLists) #6 Merge in input_json #output: input_uri, list(input_json), list(merge_uri, base_json) merge6 = inputRDD.cogroup(merge5) #7 Replace JSON as necessary result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath)) return result
def merge_json(input_jsons, merge_uri_and_jsons, input_path): for x in input_jsons: input_json = x break findReplaceMap = dict() for merge_uri_and_json in merge_uri_and_jsons: # print merge_uri_and_json # print "==========" uri = merge_uri_and_json[0] json_obj = merge_uri_and_json[1] findReplaceMap[uri] = json_obj # print "INPUT JSON:", input_json JSONUtil.replace_values_at_path_batch(input_json, input_path, findReplaceMap, []) # print "OUTPUT JSON:", input_json return input_json
def merge_json(input_jsons, merge_uri_and_jsons, input_path, removeElements): for x in input_jsons: input_json = x break for merge_uri_and_json in merge_uri_and_jsons: uri_and_jsons = [] for x in merge_uri_and_json: uri_and_jsons.append(x) for uri_and_json in uri_and_jsons: input_json = JSONUtil.replace_values_at_path(input_json, input_path, uri_and_json[0], uri_and_json[1], removeElements) return input_json
def frame_json(frame, type_to_rdd): document_type = frame["@type"] output_rdd = type_to_rdd[document_type]["rdd"] if len(frame.items()) > 1: if "@explicit" in frame and frame["@explicit"] == True: output_rdd = output_rdd.map(lambda (uri, json): (uri, JSONUtil.frame_include_only_values(json, frame))) for key, val in frame.items(): if key[0] == "@": continue if isinstance(val, dict) and not "@type" in val: continue if isinstance(val, dict) and "@embed" in val and val["@embed"] == False: continue # should this be every value? child_rdd = frame_json(val, type_to_rdd) output_rdd = EntityMerger.merge_rdds(output_rdd, key, child_rdd, 10) return output_rdd
def merge_json(input_jsons, merge_uri_and_jsons, input_path): for x in input_jsons: input_json = x break # B/cluster [ # [ # ["C", {"image": {"isSimilarTo": [{"isSimilarTo": {"uri": "I5"}}, {"isSimilarTo": {"uri": "I6"}}]}, "uri": "C"}], # ["B", {"image": {"isSimilarTo": [{"isSimilarTo": {"uri": "I1"}}, {"isSimilarTo": {"uri": "I3"}}, {"isSimilarTo": {"uri": "I5"}}]}, "uri": "B"}] # ] # ] for merge_uri_and_json in merge_uri_and_jsons: uri_and_jsons = [] for x in merge_uri_and_json: uri_and_jsons.append(x) for uri_and_json in merge_uri_and_json: input_json = JSONUtil.replace_values_at_path(input_json, input_path, uri_and_json[0], uri_and_json[1], []) return input_json