Ejemplo n.º 1
0
    def deduplicate(input_json, input_path):
        input_path_arr = input_path.strip().split(".")
        if len(input_path_arr) > 1:
            json_path = ".".join(input_path_arr[0: len(input_path_arr)-1])
            input_objs = JSONUtil.extract_objects_from_path(input_json, json_path)
        else:
            input_objs = JSONUtil.to_list(input_json)

        # print "Got objects:", json.dumps(input_objs)

        last_path_elem = input_path_arr[len(input_path_arr)-1]
        for input_obj in input_objs:
            if last_path_elem in input_obj:
                last_input_obj = input_obj[last_path_elem]
                if isinstance(last_input_obj, list):
                    seen_objs = set()
                    rest = list()
                    for part in last_input_obj:
                        part_str = json.dumps(part)
                        if part_str in seen_objs:
                            # last_input_obj.remove(part)
                            continue
                        else:
                            seen_objs.add(json.dumps(part))
                            rest.append(part)
                    #print rest
                    input_obj[last_path_elem] = rest

        return input_json
Ejemplo n.º 2
0
    def clean_rdds(inputRDD, inputPath, baseRDD, numPartitions):
        #1. Extract the source_uri from inputRDD
        #output: source_uri, input_uri
        input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \
            .map(lambda (x, y): (y, x))

        #2. JOIN extracted source_uri with base
        #output source_uri, (input_uri, base_json)
        merge3 = input_source_rdd.join(baseRDD, numPartitions)

        #3. Make input_uri as the key
        #output: input_uri, (source_uri, base_json)
        merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1])))

        #4. Group results by input_uri
        #output: input_uri, list(source_uri, base_json)
        merge5 = merge4.groupByKey(numPartitions)

        #5 Merge in input_json
        #output: input_uri, list(input_json), list(source_uri, base_json)
        merge6 = inputRDD.cogroup(merge5)

        #6 Replace JSON as necessary
        result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, []))
        return result
Ejemplo n.º 3
0
    def merge_rdds(inputRDD, inputPath, baseRDD, joinRDD, removeElements, numPartitions):
        #1. Merge baseRDD and joinRDD
        #baseRDD: base_uri base_json
        #joinRDD: source_uri base_uri
        #output:  source_uri, base_json
        join_rdd_on_base = joinRDD.map(lambda x: load_linking_row(x, False)) #Get back base_uri -> source_uri
        base_merge = join_rdd_on_base.join(baseRDD).map(lambda x: (x[1][0], x[1][1]))

        #2. Extract the source_uri from inputRDD
        #output: source_uri, input_uri
        input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \
            .map(lambda (x, y): (y, x))

        #3. JOIN extracted source_uri with base
        #output source_uri, (input_uri, base_json)
        if numPartitions != -1:
            merge3 = input_source_rdd.join(base_merge, numPartitions)
        else:
            merge3 = input_source_rdd.join(base_merge)

        #4. Make input_uri as the key
        #output: input_uri, (source_uri, base_json)
        merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], (source_uri, join_res[1])))

        #5. Group results by input_uri
        #output: input_uri, list(source_uri, base_json)
        merge5 = merge4.groupByKey()

        #6 Merge in input_json
        #output: input_uri, list(input_json), list(source_uri, base_json)
        merge6 = inputRDD.cogroup(merge5)

        #7 Replace JSON as necessary
        result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath, removeElements))
        return result
Ejemplo n.º 4
0
    def merge_rdds(inputRDD, inputPath, baseRDD, numPartitions):
        # inputRDD: input_uri, input_json
        # base_rdd: merge_uri, base_json
        #
        # inputPath: path in input_json where merge_uri will be found
        #

        #2. Extract the source_uri from inputRDD
        #output: merge_uri, input_uri
        input_source_rdd = inputRDD.flatMapValues(lambda x: JSONUtil.extract_values_from_path(x, inputPath)) \
            .map(lambda (x, y): (y, x))

        #3. JOIN extracted source_uri with base
        #output merge_uri, (input_uri, base_json)
        merge3 = input_source_rdd.join(baseRDD)

        #4. Make input_uri as the key
        #output: input_uri, (merge_uri, base_json)
        merge4 = merge3.map(lambda (source_uri, join_res): (join_res[0], [(source_uri, join_res[1])])).partitionBy(numPartitions)

        #5. Group results by input_uri
        #output: input_uri, list(merge_uri, base_json))
        merge5 = merge4.reduceByKey(reduceLists)

        #6 Merge in input_json
        #output: input_uri, list(input_json), list(merge_uri, base_json)
        merge6 = inputRDD.cogroup(merge5)

        #7 Replace JSON as necessary
        result = merge6.mapValues(lambda x: merge_json(x[0], x[1], inputPath))
        return result
Ejemplo n.º 5
0
def merge_json(input_jsons, merge_uri_and_jsons, input_path):
        for x in input_jsons:
            input_json = x
            break


        findReplaceMap = dict()
        for merge_uri_and_json in merge_uri_and_jsons:
            # print merge_uri_and_json
            # print "=========="
            uri = merge_uri_and_json[0]
            json_obj = merge_uri_and_json[1]
            findReplaceMap[uri] = json_obj


        # print "INPUT JSON:", input_json
        JSONUtil.replace_values_at_path_batch(input_json, input_path, findReplaceMap, [])
        # print "OUTPUT JSON:", input_json

        return input_json
Ejemplo n.º 6
0
def merge_json(input_jsons, merge_uri_and_jsons, input_path, removeElements):
        for x in input_jsons:
            input_json = x
            break

        for merge_uri_and_json in merge_uri_and_jsons:
            uri_and_jsons = []
            for x in merge_uri_and_json:
                uri_and_jsons.append(x)

            for uri_and_json in uri_and_jsons:
                input_json = JSONUtil.replace_values_at_path(input_json, input_path, uri_and_json[0],
                                                         uri_and_json[1], removeElements)
        return input_json
Ejemplo n.º 7
0
def frame_json(frame, type_to_rdd):
    document_type = frame["@type"]
    output_rdd = type_to_rdd[document_type]["rdd"]
    if len(frame.items()) > 1:
        if "@explicit" in frame and frame["@explicit"] == True:
            output_rdd = output_rdd.map(lambda (uri, json): (uri, JSONUtil.frame_include_only_values(json, frame)))
        for key, val in frame.items():
            if key[0] == "@":
                continue
            if isinstance(val, dict) and not "@type" in val:
                continue
            if isinstance(val, dict) and "@embed" in val and val["@embed"] == False:
                continue
            # should this be every value?
            child_rdd = frame_json(val, type_to_rdd)
            output_rdd = EntityMerger.merge_rdds(output_rdd, key, child_rdd, 10)
    return output_rdd
Ejemplo n.º 8
0
def merge_json(input_jsons, merge_uri_and_jsons, input_path):
        for x in input_jsons:
            input_json = x
            break

        # B/cluster	[
        #               [
        #                   ["C", {"image": {"isSimilarTo": [{"isSimilarTo": {"uri": "I5"}}, {"isSimilarTo": {"uri": "I6"}}]}, "uri": "C"}],
        #                   ["B", {"image": {"isSimilarTo": [{"isSimilarTo": {"uri": "I1"}}, {"isSimilarTo": {"uri": "I3"}}, {"isSimilarTo": {"uri": "I5"}}]}, "uri": "B"}]
        #               ]
        #          ]

        for merge_uri_and_json in merge_uri_and_jsons:

            uri_and_jsons = []
            for x in merge_uri_and_json:
                uri_and_jsons.append(x)


            for uri_and_json in merge_uri_and_json:
                input_json = JSONUtil.replace_values_at_path(input_json, input_path, uri_and_json[0],
                                                         uri_and_json[1], [])
        return input_json