Python project Examples, DoD.data_processing_utils.project Python Examples

Example #1

0

Show file

File: dod.py Project: singletrips/aurum-datadiscovery

def test_e2e(dod, number_jps=5):

    # attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"]
    # values = ["968548423", "kimball", "Mechanical Engineering"]

    # # cannot search for numbers
    # attrs = ["s_name", "s_address", "ps_availqty"]
    # values = ["Supplier#000000001", "N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ", "7340"]

    # attrs = ["s_name", "s_address", "ps_comment"]
    # values = ["Supplier#000000001", "N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ",
    #           "dly final packages haggle blithely according to the pending packages. slyly regula"]

    # attrs = ["n_name", "s_name", "c_name", "o_clerk"]
    # values = ["CANADA", "Supplier#000000013", "Customer#000000005", "Clerk#000000400"]

    # attrs = ["o_clerk", "o_orderpriority", "n_name"]
    # values = ["Clerk#000000951", "5-LOW", "JAPAN"]

    # attrs = ["Subject", "Title", "Publisher"]
    # values = ["", "Man who would be king and other stories", "Oxford university press, incorporated"]

    # attrs = ["Iap Category Name", "Person Name", "Person Email"]
    # # values = ["", "Meghan Kenney", "*****@*****.**"]
    # values = ["Engineering", "", ""]

    # attrs = ["Building Name Long", "Ext Gross Area", "Building Room", "Room Square Footage"]
    # values = ["", "", "", ""]

    # attrs = ["c_name", "c_phone", "n_name", "l_tax"]
    # values = ["Customer#000000001", "25-989-741-2988", "BRAZIL", ""]

    # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"]
    # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"]

    # attrs = ["Neighborhood ", "Total Population ", "Graduate Degree %"]
    # values = ["Cambridgeport", "", ""]

    attrs = ["Email Address", "Department Full Name"]
    values = ["*****@*****.**", ""]

    i = 0
    for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search(
            attrs, values, debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        # i += 1
        # print(mjp.head(2))
        # if i > number_jps:
        #     break

        proj_view = dpu.project(mjp, attrs_project)

        print(str(proj_view.head(10)))

        print("")
        input("Press any key to continue...")

Example #2

0

Show file

File: dod.py Project: damienrrb/aurum-datadiscovery

def test_e2e(dod, number_jps=5):
    # attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"]
    # values = ["968548423", "kimball", "Mechanical Engineering"]

    attrs = ["Subject", "Title", "Publisher"]
    values = [
        "", "Man who would be king and other stories",
        "Oxford university press, incorporated"
    ]

    # attrs = ["Iap Category Name", "Person Name", "Person Email"]
    # # values = ["", "Meghan Kenney", "*****@*****.**"]
    # values = ["Engineering", "", ""]

    # attrs = ["Building Name Long", "Ext Gross Area", "Building Room", "Room Square Footage"]
    # values = ["", "", "", ""]

    # attrs = ["c_name", "c_phone", "n_name", "l_tax"]
    # values = ["Customer#000000001", "25-989-741-2988", "BRAZIL", ""]

    # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"]
    # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"]

    i = 0
    first = True
    first_mjp = None
    most_likely_key = None
    for mjp, attrs_project in dod.virtual_schema_iterative_search(
            attrs, values, debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        # i += 1
        # print(mjp.head(2))
        # if i > number_jps:
        #     break

        proj_view = dpu.project(mjp, attrs_project)

        if first:
            first = False
            first_mjp = mjp
            most_likely_keys_info = mva.most_likely_key(first_mjp)
            most_likely_key = most_likely_keys_info[0][0]
        missing_keys, non_unique_df1, non_unique_df2, conflicting_pair = \
            mva.inconsistent_value_on_key(first_mjp, mjp, key=most_likely_key)
        if len(conflicting_pair) > 0:
            print(str(conflicting_pair))

Example #3

0

Show file

File: app.py Project: Florents-Tselai/aurum-datadiscovery

def findvs():
    if request.method == 'POST':
        json_request = request.get_json()
        payload_str = json_request['payload']
        payload = json.loads(payload_str)

        # Prepare input parameters to DoD
        list_attributes = ["" for k, v in payload.items()
                           if k[0] == "0"]  # measure number attrs
        list_samples = ["" for el in list_attributes
                        ]  # template for samples, 1 row only for now
        for k, v in payload.items():
            row_idx = int(k[0])
            col_idx = int(k[2])
            if row_idx == 0:
                list_attributes[col_idx] = v
            else:
                list_samples[col_idx] = v

        # Obtain view - always create a new view_generator, we assume these are new views
        # global dod
        global view_generator
        view_generator = iter(
            dod.virtual_schema_iterative_search(list_attributes, list_samples,
                                                {}))
        mvs, attrs_to_project, view_metadata = next(view_generator)
        proj_view = dpu.project(mvs, attrs_to_project)
        analysis = obtain_view_analysis(proj_view)
        sample_view = proj_view.head(10)
        html_dataframe = sample_view.to_html()

        global matview
        matview = proj_view

        return jsonify({
            "view": html_dataframe,
            "analysis": analysis,
            "joingraph": view_metadata
        })

Example #4

0

Show file

def next_view():
    if request.method == 'POST':

        # Obtain view - always create a new view_generator, we assume these are new views
        global view_generator
        try:
            mvs, attrs_to_project, view_metadata = next(view_generator)
        except StopIteration:
            print("finished exploring views")
            return jsonify({"view": "no-more-views", "analysis": 'no'})
        proj_view = dpu.project(mvs, attrs_to_project)
        analysis = obtain_view_analysis(proj_view)
        sample_view = proj_view.head(10)
        html_dataframe = sample_view.to_html()

        global matview
        matview = proj_view

        return jsonify({
            "view": html_dataframe,
            "analysis": analysis,
            "joingraph": view_metadata
        })

Example #5

0

Show file

File: evaluate-dod.py Project: zyxzyxzyx/aurum-datadiscovery

def run_dod(dod, attrs, values, output_path, max_hops=2, name=None):
    view_metadata_mapping = dict()
    i = 0
    perf_stats = dict()
    st_runtime = time.time()
    for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search(
            attrs,
            values,
            perf_stats,
            max_hops=max_hops,
            debug_enumerate_all_jps=False):
        proj_view = dpu.project(mjp, attrs_project)

        if output_path is not None:
            view_path = output_path + "/view_" + str(i)
            proj_view.to_csv(view_path, encoding='latin1',
                             index=False)  # always store this
            # store metadata associated to that view
            view_metadata_mapping[view_path] = metadata

        i += 1
    et_runtime = time.time()
    perf_stats['et_runtime'] = (et_runtime - st_runtime)
    print("#$# " + str(name))
    print("#$# ")
    print("")
    pp.pprint(perf_stats)
    total_join_graphs = sum(perf_stats['num_join_graphs_per_candidate_group'])
    total_materializable_join_graphs = sum(
        perf_stats['materializable_join_graphs'])
    print("Total join graphs: " + str(total_join_graphs))
    print("Total materializable join graphs: " +
          str(total_materializable_join_graphs))
    print("")
    print("Total views: " + str(i))
    print("#$# ")

Example #6

0

Show file

File: dod.py Project: nato16/aurum-datadiscovery

def main(args):
    model_path = args.model_path
    separator = args.separator

    store_client = StoreHandler()
    network = fieldnetwork.deserialize_network(model_path)
    dod = DoD(network=network,
              store_client=store_client,
              csv_separator=separator)

    attrs = args.list_attributes.split(";")
    values = args.list_values.split(";")
    print(attrs)
    print(values)
    assert len(attrs) == len(values)

    i = 0
    for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search(
            attrs, values, debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        proj_view = dpu.project(mjp, attrs_project)
        print(str(proj_view.head(10)))
        print("Metadata")
        print(metadata)
        if args.output_path:
            if args.full_view:
                mjp.to_csv(args.output_path + "/raw_view_" + str(i),
                           encoding='latin1',
                           index=False)
            proj_view.to_csv(args.output_path + "/view_" + str(i),
                             encoding='latin1',
                             index=False)  # always store this
        i += 1
        if args.interactive == "True":
            print("")
            input("Press any key to continue...")

Example #7

0

Show file

def test_e2e(dod,
             attrs,
             values,
             number_jps=5,
             output_path=None,
             full_view=False,
             interactive=False):

    ###
    # Run Core DoD
    ###
    view_metadata_mapping = dict()
    i = 0
    perf_stats = dict()
    st_runtime = time.time()
    for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search(
            attrs, values, perf_stats, max_hops=2,
            debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        # i += 1
        # print(mjp.head(2))
        # if i > number_jps:
        #     break

        proj_view = dpu.project(mjp, attrs_project)

        # print(str(proj_view.head(5)))
        # print("Metadata")
        # print(metadata)

        if output_path is not None:
            view_path = None
            if full_view:
                view_path = output_path + "/raw_view_" + str(i)
                mjp.to_csv(view_path, encoding='latin1', index=False)
            view_path = output_path + "/view_" + str(i)
            proj_view.to_csv(view_path, encoding='latin1',
                             index=False)  # always store this
            # store metadata associated to that view
            view_metadata_mapping[view_path] = metadata

        i += 1

        if interactive:
            print("")
            input("Press any key to continue...")
    et_runtime = time.time()
    perf_stats['runtime'] = (et_runtime - st_runtime)
    pp.pprint(perf_stats)
    if 'num_join_graphs_per_candidate_group' in perf_stats:
        total_join_graphs = sum(
            perf_stats['num_join_graphs_per_candidate_group'])
        print("Total join graphs: " + str(total_join_graphs))
    if 'materializable_join_graphs' in perf_stats:
        total_materializable_join_graphs = sum(
            perf_stats['materializable_join_graphs'])
        print("Total materializable join graphs: " +
              str(total_materializable_join_graphs))

    print("Total views: " + str(i))
    exit()

    ###
    # Run 4C
    ###
    # return
    groups_per_column_cardinality = v4c.main(output_path)

    for k, v in groups_per_column_cardinality.items():
        compatible_groups = v['compatible']
        contained_groups = v['contained']
        complementary_group = v['complementary']
        contradictory_group = v['contradictory']

        print("Compatible views: " + str(len(compatible_groups)))
        print("Contained views: " + str(len(contained_groups)))
        print("Complementary views: " + str(len(complementary_group)))
        print("Contradictory views: " + str(len(contradictory_group)))