Esempio n. 1
0
def gather_statistics(schema_set, datasources_with_tag, stats_output_p, datasets_input_p):

    for source_name in datasources_with_tag:
        all_stats = {}

        file_path = stats_output_p + source_name + '.json'
        my_file = pathlib.Path(file_path)
        if my_file.exists():
            continue

        path = datasets_input_p + source_name + '.csv'
        dataset = pd.read_csv(path, index_col=0, header=0)
        dataset = df_rename_cols(dataset)

        attr_schema = schema_set[source_name]
        df_columns = list(dataset.columns.values)

        # print(attr_schema)
        attr_schema = [{'name': parse_dataset.clean_name(attr['name'], False, False)} for attr in attr_schema]
        df_columns = [parse_dataset.clean_name(attr, False, False) for attr in df_columns]

        cols_to_delete = find_attrs_to_delete(attr_schema, df_columns)
        dataset = df_delete_cols(dataset, cols_to_delete)

        schema = schema_set[source_name]
        attributes_list = [parse_dataset.clean_name(attr['name'], False, False) for attr in schema]
        attributes_list = [item for item in attributes_list if item not in cols_to_delete]

        print(source_name, attributes_list, cols_to_delete)

        for attr in attributes_list:
            stat, groups, uniques = groupby_unique(attr, dataset)

            all_stats[attr] = stat

            # TODO more types of stats needed


        with open(file_path, 'w') as fp:
            json.dump(all_stats, fp, sort_keys=True, indent=2)

    return
def load_prematching_metadata(p, m, pds):
    all_topics = {}

    topic_contexts_f = open(p.enriched_topics_json_dir, 'r')
    topic_contexts = json.load(topic_contexts_f)
    all_topics = {}

    # TODO get all available topics
    for source_name in m.datasources_with_tag:
        source_name = pds.clean_name(source_name, False, False)
        for src_top in topic_contexts[source_name]:
            if src_top not in all_topics:
                all_topics[src_top] = {}

            all_topics[src_top][source_name] = topic_contexts[source_name][
                src_top]  # synsets

    attrs_contexts_f = open(p.enriched_attrs_json_dir, 'r')
    attrs_contexts = json.load(attrs_contexts_f)

    return all_topics, attrs_contexts, topic_contexts
def load_per_source_metadata(p, m, datasources, source_name, pds, bmm):
    # path = datasets_path + source_name + '.csv'
    # dataset = pd.read_csv(path, index_col=0, header=0)
    stats_f = open(p.dataset_stats + source_name + '.json', 'r')
    stats = json.load(stats_f)
    df_columns = list(stats.keys())

    schema = m.schema_set[source_name]
    metadata = m.dataset_metadata_set[source_name]['tags']
    dataset = pd.DataFrame()

    # dataset = bmm.df_rename_cols(dataset)

    datasources[source_name] = (source_name, dataset, schema, metadata)

    print(source_name)
    # bmm.print_metadata_head(source_name, dataset, schema, metadata)

    # initialization schema matching
    tags_list = [tag['display_name'] for tag in metadata]
    # use enriched tags instead
    tags_list_enriched_f = open(p.enriched_topics_json_dir, 'r')
    tags_list_enriched = json.load(tags_list_enriched_f)
    tags_list_enriched_dataset = tags_list_enriched[source_name]
    tags_list_enriched_names = list(tags_list_enriched[source_name].keys())
    # TODO add non-overlapping homonyms to context

    attributes_list = [
        pds.clean_name(attr['name'], False, False) for attr in schema
    ]
    cols_to_delete = bmm.find_attrs_to_delete(attributes_list, df_columns)
    attributes_list = [
        item for item in attributes_list if item not in cols_to_delete
    ]

    return tags_list_enriched_dataset, tags_list_enriched_names, attributes_list, schema, datasources, stats
Esempio n. 4
0
def print_metadata_head(source_name, dataset, schema, metadata):
    print('dataset_name:', source_name)
    print('dataset_values.head \n', dataset.head())
    print('dataset_schema[0]', parse_dataset.clean_name(schema[0]['name'], False, False))
    print('dataset_schema[1]', parse_dataset.clean_name(schema[1]['name'], False, False))
    print('dataset_tags[0]', metadata[0]['display_name'])
Esempio n. 5
0
def df_rename_cols(dataset):
    dataset_col_names = [parse_dataset.clean_name(x) for x in list(dataset.columns.values)]
    col_rename_dict = {i: j for i, j in zip(list(dataset.columns.values), dataset_col_names)}
    dataset.rename(columns=col_rename_dict, inplace=True)
    return dataset
Esempio n. 6
0
    datasets_path = './thesis_project_dataset_clean/'
    datasources = {}
    for source_name in datasources_with_tag:
        dataset = pd.read_csv(datasets_path + source_name + '.csv', index_col=0, header=0)
        schema = schema_set[source_name]
        metadata = dataset_metadata_set[source_name]['tags']

        dataset = df_rename_cols(dataset)

        datasources[source_name] = (source_name, dataset, schema, metadata)
        print_metadata_head(source_name, dataset, schema, metadata)

        # initialization schema matching
        tags_list = [tag['display_name'] for tag in metadata]
        attributes_list = [parse_dataset.clean_name(attr['name'], False, False) for attr in schema]
        sim_matrix = build_local_similarity_matrix(tags_list, attributes_list)
        sim_frame = pd.DataFrame(data=sim_matrix, columns=attributes_list, index=tags_list)
        print(sim_frame.to_string())

        tree_matches = sim_frame.loc['trees']
        attrs = list(sim_frame.columns.values)
        max_score = 0
        arg_max_score = None
        arg_i = -1
        for attr_i in range(len(attrs)):
            attr = attrs[attr_i]
            score = sim_frame.loc['trees', attr]
            if score > max_score:
                max_score = score
                arg_max_score = attr
def perform_matching(p, dataset_metadata_set, schema_set, datasources_with_tag,
                     kb, params):
    comparison_count = 0
    comparison_count_o = [comparison_count]
    sim_matrices = {}

    for source_name in datasources_with_tag:
        t2 = time.time()

        dataset = pd.read_csv(p.datasets_path + source_name + '.csv',
                              index_col=0,
                              header=0)
        dataset = bmm.df_rename_cols(dataset)

        schema = schema_set[source_name]
        metadata = dataset_metadata_set[source_name]['tags']
        schema = [{
            'name': pds.clean_name(attr['name'], False, False)
        } for attr in schema]

        schema_attr_names = []
        for attr in schema:
            # attr['name'] = pds.clean_name(attr['name'], False, False)
            schema_attr_names.append(attr['name'])
        schema_attr_names.sort()

        for concept in kb:
            for datasource in kb[concept]['matches']:
                src_attr = kb[concept]['matches'][datasource]['attribute']
                src_vals = kb[concept]['matches'][datasource]['example_values']
                # do not match with self
                if source_name == datasource:
                    continue
                # do not match if no populated values
                if src_vals == None:
                    continue

                src_data = pd.DataFrame({src_attr: src_vals})
                print("[concept:%s, datasource:%s(%s) <=> dataset:%s]" %
                      (concept, datasource, src_attr, source_name))

                # groupby values for each column and obtain count for each unique value, then multiply counts when comparison succeeds
                tar_schema = list(dataset.columns.values)
                cols_to_delete = bmm.find_attrs_to_delete(schema, tar_schema)
                tar_schema = [
                    item for item in tar_schema if item not in cols_to_delete
                ]

                attrs_stat = {}
                max_len = 0
                for attr in tar_schema:
                    # TODO save this output to file for later use
                    # stat, groups, uniques = bmm.groupby_unique(attr, dataset)
                    stat, uniques = bmm.get_attr_stats(p.dataset_stats,
                                                       source_name, attr)
                    uniques.sort()

                    # save for later
                    try:
                        arg_i = schema_attr_names.index(attr)
                        if schema[arg_i]['domain'] == None:
                            schema[arg_i]['coded_values'] = uniques
                            schema[arg_i]['domain'] = 'coded_values_groupby'
                    except:
                        pass

                    attrs_stat[attr] = (stat, uniques)
                    if len(uniques) > max_len:
                        max_len = len(uniques)
                tar_df = pd.DataFrame()
                for attr in tar_schema:
                    uniques = attrs_stat[attr][1]
                    attrs_stat[attr] = attrs_stat[attr][0]
                    attr_vals = uniques + ['None'] * (max_len - len(uniques))
                    tar_df[attr] = attr_vals

                # collect stats first, also compare data types
                src_datatype = kb[concept]['matches'][datasource]['data_type']
                attr_schema = schema_set[datasource]
                cols_to_delete = bmm.compare_datatypes(src_datatype,
                                                       attr_schema, tar_schema)
                tar_df = bmm.df_delete_cols(tar_df, cols_to_delete)

                # TODO datatypes must match! need to move to a different matcher
                sim_matrix, confidence = bmm.match_table_by_values(
                    src_data, tar_df, params['match_threshold'],
                    comparison_count_o, attrs_stat, params['sample_ratio'],
                    params['sample_min_count'], params['sample_max_count'])

                src_names = list(src_data.columns.values)
                tar_names = list(tar_df.columns.values)
                sim_matrix2 = schm.matcher_name_matrix(src_names, tar_names)

                sim_matrix = schm.combine_scores_matrix(
                    sim_matrix, sim_matrix2, params['proportions'])

                print(sim_matrix.to_string())

                # save similarity matrices
                filename = '%s|%s|%s||%s.csv' % (concept, datasource, src_attr,
                                                 source_name)
                sim_matrices[filename] = sim_matrix

        t3 = time.time()
        total = t3 - t2
        print('time %s sec' % (total))
        print('-----')

    return kb, sim_matrices
def initialize_matching(p, input_topics, dataset_metadata_set, schema_set,
                        datasources_with_tag, reverse_index, kb):

    datasources = {}
    for source_name in datasources_with_tag:
        # path = datasets_path + source_name + '.csv'
        # dataset = pd.read_csv(path, index_col=0, header=0)
        stats_f = open(p.dataset_stats + source_name + '.json', 'r')
        stats = json.load(stats_f)
        df_columns = list(stats.keys())

        schema = schema_set[source_name]
        metadata = dataset_metadata_set[source_name]['tags']
        dataset = pd.DataFrame()

        # dataset = bmm.df_rename_cols(dataset)

        datasources[source_name] = (source_name, dataset, schema, metadata)

        print(source_name)
        if DEBUG_MODE:
            bmm.print_metadata_head(source_name, dataset, schema, metadata)

        # initialization schema matching
        tags_list = [tag['display_name'] for tag in metadata]
        attributes_list = [
            pds.clean_name(attr['name'], False, False) for attr in schema
        ]
        cols_to_delete = bmm.find_attrs_to_delete(attributes_list, df_columns)
        attributes_list = [
            item for item in attributes_list if item not in cols_to_delete
        ]

        sim_matrix = bmm.build_local_similarity_matrix(tags_list,
                                                       attributes_list)
        sim_frame = pd.DataFrame(data=sim_matrix,
                                 columns=attributes_list,
                                 index=tags_list)

        # print(sim_frame.to_string())

        # TODO during new concepts stage, add second best tag and so on
        attrs = list(sim_frame.columns.values)

        # if stats file is empty
        if len(attrs) == 0:
            return kb, datasources_with_tag, schema_set

        for topic in reverse_index[source_name]:

            max_score = 0
            arg_max_score = None
            arg_i = -1
            for attr_i in range(len(attrs)):
                attr = attrs[attr_i]
                score = sim_frame.loc[topic, attr]
                if score > max_score:
                    max_score = score
                    arg_max_score = attr
                    arg_i = attr_i

            arg_max_examples_vals = None
            example_value = None
            if schema[arg_i]['domain'] != None:
                arg_max_examples_vals = schema[arg_i]['coded_values']
                arg_max_examples_vals.sort()
                if len(arg_max_examples_vals) > 0:
                    example_value = arg_max_examples_vals[0]
            else:
                # loading from stats file
                _, uniques = bmm.get_attr_stats(p.dataset_stats, source_name,
                                                attrs[arg_i])
                # stat, _, uniques = bmm.groupby_unique(attrs[arg_i], dataset)

                uniques.sort()
                schema[arg_i]['coded_values'] = uniques
                arg_max_examples_vals = schema[arg_i]['coded_values']

                if len(arg_max_examples_vals) > 0:
                    print('arg_max_examples_vals', arg_max_examples_vals[0])

                schema[arg_i]['domain'] = 'coded_values_groupby'

            print('best match:', topic, arg_max_score, max_score,
                  example_value)

            kb_match_entry = {
                'concept': topic,
                'datasource': source_name,
                'attribute': arg_max_score,
                'match_score': max_score,
                'example_values': arg_max_examples_vals,
                'data_type': schema[arg_i]['data_type']
            }

            bmm.update_kb_json(kb, kb_match_entry)
        print('-----')

    # done initialization

    return kb, datasources_with_tag, schema_set
def create_attributes_contexts(datasets, m, p, r):
    contexts = {}

    for dataset in datasets:
        contexts[dataset] = {}

        schema = m.schema_set[dataset]
        attributes_list = [
            pds.clean_name(attr['name'], False, False) for attr in schema
        ]

        dataset_existing_tags = m.dataset_metadata_set[dataset]['tags']
        dataset_existing_groups = m.dataset_metadata_set[dataset]['groups']
        dataset_notes = m.dataset_metadata_set[dataset]['notes']

        desc = ''
        for group in dataset_existing_groups:
            desc = ' ' + group['description']

        dataset_existing_tags = [
            tag['display_name'] for tag in dataset_existing_tags
        ]
        dataset_existing_groups = [
            group['display_name'] for group in dataset_existing_groups
        ]
        dataset_notes = [
            word for word in dataset_notes.split() if "http://" not in word
        ]

        notes = ' '.join(dataset_notes)

        stats_f = open(p.dataset_stats + dataset + '.json', 'r')
        stats = json.load(stats_f)

        df_columns = list(stats.keys())

        attributes_list = [
            pds.clean_name(attr['name'], False, False) for attr in schema
        ]
        cols_to_delete = bmm.find_attrs_to_delete(attributes_list, df_columns)
        attributes_list = [
            item for item in attributes_list if item not in cols_to_delete
        ]

        for attr in attributes_list:
            # other_attrs = attributes_list.copy()
            # other_attrs.remove(attr)
            other_attrs = []

            attr_values = stats[attr].keys()
            # TODO get average of val length, place attr vals in notes if length is long

            length = 0
            if len(attr_values) > 0:

                if len(attr_values) > r.vals_truncate_sample:

                    num_to_select = r.vals_truncate_sample
                    attr_values = random.sample(attr_values, num_to_select)

                length = [len(val) for val in attr_values]
                length = sum(length) / len(attr_values)

            if r.sentence_threshold <= length:
                notes = notes + '. ' + '. '.join([val for val in attr_values])
                # print('>>>>>', notes)
            else:
                other_attrs.extend(attr_values)
                # print('>>>>>', other_attrs)

            pt.enrich_homonyms(dataset, attr, desc, notes, other_attrs)

    m.dataset_attributes_contexts = contexts
    return contexts
def initialize_matching(p, m, r):

    all_topics, attrs_contexts, topic_contexts = load_prematching_metadata(
        p, m, pds)

    wordnet = pt.load_dict()

    pair_dict_all = {}

    datasources = {}
    for source_name in m.datasources_with_tag:
        tags_list_enriched_dataset, tags_list_enriched_names, attributes_list, schema, _, _ = load_per_source_metadata(
            p, m, datasources, source_name, pds, bmm)

        score_names = m.score_names

        sim_matrix1 = build_local_similarity_matrix(tags_list_enriched_dataset,
                                                    attributes_list, r)
        # TODO build_local_similarity_matrix using context

        if source_name not in attrs_contexts:
            print('ERROR: DATASOURCE NOT FOUND', source_name, '\n', '-----')
            continue
        attribute_contexts = attrs_contexts[source_name]

        # topic_contexts is all datasets, attribute_contexts is per dataset
        sim_matrix2, sim_matrix3, pair_dict = build_local_context_similarity_matrix(
            topic_contexts, attribute_contexts, source_name, wordnet,
            all_topics)

        sim_frame1 = pd.DataFrame(data=sim_matrix1,
                                  columns=attributes_list,
                                  index=tags_list_enriched_names)
        sim_frame2 = pd.DataFrame(data=sim_matrix2,
                                  columns=attributes_list,
                                  index=tags_list_enriched_names)
        sim_frame3 = pd.DataFrame(data=sim_matrix3,
                                  columns=attributes_list,
                                  index=tags_list_enriched_names)

        # chance of getting external topics
        pair_dict_all.update(pair_dict)

        # print(sim_frame.to_string())

        attrs = list(sim_frame1.columns.values)

        # if stats file is empty
        if len(attrs) == 0:
            print('ERROR: empty dataset', source_name, '\n', '-----')
            continue

        # get example values
        for attr_i in range(len(schema)):
            # print(attr_i)
            if 'domain' not in schema[attr_i] or schema[attr_i][
                    'domain'] == None:

                attr_name = schema[attr_i]['name']
                attr_name = pds.clean_name(attr_name, False, False)

                # loading from stats file
                _, uniques = bmm.get_attr_stats(p.dataset_stats, source_name,
                                                attr_name)
                if uniques != None:
                    # print('uniques', len(uniques))
                    pass
                else:
                    continue

                # stat, _, uniques = bmm.groupby_unique(attrs[arg_i], dataset)

                uniques.sort()
                schema[attr_i]['coded_values'] = uniques
                # arg_max_examples_vals = schema[attr_i]['coded_values']

                # if len(arg_max_examples_vals) > 0: print('arg_max_examples_vals', arg_max_examples_vals[0])

                schema[attr_i]['domain'] = 'coded_values_groupby'

        # init kb
        build_kb_json(tags_list_enriched_names, source_name, m)

        # during new concepts stage, add second best tag and so on

        for attr_i in range(len(attrs)):
            scores1 = [[
                attr_i, attrs[attr_i], sim_frame1.loc[topic, attrs[attr_i]],
                topic
            ] for topic in tags_list_enriched_names]
            scores2 = [[
                attr_i, attrs[attr_i], sim_frame2.loc[topic, attrs[attr_i]],
                topic
            ] for topic in tags_list_enriched_names]
            scores3 = [[
                attr_i, attrs[attr_i], sim_frame3.loc[topic, attrs[attr_i]],
                topic
            ] for topic in tags_list_enriched_names]

            score_len = 0
            if len(scores1) != 0: score_len = len(scores1[0])

            # for topic in tags_list_enriched_names:
            #     scores1 = [[attr_i, attrs[attr_i], sim_frame1.loc[topic, attrs[attr_i]]] for attr_i in range(len(attrs))]
            #     scores2 = [[attr_i, attrs[attr_i], sim_frame2.loc[topic, attrs[attr_i]]] for attr_i in range(len(attrs))]

            scores = []
            for i in range(len(scores1)):
                if scores1[i][2] >= scores2[i][2]:
                    # print(scores2[i][2])
                    # print(scores3[i][2])
                    scores.append([
                        attr_i, attrs[attr_i], scores1[i][2], scores1[i][3],
                        score_names[0]
                    ])
                else:
                    multiplier = 1.0
                    if scores3[i][2] != 0: multiplier = scores3[i][2]
                    scores.append([
                        attr_i, attrs[attr_i],
                        min(scores2[i][2] * multiplier, 1.0), scores2[i][3],
                        score_names[1]
                    ])

            scores = sorted(scores, key=lambda tup: tup[2])
            scores.reverse()
            scores_examples = []
            for attr_score in scores:
                # example_value = None
                # print(attr_score, attr_score[0], schema[attr_score[0]])
                if 'coded_values' not in schema[attr_score[0]]:
                    continue
                arg_max_examples_vals = schema[attr_score[0]]['coded_values']
                arg_max_examples_vals.sort()
                scores_examples.append(attr_score +
                                       [schema[attr_score[0]]['coded_values']])
                # print('here')

            top = 0
            output = []
            for score in scores_examples:
                if len(score) <= score_len:
                    # print('skip', score)
                    continue
                # print('topic_to_attr_count', score[2], top)
                if score[
                        2] > r.topic_to_attr_threshold and top <= r.topic_to_attr_count:
                    # print('topic_to_attr_count', r.topic_to_attr_count)
                    output.append(score)
                    top += 1
            # if len(output) == 0:
            #     output.append(scores_examples[0])

            # max_score = 0
            # arg_max_score = None
            # arg_i = -1
            # for attr_i in range(len(attrs)):
            #     attr = attrs[attr_i]
            #     score = sim_frame.loc[topic, attr]
            #     if score > max_score:
            #         max_score = score
            #         arg_max_score = attr
            #         arg_i = attr_i

            # if len(arg_max_examples_vals) > 0: example_value = arg_max_examples_vals[0]
            # print('best match:', topic, arg_max_score, max_score, example_value)

            # print('=====output', output)

            for match in output:
                kb_match_entry = {
                    'concept': match[3],
                    'datasource': source_name,
                    'attribute': match[1],
                    'match_score': match[2],
                    'example_values': match[5],
                    'data_type': schema[match[0]]['data_type'],
                    'score_name': match[4]
                }

                update_kb_json(m.kbs[source_name], kb_match_entry)

                # for debugging:
                kb_match_entry['example_values'] = kb_match_entry[
                    'example_values'][:min(
                        len(kb_match_entry['example_values']), 5)]
                pprint.pprint(kb_match_entry)

        print('-----')

    m.pair_dict_all = pair_dict_all

    # done initialization

    return True
def initialize_matching_full(p, m, r):

    all_topics, attrs_contexts, topic_contexts = load_prematching_metadata(
        p, m, pds)

    wordnet = pt.load_dict()

    pair_dict_all = {}

    kb_curr_file = './outputs/kb_file.json'  # TODO <=
    if not os.path.exists(kb_curr_file):
        with open(kb_curr_file, 'w') as fp:
            json.dump({}, fp, sort_keys=True, indent=2)

    fp = open(kb_curr_file, 'r')
    m.kbs = json.load(fp)

    len_all_ds = len(m.datasources_with_tag)

    datasources = {}
    for source_name in m.datasources_with_tag:
        if source_name in m.kbs:
            print('--already have local mapping ', source_name)
            continue  # already have local mapping, skip

        _, _, attributes_list, schema, _, _ = load_per_source_metadata(
            p, m, datasources, source_name, pds, bmm)

        attributes_list_orig = [
            pds.clean_name(attr['name'], False, False) for attr in schema
        ]

        score_names = m.score_names

        # 700+ topics
        sim_matrix1 = build_local_similarity_matrix(all_topics,
                                                    attributes_list, r)

        if source_name not in attrs_contexts:
            print('ERROR: DATASOURCE NOT FOUND', source_name, '\n', '---!--')
            continue
        attribute_contexts = attrs_contexts[source_name]

        # topic_contexts is all datasets, attribute_contexts is per dataset
        sim_matrix2, pair_dict, sim_matrix3 = build_local_context_similarity_matrix_full(
            topic_contexts, attribute_contexts, source_name, wordnet,
            all_topics, p.server_ip)

        pprint.pprint(pair_dict)

        sim_frame1 = pd.DataFrame(data=sim_matrix1,
                                  columns=attributes_list,
                                  index=all_topics.keys())
        sim_frame2 = pd.DataFrame(data=sim_matrix2,
                                  columns=list(attribute_contexts.keys()),
                                  index=all_topics.keys())
        sim_frame3 = pd.DataFrame(data=sim_matrix3,
                                  columns=list(attribute_contexts.keys()),
                                  index=all_topics.keys())

        pair_dict_all.update(pair_dict)
        attrs = list(sim_frame1.columns.values)

        if len(attrs) == 0:
            print('ERROR: empty dataset', source_name, '\n', '-----')
            continue

        for attr_i in range(len(schema)):
            if 'domain' not in schema[attr_i] or schema[attr_i][
                    'domain'] == None:

                attr_name = schema[attr_i]['name']
                attr_name = pds.clean_name(attr_name, False, False)

                _, uniques = bmm.get_attr_stats(p.dataset_stats, source_name,
                                                attr_name)
                if uniques != None:
                    pass
                else:
                    continue

                uniques.sort()
                schema[attr_i]['coded_values'] = uniques

                schema[attr_i]['domain'] = 'coded_values_groupby'

        # init kb
        build_kb_json(all_topics, source_name, m)

        for attr_i in range(len(attrs)):
            scores1 = [[
                attr_i, attrs[attr_i], sim_frame1.loc[topic, attrs[attr_i]],
                topic, None
            ] for topic in all_topics]
            scores2 = [[
                attr_i, attrs[attr_i], sim_frame2.loc[topic,
                                                      attrs[attr_i]], topic,
                pair_dict[(source_name, pds.clean_name(attrs[attr_i]), topic)]
            ] for topic in all_topics]
            scores3 = [[
                attr_i, attrs[attr_i], sim_frame3.loc[topic, attrs[attr_i]],
                topic, None
            ] for topic in all_topics]

            score_len = 0
            if len(scores1) != 0: score_len = len(scores1[0])

            # TODO change score to weighted average
            weights = [0.40, 0.6, 0.0]  # TODO train the weights
            scores = []
            for i in range(len(scores1)):

                scores_tmp = [scores1[i][2], scores2[i][2], scores3[i][2]]
                index, element = max(enumerate(scores_tmp), key=itemgetter(1))

                # if scores1[i][2] >= scores2[i][2]:
                #     scores.append([attr_i, attrs[attr_i],scores1[i][2],scores1[i][3], scores1[i][4], score_names[0]])
                # else:
                #     scores.append([attr_i, attrs[attr_i],scores2[i][2],scores2[i][3], scores2[i][4], score_names[1]])

                score_tmp = weights[0] * scores_tmp[0] + weights[
                    1] * scores_tmp[1] + weights[2] * scores_tmp[2]

                scores.append([
                    attr_i, attrs[attr_i], score_tmp, scores2[i][3],
                    scores2[i][4], score_names[index]
                ])

            scores = sorted(scores, key=lambda tup: tup[2])
            scores.reverse()
            scores_examples = []
            for attr_score in scores:
                # attr_splt = attr_score[1].split()
                ind = attributes_list_orig.index(attr_score[1])
                if 'coded_values' not in schema[ind]:
                    continue
                arg_max_examples_vals = schema[ind]['coded_values']
                arg_max_examples_vals.sort()
                scores_examples.append(attr_score + [arg_max_examples_vals])

            top = 0
            output = []
            for score in scores_examples:
                if len(score) <= score_len:
                    continue
                if score[
                        2] > r.topic_to_attr_threshold and top <= r.topic_to_attr_count:
                    output.append(score)
                    top += 1

            for match in output:
                kb_match_entry = {
                    'concept': match[3],
                    'datasource': source_name,
                    'attribute': match[1],
                    'match_score': match[2],
                    'example_values': match[6],
                    'topic_source': match[5],
                    'data_type': schema[match[0]]['data_type'],
                    'score_name': match[4]
                }

                update_kb_json(m.kbs[source_name], kb_match_entry)

                # for debugging:
                kb_match_entry['example_values'] = kb_match_entry[
                    'example_values'][:min(
                        len(kb_match_entry['example_values']), 5)]
                pprint.pprint(kb_match_entry)

        with open(p.schema_p, 'w') as fp:
            json.dump(m.schema_set, fp, sort_keys=True, indent=2)

        with open(kb_curr_file, 'w') as fp:
            json.dump(m.kbs, fp, sort_keys=True, indent=2)

        print('done saving kb_file', source_name)
        print('^^^ PROGRESS', len(m.kbs) / len_all_ds)

    return