Beispiel #1
0
    def row_similarity_ub(self, query, beta):

        rank_candidate = []
        for i in self.tables.keys():

            if i == query.name:
                continue

            if i in self.cache_tables:
                rank_candidate.append(
                    (i, self.pre_state.state["row_sim_ub"]["score"][i]))
                continue

            tname = i[6:]

            tableA = query.value
            tableB = self.tables[i]

            if tname not in self.mappings:
                row_sim_ub = 0
            else:
                row_sim_ub = 0
                initial_mapping = self.mappings[tname]
                for key in initial_mapping.keys():
                    Avalue = tableA[key].dropna().keys()
                    Bvalue = tableB[initial_mapping[key]].dropna().values
                    try:
                        row_sim = jaccard_similarity(Avalue, Bvalue)
                    except:
                        row_sim = 0
                    if row_sim > row_sim_ub:
                        row_sim_ub = row_sim
            rank_candidate.append((i, beta * row_sim_ub))
        return rank_candidate
Beispiel #2
0
    def mapping(tableA, tableB, Mpair={}, MpairR={}):

        matching = []

        scma = tableA.columns.values
        scmb = tableB.columns.values
        shmal = len(scma)
        shmbl = len(scmb)

        acol_set = {}

        for i in range(shmal):

            nameA = scma[i]

            if nameA in Mpair:
                continue
            if nameA == "Unnamed: 0" or "index" in nameA:
                continue

            colA = tableA[nameA][~pd.isnull(tableA[nameA])].values
            if nameA not in acol_set:
                acol_set[nameA] = list(set(colA))

            try:
                colA = colA[~np.isnan(colA)]
            except:
                try:
                    colA = colA[colA != np.array(None)]
                except:
                    colA = colA

            for j in range(shmbl):

                nameB = scmb[j]  # .split('_')[0].lower()
                if nameB in MpairR:
                    continue

                if nameB == "Unnamed: 0" or "index" in nameB:
                    continue

                if tableA[nameA].dtype != tableB[nameB].dtype:
                    continue

                colB = tableB[nameB][~pd.isnull(tableB[nameB])].values

                try:
                    colB = colB[~np.isnan(colB)]
                except:
                    try:
                        colB = colB[colB != np.array(None)]
                    except:
                        colB = colB

                sim_col = jaccard_similarity(acol_set[nameA], colB)
                matching.append((nameA, nameB, sim_col))

        matching = sorted(matching, key=lambda d: d[2], reverse=True)

        return matching
Beispiel #3
0
    def continue_full_mapping(self, tableA, tableB, mapped):

        # start_time = timeit.default_timer()
        matching = []
        Mpair = mapped
        MpairR = {}
        for i in Mpair.keys():
            MpairR[Mpair[i]] = i

        scma = tableA.columns.tolist()
        scmb = tableB.columns.tolist()
        shmal = len(scma)
        shmbl = len(scmb)

        acol_set = {}
        for i in range(shmal):
            nameA = scma[i]
            if nameA in Mpair:
                continue

            if tableA[nameA].dtype != float:
                continue

            if nameA not in acol_set:
                colA = tableA[scma[i]].dropna().tolist()
                acol_set[nameA] = list(set(colA))
            else:
                colA = acol_set[nameA]

            for j in range(shmbl):
                nameB = scmb[j]
                if nameB in MpairR:
                    continue

                if tableA[nameA].dtype != tableB[nameB].dtype:
                    continue

                colB = tableB[nameB].dropna().tolist()
                sim_col = jaccard_similarity(colA, colB)

                if sim_col > self.sim_thres:
                    matching.append((nameA, nameB, sim_col))
                    if sim_col > 0.8:
                        break

        matching = sorted(matching, key=lambda d: d[2], reverse=True)

        for i in range(len(matching)):
            if matching[i][2] < self.sim_thres:
                break
            else:
                if matching[i][0] not in Mpair and matching[i][1] not in MpairR:
                    Mpair[matching[i][0]] = matching[i][1]
                    MpairR[matching[i][1]] = matching[i][0]

        return Mpair
Beispiel #4
0
    def row_similarity(self, detected_key, SM, tableA, tableB):
        if len(detected_key) != 0:
            max_row_sim = 0
            for key in detected_key:
                if key in SM:
                    Avalue = tableA[key].values
                    Bvalue = tableB[SM[key]].values
                    try:
                        row_sim = jaccard_similarity(Avalue, Bvalue)
                    except:
                        row_sim = 0
                if row_sim > max_row_sim:
                    max_row_sim = row_sim

        else:
            max_row_sim = 0

        return max_row_sim
Beispiel #5
0
    def mapping_naive_incremental(self,
                                  tableA,
                                  tableB,
                                  gid,
                                  meta_mapping,
                                  schema_linking,
                                  unmatched,
                                  mapped={}):

        start_time = timeit.default_timer()
        time1 = 0

        Mpair = mapped
        MpairR = {}
        for i in Mpair.keys():
            MpairR[Mpair[i]] = i

        matching = []
        t_mapping = {}
        for i in tableA.columns.tolist():
            if i in Mpair:
                continue
            if i not in meta_mapping[gid]:
                continue
            t_mapping[schema_linking[gid][meta_mapping[gid][i]]] = i

        for i in tableB.columns.tolist():
            if i in MpairR:
                continue
            if schema_linking[gid][i] in t_mapping:
                if tableB[i].dtype != tableA[t_mapping[schema_linking[gid]
                                                       [i]]].dtype:
                    continue
                Mpair[t_mapping[schema_linking[gid][i]]] = i
                MpairR[i] = t_mapping[schema_linking[gid][i]]

        scma = tableA.columns.tolist()
        scmb = tableB.columns.tolist()
        shmal = len(scma)
        shmbl = len(scmb)

        acol_set = {}

        for i in range(shmal):

            nameA = scma[i]

            if nameA in Mpair:
                continue

            if nameA == "Unnamed: 0" or "index" in nameA:
                continue

            if nameA not in acol_set:
                colA = tableA[scma[i]][~pd.isnull(tableA[scma[i]])].values
                acol_set[nameA] = list(set(colA))
            else:
                colA = acol_set[nameA]

            for j in range(shmbl):

                nameB = scmb[j]  # .split('_')[0].lower()
                if nameB in MpairR:
                    continue

                if nameB == "Unnamed: 0" or "index" in nameB:
                    continue

                if tableA[nameA].dtype != tableB[nameB].dtype:
                    continue

                if nameB in unmatched[gid][nameA]:
                    continue

                colB = tableB[scmb[j]][~pd.isnull(tableB[scmb[j]])].values

                try:
                    colB = colB[~np.isnan(colB)]
                except:
                    try:
                        colB = colB[colB != np.array(None)]
                    except:
                        colB = colB

                s1 = timeit.default_timer()
                sim_col = jaccard_similarity(colA, colB)
                e1 = timeit.default_timer()
                time1 += e1 - s1

                if sim_col < self.sim_thres:
                    unmatched[gid][nameA][nameB] = ''

                matching.append((nameA, nameB, sim_col))

        matching = sorted(matching, key=lambda d: d[2], reverse=True)

        for i in range(len(matching)):
            if matching[i][2] < self.sim_thres:
                break
            else:
                if matching[i][0] not in Mpair and matching[i][1] not in MpairR:
                    Mpair[matching[i][0]] = matching[i][1]
                    MpairR[matching[i][1]] = matching[i][0]

        for i in tableA.columns.tolist():
            if i in Mpair:
                if i not in meta_mapping[gid]:
                    meta_mapping[gid][i] = Mpair[i]

                for j in tableB.columns.tolist():
                    if j != Mpair[i]:
                        unmatched[gid][i][j] = ''

        end_time = timeit.default_timer()
        time_total = end_time - start_time
        #print('full schema mapping: ', time_total)
        return Mpair, meta_mapping, unmatched, time_total
Beispiel #6
0
    def mapping_naive_groups(self, tableA, tableA_valid, schema_element):

        start_time = timeit.default_timer()
        time1 = 0

        Mpair = {}
        MpairR = {}

        scma = tableA.columns.values
        shmal = len(scma)
        acol_set = {}

        group_list = []
        for group in schema_element.keys():
            Mpair[group] = {}
            MpairR[group] = {}
            matching = []

            for i in range(shmal):

                nameA = scma[i]
                if nameA not in tableA_valid:
                    continue

                if nameA == "Unnamed: 0" or "index" in nameA:
                    continue

                colA = tableA[scma[i]][~pd.isnull(tableA[scma[i]])].values
                if nameA not in acol_set:
                    acol_set[nameA] = list(set(colA))

                #try:
                #    colA = colA[~np.isnan(colA)]
                #except:
                #    try:
                #        colA = colA[colA != np.array(None)]
                #    except:
                #        colA = colA

                for j in schema_element[group].keys():

                    nameB = j
                    colB = np.array(schema_element[group][nameB])

                    try:
                        colB = colB[~np.isnan(colB)]
                    except:
                        try:
                            colB = colB[colB != np.array(None)]
                        except:
                            colB = colB

                    s1 = timeit.default_timer()

                    sim_col = jaccard_similarity(acol_set[nameA], colB)
                    e1 = timeit.default_timer()
                    time1 += e1 - s1
                    #c1 += 1
                    matching.append((nameA, nameB, sim_col))

            matching = sorted(matching, key=lambda d: d[2], reverse=True)

            if len(matching) == 0:
                continue

            if matching[0][2] < self.sim_thres:
                continue
            else:
                group_list.append(group)

        end_time = timeit.default_timer()

        return group_list
Beispiel #7
0
    def mapping_naive_tables_join(self,
                                  tableA,
                                  valid_keys,
                                  schema_element_sample,
                                  schema_element,
                                  schema_dtype,
                                  unmatched,
                                  tflag=False):

        start_time = timeit.default_timer()
        time1 = 0

        Mpair = {}
        MpairR = {}

        scma = tableA.columns.values
        shmal = len(scma)
        acol_set = {}

        for group in schema_element.keys():

            Mpair[group] = {}
            MpairR[group] = {}
            matching = []

            for i in range(shmal):

                nameA = scma[i]

                if nameA == "Unnamed: 0" or "index" in nameA:
                    continue
                if nameA not in valid_keys:
                    continue

                if nameA not in acol_set:
                    A_index = ~pd.isnull(tableA[nameA])
                    colA = (tableA[nameA][A_index]).values
                    acol_set[nameA] = list(set(colA))
                else:
                    colA = acol_set[nameA]

                for j in schema_element[group].keys():

                    nameB = j

                    if nameB == "Unnamed: 0" or "index" in nameB:
                        continue

                    if schema_dtype[group][j] is not tableA[nameA].dtype:
                        continue

                    colB = np.array(schema_element[group][nameB])

                    try:
                        colB = colB[~np.isnan(colB)]
                    except:
                        try:
                            colB = colB[colB != np.array(None)]
                        except:
                            colB = colB

                    s1 = timeit.default_timer()

                    try:
                        sim_col = jaccard_similarity(colA, colB)
                    except:
                        print(colA)
                        print(colB)

                    if sim_col < self.sim_thres:
                        unmatched[group][nameA][nameB] = ''

                    e1 = timeit.default_timer()
                    time1 += e1 - s1

                    matching.append((nameA, nameB, sim_col))

            for i in schema_element_sample[group].keys():

                nameB = i

                if nameB == "Unnamed: 0" or "index" in nameB:
                    continue

                colB = np.array(schema_element_sample[group][nameB])

                try:
                    colB = colB[~np.isnan(colB)]
                except:
                    try:
                        colB = colB[colB != np.array(None)]
                    except:
                        colB = colB

                for j in range(shmal):

                    nameA = scma[j]
                    if nameA == "Unnamed: 0" or "index" in nameA:
                        continue

                    if nameB in unmatched[group][nameA]:
                        continue

                    if nameA not in acol_set:
                        colA = tableA[nameA][~pd.isnull(tableA[nameA])].values
                        acol_set[nameA] = list(set(colA))
                    else:
                        colA = acol_set[nameA]

                    if schema_dtype[group][nameB] is not tableA[nameA].dtype:
                        continue

                    s1 = timeit.default_timer()
                    sim_col = jaccard_similarity(colA, colB)
                    e1 = timeit.default_timer()
                    time1 += e1 - s1

                    if sim_col < self.sim_thres:
                        unmatched[group][nameA][nameB] = ''

                    matching.append((nameA, nameB, sim_col))

            matching = sorted(matching, key=lambda d: d[2], reverse=True)

            for i in range(len(matching)):
                if matching[i][2] < self.sim_thres:
                    break
                else:
                    if matching[i][0] not in Mpair[group] and matching[i][
                            1] not in MpairR[group]:
                        Mpair[group][matching[i][0]] = matching[i][1]
                        MpairR[group][matching[i][1]] = matching[i][0]

        end_time = timeit.default_timer()

        if tflag:
            print('raw schema mapping: ', end_time - start_time)
            print('sim schema mapping: ', time1)

        return Mpair, unmatched
Beispiel #8
0
    def mapping_naive_tables(self,
                             tableA,
                             valid_keys,
                             schema_element,
                             schema_dtype,
                             tflag=False):

        start_time = timeit.default_timer()
        time1 = 0

        Mpair = {}
        MpairR = {}

        scma = tableA.columns.values
        shmal = len(scma)
        acol_set = {}

        for group in schema_element.keys():

            Mpair[group] = {}
            MpairR[group] = {}
            matching = []

            for i in range(shmal):

                nameA = scma[i]
                if nameA == "Unnamed: 0" or "index" in nameA:
                    continue

                if nameA not in valid_keys:
                    continue

                if nameA not in acol_set:
                    colA = tableA[scma[i]][~pd.isnull(tableA[scma[i]])].values
                    acol_set[nameA] = list(set(colA))
                else:
                    colA = acol_set[nameA]

                for j in schema_element[group].keys():

                    nameB = j
                    if nameB == "Unnamed: 0" or "index" in nameB:
                        continue

                    colB = np.array(schema_element[group][nameB])

                    if schema_dtype[group][j] is not tableA[nameA].dtype:
                        continue

                    try:
                        colB = colB[~np.isnan(colB)]
                    except:
                        try:
                            colB = colB[colB != np.array(None)]
                        except:
                            colB = colB

                    s1 = timeit.default_timer()

                    sim_col = jaccard_similarity(colA, colB)

                    e1 = timeit.default_timer()
                    time1 += e1 - s1

                    matching.append((nameA, nameB, sim_col))

            matching = sorted(matching, key=lambda d: d[2], reverse=True)

            for i in range(len(matching)):
                if matching[i][2] < self.sim_thres:
                    break
                else:
                    if matching[i][0] not in Mpair[group] and matching[i][
                            1] not in MpairR[group]:
                        Mpair[group][matching[i][0]] = matching[i][1]
                        MpairR[group][matching[i][1]] = matching[i][0]

        end_time = timeit.default_timer()

        if tflag:
            print('Schema Mapping Before Search: %s Seconds.' %
                  (end_time - start_time))

        return Mpair