Ejemplo n.º 1
0
    def preciseBatchMatch(self,
                          path,
                          min_similarity,
                          outputfilename,
                          gold_mapping_path,
                          kno=5,
                          max_similarity=1.0):

        gold_mapping = pd.read_csv(gold_mapping_path,
                                   sep='\t',
                                   encoding="UTF-8",
                                   header=None,
                                   names=['src_id', 'tgt_id', 'label'])

        print('Starting matching')
        f = open(path, "a+", encoding="UTF-8")
        correspondences = dict()
        i = 0
        total_size = len(self.src_graphmanager.graph.keys())
        for nodeid in self.src_graphmanager.graph.keys():
            i = i + 1
            if i % 5000 == 0:
                print("  " + str(int(100 * i / (total_size))) + "% done")

            indices = []

            for src_prop in self.src_graphmanager.indices.keys():
                for tgt_prop in self.tgt_graphmanager.indices.keys():
                    try:
                        x = self.src_graphmanager.graph[nodeid][src_prop]
                        cind = self.tgt_graphmanager.indices[
                            tgt_prop].getIndicesForValue(x)
                        #if len(cind) > len(self.tgt_graphmanager.graph.keys())*0.7:
                        #    continue
                        indices = indices + cind  #self.tgt_graphmanager.indices[tgt_prop].getIndicesForValue(self.src_graphmanager.graph[nodeid][src_prop])
                    except KeyError:
                        pass
            #tmp_tgt_ind = dict()
            #for index in indices:
            #    if index in tmp_tgt_ind.keys():
            #        tmp_tgt_ind[index] = tmp_tgt_ind[index] + 1
            #    else:
            #        tmp_tgt_ind[index] = 1

            tmp_tgt_ind = Counter(indices)

            if len(indices) < 1:
                correspondences[nodeid] = None
            else:
                matchfound = False
                best_matching_resources = heapq.nlargest(
                    100, tmp_tgt_ind.items(), key=operator.itemgetter(1)
                )  #max((tmp_tgt_ind.items()), key=operator.itemgetter(1))

                bmr = pd.DataFrame(np.array(best_matching_resources))
                try:
                    bmr[2] = bmr.apply(
                        lambda row: len(self.tgt_graphmanager.graph[row[0]][
                            list(self.tgt_graphmanager.indices.keys())[0]]),
                        axis=1)
                except:
                    bmr[2] = bmr.apply(
                        lambda row: len(self.tgt_graphmanager.graph[row[0]][
                            list(self.tgt_graphmanager.indices.keys())[1]]),
                        axis=1)
                bmr[1] = bmr[1].astype('int32')
                bmr[2] = bmr[2].astype('int32')
                bmr.sort_values(by=[1, 2],
                                ascending=[False, True],
                                inplace=True,
                                axis=0)
                bmr = bmr.head(kno)

                #max((tmp_tgt_ind.items()), key=operator.itemgetter(1))
                for index, row in bmr.iterrows():
                    best_matching_resource = [row[0], int(row[1])]

                    maximum_ngrams_x = 0
                    maximum_ngrams_y = 0
                    for src_prop in self.src_graphmanager.indices.keys():
                        try:
                            x = self.src_graphmanager.graph[nodeid][src_prop]
                            maximum_ngrams_x = maximum_ngrams_x + len(
                                getNGrams(self.src_graphmanager.graph[nodeid]
                                          [src_prop]))
                        except KeyError:
                            pass
                    for tgt_prop in self.tgt_graphmanager.indices.keys():
                        try:
                            y = self.tgt_graphmanager.graph[
                                best_matching_resource[0]][tgt_prop]
                            maximum_ngrams_y = maximum_ngrams_y + len(
                                self.tgt_graphmanager.graph[
                                    best_matching_resource[0]][tgt_prop])
                        except KeyError:
                            pass
                    no_of_ngrams = max(maximum_ngrams_x, maximum_ngrams_y)
                    if best_matching_resource[
                            1] > no_of_ngrams * min_similarity:  # and best_matching_resource[1] <= min(maximum_ngrams_x, maximum_ngrams_y)*max_similarity:
                        try:
                            correspondences[nodeid] = best_matching_resource[0]
                            srcid = str(nodeid).replace("<",
                                                        "").replace(">", "")
                            tgtid = str(best_matching_resource[0]).replace(
                                "<", "").replace(">", "")
                            if len(gold_mapping.loc[
                                (gold_mapping.src_id == srcid)
                                    & (gold_mapping.tgt_id == tgtid)]) > 0:
                                lbl = 1
                            else:
                                lbl = 0

                            #l = self.src_graphmanager.graph[nodeid]["<http://rdata2graph.sap.com/darkscape/non-player_character.label>".lower()] + " -> " + self.tgt_graphmanager.graph[best_matching_resource[0]]["<http://rdata2graph.sap.com/oldschoolrunescape/non-player_character.label>".lower()] + " <----> " + str(nodeid).replace("<","").replace(">","") + "\t" + str(best_matching_resource[0]).replace("<","").replace(">","") + "\r\n"
                            l = srcid + "\t" + tgtid + "\t" + str(lbl) + "\r\n"
                            f.write(l.lower())
                            f.flush()
                        except:
                            pass
        f.close()
        return correspondences
    def match_src(self,
                  nodeid,
                  min_similarity=0.01,
                  max_similarity=1.0,
                  exclude_nodeid=None):
        indices = []
        for tgt_prop in self.tgt_graphmanager.indices.keys():
            for src_prop in self.src_graphmanager.indices.keys():
                try:
                    self.tgt_graphmanager.indices[tgt_prop]
                    indices = indices + self.tgt_graphmanager.indices[
                        tgt_prop].getIndicesForValue(
                            self.src_graphmanager.graph[nodeid][src_prop])
                except KeyError:
                    pass
        if exclude_nodeid is not None and exclude_nodeid in indices:
            indices.remove(exclude_nodeid)
        tmp_tgt_ind = dict()
        for index in indices:
            if index in tmp_tgt_ind.keys():
                tmp_tgt_ind[index] = tmp_tgt_ind[index] + 1
            else:
                tmp_tgt_ind[index] = 1

        if len(tmp_tgt_ind) < 1:
            return None
        else:
            out = ""
            import heapq
            best_matching_resources = heapq.nlargest(
                1, tmp_tgt_ind.items(), key=operator.itemgetter(1)
            )  #max((tmp_tgt_ind.items()), key=operator.itemgetter(1))
            for best_matching_resource in best_matching_resources:
                maximum_ngrams_x = 0
                maximum_ngrams_y = 0
                for src_prop in self.src_graphmanager.indices.keys():
                    try:
                        maximum_ngrams_x = maximum_ngrams_x + len(
                            getNGrams(
                                self.src_graphmanager.graph[nodeid][src_prop]))
                    except KeyError:
                        pass
                for tgt_prop in self.tgt_graphmanager.indices.keys():
                    try:
                        maximum_ngrams_y = maximum_ngrams_y + len(
                            self.tgt_graphmanager.graph[
                                best_matching_resource[0]][tgt_prop])
                    except KeyError:
                        pass
                no_of_ngrams = max(maximum_ngrams_x, maximum_ngrams_y)
                if no_of_ngrams > 10 and best_matching_resource[
                        1] > no_of_ngrams * min_similarity and best_matching_resource[
                            1] < no_of_ngrams and best_matching_resource[
                                1] <= min(maximum_ngrams_x,
                                          maximum_ngrams_y) * max_similarity:
                    try:
                        out = out + nodeid.replace("<", "").replace(
                            ">", "") + "\t" + str(
                                best_matching_resource[0]).replace(
                                    "<", "").replace(">", "") + "\n"
                        #l = self.src_graphmanager.graph[nodeid]["<http://rdata2graph.sap.com/darkscape/non-player_character.label>".lower()] + " -> " + self.tgt_graphmanager.graph[best_matching_resource[0]]["<http://rdata2graph.sap.com/oldschoolrunescape/non-player_character.label>".lower()] + " <----> " + str(nodeid).replace("<","").replace(">","") + "\t" + str(best_matching_resource[0]).replace("<","").replace(">","") + "\r\n"
                        #l = str(nodeid).replace("<","").replace(">","") + "\t" + str(best_matching_resource[0]).replace("<","").replace(">","") + "\r\n"
                        #f.write(l.lower())
                        #f.flush()
                    except KeyError:
                        pass
            return out
Ejemplo n.º 3
0
    def preciseBatchMatch(self, path, min_similarity, max_similarity=1.0):
        print('Starting matching')
        f = open(path, "a+", encoding="UTF-8")
        correspondences = dict()
        i = 0
        total_size = len(self.src_graphmanager.graph.keys())
        for nodeid in self.src_graphmanager.graph.keys():
            i = i + 1
            if i % 1000 == 0:
                print("  " + str(int(100 * i / (total_size))) + "% done")
            if random.randint(1, 101) > 100:
                continue
            indices = []

            for src_prop in self.src_graphmanager.indices.keys():
                for tgt_prop in self.tgt_graphmanager.indices.keys():
                    try:
                        cind = self.tgt_graphmanager.indices[
                            tgt_prop].getIndicesForValue(
                                self.src_graphmanager.graph[nodeid][src_prop])
                        #if len(cind) > len(self.tgt_graphmanager.graph.keys())*0.7:
                        #    continue
                        indices = indices + cind  #self.tgt_graphmanager.indices[tgt_prop].getIndicesForValue(self.src_graphmanager.graph[nodeid][src_prop])
                    except KeyError:
                        pass
            tmp_tgt_ind = dict()
            for index in indices:
                if index in tmp_tgt_ind.keys():
                    tmp_tgt_ind[index] = tmp_tgt_ind[index] + 1
                else:
                    tmp_tgt_ind[index] = 1

            if len(tmp_tgt_ind) < 1:
                correspondences[nodeid] = None
            else:
                matchfound = False
                best_matching_resources = heapq.nlargest(
                    10, tmp_tgt_ind.items(), key=operator.itemgetter(1)
                )  #max((tmp_tgt_ind.items()), key=operator.itemgetter(1))
                #max((tmp_tgt_ind.items()), key=operator.itemgetter(1))
                for k in range(len(best_matching_resources)):
                    best_matching_resource = best_matching_resources[k]
                    lbl = 0
                    if k == 0:
                        lbl = 1
                    maximum_ngrams_x = 0
                    maximum_ngrams_y = 0
                    for src_prop in self.src_graphmanager.indices.keys():
                        try:
                            maximum_ngrams_x = maximum_ngrams_x + len(
                                getNGrams(self.src_graphmanager.graph[nodeid]
                                          [src_prop]))
                        except KeyError:
                            pass
                    for tgt_prop in self.tgt_graphmanager.indices.keys():
                        try:
                            maximum_ngrams_y = maximum_ngrams_y + len(
                                self.tgt_graphmanager.graph[
                                    best_matching_resource[0]][tgt_prop])
                        except KeyError:
                            pass
                    no_of_ngrams = max(maximum_ngrams_x, maximum_ngrams_y)
                    if k == 0:
                        if no_of_ngrams > 10 and best_matching_resource[
                                1] > no_of_ngrams * min_similarity and best_matching_resource[
                                    1] < no_of_ngrams and best_matching_resource[
                                        1] <= min(
                                            maximum_ngrams_x,
                                            maximum_ngrams_y) * max_similarity:
                            try:
                                correspondences[
                                    nodeid] = best_matching_resource[0]
                                #l = self.src_graphmanager.graph[nodeid]["<http://rdata2graph.sap.com/darkscape/non-player_character.label>".lower()] + " -> " + self.tgt_graphmanager.graph[best_matching_resource[0]]["<http://rdata2graph.sap.com/oldschoolrunescape/non-player_character.label>".lower()] + " <----> " + str(nodeid).replace("<","").replace(">","") + "\t" + str(best_matching_resource[0]).replace("<","").replace(">","") + "\r\n"
                                l = str(nodeid).replace("<", "").replace(
                                    ">", "") + "\t" + str(
                                        best_matching_resource[0]).replace(
                                            "<", "").replace(
                                                ">",
                                                "") + "\t" + str(lbl) + "\r\n"
                                f.write(l.lower())
                                f.flush()
                                matchfound = True
                            except KeyError:
                                pass
                        else:
                            correspondences[nodeid] = None
                    elif no_of_ngrams > 10 and matchfound:
                        try:
                            correspondences[nodeid] = best_matching_resource[0]
                            #l = self.src_graphmanager.graph[nodeid]["<http://rdata2graph.sap.com/darkscape/non-player_character.label>".lower()] + " -> " + self.tgt_graphmanager.graph[best_matching_resource[0]]["<http://rdata2graph.sap.com/oldschoolrunescape/non-player_character.label>".lower()] + " <----> " + str(nodeid).replace("<","").replace(">","") + "\t" + str(best_matching_resource[0]).replace("<","").replace(">","") + "\r\n"
                            l = str(nodeid).replace("<", "").replace(
                                ">", "") + "\t" + str(
                                    best_matching_resource[0]).replace(
                                        "<", "").replace(
                                            ">", "") + "\t" + str(lbl) + "\r\n"
                            f.write(l.lower())
                            f.flush()
                        except KeyError:
                            pass
        f.close()
        return correspondences