Exemple #1
0
def find_relation_class_attr_name_sem_matchings(network, kr_handlers):
    # Retrieve relation names

    #self.find_relation_class_name_sem_matchings()
    st = time.time()
    names = []
    seen_fields = []
    for (db_name, source_name, field_name, _) in network.iterate_values():
        orig_field_name = field_name
        if field_name not in seen_fields:
            seen_fields.append(field_name)  # seen already
            field_name = nlp.camelcase_to_snakecase(field_name)
            field_name = field_name.replace('-', ' ')
            field_name = field_name.replace('_', ' ')
            field_name = field_name.lower()
            svs = []
            for token in field_name.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(
                ('attribute', (db_name, source_name, orig_field_name), svs))

    num_attributes_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(('class', (kr_name, original_cl_name), svs))

    matchings = []
    for idx_rel in range(0,
                         num_attributes_inserted):  # Compare only with classes
        for idx_class in range(num_attributes_inserted, len(names)):
            svs_rel = names[idx_rel][2]
            svs_cla = names[idx_class][2]
            semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla)
            if semantic_sim > 0.8:
                # match.format db_name, source_name, field_name -> class_name
                match = ((names[idx_rel][1][0], names[idx_rel][1][1],
                          names[idx_rel][1][2]), names[idx_class][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (sem): " + str(et - st))
    return matchings
Exemple #2
0
def find_relation_class_name_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = nlp.camelcase_to_snakecase(source_name)
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            m = MinHash(num_perm=32)
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('relation', (db_name, original_source_name), m))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=32)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))

    # Index all the minhashes
    lsh_index = MinHashLSH(threshold=0.5, num_perm=32)

    for idx in range(len(names)):
        lsh_index.insert(idx, names[idx][2])

    matchings = []
    for idx in range(0, num_relations_inserted):  # Compare only with classes
        N = lsh_index.query(names[idx][2])
        for n in N:
            kind_q = names[idx][0]
            kind_n = names[n][0]
            if kind_n != kind_q:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx][1][0], names[idx][1][1], "_"),
                         names[n][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (name): " + str(et - st))
    return matchings
Exemple #3
0
def minhash(str_values):
    def java_long(number):
        return (number + 2**63) % 2**64 - 2**63

    def remainder(a, b):
        return a - (b * int(a / b))

    def hash_this(value):
        h = mersenne_prime
        length = len(value)
        for i in range(length):
            h = 31 * h + ord(value[i])
        return h

    mh = [9223372036854775807 for i in range(k)]

    for v in str_values:
        v = nlp.camelcase_to_snakecase(v)
        v = v.replace('_', ' ')
        v = v.replace('-', ' ')
        v = v.lower()
        for token in v.split(' '):
            if token not in stopwords.words('english'):
                raw_hash = hash_this(token)
                for i in range(k):
                    first_part = java_long(random_seeds[i][0] * raw_hash)
                    second_part = java_long(random_seeds[i][1])
                    nomodule = java_long(first_part + second_part)
                    h = java_long(remainder(nomodule, mersenne_prime))
                    if h < mh[i]:
                        mh[i] = h
    return mh
Exemple #4
0
def find_sem_coh_matchings(network, kr_handlers):
    matchings = []
    matchings_special = []
    # Get all relations with groups
    table_groups = dict()
    for db, t, attrs in SS.read_table_columns(None, network=network):
        groups = SS.extract_cohesive_groups(t, attrs)
        table_groups[(db, t)] = groups  # (score, [set()])

    names = []
    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(('class', (kr_name, original_cl_name), svs))

    for db_table_info, groups in table_groups.items():
        db_name, table_name = db_table_info
        class_seen = []  # to filter out already seen classes
        for g_score, g_tokens in groups:
            g_svs = []
            for t in g_tokens:
                sv = glove_api.get_embedding_for_word(t)
                if sv is not None:
                    g_svs.append(sv)
            for _, class_info, class_svs in names:
                kr_name, class_name = class_info
                sim = SS.compute_semantic_similarity(class_svs, g_svs)
                if sim > g_score and class_name not in class_seen:
                    class_seen.append(class_name)
                    match = ((db_name, table_name, "_"), (kr_name, class_name))
                    matchings.append(match)
                """
                similar = SS.groupwise_semantic_sim(class_svs, g_svs, 0.7)
                if similar:
                    class_seen.append(class_name)
                    match = ((db_name, table_name, "_"), (kr_name, class_name))
                    matchings_special.append(match)
                continue
                """

    return matchings, table_groups  #, matchings_special
Exemple #5
0
def test_find_semantic_sim():
    # Load onto
    om = SSAPI(None, None, None, None)
    # Load parsed ontology
    om.add_krs([("dbpedia", "cache_onto/schemaorg.pkl")], parsed=True)

    # Load glove model
    print("Loading language model...")
    path_to_glove_model = "../glove/glove.6B.100d.txt"
    glove_api.load_model(path_to_glove_model)
    print("Loading language model...OK")

    print("Loading ontology classes...")
    names = []
    # Load classes
    for kr_name, kr_handler in om.kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(('class', cl, svs))
    print("Loading ontology classes...OK")

    while True:
        # Get words
        i = input("introduce two words separated by space to get similarity. EXIT to exit")
        tokens = i.split(' ')
        if tokens[0] == "EXIT":
            print("bye!")
            break
        svs = []
        for t in tokens:
            sv = glove_api.get_embedding_for_word(t)
            if sv is not None:
                svs.append(sv)
            else:
                print("No vec for : " + str(t))
        for _, cl, vecs in names:
            sim = SS.compute_semantic_similarity(svs, vecs)
            if sim > 0.4:
                print(str(cl) + " -> " + str(sim))
Exemple #6
0
def retrieve_class_names(kr_handlers, num_perm=32):
    names = list()

    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=num_perm)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))
    return names
def find_matching_to_text(network,
                          semantic_sim_threshold=0.5,
                          sensitivity_neg_signal=0.5,
                          negative_signal_threshold=0.4,
                          penalize_unknown_word=False,
                          add_exact_matches=True,
                          reference_name="",
                          reference_gen=None):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_fields = set()
    for (db_name, source_name, field_name, _) in network.iterate_values():
        orig_field_name = field_name
        key_seen = source_name + field_name
        if key_seen not in seen_fields:
            seen_fields.add(key_seen)  # seen already
            field_name = nlp.camelcase_to_snakecase(field_name)
            field_name = field_name.replace('-', ' ')
            field_name = field_name.replace('_', ' ')
            field_name = field_name.lower()
            svs = []
            for token in field_name.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    if sv is not None:
                        svs.append(sv)
            names.append(
                ('attribute', (db_name, source_name, orig_field_name), svs))

    num_attributes_inserted = len(names)

    # Retrieve class names

    for cl in reference_gen:
        original_cl_name = cl
        cl = cl.replace('-', ' ')
        cl = cl.replace('_', ' ')
        cl = cl.lower()
        svs = []
        for token in cl.split():
            if token not in stopwords.words('english'):
                sv = glove_api.get_embedding_for_word(token)
                if sv is not None:
                    svs.append(sv)
        names.append(('class', (reference_name, original_cl_name), svs))

    print("N equals: " + str(len(names)))

    pos_matchings = []
    neg_matchings = []
    for idx_class in range(num_attributes_inserted, len(names)):
        for idx_rel in range(
                0, num_attributes_inserted):  # Compare only with classes
            ban_index1, ban_index2 = get_ban_indexes(names[idx_rel][1][2],
                                                     names[idx_class][1][1])
            svs_rel = remove_banned_vectors(ban_index1, names[idx_rel][2])
            svs_cla = remove_banned_vectors(ban_index2, names[idx_class][2])
            semantic_sim, strong_signal = SS.compute_semantic_similarity(
                svs_rel,
                svs_cla,
                penalize_unknown_word=penalize_unknown_word,
                add_exact_matches=add_exact_matches,
                signal_strength_threshold=sensitivity_neg_signal)
            if strong_signal and semantic_sim > semantic_sim_threshold:
                # match.format db_name, source_name, field_name -> class_name
                match = ((names[idx_rel][1][0], names[idx_rel][1][1],
                          names[idx_rel][1][2]), names[idx_class][1])
                pos_matchings.append(match)
                continue  # FIXME: one matching per entity
            elif strong_signal and semantic_sim < negative_signal_threshold:
                match = ((names[idx_rel][1][0], names[idx_rel][1][1],
                          names[idx_rel][1][2]), names[idx_class][1])
                neg_matchings.append(match)
    et = time.time()
    print("l52: " + str(et - st))
    return pos_matchings, neg_matchings
Exemple #8
0
def find_relation_class_name_sem_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            svs = []
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    #if sv is not None:
                    svs.append(
                        sv)  # append even None, to apply penalization later
            names.append(('relation', (db_name, original_source_name), svs))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            svs = []
            for token in cl.split():
                if token not in stopwords.words('english'):
                    sv = glove_api.get_embedding_for_word(token)
                    #if sv is not None:
                    svs.append(
                        sv)  # append even None, to apply penalization later
            names.append(('class', (kr_name, original_cl_name), svs))

    matchings = []
    for idx_rel in range(0,
                         num_relations_inserted):  # Compare only with classes
        for idx_class in range(num_relations_inserted, len(names)):
            svs_rel = names[idx_rel][2]
            svs_cla = names[idx_class][2]
            semantic_sim = SS.compute_semantic_similarity(
                svs_rel,
                svs_cla,
                penalize_unknown_word=True,
                add_exact_matches=False)
            #semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla)
            if semantic_sim > 0.5:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx_rel][1][0], names[idx_rel][1][1], "_"),
                         names[idx_class][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (sem): " + str(et - st))
    return matchings