Ejemplo n.º 1
0
def clustering_3_initials(data):
    for cluster in key_collision(data, key=lambda i: initialize(DATA[i]['transliteration']), max_size=2):

        # We check the cluster once more:
        # If no item in the cluster has initials, we filter it
        # NOTE: it deprives us of some clusters where Aleks would match Aleksander
        # if not any(any(len(token.replace('.', '')) < 2 for token in tokenize(DATA[i]['name'])[:-1]) for i in cluster):
        #     pass

        yield cluster
Ejemplo n.º 2
0
def clustering_0a_external_identifiers(data):

    default_list = []

    def keys(i):
        code = DATA[i].get('wikidata_code')

        if code is None:
            return

        return EXTERNAL.get(code, default_list)

    for cluster in key_collision(data, keys=keys, max_size=2):

        # TODO: remove when fog is fixed
        if len(cluster) > 2:
            continue

        codeA = DATA[cluster[0]].get('wikidata_code')
        codeB = DATA[cluster[1]].get('wikidata_code')

        # We want perfect overlap
        A = EXTERNAL.get(codeA, set()) if codeA else set()
        B = EXTERNAL.get(codeB, set()) if codeB else set()

        # NOTE: we could rely on key intersection match
        # if overlap_coefficient(A, B) != 1:
        #     continue

        Ak = set(s for s, _ in A)
        Bk = set(s for s, _ in B)

        Ik = Ak & Bk

        assert len(Ik) > 0

        Ai = set(p for p in A if p[0] in Ik)
        Bi = set(p for p in B if p[0] in Ik)

        # print()
        if Ai != Bi:
        #     print('DISCARD')
        #     print(A)
        #     print(B)
        #     print(Ik)
        #     print(DATA[cluster[0]]['name'], '=>', DATA[cluster[1]]['name'])
            continue

        # print('KEEP')
        # print(A)
        # print(B)
        # print(Ik)
        # print(DATA[cluster[0]]['name'], '=>', DATA[cluster[1]]['name'])

        yield cluster
Ejemplo n.º 3
0
def clustering_7_rusalka(data):
    return key_collision(data, key=lambda i: rusalka(DATA[i]['transliteration']))
Ejemplo n.º 4
0
def clustering_6_cologne(data):
    return key_collision(data, key=lambda i: safe_cologne(DATA[i]['transliteration']))
Ejemplo n.º 5
0
def clustering_5_bigram_fingerprinting(data):
    return key_collision(data, key=lambda i: ngrams_fingerprint(2, DATA[i]['transliteration']))
Ejemplo n.º 6
0
def clustering_4_fingerprinting(data):
    return key_collision(data, key=lambda i: fingerprint(DATA[i]['transliteration']))
Ejemplo n.º 7
0
def clustering_2_harsh_normalization(data):
    return key_collision(data, key=lambda i: process_harsher(DATA[i]['transliteration']))
Ejemplo n.º 8
0
def clustering_0b_exact(data):
    return key_collision(data, key=lambda i: DATA[i]['name'])