コード例 #1
0
def get_lang_pared_char_sets_dists():
    lang_to_pared_char_set, lang_to_char_set, all_pared_chars = get_lang_to_pared_char_set_short()

    code_to_ne_counts = get_code_to_ne_counts()

    """remove codes with very little data"""
    for code, ne_count in code_to_ne_counts.items():
        if ne_count < 500:
            lang_to_pared_char_set.pop(code)
            lang_to_char_set.pop(code)

    lang_to_char_features = get_pared_down_chars_as_features(lang_to_pared_char_set, all_pared_chars)


    # pprint.pprint(lang_to_char_features)

    uriel_loaded = True
    if len(code_to_code_to_scorestruct) == 0:
        uriel_loaded = False

    for iso_code1, char_features1 in sorted(lang_to_char_features.items()):
        # if uriel_loaded and iso_code1 not in code_to_code_to_scorestruct and code_to_ne_counts[iso_code1] < :
        # continue
        for iso_code2, char_features2 in sorted(lang_to_char_features.items()):
            # if uriel_loaded and iso_code2 not in code_to_code_to_scorestruct:
            # continue

            cos_dist = scipy.spatial.distance.cosine(char_features1, char_features2)
            if 'e' in str(cos_dist):
                cos_dist = 0.0
            code_to_code_to_scorestruct[iso_code1][iso_code2].set_dist('transliterable', cos_dist)
コード例 #2
0
def get_script_unicode_dists():
    script_to_pared_chars, all_pared_chars = get_script_to_pared_chars()
    script_to_char_features = get_pared_down_chars_as_features(script_to_pared_chars, all_pared_chars)
    pprint.pprint(script_to_pared_chars)

    for script1, char_features1 in sorted(script_to_char_features.items()):
        for script2, char_features2 in sorted(script_to_char_features.items()):
            cos_dist = scipy.spatial.distance.cosine(char_features1, char_features2)
            if 'e' in str(cos_dist):
                cos_dist = 0.0
            script_to_script_to_scorestruct[script1][script2].set_dist('transliterable', cos_dist)