def get_lang_pared_char_sets_dists(): lang_to_pared_char_set, lang_to_char_set, all_pared_chars = get_lang_to_pared_char_set_short() code_to_ne_counts = get_code_to_ne_counts() """remove codes with very little data""" for code, ne_count in code_to_ne_counts.items(): if ne_count < 500: lang_to_pared_char_set.pop(code) lang_to_char_set.pop(code) lang_to_char_features = get_pared_down_chars_as_features(lang_to_pared_char_set, all_pared_chars) # pprint.pprint(lang_to_char_features) uriel_loaded = True if len(code_to_code_to_scorestruct) == 0: uriel_loaded = False for iso_code1, char_features1 in sorted(lang_to_char_features.items()): # if uriel_loaded and iso_code1 not in code_to_code_to_scorestruct and code_to_ne_counts[iso_code1] < : # continue for iso_code2, char_features2 in sorted(lang_to_char_features.items()): # if uriel_loaded and iso_code2 not in code_to_code_to_scorestruct: # continue cos_dist = scipy.spatial.distance.cosine(char_features1, char_features2) if 'e' in str(cos_dist): cos_dist = 0.0 code_to_code_to_scorestruct[iso_code1][iso_code2].set_dist('transliterable', cos_dist)
def get_script_unicode_dists(): script_to_pared_chars, all_pared_chars = get_script_to_pared_chars() script_to_char_features = get_pared_down_chars_as_features(script_to_pared_chars, all_pared_chars) pprint.pprint(script_to_pared_chars) for script1, char_features1 in sorted(script_to_char_features.items()): for script2, char_features2 in sorted(script_to_char_features.items()): cos_dist = scipy.spatial.distance.cosine(char_features1, char_features2) if 'e' in str(cos_dist): cos_dist = 0.0 script_to_script_to_scorestruct[script1][script2].set_dist('transliterable', cos_dist)