def phono_edit_distance_wrapper(w1, w2, sequence_type, features, max_distance):
    score = phono_edit_distance(w1,
                                w2,
                                sequence_type=sequence_type,
                                features=features)
    if score <= max_distance:
        return score
    else:
        return None
Example #2
0
def _is_phono_edit_distance_neighbor(w, query, sequence_type, specifier,
                                     max_distance):
    return phono_edit_distance(w, query, sequence_type,
                               specifier) <= max_distance
def phono_edit_distance_wrapper(w1, w2, sequence_type, features, max_distance):
    score = phono_edit_distance(w1, w2, sequence_type = sequence_type,features = features)
    if score <= max_distance:
        return score
    else:
        return None
Example #4
0
def find_distances(corpus,
                   phono_dict,
                   features,
                   model,
                   use_stoplist=False,
                   n=1000):
    class_text = Sentence(corpus)

    # Convert dict keys and word list to upper case for comparison purposes
    phono_words = [word.upper() for word in list(phono_dict.keys())]

    # Create set of all unique tokens in the corpus
    if use_stoplist:
        set_of_unique_tokens = set(class_text.tokens_cased_without_stop)
    else:
        set_of_unique_tokens = set(class_text.tokens_cased)

    # Filter set of unique tokens to only those that are in the model and the phonological corpus
    set_of_unique_tokens = {
        token
        for token in set_of_unique_tokens
        if token in model and token.upper() in phono_words
    }

    # Empty lists for word tuples (index) and distances (column) of resulting DataFrame
    distances = []
    word_tuples = []

    widgets = [
        progressbar.Percentage(), ' ',
        progressbar.Bar(marker='#', left='[', right=']'), ' ',
        progressbar.ETA(), ' ',
        progressbar.Counter(format='Completed %(value)d/%(max_value)d')
    ]

    pbar = progressbar.ProgressBar(widgets=widgets,
                                   maxval=len(set_of_unique_tokens))
    pbar.start()

    for i, token in enumerate(set_of_unique_tokens):
        similar_set = model.most_similar(token, topn=n)

        # Only keep words and filter
        similar_set = [item[0] for item in similar_set]
        similar_set = [
            sim_word for sim_word in similar_set
            if sim_word in model and sim_word.upper() in phono_words
            and token.upper() != sim_word.upper()
        ]

        for sim_word in similar_set:
            distance = phono_edit_distance.phono_edit_distance(
                phono_dict.get(token.upper()),
                phono_dict.get(sim_word.upper()), 'transcription', features)
            word_tuples.append((token, sim_word))
            distances.append(distance)

        pbar.update(i)
    pbar.finish()

    index = pd.MultiIndex.from_tuples(
        tuples=word_tuples, names=['Corpus Word', 'Similar Word from Model'])
    columns = ['Phono Edit Distance']

    dist_frame = pd.DataFrame(distances, index=index, columns=columns)

    return dist_frame
def _is_phono_edit_distance_neighbor(w, query, sequence_type, specifier, max_distance):
    return phono_edit_distance(w, query, sequence_type, specifier) <= max_distance