Ejemplo n.º 1
0
def phono_edit_distance(word1, word2, sequence_type, features):
    """Returns an analogue to Levenshtein edit distance but uses
    phonological features instead of characters

    Parameters
    ----------
    word1: Word
        Word object containing transcription tiers which will be compared
        to another word containing transcription tiers

    word2: Word
        The other word containing transcription tiers to which word1 will
        be compared

    sequence_type: string
        Name of the sequence type (transcription or a tier) to use for comparisons

    features: FeatureMatrix
        FeatureMatrix that contains all the segments in both transcriptions
        to be compared

    Returns
    -------
    float
        the phonological edit distance between two words
    """

    w1 = getattr(word1, sequence_type)
    w2 = getattr(word2, sequence_type)

    a = Aligner(features_tf=True, features=features)

    m = a.make_similarity_matrix(w1, w2)

    return m[-1][-1]['f']
Ejemplo n.º 2
0
def find_mutation_minpairs(corpus_context,
                           query,
                           tier_type=None,
                           stop_check=None,
                           call_back=None):
    """Find all minimal pairs of the query word based only on segment
    mutations (not deletions/insertions)

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    query : Word
        The word whose minimal pairs to find
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the function

    Returns
    -------
    list
        The found minimal pairs for the queried word
    """
    matches = []
    sequence_type = corpus_context.sequence_type
    query = ensure_query_is_word(query, corpus_context,
                                 corpus_context.sequence_type, tier_type)
    if call_back is not None:
        call_back('Finding neighbors...')
        call_back(0, len(corpus_context))
        cur = 0
    al = Aligner(features_tf=False,
                 ins_penalty=float('inf'),
                 del_penalty=float('inf'),
                 sub_penalty=1)
    for w in corpus_context:
        w_sequence = getattr(w, sequence_type)
        query_sequence = getattr(query, sequence_type)
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 10 == 0:
                call_back(cur)
        if (len(w_sequence) > len(query_sequence) + 1
                or len(w_sequence) < len(query_sequence) - 1):
            continue
        m = al.make_similarity_matrix(query_sequence, w_sequence)
        if m[-1][-1]['f'] != 1:
            continue
        matches.append(str(w_sequence))

    neighbors = list(set(matches) - set([str(query_sequence)]))
    return (len(neighbors), neighbors)
def find_mutation_minpairs(corpus_context, query, tier_type = None, collapse_homophones = False,
                    stop_check = None, call_back = None):
    """Find all minimal pairs of the query word based only on segment
    mutations (not deletions/insertions)

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    query : Word
        The word whose minimal pairs to find
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the function

    Returns
    -------
    list
        The found minimal pairs for the queried word
    """
    matches = []
    sequence_type = corpus_context.sequence_type
    query = ensure_query_is_word(query, corpus_context, corpus_context.sequence_type, tier_type)
    if call_back is not None:
        call_back('Finding neighbors...')
        call_back(0,len(corpus_context))
        cur = 0
    al = Aligner(features_tf=False, ins_penalty=float('inf'), del_penalty=float('inf'), sub_penalty=1)
    for w in corpus_context:
        w_sequence = getattr(w, sequence_type)
        query_sequence = getattr(query, sequence_type)
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 10 == 0:
                call_back(cur)
        if (len(w_sequence) > len(query_sequence)+1 or
            len(w_sequence) < len(query_sequence)-1):
            continue
        m = al.make_similarity_matrix(query_sequence, w_sequence)
        if m[-1][-1]['f'] != 1:
            continue

        w_sequence = getattr(w, sequence_type)
        if collapse_homophones and any(getattr(m, sequence_type) == w_sequence for m in matches):
            continue
        else:
            #matches.append(str(w_sequence))
            matches.append(w)

    matches = [m.spelling for m in matches]
    neighbors = list(set(matches)-set([str(query_sequence)]))
    return (len(neighbors), neighbors)
def phono_edit_distance(word1, word2, sequence_type, features):
    """Returns an analogue to Levenshtein edit distance but uses
    phonological features instead of characters

    Parameters
    ----------
    word1: Word
        Word object containing transcription tiers which will be compared
        to another word containing transcription tiers

    word2: Word
        The other word containing transcription tiers to which word1 will
        be compared

    sequence_type: string
        Name of the sequence type (transcription or a tier) to use for comparisons

    features: FeatureMatrix
        FeatureMatrix that contains all the segments in both transcriptions
        to be compared

    Returns
    -------
    float
        the phonological edit distance between two words
    """

    w1 = getattr(word1,sequence_type)
    w2 = getattr(word2,sequence_type)


    a = Aligner(features_tf=True, features=features)

    m = a.make_similarity_matrix(w1, w2)

    return m[-1][-1]['f']