def phono_edit_distance(word1, word2, sequence_type, features): """Returns an analogue to Levenshtein edit distance but uses phonological features instead of characters Parameters ---------- word1: Word Word object containing transcription tiers which will be compared to another word containing transcription tiers word2: Word The other word containing transcription tiers to which word1 will be compared sequence_type: string Name of the sequence type (transcription or a tier) to use for comparisons features: FeatureMatrix FeatureMatrix that contains all the segments in both transcriptions to be compared Returns ------- float the phonological edit distance between two words """ w1 = getattr(word1, sequence_type) w2 = getattr(word2, sequence_type) a = Aligner(features_tf=True, features=features) m = a.make_similarity_matrix(w1, w2) return m[-1][-1]['f']
def find_mutation_minpairs(corpus_context, query, tier_type=None, stop_check=None, call_back=None): """Find all minimal pairs of the query word based only on segment mutations (not deletions/insertions) Parameters ---------- corpus_context : CorpusContext Context manager for a corpus query : Word The word whose minimal pairs to find stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the function Returns ------- list The found minimal pairs for the queried word """ matches = [] sequence_type = corpus_context.sequence_type query = ensure_query_is_word(query, corpus_context, corpus_context.sequence_type, tier_type) if call_back is not None: call_back('Finding neighbors...') call_back(0, len(corpus_context)) cur = 0 al = Aligner(features_tf=False, ins_penalty=float('inf'), del_penalty=float('inf'), sub_penalty=1) for w in corpus_context: w_sequence = getattr(w, sequence_type) query_sequence = getattr(query, sequence_type) if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 10 == 0: call_back(cur) if (len(w_sequence) > len(query_sequence) + 1 or len(w_sequence) < len(query_sequence) - 1): continue m = al.make_similarity_matrix(query_sequence, w_sequence) if m[-1][-1]['f'] != 1: continue matches.append(str(w_sequence)) neighbors = list(set(matches) - set([str(query_sequence)])) return (len(neighbors), neighbors)
def find_mutation_minpairs(corpus_context, query, tier_type = None, collapse_homophones = False, stop_check = None, call_back = None): """Find all minimal pairs of the query word based only on segment mutations (not deletions/insertions) Parameters ---------- corpus_context : CorpusContext Context manager for a corpus query : Word The word whose minimal pairs to find stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the function Returns ------- list The found minimal pairs for the queried word """ matches = [] sequence_type = corpus_context.sequence_type query = ensure_query_is_word(query, corpus_context, corpus_context.sequence_type, tier_type) if call_back is not None: call_back('Finding neighbors...') call_back(0,len(corpus_context)) cur = 0 al = Aligner(features_tf=False, ins_penalty=float('inf'), del_penalty=float('inf'), sub_penalty=1) for w in corpus_context: w_sequence = getattr(w, sequence_type) query_sequence = getattr(query, sequence_type) if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 10 == 0: call_back(cur) if (len(w_sequence) > len(query_sequence)+1 or len(w_sequence) < len(query_sequence)-1): continue m = al.make_similarity_matrix(query_sequence, w_sequence) if m[-1][-1]['f'] != 1: continue w_sequence = getattr(w, sequence_type) if collapse_homophones and any(getattr(m, sequence_type) == w_sequence for m in matches): continue else: #matches.append(str(w_sequence)) matches.append(w) matches = [m.spelling for m in matches] neighbors = list(set(matches)-set([str(query_sequence)])) return (len(neighbors), neighbors)
def phono_edit_distance(word1, word2, sequence_type, features): """Returns an analogue to Levenshtein edit distance but uses phonological features instead of characters Parameters ---------- word1: Word Word object containing transcription tiers which will be compared to another word containing transcription tiers word2: Word The other word containing transcription tiers to which word1 will be compared sequence_type: string Name of the sequence type (transcription or a tier) to use for comparisons features: FeatureMatrix FeatureMatrix that contains all the segments in both transcriptions to be compared Returns ------- float the phonological edit distance between two words """ w1 = getattr(word1,sequence_type) w2 = getattr(word2,sequence_type) a = Aligner(features_tf=True, features=features) m = a.make_similarity_matrix(w1, w2) return m[-1][-1]['f']