def run(self): time.sleep(0.1) kwargs = self.kwargs context = kwargs.pop('context') if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs.pop('corpusModel').corpus st = kwargs.pop('sequence_type') tt = kwargs.pop('type_token') ft = kwargs.pop('frequency_cutoff') with cm(corpus, st, tt, frequency_threshold=ft) as c: try: query = kwargs.pop('query') alg = kwargs.pop('algorithm') self.results = string_similarity(c, query, alg, **kwargs) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return self.dataReady.emit(self.results)
def run(self): if self.name == 'functional_load': try: results = minpair_fl(self.kwargs['corpus'], self.kwargs['segment_pair'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return elif self.name == 'string_similarity': try: results = string_similarity( self.kwargs['corpus'], self.kwargs['query'], self.kwargs['algorithm'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return elif self.name == 'phonotactic_probability': try: results = phonotactic_probability_vitevitch( self.kwargs['corpus'], self.kwargs['query'], self.kwargs['sequence_type'], probability_type=self.kwargs['probability_type'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return elif self.name == 'kullback_leibler': try: results = KullbackLeibler(self.kwargs['corpus'], self.kwargs['seg1'], self.kwargs['seg2'], self.kwargs['side'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return else: raise UnLuckyException( 'No analysis function called {} could be found'.format( self.name))
def run(self): time.sleep(0.1) kwargs = self.kwargs context = kwargs.pop("context") if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs.pop("corpusModel").corpus st = kwargs.pop("sequence_type") tt = kwargs.pop("type_token") with cm(corpus, st, tt, None) as c: try: query = kwargs.pop("query") alg = kwargs.pop("algorithm") self.results = string_similarity(c, query, alg, **kwargs) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return self.dataReady.emit(self.results)
def test_mass_relate_transcription_token(unspecified_test_corpus): expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),12.10974787), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-15.29756722), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-16.05808867), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-8.574032654), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-6.823215263), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-14.77671518), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-13.71767966), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-11.34309371), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-11.19329949), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-9.205644162), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-13.74726148), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-23.12247048), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-15.1191937), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.79217439), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-15.68503325),] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c: calced = string_similarity(c,unspecified_test_corpus.find('atema'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001) expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-14.77671518), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-15.43519993), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-13.96361833), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-11.58324408), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-11.67727303), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),8.126877557), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-9.734809346), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-7.840021077), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-15.95332831), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-6.848974285), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-16.85050186), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-20.51761446), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-12.51433768), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-4.829191506), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-5.994066536),] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c: calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_transcription_type(unspecified_test_corpus): expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),10.54988612), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-13.35737022), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-16.64202823), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-5.95476627), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-8.178638789), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-14.85026877), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-13.67469544), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-12.0090178), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-12.51154463), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-8.296421824), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-13.01231991), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-23.85818691), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-14.54716897), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.85402179), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-14.60340869),] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c: calced = string_similarity(c,unspecified_test_corpus.find('atema'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001) expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-14.85026877), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-16.64202823), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-12.94778139), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-11.67221494), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-12.07768004), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),8.812614836), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-11.93742415), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-7.90637444), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-18.22899329), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-7.683230889), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-16.91136117), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-21.83498509), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-12.52396715), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-5.239146233), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-6.943894326),] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c: calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_spelling_token(unspecified_test_corpus): expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),12.9671688), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-16.49795651), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-17.65533907), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-7.337667817), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-9.088485208), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-13.8251823), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-17.52074498), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-12.59737574), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-14.82488063), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-9.8915809), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-14.6046824), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-27.61147254), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-16.14809881), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.8308605), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-22.4838445)] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'token') as c: calced = string_similarity(c,unspecified_test_corpus.find('atema'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001) expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-13.8251823), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-14.48366705), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-16.62778969), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-10.46022702), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-10.55425597), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),6.832376308), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-7.235843913), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-9.913037922), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-19.77169406), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-5.382988852), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-16.07045316), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-24.92713472), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-11.39132061), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-5.172159875), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-10.12650306)] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'token') as c: calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_spelling_type(unspecified_test_corpus): expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),11.0766887), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-14.09489383), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-18.35890071), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-6.270847817), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-8.494720336), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-13.57140897), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-18.17657916), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-13.51516925), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-16.90806783), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-8.717863887), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-13.53912249), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-28.78151269), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-15.17933206), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.53067344), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-17.53815687),] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c: calced = string_similarity(c, unspecified_test_corpus.find('atema'), 'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001) expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-13.57140897), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-15.36316844), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-16.92481569), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-10.28799462), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-10.69345973), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),7.323034009), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-8.971692634), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-10.26267682), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-20.30229654), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-6.088289546), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-15.73786189), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-25.52902026), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-11.13974683), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-5.449867265), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-7.54617756),] expected.sort(key=lambda t:t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c: calced = string_similarity(c, unspecified_test_corpus.find('sasi'),'khorsi') for i, v in enumerate(expected): assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_spelling(unspecified_test_corpus): expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),0), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),4), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),7), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),3), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),3), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),5), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),6), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),6), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),6), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),3), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),3), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),9), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),4), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),3), (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),5)] expected.sort(key=lambda t:t[1]) with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c: calced = string_similarity(c,unspecified_test_corpus.find('atema'),'edit_distance') calced.sort(key=lambda t:t[1]) for i, v in enumerate(expected): assert(calced[i] == v) expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),5), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),5), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),6), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),3), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),3), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),0), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),2), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),5), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),6), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),3), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),4), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),8), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),3), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),3), (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),3)] expected.sort(key=lambda t:t[1]) with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c: calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'edit_distance') calced.sort(key=lambda t:t[1]) for i, v in enumerate(expected): assert(calced[i] == v)
def run(self): if self.name == 'functional_load': try: results = minpair_fl(self.kwargs['corpus'], self.kwargs['segment_pair'], stop_check = self.kwargs['stop_check'], call_back = self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name,e) self.errorEncountered.emit(message) return elif self.name == 'string_similarity': try: results = string_similarity(self.kwargs['corpus'], self.kwargs['query'], self.kwargs['algorithm'], stop_check = self.kwargs['stop_check'], call_back = self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name,e) self.errorEncountered.emit(message) return elif self.name == 'phonotactic_probability': try: results = phonotactic_probability_vitevitch(self.kwargs['corpus'],self.kwargs['query'], self.kwargs['sequence_type'], probability_type=self.kwargs['probability_type'], stop_check = self.kwargs['stop_check'], call_back = self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name,e) self.errorEncountered.emit(message) return elif self.name == 'kullback_leibler': try: results = KullbackLeibler(self.kwargs['corpus'], self.kwargs['seg1'], self.kwargs['seg2'], self.kwargs['side'], stop_check = self.kwargs['stop_check'], call_back = self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name,e) self.errorEncountered.emit(message) return else: raise UnLuckyException('No analysis function called {} could be found'.format(self.name))
def calc_freq_of_alt(corpus_context, seg1, seg2, algorithm, output_filename = None, min_rel = None, max_rel = None, phono_align = False, min_pairs_okay = False, from_gui=False, stop_check = None, call_back = None): """Returns a double that is a measure of the frequency of alternation of two sounds in a given corpus Parameters ---------- corpus_context : CorpusContext Context manager for a corpus seg1: char A sound segment, e.g. 's', 't' seg2: char A sound segment algorithm: string The string similarity algorithm max_rel: double Filters out all words that are higher than max_rel from a relatedness measure min_rel: double Filters out all words that are lower than min_rel from a relatedness measure phono_align: boolean (1 or 0), optional 1 means 'only count alternations that are likely phonologically aligned,' defaults to not force phonological alignment min_pairs_okay: bool, optional True means allow minimal pairs (e.g. in English, 's' and 't' do not alternate in minimal pairs, so allowing minimal pairs may skew results) stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- double The frequency of alternation of two sounds in a given corpus """ list_seg1 = [] list_seg2 = [] all_words = set() if call_back is not None: call_back('Finding instances of segments...') call_back(0, len(corpus_context)) cur = 0 for w in corpus_context: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 1000 == 0: call_back(cur) tier = getattr(w, corpus_context.sequence_type) if seg1 in tier: list_seg1.append(w) all_words.add(w.spelling) if seg2 in tier: list_seg2.append(w) all_words.add(w.spelling) if call_back is not None: call_back('Calculating string similarities...') call_back(0, len(list_seg1) * len(list_seg2)) cur = 0 related_list = [] if phono_align: al = pam.Aligner(features = corpus_context.specifier) for w1 in list_seg1: for w2 in list_seg2: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 1000 == 0: #print(len(related_list)) call_back(cur) if w1 == w2: continue ss = string_similarity(corpus_context, (w1,w2), algorithm) if min_rel is not None and ss[0][-1] < min_rel: continue if max_rel is not None and ss[0][-1] > max_rel: continue if not min_pairs_okay: if len(w1.transcription) == len(w2.transcription): count_diff = 0 for i in range(len(w1.transcription)): if w1.transcription[i] != w2.transcription[i]: count_diff += 1 if count_diff > 1: break if count_diff == 1: continue if phono_align: alignment = al.align(w1.transcription, w2.transcription) if not al.morpho_related(alignment, seg1, seg2): continue related_list.append(ss[0]) words_with_alt = set() if call_back is not None: call_back('Calculating frequency of alternation...') call_back(0, len(related_list)) cur = 0 for w1, w2, score in related_list: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 100 == 0: call_back(cur) words_with_alt.add(w1.spelling) #Hacks words_with_alt.add(w2.spelling) #Calculate frequency of alternation using sets to ensure no duplicates (i.e. words with both seg1 and seg2 freq_of_alt = len(words_with_alt)/len(all_words) if output_filename: print_freqalt_results(output_filename, related_list) return len(all_words), len(words_with_alt), freq_of_alt
def test_mass_relate_transcription_token(unspecified_test_corpus): expected = [ (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('atema'), 12.10974787), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('enuta'), -15.29756722), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mashomisi'), -16.05808867), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mata'), -8.574032654), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('nata'), -6.823215263), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('sasi'), -14.77671518), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shashi'), -13.71767966), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shisata'), -11.34309371), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shushoma'), -11.19329949), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ta'), -9.205644162), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tatomi'), -13.74726148), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tishenishu'), -23.12247048), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('toni'), -15.1191937), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tusa'), -13.79217439), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ʃi'), -15.68503325), ] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c: calced = string_similarity(c, unspecified_test_corpus.find('atema'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001) expected = [ (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('atema'), -14.77671518), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('enuta'), -15.43519993), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mashomisi'), -13.96361833), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mata'), -11.58324408), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('nata'), -11.67727303), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('sasi'), 8.126877557), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shashi'), -9.734809346), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shisata'), -7.840021077), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shushoma'), -15.95332831), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ta'), -6.848974285), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tatomi'), -16.85050186), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tishenishu'), -20.51761446), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('toni'), -12.51433768), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tusa'), -4.829191506), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ʃi'), -5.994066536), ] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c: calced = string_similarity(c, unspecified_test_corpus.find('sasi'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_transcription_type(unspecified_test_corpus): expected = [ (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('atema'), 10.54988612), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('enuta'), -13.35737022), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mashomisi'), -16.64202823), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mata'), -5.95476627), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('nata'), -8.178638789), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('sasi'), -14.85026877), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shashi'), -13.67469544), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shisata'), -12.0090178), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shushoma'), -12.51154463), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ta'), -8.296421824), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tatomi'), -13.01231991), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tishenishu'), -23.85818691), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('toni'), -14.54716897), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tusa'), -13.85402179), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ʃi'), -14.60340869), ] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c: calced = string_similarity(c, unspecified_test_corpus.find('atema'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001) expected = [ (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('atema'), -14.85026877), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('enuta'), -16.64202823), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mashomisi'), -12.94778139), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mata'), -11.67221494), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('nata'), -12.07768004), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('sasi'), 8.812614836), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shashi'), -11.93742415), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shisata'), -7.90637444), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shushoma'), -18.22899329), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ta'), -7.683230889), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tatomi'), -16.91136117), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tishenishu'), -21.83498509), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('toni'), -12.52396715), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tusa'), -5.239146233), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ʃi'), -6.943894326), ] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c: calced = string_similarity(c, unspecified_test_corpus.find('sasi'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_spelling_token(unspecified_test_corpus): expected = [(unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('atema'), 12.9671688), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('enuta'), -16.49795651), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mashomisi'), -17.65533907), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mata'), -7.337667817), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('nata'), -9.088485208), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('sasi'), -13.8251823), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shashi'), -17.52074498), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shisata'), -12.59737574), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shushoma'), -14.82488063), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ta'), -9.8915809), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tatomi'), -14.6046824), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tishenishu'), -27.61147254), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('toni'), -16.14809881), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tusa'), -13.8308605), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ʃi'), -22.4838445)] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'token') as c: calced = string_similarity(c, unspecified_test_corpus.find('atema'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001) expected = [(unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('atema'), -13.8251823), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('enuta'), -14.48366705), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mashomisi'), -16.62778969), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mata'), -10.46022702), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('nata'), -10.55425597), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('sasi'), 6.832376308), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shashi'), -7.235843913), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shisata'), -9.913037922), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shushoma'), -19.77169406), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ta'), -5.382988852), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tatomi'), -16.07045316), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tishenishu'), -24.92713472), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('toni'), -11.39132061), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tusa'), -5.172159875), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ʃi'), -10.12650306)] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'token') as c: calced = string_similarity(c, unspecified_test_corpus.find('sasi'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_spelling_type(unspecified_test_corpus): expected = [ (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('atema'), 11.0766887), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('enuta'), -14.09489383), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mashomisi'), -18.35890071), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('mata'), -6.270847817), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('nata'), -8.494720336), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('sasi'), -13.57140897), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shashi'), -18.17657916), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shisata'), -13.51516925), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('shushoma'), -16.90806783), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ta'), -8.717863887), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tatomi'), -13.53912249), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tishenishu'), -28.78151269), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('toni'), -15.17933206), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('tusa'), -13.53067344), (unspecified_test_corpus.find('atema'), unspecified_test_corpus.find('ʃi'), -17.53815687), ] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c: calced = string_similarity(c, unspecified_test_corpus.find('atema'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001) expected = [ (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('atema'), -13.57140897), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('enuta'), -15.36316844), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mashomisi'), -16.92481569), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('mata'), -10.28799462), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('nata'), -10.69345973), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('sasi'), 7.323034009), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shashi'), -8.971692634), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shisata'), -10.26267682), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('shushoma'), -20.30229654), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ta'), -6.088289546), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tatomi'), -15.73786189), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tishenishu'), -25.52902026), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('toni'), -11.13974683), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('tusa'), -5.449867265), (unspecified_test_corpus.find('sasi'), unspecified_test_corpus.find('ʃi'), -7.54617756), ] expected.sort(key=lambda t: t[2]) expected.reverse() with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c: calced = string_similarity(c, unspecified_test_corpus.find('sasi'), 'khorsi') for i, v in enumerate(expected): assert (abs(calced[i][2] - v[2]) < 0.0001)