Ejemplo n.º 1
0
 def run(self):
     time.sleep(0.1)
     kwargs = self.kwargs
     context = kwargs.pop('context')
     if context == RestrictedContextWidget.canonical_value:
         cm = CanonicalVariantContext
     elif context == RestrictedContextWidget.frequent_value:
         cm = MostFrequentVariantContext
     corpus = kwargs.pop('corpusModel').corpus
     st = kwargs.pop('sequence_type')
     tt = kwargs.pop('type_token')
     ft = kwargs.pop('frequency_cutoff')
     with cm(corpus, st, tt, frequency_threshold=ft) as c:
         try:
             query = kwargs.pop('query')
             alg = kwargs.pop('algorithm')
             self.results = string_similarity(c,
                                              query, alg, **kwargs)
         except PCTError as e:
             self.errorEncountered.emit(e)
             return
         except Exception as e:
             e = PCTPythonError(e)
             self.errorEncountered.emit(e)
             return
     if self.stopped:
         self.finishedCancelling.emit()
         return
     self.dataReady.emit(self.results)
Ejemplo n.º 2
0
    def run(self):
        if self.name == 'functional_load':
            try:
                results = minpair_fl(self.kwargs['corpus'],
                                     self.kwargs['segment_pair'],
                                     stop_check=self.kwargs['stop_check'],
                                     call_back=self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name, e)
                self.errorEncountered.emit(message)
                return

        elif self.name == 'string_similarity':
            try:
                results = string_similarity(
                    self.kwargs['corpus'],
                    self.kwargs['query'],
                    self.kwargs['algorithm'],
                    stop_check=self.kwargs['stop_check'],
                    call_back=self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name, e)
                self.errorEncountered.emit(message)
                return

        elif self.name == 'phonotactic_probability':
            try:
                results = phonotactic_probability_vitevitch(
                    self.kwargs['corpus'],
                    self.kwargs['query'],
                    self.kwargs['sequence_type'],
                    probability_type=self.kwargs['probability_type'],
                    stop_check=self.kwargs['stop_check'],
                    call_back=self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name, e)
                self.errorEncountered.emit(message)
                return

        elif self.name == 'kullback_leibler':
            try:
                results = KullbackLeibler(self.kwargs['corpus'],
                                          self.kwargs['seg1'],
                                          self.kwargs['seg2'],
                                          self.kwargs['side'],
                                          stop_check=self.kwargs['stop_check'],
                                          call_back=self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name, e)
                self.errorEncountered.emit(message)
                return

        else:
            raise UnLuckyException(
                'No analysis function called {} could be found'.format(
                    self.name))
Ejemplo n.º 3
0
 def run(self):
     time.sleep(0.1)
     kwargs = self.kwargs
     context = kwargs.pop("context")
     if context == RestrictedContextWidget.canonical_value:
         cm = CanonicalVariantContext
     elif context == RestrictedContextWidget.frequent_value:
         cm = MostFrequentVariantContext
     corpus = kwargs.pop("corpusModel").corpus
     st = kwargs.pop("sequence_type")
     tt = kwargs.pop("type_token")
     with cm(corpus, st, tt, None) as c:
         try:
             query = kwargs.pop("query")
             alg = kwargs.pop("algorithm")
             self.results = string_similarity(c, query, alg, **kwargs)
         except PCTError as e:
             self.errorEncountered.emit(e)
             return
         except Exception as e:
             e = PCTPythonError(e)
             self.errorEncountered.emit(e)
             return
     if self.stopped:
         self.finishedCancelling.emit()
         return
     self.dataReady.emit(self.results)
Ejemplo n.º 4
0
 def run(self):
     time.sleep(0.1)
     kwargs = self.kwargs
     context = kwargs.pop('context')
     if context == RestrictedContextWidget.canonical_value:
         cm = CanonicalVariantContext
     elif context == RestrictedContextWidget.frequent_value:
         cm = MostFrequentVariantContext
     corpus = kwargs.pop('corpusModel').corpus
     st = kwargs.pop('sequence_type')
     tt = kwargs.pop('type_token')
     ft = kwargs.pop('frequency_cutoff')
     with cm(corpus, st, tt, frequency_threshold=ft) as c:
         try:
             query = kwargs.pop('query')
             alg = kwargs.pop('algorithm')
             self.results = string_similarity(c,
                                              query, alg, **kwargs)
         except PCTError as e:
             self.errorEncountered.emit(e)
             return
         except Exception as e:
             e = PCTPythonError(e)
             self.errorEncountered.emit(e)
             return
     if self.stopped:
         self.finishedCancelling.emit()
         return
     self.dataReady.emit(self.results)
def test_mass_relate_transcription_token(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),12.10974787),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-15.29756722),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-16.05808867),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-8.574032654),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-6.823215263),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-14.77671518),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-13.71767966),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-11.34309371),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-11.19329949),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-9.205644162),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-13.74726148),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-23.12247048),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-15.1191937),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.79217439),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-15.68503325),]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('atema'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)

    expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-14.77671518),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-15.43519993),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-13.96361833),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-11.58324408),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-11.67727303),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),8.126877557),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-9.734809346),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-7.840021077),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-15.95332831),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-6.848974285),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-16.85050186),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-20.51761446),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-12.51433768),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-4.829191506),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-5.994066536),]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_transcription_type(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),10.54988612),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-13.35737022),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-16.64202823),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-5.95476627),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-8.178638789),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-14.85026877),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-13.67469544),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-12.0090178),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-12.51154463),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-8.296421824),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-13.01231991),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-23.85818691),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-14.54716897),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.85402179),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-14.60340869),]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('atema'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)

    expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-14.85026877),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-16.64202823),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-12.94778139),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-11.67221494),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-12.07768004),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),8.812614836),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-11.93742415),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-7.90637444),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-18.22899329),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-7.683230889),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-16.91136117),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-21.83498509),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-12.52396715),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-5.239146233),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-6.943894326),]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_spelling_token(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),12.9671688),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-16.49795651),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-17.65533907),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-7.337667817),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-9.088485208),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-13.8251823),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-17.52074498),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-12.59737574),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-14.82488063),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-9.8915809),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-14.6046824),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-27.61147254),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-16.14809881),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.8308605),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-22.4838445)]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'token') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('atema'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)

    expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-13.8251823),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-14.48366705),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-16.62778969),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-10.46022702),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-10.55425597),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),6.832376308),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-7.235843913),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-9.913037922),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-19.77169406),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-5.382988852),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-16.07045316),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-24.92713472),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-11.39132061),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-5.172159875),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-10.12650306)]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'token') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)
def test_mass_relate_spelling_type(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),11.0766887),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),-14.09489383),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),-18.35890071),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),-6.270847817),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),-8.494720336),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),-13.57140897),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),-18.17657916),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),-13.51516925),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),-16.90806783),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),-8.717863887),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),-13.53912249),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),-28.78151269),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),-15.17933206),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),-13.53067344),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),-17.53815687),]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'), 'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)

    expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),-13.57140897),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),-15.36316844),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),-16.92481569),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),-10.28799462),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),-10.69345973),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),7.323034009),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),-8.971692634),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),-10.26267682),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),-20.30229654),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),-6.088289546),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),-15.73786189),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),-25.52902026),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),-11.13974683),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),-5.449867265),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),-7.54617756),]
    expected.sort(key=lambda t:t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),'khorsi')
    for i, v in enumerate(expected):
        assert(abs(calced[i][2] - v[2]) < 0.0001)
Ejemplo n.º 9
0
def test_spelling(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),0),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),4),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),7),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),5),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),6),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),6),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),6),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),9),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),4),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),5)]
    expected.sort(key=lambda t:t[1])
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('atema'),'edit_distance')
    calced.sort(key=lambda t:t[1])
    for i, v in enumerate(expected):
        assert(calced[i] == v)

    expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),5),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),5),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),6),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),0),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),2),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),5),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),6),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),4),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),8),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),3)]
    expected.sort(key=lambda t:t[1])
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'edit_distance')
    calced.sort(key=lambda t:t[1])
    for i, v in enumerate(expected):
        assert(calced[i] == v)
Ejemplo n.º 10
0
    def run(self):
        if self.name == 'functional_load':
            try:
                results = minpair_fl(self.kwargs['corpus'], self.kwargs['segment_pair'],
                                     stop_check = self.kwargs['stop_check'],
                                     call_back = self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name,e)
                self.errorEncountered.emit(message)
                return

        elif self.name == 'string_similarity':
            try:
                results = string_similarity(self.kwargs['corpus'], self.kwargs['query'], self.kwargs['algorithm'],
                                                        stop_check = self.kwargs['stop_check'],
                                                            call_back = self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name,e)
                self.errorEncountered.emit(message)
                return

        elif self.name == 'phonotactic_probability':
            try:
                results = phonotactic_probability_vitevitch(self.kwargs['corpus'],self.kwargs['query'],
                                                            self.kwargs['sequence_type'],
                                                            probability_type=self.kwargs['probability_type'],
                                                            stop_check = self.kwargs['stop_check'],
                                                            call_back = self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name,e)
                self.errorEncountered.emit(message)
                return

        elif self.name == 'kullback_leibler':
            try:
                results = KullbackLeibler(self.kwargs['corpus'], self.kwargs['seg1'], self.kwargs['seg2'], self.kwargs['side'],
                                          stop_check = self.kwargs['stop_check'],
                                            call_back = self.kwargs['call_back'])
                self.dataReady.emit(results)
            except Exception as e:
                message = '{}:{}'.format(self.name,e)
                self.errorEncountered.emit(message)
                return

        else:
            raise UnLuckyException('No analysis function called {} could be found'.format(self.name))
Ejemplo n.º 11
0
def calc_freq_of_alt(corpus_context, seg1, seg2, algorithm, output_filename = None,
                    min_rel = None, max_rel = None, phono_align = False,
                    min_pairs_okay = False, from_gui=False, stop_check = None,
                    call_back = None):
    """Returns a double that is a measure of the frequency of
    alternation of two sounds in a given corpus

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    seg1: char
        A sound segment, e.g. 's', 't'
    seg2: char
        A sound segment
    algorithm: string
        The string similarity algorithm
    max_rel: double
        Filters out all words that are higher than max_rel from a relatedness measure
    min_rel: double
        Filters out all words that are lower than min_rel from a relatedness measure
    phono_align: boolean (1 or 0), optional
        1 means 'only count alternations that are likely phonologically aligned,'
        defaults to not force phonological alignment
    min_pairs_okay: bool, optional
        True means allow minimal pairs (e.g. in English, 's' and 't' do not
        alternate in minimal pairs,
        so allowing minimal pairs may skew results)
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    double
        The frequency of alternation of two sounds in a given corpus
    """

    list_seg1 = []
    list_seg2 = []
    all_words = set()
    if call_back is not None:
        call_back('Finding instances of segments...')
        call_back(0, len(corpus_context))
        cur = 0
    for w in corpus_context:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 1000 == 0:
                call_back(cur)
        tier = getattr(w, corpus_context.sequence_type)
        if seg1 in tier:
            list_seg1.append(w)
            all_words.add(w.spelling)
        if seg2 in tier:
            list_seg2.append(w)
            all_words.add(w.spelling)



    if call_back is not None:
        call_back('Calculating string similarities...')
        call_back(0, len(list_seg1) * len(list_seg2))
        cur = 0
    related_list = []
    if phono_align:
        al = pam.Aligner(features = corpus_context.specifier)
    for w1 in list_seg1:
        for w2 in list_seg2:
            if stop_check is not None and stop_check():
                return
            if call_back is not None:
                cur += 1
                if cur % 1000 == 0:
                    #print(len(related_list))
                    call_back(cur)
            if w1 == w2:
                continue
            ss = string_similarity(corpus_context, (w1,w2), algorithm)
            if min_rel is not None and ss[0][-1] < min_rel:
                continue
            if max_rel is not None and ss[0][-1] > max_rel:
                continue
            if not min_pairs_okay:
                if len(w1.transcription) == len(w2.transcription):
                    count_diff = 0
                    for i in range(len(w1.transcription)):
                        if w1.transcription[i] != w2.transcription[i]:
                            count_diff += 1
                            if count_diff > 1:
                                break
                    if count_diff == 1:
                        continue
            if phono_align:
                alignment = al.align(w1.transcription, w2.transcription)
                if not al.morpho_related(alignment, seg1, seg2):
                    continue

            related_list.append(ss[0])

    words_with_alt = set()
    if call_back is not None:
        call_back('Calculating frequency of alternation...')
        call_back(0, len(related_list))
        cur = 0
    for w1, w2, score in related_list:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 100 == 0:
                call_back(cur)
        words_with_alt.add(w1.spelling) #Hacks
        words_with_alt.add(w2.spelling)

    #Calculate frequency of alternation using sets to ensure no duplicates (i.e. words with both seg1 and seg2

    freq_of_alt = len(words_with_alt)/len(all_words)

    if output_filename:
        print_freqalt_results(output_filename, related_list)

    return len(all_words), len(words_with_alt), freq_of_alt
Ejemplo n.º 12
0
def test_mass_relate_transcription_token(unspecified_test_corpus):
    expected = [
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('atema'), 12.10974787),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('enuta'), -15.29756722),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mashomisi'), -16.05808867),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mata'), -8.574032654),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('nata'), -6.823215263),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('sasi'), -14.77671518),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shashi'), -13.71767966),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shisata'), -11.34309371),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shushoma'), -11.19329949),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ta'), -9.205644162),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tatomi'), -13.74726148),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tishenishu'), -23.12247048),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('toni'), -15.1191937),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tusa'), -13.79217439),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ʃi'), -15.68503325),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('atema'), -14.77671518),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('enuta'), -15.43519993),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mashomisi'), -13.96361833),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mata'), -11.58324408),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('nata'), -11.67727303),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('sasi'), 8.126877557),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shashi'), -9.734809346),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shisata'), -7.840021077),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shushoma'), -15.95332831),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ta'), -6.848974285),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tatomi'), -16.85050186),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tishenishu'), -20.51761446),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('toni'), -12.51433768),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tusa'), -4.829191506),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ʃi'), -5.994066536),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
Ejemplo n.º 13
0
def test_mass_relate_transcription_type(unspecified_test_corpus):
    expected = [
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('atema'), 10.54988612),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('enuta'), -13.35737022),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mashomisi'), -16.64202823),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mata'), -5.95476627),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('nata'), -8.178638789),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('sasi'), -14.85026877),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shashi'), -13.67469544),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shisata'), -12.0090178),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shushoma'), -12.51154463),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ta'), -8.296421824),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tatomi'), -13.01231991),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tishenishu'), -23.85818691),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('toni'), -14.54716897),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tusa'), -13.85402179),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ʃi'), -14.60340869),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('atema'), -14.85026877),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('enuta'), -16.64202823),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mashomisi'), -12.94778139),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mata'), -11.67221494),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('nata'), -12.07768004),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('sasi'), 8.812614836),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shashi'), -11.93742415),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shisata'), -7.90637444),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shushoma'), -18.22899329),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ta'), -7.683230889),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tatomi'), -16.91136117),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tishenishu'), -21.83498509),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('toni'), -12.52396715),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tusa'), -5.239146233),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ʃi'), -6.943894326),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
Ejemplo n.º 14
0
def test_mass_relate_spelling_token(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('atema'), 12.9671688),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('enuta'), -16.49795651),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('mashomisi'), -17.65533907),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('mata'), -7.337667817),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('nata'), -9.088485208),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('sasi'), -13.8251823),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('shashi'), -17.52074498),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('shisata'), -12.59737574),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('shushoma'), -14.82488063),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('ta'), -9.8915809),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('tatomi'), -14.6046824),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('tishenishu'), -27.61147254),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('toni'), -16.14809881),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('tusa'), -13.8308605),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('ʃi'), -22.4838445)]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [(unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('atema'), -13.8251823),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('enuta'), -14.48366705),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('mashomisi'), -16.62778969),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('mata'), -10.46022702),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('nata'), -10.55425597),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('sasi'), 6.832376308),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('shashi'), -7.235843913),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('shisata'), -9.913037922),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('shushoma'), -19.77169406),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('ta'), -5.382988852),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('tatomi'), -16.07045316),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('tishenishu'), -24.92713472),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('toni'), -11.39132061),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('tusa'), -5.172159875),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('ʃi'), -10.12650306)]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
Ejemplo n.º 15
0
def test_mass_relate_spelling_type(unspecified_test_corpus):
    expected = [
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('atema'), 11.0766887),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('enuta'), -14.09489383),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mashomisi'), -18.35890071),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mata'), -6.270847817),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('nata'), -8.494720336),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('sasi'), -13.57140897),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shashi'), -18.17657916),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shisata'), -13.51516925),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shushoma'), -16.90806783),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ta'), -8.717863887),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tatomi'), -13.53912249),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tishenishu'), -28.78151269),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('toni'), -15.17933206),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tusa'), -13.53067344),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ʃi'), -17.53815687),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('atema'), -13.57140897),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('enuta'), -15.36316844),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mashomisi'), -16.92481569),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mata'), -10.28799462),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('nata'), -10.69345973),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('sasi'), 7.323034009),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shashi'), -8.971692634),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shisata'), -10.26267682),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shushoma'), -20.30229654),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ta'), -6.088289546),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tatomi'), -15.73786189),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tishenishu'), -25.52902026),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('toni'), -11.13974683),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tusa'), -5.449867265),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ʃi'), -7.54617756),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)