def run(self): if self.name == 'functional_load': try: results = minpair_fl(self.kwargs['corpus'], self.kwargs['segment_pair'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return elif self.name == 'string_similarity': try: results = string_similarity( self.kwargs['corpus'], self.kwargs['query'], self.kwargs['algorithm'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return elif self.name == 'phonotactic_probability': try: results = phonotactic_probability_vitevitch( self.kwargs['corpus'], self.kwargs['query'], self.kwargs['sequence_type'], probability_type=self.kwargs['probability_type'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return elif self.name == 'kullback_leibler': try: results = KullbackLeibler(self.kwargs['corpus'], self.kwargs['seg1'], self.kwargs['seg2'], self.kwargs['side'], stop_check=self.kwargs['stop_check'], call_back=self.kwargs['call_back']) self.dataReady.emit(results) except Exception as e: message = '{}:{}'.format(self.name, e) self.errorEncountered.emit(message) return else: raise UnLuckyException( 'No analysis function called {} could be found'.format( self.name))
def run(self): time.sleep(0.1) kwargs = self.kwargs self.results = [] context = kwargs.pop('context') if context == ContextWidget.canonical_value: cm = CanonicalVariantContext elif context == ContextWidget.frequent_value: cm = MostFrequentVariantContext elif context == ContextWidget.separate_value: cm = SeparatedTokensVariantContext elif context == ContextWidget.relative_value: cm = WeightedVariantContext with cm(kwargs['corpus'], kwargs['sequence_type'], kwargs['type_token'], frequency_threshold=kwargs['frequency_cutoff']) as c: try: for pair in kwargs['segment_pairs']: res = KullbackLeibler(c, pair[0], pair[1], outfile=None, side=kwargs['side'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) if self.stopped: break self.results.append(res) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return self.dataReady.emit(self.results)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser( description='Phonological CorpusTools: Kullback-Leibler CL interface') parser.add_argument( 'corpus_file_name', help= 'Path to corpus file. This can just be the file name if it\'s in the same directory as CorpusTools' ) parser.add_argument('seg1', help='First segment') parser.add_argument('seg2', help='Second segment') parser.add_argument( 'side', help= 'Context to check. Options are \'right\', \'left\' and \'both\'. You can enter just the first letter.' ) parser.add_argument( '-s', '--sequence_type', default='transcription', help= "The attribute of Words to calculate KL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier." ) parser.add_argument( '-t', '--type_or_token', default='token', help='Specifies whether entropy is based on type or token frequency.') parser.add_argument( '-c', '--context_type', type=str, default='Canonical', help= "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details." ) parser.add_argument('-o', '--outfile', help='Name of output file (optional)') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary( os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus_path = args.corpus_file_name if not os.path.isfile(corpus_path): corpus_path = os.path.join(os.getcwd(), corpus_path) corpus = load_binary(corpus_path) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token) results = KullbackLeibler(corpus, args.seg1, args.seg2, args.side, outfile=None) outfile = args.outfile if outfile is not None: if not os.path.isfile(outfile): outfile = os.path.join(os.getcwd(), outfile) if not outfile.endswith('.txt'): outfile += '.txt' with open(outfile, mode='w', encoding='utf-8-sig') as f: print( 'Seg1,Seg2,Seg1 entropy,Seg2 entropy,Possible UR, Spurious UR\n\r', file=f) print(','.join([str(r) for r in results]), file=f) print( '\n\rContext,Context frequency,{} frequency in context,{} frequency in context\n\r' .format(seg1, seg2), file=f) for context, result in allC.items(): cfrequency = freq_c[context] / totalC print('{},{},{},{}\n\r'.format(context, cfrequency, result.seg1 / result.sum(), result.seg2 / result.sum()), file=f) print('Done!') else: print(results)