def run_manifold_module(self, verbose=False): """ Run the phon module. """ vprint(verbose, 'Syntactic word neighbors...') if self.corpus_file_object: self._make_all_manifold_objects()
def run_ngram_module(self, verbose=False): """ Run the ngram module. """ vprint('Extracting word ngrams...', verbose=verbose) if self._wordlist is None: self._make_wordlist()
def run_ngram_module(self, verbose=False): """ Run the ngram module. """ vprint(verbose, 'Extracting word ngrams...') if self._wordlist is None: self._make_wordlist()
def run_manifold_module(self, verbose=False): """ Run the phon module. """ vprint('Syntactic word neighbors...', verbose=verbose) if self.corpus_file_object: self._make_all_manifold_objects()
def test_vprint(): assert vprint('x', verbose=False) is None assert vprint('x', verbose=True) is None
def output_all_results(self, directory=None, verbose=False, test=False): """ Output all Linguistica results to *directory*. :param directory: output directory. If not specified, it defaults to the current directory given by ``os.getcwd()``. """ if not directory: output_dir = os.getcwd() else: output_dir = os.path.abspath(directory) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint(verbose, 'ngram objects') fname = 'word_bigrams.txt' obj = double_sorted(self.word_bigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Word bigrams', headers=['Word bigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[50, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'word_trigrams.txt' obj = double_sorted(self.word_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Word trigrams', headers=['Word trigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[75, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- vprint(verbose, 'morphological signature objects') fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(descending order of word count)', headers=['Stem', 'Word count', 'Words'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: x[0], reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(alphabetical order of stems)', headers=['Stem', 'Word count', '1st 10 words'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_stems.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems', headers=['Signature', 'Stem count', 'Stems'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_stems_truncated.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems ' '(first 10 stems for each sig)', headers=['Signature', 'Stem count', '1st 10 stems'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ' '.join(sorted(x[1])[:10]) ], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'stems_to_signatures.txt' obj = double_sorted(self.stems_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Stems to signatures', headers=['Stems', 'Signatures'], row_functions=[ lambda x: x[0], lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1])) ], column_widths=[15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'words_to_signatures.txt' obj = double_sorted(self.words_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Words to signatures', headers=['Word', 'Sig count', 'Signatures'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1])) ], column_widths=[25, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_words.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words', headers=['Signature', 'Word count', 'Words'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_words_truncated.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words ' '(first 10 words for each sig)', headers=['Signature', 'Word count', '1st 10 words'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])[:10]) ], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'words_to_sigtransforms.txt' obj = double_sorted(self.words_to_sigtransforms().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to sigtransforms', headers=['Word', 'Signature transforms'], row_functions=[ lambda x: x[0], lambda x: ', '.join( SEP_SIG.join(sig) + SEP_SIGTRANSFORM + affix for sig, affix in sorted(x[1])) ], column_widths=[20, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'affixes_to_signatures.txt' obj = double_sorted(self.affixes_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Affixes to signatures', headers=['Affix', 'Sig count', 'Signatures'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1])) ], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint(verbose, 'manifold objects') fname = 'words_to_neighbors.txt' obj = list() # list of tuple(word, list of neighbor words) for word in self.wordlist()[:self.parameters()['max_word_types']]: obj.append((word, self.words_to_neighbors()[word])) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Words to neighbors', headers=['Word', 'Neighbors'], row_functions=[lambda x: x[0], lambda x: ' '.join(x[1])], column_widths=[25, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- vprint(verbose, 'phon objects') def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_, test_, encoding_, number_of_word_types_, number_of_word_tokens_, input_file_path_): output_latex(obj_, f_path_, title=title_, headers=[ 'Word', 'Count', 'Frequency', 'Phones', 'Unigram plog', 'Avg unigram plog', 'Bigram plog', 'Avg bigram plog' ], row_functions=[ lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: ' '.join(x[1].phones), lambda x: '%8.3f' % x[1].unigram_plog, lambda x: '%8.3f' % x[1].avg_unigram_plog, lambda x: '%8.3f' % x[1].bigram_plog, lambda x: '%8.3f' % x[1].avg_bigram_plog, ], column_widths=[35, 10, 15, 60, 15, 15, 15, 15], lxa_parameters=lxa_parameters_, test=test_, encoding=encoding_, number_of_word_types=number_of_word_types_, number_of_word_tokens=number_of_word_tokens_, input_file_path=input_file_path_) fname = 'wordlist.txt' obj_word_phon = list() # list of tuple(word, list of neighbor words) for word in self.wordlist(): obj_word_phon.append((word, self.word_phonology_dict()[word])) f_path = os.path.join(output_dir, 'wordlist.txt') output_latex_for_phon_words(obj_word_phon, f_path, 'Wordlist sorted by word count', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint(verbose, '\t' + fname) fname = 'wordlist_by_avg_unigram_plog.txt' obj_unigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_unigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_unigram_plog, f_path, 'Wordlist sorted by avg unigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint(verbose, '\t' + fname) fname = 'wordlist_by_avg_bigram_plog.txt' obj_bigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_bigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_bigram_plog, f_path, 'Wordlist sorted by avg bigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint(verbose, '\t' + fname) fname = 'phones.txt' obj = double_sorted(self.phone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Phones', headers=['Phone', 'Count', 'Frequency', 'Plog'], row_functions=[ lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].plog, ], column_widths=[10, 10, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'biphones.txt' obj = double_sorted(self.biphone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Biphones', headers=['Biphone', 'Count', 'Frequency', 'MI', 'Weighted MI'], row_functions=[ lambda x: ' '.join(x[0]), lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].MI, lambda x: '%8.3f' % x[1].weighted_MI, ], column_widths=[10, 10, 15, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'triphones.txt' obj = double_sorted(self.phone_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Triphones', headers=['Triphone', 'Count'], row_functions=[ lambda x: ' '.join(x[0]), lambda x: x[1], ], column_widths=[15, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- vprint(verbose, 'trie objects') fname = 'words_as_tries.txt' obj = list() for word in self.wordlist(): obj.append((word, self.broken_words_left_to_right()[word], self.broken_words_right_to_left()[word])) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Words as tries', headers=['Word', 'Left-to-right trie', 'Right-to-left trie'], row_functions=[ lambda x: x[0], lambda x: ' '.join(x[1]), lambda x: ' '.join(x[2]), ], column_widths=[35, 50, 50], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'successors.txt' obj = double_sorted(self.successors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Successors', headers=['String', 'Successors'], row_functions=[ lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'predecessors.txt' obj = double_sorted(self.predecessors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Predecessors', headers=['String', 'Predecessors'], row_functions=[ lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname)
def run_trie_module(self, verbose=False): """ Run the trie module. """ vprint(verbose, 'Tries...') self._make_all_trie_objects()
def run_phon_module(self, verbose=False): """ Run the phon module. """ vprint(verbose, 'Phonology...') self._make_all_phon_objects()
def run_signature_module(self, verbose=False): """ Run the signature module. """ vprint(verbose, 'Morphological signatures...') self._make_all_signature_objects()
def output_all_results(self, directory=None, verbose=False, test=False): """ Output all Linguistica results to *directory*. :param directory: output directory. If not specified, it defaults to the current directory given by ``os.getcwd()``. """ if not directory: output_dir = os.getcwd() else: output_dir = os.path.abspath(directory) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint('ngram objects', verbose=verbose) fname = 'word_bigrams.txt' obj = double_sorted(self.word_bigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Word bigrams', headers=['Word bigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[50, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'word_trigrams.txt' obj = double_sorted(self.word_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Word trigrams', headers=['Word trigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[75, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- vprint('morphological signature objects', verbose=verbose) fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(descending order of word count)', headers=['Stem', 'Word count', 'Words'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: x[0], reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(alphabetical order of stems)', headers=['Stem', 'Word count', '1st 10 words'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_stems.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems', headers=['Signature', 'Stem count', 'Stems'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_stems_truncated.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems ' '(first 10 stems for each sig)', headers=['Signature', 'Stem count', '1st 10 stems'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ' '.join(sorted(x[1])[:10])], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'stems_to_signatures.txt' obj = double_sorted(self.stems_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to signatures', headers=['Stems', 'Signatures'], row_functions=[lambda x: x[0], lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))], column_widths=[15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'words_to_signatures.txt' obj = double_sorted(self.words_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to signatures', headers=['Word', 'Sig count', 'Signatures'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))], column_widths=[25, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_words.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words', headers=['Signature', 'Word count', 'Words'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_words_truncated.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words ' '(first 10 words for each sig)', headers=['Signature', 'Word count', '1st 10 words'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])[:10])], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'words_to_sigtransforms.txt' obj = double_sorted(self.words_to_sigtransforms().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to sigtransforms', headers=['Word', 'Signature transforms'], row_functions=[lambda x: x[0], lambda x: ', '.join(SEP_SIG.join(sig) + SEP_SIGTRANSFORM + affix for sig, affix in sorted(x[1]))], column_widths=[20, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'affixes_to_signatures.txt' obj = double_sorted(self.affixes_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Affixes to signatures', headers=['Affix', 'Sig count', 'Signatures'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint('manifold objects', verbose=verbose) fname = 'words_to_neighbors.txt' obj = list() # list of tuple(word, list of neighbor words) for word in self.wordlist()[: self.parameters()['max_word_types']]: obj.append((word, self.words_to_neighbors()[word])) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to neighbors', headers=['Word', 'Neighbors'], row_functions=[lambda x: x[0], lambda x: ' '.join(x[1])], column_widths=[25, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- vprint('phon objects', verbose=verbose) def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_, test_, encoding_, number_of_word_types_, number_of_word_tokens_, input_file_path_): output_latex(obj_, f_path_, title=title_, headers=['Word', 'Count', 'Frequency', 'Phones', 'Unigram plog', 'Avg unigram plog', 'Bigram plog', 'Avg bigram plog'], row_functions=[lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: ' '.join(x[1].phones), lambda x: '%8.3f' % x[1].unigram_plog, lambda x: '%8.3f' % x[1].avg_unigram_plog, lambda x: '%8.3f' % x[1].bigram_plog, lambda x: '%8.3f' % x[1].avg_bigram_plog, ], column_widths=[35, 10, 15, 60, 15, 15, 15, 15], lxa_parameters=lxa_parameters_, test=test_, encoding=encoding_, number_of_word_types=number_of_word_types_, number_of_word_tokens=number_of_word_tokens_, input_file_path=input_file_path_) fname = 'wordlist.txt' obj_word_phon = list() # list of tuple(word, list of neighbor words) for word in self.wordlist(): obj_word_phon.append((word, self.word_phonology_dict()[word])) f_path = os.path.join(output_dir, 'wordlist.txt') output_latex_for_phon_words(obj_word_phon, f_path, 'Wordlist sorted by word count', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'wordlist_by_avg_unigram_plog.txt' obj_unigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_unigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_unigram_plog, f_path, 'Wordlist sorted by avg unigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'wordlist_by_avg_bigram_plog.txt' obj_bigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_bigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_bigram_plog, f_path, 'Wordlist sorted by avg bigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'phones.txt' obj = double_sorted(self.phone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Phones', headers=['Phone', 'Count', 'Frequency', 'Plog'], row_functions=[lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].plog, ], column_widths=[10, 10, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'biphones.txt' obj = double_sorted(self.biphone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Biphones', headers=['Biphone', 'Count', 'Frequency', 'MI', 'Weighted MI'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].MI, lambda x: '%8.3f' % x[1].weighted_MI, ], column_widths=[10, 10, 15, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'triphones.txt' obj = double_sorted(self.phone_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Triphones', headers=['Triphone', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1], ], column_widths=[15, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- vprint('trie objects', verbose=verbose) fname = 'words_as_tries.txt' obj = list() for word in self.wordlist(): obj.append((word, self.broken_words_left_to_right()[word], self.broken_words_right_to_left()[word])) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words as tries', headers=['Word', 'Left-to-right trie', 'Right-to-left trie'], row_functions=[lambda x: x[0], lambda x: ' '.join(x[1]), lambda x: ' '.join(x[2]), ], column_widths=[35, 50, 50], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'successors.txt' obj = double_sorted(self.successors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Successors', headers=['String', 'Successors'], row_functions=[lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'predecessors.txt' obj = double_sorted(self.predecessors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Predecessors', headers=['String', 'Predecessors'], row_functions=[lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose)
def run_trie_module(self, verbose=False): """ Run the trie module. """ vprint('Tries...', verbose=verbose) self._make_all_trie_objects()
def run_phon_module(self, verbose=False): """ Run the phon module. """ vprint('Phonology...', verbose=verbose) self._make_all_phon_objects()
def run_signature_module(self, verbose=False): """ Run the signature module. """ vprint('Morphological signatures...', verbose=verbose) self._make_all_signature_objects()
def test_vprint(): assert vprint(False, 'x') is None assert vprint(True, 'x') is None