Ejemplo n.º 1
0
 def run_manifold_module(self, verbose=False):
     """
     Run the phon module.
     """
     vprint(verbose, 'Syntactic word neighbors...')
     if self.corpus_file_object:
         self._make_all_manifold_objects()
Ejemplo n.º 2
0
 def run_ngram_module(self, verbose=False):
     """
     Run the ngram module.
     """
     vprint('Extracting word ngrams...', verbose=verbose)
     if self._wordlist is None:
         self._make_wordlist()
Ejemplo n.º 3
0
 def run_ngram_module(self, verbose=False):
     """
     Run the ngram module.
     """
     vprint(verbose, 'Extracting word ngrams...')
     if self._wordlist is None:
         self._make_wordlist()
Ejemplo n.º 4
0
 def run_manifold_module(self, verbose=False):
     """
     Run the phon module.
     """
     vprint('Syntactic word neighbors...', verbose=verbose)
     if self.corpus_file_object:
         self._make_all_manifold_objects()
Ejemplo n.º 5
0
def test_vprint():
    assert vprint('x', verbose=False) is None
    assert vprint('x', verbose=True) is None
Ejemplo n.º 6
0
    def output_all_results(self, directory=None, verbose=False, test=False):
        """
        Output all Linguistica results to *directory*.

        :param directory: output directory. If not specified, it defaults to
            the current directory given by ``os.getcwd()``.
        """
        if not directory:
            output_dir = os.getcwd()
        else:
            output_dir = os.path.abspath(directory)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint(verbose, 'ngram objects')

            fname = 'word_bigrams.txt'
            obj = double_sorted(self.word_bigram_counter().items(),
                                key=lambda x: x[1],
                                reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(
                obj,
                f_path,
                title='Word bigrams',
                headers=['Word bigram', 'Count'],
                row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]],
                column_widths=[50, 10],
                lxa_parameters=self.parameters(),
                test=test,
                encoding=self.encoding,
                number_of_word_types=self.number_of_word_types(),
                number_of_word_tokens=self.number_of_word_tokens(),
                input_file_path=self.file_abspath)
            vprint(verbose, '\t' + fname)

            fname = 'word_trigrams.txt'
            obj = double_sorted(self.word_trigram_counter().items(),
                                key=lambda x: x[1],
                                reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(
                obj,
                f_path,
                title='Word trigrams',
                headers=['Word trigram', 'Count'],
                row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]],
                column_widths=[75, 10],
                lxa_parameters=self.parameters(),
                test=test,
                encoding=self.encoding,
                number_of_word_types=self.number_of_word_types(),
                number_of_word_tokens=self.number_of_word_tokens(),
                input_file_path=self.file_abspath)
            vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        vprint(verbose, 'morphological signature objects')

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Stems to words '
                     '(descending order of word count)',
                     headers=['Stem', 'Word count', 'Words'],
                     row_functions=[
                         lambda x: x[0], lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: x[0],
                            reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Stems to words '
                     '(alphabetical order of stems)',
                     headers=['Stem', 'Word count', '1st 10 words'],
                     row_functions=[
                         lambda x: x[0], lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_stems.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to stems',
                     headers=['Signature', 'Stem count', 'Stems'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_stems_truncated.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to stems '
                     '(first 10 stems for each sig)',
                     headers=['Signature', 'Stem count', '1st 10 stems'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ' '.join(sorted(x[1])[:10])
                     ],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'stems_to_signatures.txt'
        obj = double_sorted(self.stems_to_signatures().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Stems to signatures',
            headers=['Stems', 'Signatures'],
            row_functions=[
                lambda x: x[0],
                lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))
            ],
            column_widths=[15, 0],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'words_to_signatures.txt'
        obj = double_sorted(self.words_to_signatures().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Words to signatures',
            headers=['Word', 'Sig count', 'Signatures'],
            row_functions=[
                lambda x: x[0], lambda x: len(x[1]),
                lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))
            ],
            column_widths=[25, 15, 0],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_words.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to words',
                     headers=['Signature', 'Word count', 'Words'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_words_truncated.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to words '
                     '(first 10 words for each sig)',
                     headers=['Signature', 'Word count', '1st 10 words'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1])[:10])
                     ],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'words_to_sigtransforms.txt'
        obj = double_sorted(self.words_to_sigtransforms().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Words to sigtransforms',
                     headers=['Word', 'Signature transforms'],
                     row_functions=[
                         lambda x: x[0], lambda x: ', '.join(
                             SEP_SIG.join(sig) + SEP_SIGTRANSFORM + affix
                             for sig, affix in sorted(x[1]))
                     ],
                     column_widths=[20, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'affixes_to_signatures.txt'
        obj = double_sorted(self.affixes_to_signatures().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Affixes to signatures',
            headers=['Affix', 'Sig count', 'Signatures'],
            row_functions=[
                lambda x: x[0], lambda x: len(x[1]),
                lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))
            ],
            column_widths=[15, 15, 0],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint(verbose, 'manifold objects')

            fname = 'words_to_neighbors.txt'
            obj = list()  # list of tuple(word, list of neighbor words)
            for word in self.wordlist()[:self.parameters()['max_word_types']]:
                obj.append((word, self.words_to_neighbors()[word]))
            f_path = os.path.join(output_dir, fname)
            output_latex(
                obj,
                f_path,
                title='Words to neighbors',
                headers=['Word', 'Neighbors'],
                row_functions=[lambda x: x[0], lambda x: ' '.join(x[1])],
                column_widths=[25, 0],
                lxa_parameters=self.parameters(),
                test=test,
                encoding=self.encoding,
                number_of_word_types=self.number_of_word_types(),
                number_of_word_tokens=self.number_of_word_tokens(),
                input_file_path=self.file_abspath)
            vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        vprint(verbose, 'phon objects')

        def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_,
                                        test_, encoding_,
                                        number_of_word_types_,
                                        number_of_word_tokens_,
                                        input_file_path_):
            output_latex(obj_,
                         f_path_,
                         title=title_,
                         headers=[
                             'Word', 'Count', 'Frequency', 'Phones',
                             'Unigram plog', 'Avg unigram plog', 'Bigram plog',
                             'Avg bigram plog'
                         ],
                         row_functions=[
                             lambda x: x[0],
                             lambda x: x[1].count,
                             lambda x: '%.6f' % x[1].frequency,
                             lambda x: ' '.join(x[1].phones),
                             lambda x: '%8.3f' % x[1].unigram_plog,
                             lambda x: '%8.3f' % x[1].avg_unigram_plog,
                             lambda x: '%8.3f' % x[1].bigram_plog,
                             lambda x: '%8.3f' % x[1].avg_bigram_plog,
                         ],
                         column_widths=[35, 10, 15, 60, 15, 15, 15, 15],
                         lxa_parameters=lxa_parameters_,
                         test=test_,
                         encoding=encoding_,
                         number_of_word_types=number_of_word_types_,
                         number_of_word_tokens=number_of_word_tokens_,
                         input_file_path=input_file_path_)

        fname = 'wordlist.txt'
        obj_word_phon = list()  # list of tuple(word, list of neighbor words)
        for word in self.wordlist():
            obj_word_phon.append((word, self.word_phonology_dict()[word]))
        f_path = os.path.join(output_dir, 'wordlist.txt')
        output_latex_for_phon_words(obj_word_phon, f_path,
                                    'Wordlist sorted by word count',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'wordlist_by_avg_unigram_plog.txt'
        obj_unigram_plog = double_sorted(obj_word_phon,
                                         key=lambda x: x[1].avg_unigram_plog,
                                         reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_unigram_plog, f_path,
                                    'Wordlist sorted by avg unigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'wordlist_by_avg_bigram_plog.txt'
        obj_bigram_plog = double_sorted(obj_word_phon,
                                        key=lambda x: x[1].avg_bigram_plog,
                                        reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_bigram_plog, f_path,
                                    'Wordlist sorted by avg bigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'phones.txt'
        obj = double_sorted(self.phone_dict().items(),
                            key=lambda x: x[1].count,
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Phones',
                     headers=['Phone', 'Count', 'Frequency', 'Plog'],
                     row_functions=[
                         lambda x: x[0],
                         lambda x: x[1].count,
                         lambda x: '%.6f' % x[1].frequency,
                         lambda x: '%8.3f' % x[1].plog,
                     ],
                     column_widths=[10, 10, 15, 15],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'biphones.txt'
        obj = double_sorted(self.biphone_dict().items(),
                            key=lambda x: x[1].count,
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Biphones',
            headers=['Biphone', 'Count', 'Frequency', 'MI', 'Weighted MI'],
            row_functions=[
                lambda x: ' '.join(x[0]),
                lambda x: x[1].count,
                lambda x: '%.6f' % x[1].frequency,
                lambda x: '%8.3f' % x[1].MI,
                lambda x: '%8.3f' % x[1].weighted_MI,
            ],
            column_widths=[10, 10, 15, 15, 15],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'triphones.txt'
        obj = double_sorted(self.phone_trigram_counter().items(),
                            key=lambda x: x[1],
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Triphones',
                     headers=['Triphone', 'Count'],
                     row_functions=[
                         lambda x: ' '.join(x[0]),
                         lambda x: x[1],
                     ],
                     column_widths=[15, 10],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        vprint(verbose, 'trie objects')

        fname = 'words_as_tries.txt'
        obj = list()
        for word in self.wordlist():
            obj.append((word, self.broken_words_left_to_right()[word],
                        self.broken_words_right_to_left()[word]))
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Words as tries',
            headers=['Word', 'Left-to-right trie', 'Right-to-left trie'],
            row_functions=[
                lambda x: x[0],
                lambda x: ' '.join(x[1]),
                lambda x: ' '.join(x[2]),
            ],
            column_widths=[35, 50, 50],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'successors.txt'
        obj = double_sorted(self.successors().items(),
                            key=lambda x: len(x[1]),
                            reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Successors',
                     headers=['String', 'Successors'],
                     row_functions=[
                         lambda x: x[0],
                         lambda x: ' '.join(sorted(x[1])),
                     ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'predecessors.txt'
        obj = double_sorted(self.predecessors().items(),
                            key=lambda x: len(x[1]),
                            reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Predecessors',
                     headers=['String', 'Predecessors'],
                     row_functions=[
                         lambda x: x[0],
                         lambda x: ' '.join(sorted(x[1])),
                     ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)
Ejemplo n.º 7
0
 def run_trie_module(self, verbose=False):
     """
     Run the trie module.
     """
     vprint(verbose, 'Tries...')
     self._make_all_trie_objects()
Ejemplo n.º 8
0
 def run_phon_module(self, verbose=False):
     """
     Run the phon module.
     """
     vprint(verbose, 'Phonology...')
     self._make_all_phon_objects()
Ejemplo n.º 9
0
 def run_signature_module(self, verbose=False):
     """
     Run the signature module.
     """
     vprint(verbose, 'Morphological signatures...')
     self._make_all_signature_objects()
Ejemplo n.º 10
0
    def output_all_results(self, directory=None, verbose=False, test=False):
        """
        Output all Linguistica results to *directory*.

        :param directory: output directory. If not specified, it defaults to
            the current directory given by ``os.getcwd()``.
        """
        if not directory:
            output_dir = os.getcwd()
        else:
            output_dir = os.path.abspath(directory)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint('ngram objects', verbose=verbose)

            fname = 'word_bigrams.txt'
            obj = double_sorted(self.word_bigram_counter().items(),
                                key=lambda x: x[1], reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(obj, f_path,
                         title='Word bigrams',
                         headers=['Word bigram', 'Count'],
                         row_functions=[lambda x: ' '.join(x[0]),
                                        lambda x: x[1]],
                         column_widths=[50, 10],
                         lxa_parameters=self.parameters(),
                         test=test, encoding=self.encoding,
                         number_of_word_types=self.number_of_word_types(),
                         number_of_word_tokens=self.number_of_word_tokens(),
                         input_file_path=self.file_abspath)
            vprint('\t' + fname, verbose=verbose)

            fname = 'word_trigrams.txt'
            obj = double_sorted(self.word_trigram_counter().items(),
                                key=lambda x: x[1], reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(obj, f_path,
                         title='Word trigrams',
                         headers=['Word trigram', 'Count'],
                         row_functions=[lambda x: ' '.join(x[0]),
                                        lambda x: x[1]],
                         column_widths=[75, 10],
                         lxa_parameters=self.parameters(),
                         test=test, encoding=self.encoding,
                         number_of_word_types=self.number_of_word_types(),
                         number_of_word_tokens=self.number_of_word_tokens(),
                         input_file_path=self.file_abspath)
            vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        vprint('morphological signature objects', verbose=verbose)

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Stems to words '
                           '(descending order of word count)',
                     headers=['Stem', 'Word count', 'Words'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: x[0], reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Stems to words '
                           '(alphabetical order of stems)',
                     headers=['Stem', 'Word count', '1st 10 words'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_stems.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to stems',
                     headers=['Signature', 'Stem count', 'Stems'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_stems_truncated.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to stems '
                           '(first 10 stems for each sig)',
                     headers=['Signature', 'Stem count', '1st 10 stems'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ' '.join(sorted(x[1])[:10])],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'stems_to_signatures.txt'
        obj = double_sorted(self.stems_to_signatures().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Stems to signatures',
                     headers=['Stems', 'Signatures'],
                     row_functions=[lambda x: x[0],
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig)
                                              for sig in sorted(x[1]))],
                     column_widths=[15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'words_to_signatures.txt'
        obj = double_sorted(self.words_to_signatures().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Words to signatures',
                     headers=['Word', 'Sig count', 'Signatures'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig)
                                              for sig in sorted(x[1]))],
                     column_widths=[25, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_words.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to words',
                     headers=['Signature', 'Word count', 'Words'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_words_truncated.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to words '
                           '(first 10 words for each sig)',
                     headers=['Signature', 'Word count', '1st 10 words'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ', '.join(sorted(x[1])[:10])],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'words_to_sigtransforms.txt'
        obj = double_sorted(self.words_to_sigtransforms().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Words to sigtransforms',
                     headers=['Word', 'Signature transforms'],
                     row_functions=[lambda x: x[0],
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig) +
                                              SEP_SIGTRANSFORM + affix
                                              for sig, affix in sorted(x[1]))],
                     column_widths=[20, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'affixes_to_signatures.txt'
        obj = double_sorted(self.affixes_to_signatures().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Affixes to signatures',
                     headers=['Affix', 'Sig count', 'Signatures'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig)
                                              for sig in sorted(x[1]))],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint('manifold objects', verbose=verbose)

            fname = 'words_to_neighbors.txt'
            obj = list()  # list of tuple(word, list of neighbor words)
            for word in self.wordlist()[: self.parameters()['max_word_types']]:
                obj.append((word, self.words_to_neighbors()[word]))
            f_path = os.path.join(output_dir, fname)
            output_latex(obj, f_path,
                         title='Words to neighbors',
                         headers=['Word', 'Neighbors'],
                         row_functions=[lambda x: x[0],
                                        lambda x: ' '.join(x[1])],
                         column_widths=[25, 0],
                         lxa_parameters=self.parameters(),
                         test=test, encoding=self.encoding,
                         number_of_word_types=self.number_of_word_types(),
                         number_of_word_tokens=self.number_of_word_tokens(),
                         input_file_path=self.file_abspath)
            vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        vprint('phon objects', verbose=verbose)

        def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_,
                                        test_, encoding_, number_of_word_types_,
                                        number_of_word_tokens_,
                                        input_file_path_):
            output_latex(obj_, f_path_,
                         title=title_,
                         headers=['Word', 'Count', 'Frequency', 'Phones',
                                  'Unigram plog', 'Avg unigram plog',
                                  'Bigram plog', 'Avg bigram plog'],
                         row_functions=[lambda x: x[0],
                                        lambda x: x[1].count,
                                        lambda x:
                                        '%.6f' % x[1].frequency,
                                        lambda x:
                                        ' '.join(x[1].phones),
                                        lambda x:
                                        '%8.3f' % x[1].unigram_plog,
                                        lambda x:
                                        '%8.3f' % x[1].avg_unigram_plog,
                                        lambda x:
                                        '%8.3f' % x[1].bigram_plog,
                                        lambda x:
                                        '%8.3f' % x[1].avg_bigram_plog,
                                        ],
                         column_widths=[35, 10, 15, 60, 15, 15, 15, 15],
                         lxa_parameters=lxa_parameters_,
                         test=test_, encoding=encoding_,
                         number_of_word_types=number_of_word_types_,
                         number_of_word_tokens=number_of_word_tokens_,
                         input_file_path=input_file_path_)

        fname = 'wordlist.txt'
        obj_word_phon = list()  # list of tuple(word, list of neighbor words)
        for word in self.wordlist():
            obj_word_phon.append((word, self.word_phonology_dict()[word]))
        f_path = os.path.join(output_dir, 'wordlist.txt')
        output_latex_for_phon_words(obj_word_phon, f_path,
                                    'Wordlist sorted by word count',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'wordlist_by_avg_unigram_plog.txt'
        obj_unigram_plog = double_sorted(obj_word_phon,
                                         key=lambda x: x[1].avg_unigram_plog,
                                         reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_unigram_plog, f_path,
                                    'Wordlist sorted by avg unigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'wordlist_by_avg_bigram_plog.txt'
        obj_bigram_plog = double_sorted(obj_word_phon,
                                        key=lambda x: x[1].avg_bigram_plog,
                                        reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_bigram_plog, f_path,
                                    'Wordlist sorted by avg bigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'phones.txt'
        obj = double_sorted(self.phone_dict().items(),
                            key=lambda x: x[1].count, reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Phones',
                     headers=['Phone', 'Count', 'Frequency', 'Plog'],
                     row_functions=[lambda x: x[0],
                                    lambda x: x[1].count,
                                    lambda x: '%.6f' % x[1].frequency,
                                    lambda x: '%8.3f' % x[1].plog,
                                    ],
                     column_widths=[10, 10, 15, 15],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'biphones.txt'
        obj = double_sorted(self.biphone_dict().items(),
                            key=lambda x: x[1].count, reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Biphones',
                     headers=['Biphone', 'Count', 'Frequency',
                              'MI', 'Weighted MI'],
                     row_functions=[lambda x: ' '.join(x[0]),
                                    lambda x: x[1].count,
                                    lambda x:
                                    '%.6f' % x[1].frequency,
                                    lambda x:
                                    '%8.3f' % x[1].MI,
                                    lambda x:
                                    '%8.3f' % x[1].weighted_MI,
                                    ],
                     column_widths=[10, 10, 15, 15, 15],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'triphones.txt'
        obj = double_sorted(self.phone_trigram_counter().items(),
                            key=lambda x: x[1], reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Triphones',
                     headers=['Triphone', 'Count'],
                     row_functions=[lambda x: ' '.join(x[0]),
                                    lambda x: x[1],
                                    ],
                     column_widths=[15, 10],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        vprint('trie objects', verbose=verbose)

        fname = 'words_as_tries.txt'
        obj = list()
        for word in self.wordlist():
            obj.append((word,
                        self.broken_words_left_to_right()[word],
                        self.broken_words_right_to_left()[word]))
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Words as tries',
                     headers=['Word', 'Left-to-right trie',
                              'Right-to-left trie'],
                     row_functions=[lambda x: x[0],
                                    lambda x: ' '.join(x[1]),
                                    lambda x: ' '.join(x[2]),
                                    ],
                     column_widths=[35, 50, 50],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'successors.txt'
        obj = double_sorted(self.successors().items(),
                            key=lambda x: len(x[1]), reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Successors',
                     headers=['String', 'Successors'],
                     row_functions=[lambda x: x[0],
                                    lambda x: ' '.join(sorted(x[1])),
                                    ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'predecessors.txt'
        obj = double_sorted(self.predecessors().items(),
                            key=lambda x: len(x[1]), reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Predecessors',
                     headers=['String', 'Predecessors'],
                     row_functions=[lambda x: x[0],
                                    lambda x: ' '.join(sorted(x[1])),
                                    ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)
Ejemplo n.º 11
0
 def run_trie_module(self, verbose=False):
     """
     Run the trie module.
     """
     vprint('Tries...', verbose=verbose)
     self._make_all_trie_objects()
Ejemplo n.º 12
0
 def run_phon_module(self, verbose=False):
     """
     Run the phon module.
     """
     vprint('Phonology...', verbose=verbose)
     self._make_all_phon_objects()
Ejemplo n.º 13
0
 def run_signature_module(self, verbose=False):
     """
     Run the signature module.
     """
     vprint('Morphological signatures...', verbose=verbose)
     self._make_all_signature_objects()
Ejemplo n.º 14
0
def test_vprint():
    assert vprint(False, 'x') is None
    assert vprint(True, 'x') is None
Ejemplo n.º 15
0
def test_vprint():
    assert vprint('x', verbose=False) is None
    assert vprint('x', verbose=True) is None