def run(unigram_counter=None, bigram_counter=None, trigram_counter=None,
        max_word_types=1000, n_neighbors=9, n_eigenvectors=11,
        min_context_count=3):

    word_freq_pairs = double_sorted(unigram_counter.items(),
                                    key=lambda x: x[1], reverse=True)

    if len(word_freq_pairs) > max_word_types:
        wordlist = [word for word, _ in word_freq_pairs[: max_word_types]]
    else:
        wordlist = [word for word, _ in word_freq_pairs]

    n_words = len(wordlist)

    # computing the context array
    # also words_to_contexts and contexts_to_words dicts
    context_array, words_to_contexts, contexts_to_words = get_array(
        wordlist, bigram_counter, trigram_counter, min_context_count)

    # computing shared context master matrix
    shared_context_matrix = context_array.dot(context_array.T).todense()
    del context_array

    # computing diameter
    diameter = normalize(n_words, shared_context_matrix)

    # computing incidence graph
    incidence_graph = compute_incidence_graph(n_words, diameter,
                                              shared_context_matrix)
    del shared_context_matrix

    # computing laplacian matrix
    laplacian_matrix = compute_laplacian(diameter, incidence_graph)
    del diameter
    del incidence_graph

    # computing eigenvectors and eigenvalues
    eigenvalues, eigenvectors = compute_eigenvectors(laplacian_matrix)
    del laplacian_matrix

    # computing distances between words
    # take first N columns of eigenvector matrix
    coordinates = eigenvectors[:, : n_eigenvectors]
    word_distances = compute_words_distance(coordinates)
    del coordinates
    del eigenvalues

    # computing nearest neighbors now
    nearest_neighbors = compute_closest_neighbors(word_distances, n_neighbors)

    words_to_neighbors = dict()

    for i in range(len(wordlist)):
        line = nearest_neighbors[i]
        word_idx, neighbors_idx = line[0], line[1:]
        word = wordlist[word_idx]
        neighbors = [wordlist[idx] for idx in neighbors_idx]
        words_to_neighbors[word] = neighbors

    return words_to_neighbors, words_to_contexts, contexts_to_words
Exemple #2
0
 def _make_wordlist(self):
     """
     Return a wordlist sorted by word frequency in descending order.
     (So "the" will most likely be the first word for written English.)
     """
     word_counter = self.word_unigram_counter()
     word_counter_sorted = double_sorted(word_counter.items(),
                                         key=lambda x: x[1], reverse=True)
     self._wordlist = [word for word, _ in word_counter_sorted]
    def create_major_display_table(input_iterable,
                                   key=lambda x: x, reverse=False,
                                   headers=None, row_cell_functions=None,
                                   cutoff=0,
                                   set_text_alignment=None):
        """
        This is a general function for creating a tabular display for the
        major display.
        """

        if not input_iterable:
            print('Warning: input is empty', flush=True)
            return

        if not hasattr(input_iterable, '__iter__'):
            print('Warning: input is not an iterable', flush=True)
            return

        number_of_headers = len(headers)
        number_of_columns = len(row_cell_functions)

        if number_of_headers != number_of_columns:
            print('headers and cell functions don\'t match', flush=True)
            return

        len_input = len(input_iterable)

        table_widget = QTableWidget()
        table_widget.clear()
        table_widget.setSortingEnabled(False)

        # set up row count
        if cutoff and cutoff < len_input:
            actual_cutoff = cutoff
        else:
            actual_cutoff = len_input

        table_widget.setRowCount(actual_cutoff)

        # set up column count and table headers
        table_widget.setColumnCount(number_of_headers)
        table_widget.setHorizontalHeaderLabels(headers)

        # fill in the table
        for row, x in enumerate(double_sorted(input_iterable, key=key,
                                              reverse=reverse)):
            for col, fn in enumerate(row_cell_functions):
                cell = fn(x)

                if isinstance(cell, (int, float)):
                    # cell is numeric
                    item = QTableWidgetItem()
                    item.setData(Qt.EditRole, cell)
                else:
                    # cell is not numeric
                    item = QTableWidgetItem(cell)

                if set_text_alignment:
                    for align_col, alignment in set_text_alignment:
                        if col == align_col:
                            item.setTextAlignment(alignment)

                table_widget.setItem(row, col, item)

            if not row < actual_cutoff:
                break

        table_widget.setSortingEnabled(True)
        table_widget.resizeColumnsToContents()

        return table_widget
def get_array(wordlist, bigram_to_freq, trigram_to_freq,
              min_context_count):
    worddict = {word: wordlist.index(word) for word in wordlist}

    # convert the bigram and trigram counter dicts into list and sort them
    # throw away bi/trigrams whose frequency is below min_context_count

    bigram_to_freq_sorted = [(bigram, freq) for bigram, freq in
                             double_sorted(bigram_to_freq.items(),
                                           key=lambda x: x[1],
                                           reverse=True) if
                             freq >= min_context_count]

    trigram_to_freq_sorted = [(trigram, freq) for trigram, freq in
                              double_sorted(trigram_to_freq.items(),
                                            key=lambda x: x[1],
                                            reverse=True) if
                              freq >= min_context_count]

    # This is necessary so we can reference variables from inner functions
    class Namespace:
        pass

    ns = Namespace()
    ns.n_contexts = 0

    # We use "n_contexts" to keep track of how many unique contexts there are.
    # Conveniently, n_contexts also serves to provide a unique context
    # index whenever the program encounters a new context. The dummy class
    # Namespace is to make it possible that we can refer to and update
    # n_contexts within inner functions
    # (both "contexts_increment" and "add_word")
    # inside this "GetContextArray" function.

    def contexts_increment():
        tmp = ns.n_contexts
        ns.n_contexts += 1
        return tmp

    contextdict = defaultdict(contexts_increment)
    # key: context (e.g., tuple ('of', '_', 'cat') as a 3-gram context for 'the'
    # value: context index (int)
    # This dict is analogous to worddict, where each key is a word (str)
    # and each value is a word index (int).

    # entries for sparse matrix
    rows = []  # row numbers are word indices
    cols = []  # column numbers are context indices
    values = []

    words_to_contexts = dict()
    contexts_to_words = dict()

    for word in worddict.keys():
        words_to_contexts[word] = dict()

    def add_word(current_word, current_context, occurrence_count):
        word_no = worddict[current_word]
        context_no = contextdict[current_context]
        rows.append(word_no)
        cols.append(context_no)

        # if we use 1, we assume "type" counts.
        # What if we use occurrence_count (--> "token" counts)?
        values.append(1)

        # update words_to_contexts and contexts_to_words
        if current_context not in words_to_contexts[current_word]:
            words_to_contexts[current_word][current_context] = 0

        if current_context not in contexts_to_words:
            contexts_to_words[current_context] = dict()
        if current_word not in contexts_to_words[current_context]:
            contexts_to_words[current_context][current_word] = 0

        words_to_contexts[current_word][current_context] += occurrence_count
        contexts_to_words[current_context][current_word] += occurrence_count

    for trigram, freq in trigram_to_freq_sorted:
        word1, word2, word3 = trigram

        context1 = ('_', word2, word3)
        context2 = (word1, '_', word3)
        context3 = (word1, word2, '_')

        if word1 in words_to_contexts:
            add_word(word1, context1, freq)
        if word2 in words_to_contexts:
            add_word(word2, context2, freq)
        if word3 in words_to_contexts:
            add_word(word3, context3, freq)

    for bigram, freq in bigram_to_freq_sorted:
        word1, word2 = bigram

        context1 = ('_', word2)
        context2 = (word1, '_')

        if word1 in words_to_contexts:
            add_word(word1, context1, freq)
        if word2 in words_to_contexts:
            add_word(word2, context2, freq)

    # csr_matrix in scipy.sparse means compressed matrix
    context_array = sparse.csr_matrix((values, (rows, cols)),
                                      shape=(len(worddict), ns.n_contexts + 1),
                                      dtype=np.int64)

    return context_array, words_to_contexts, contexts_to_words
Exemple #5
0
    def __init__(self,
                 file_path=None,
                 wordlist_file=False,
                 corpus_object=None,
                 wordlist_object=None,
                 encoding=ENCODING,
                 **kwargs):
        self.file_abspath = self._check_file_path(file_path)

        if self.file_abspath is None:
            self.directory = None
        else:
            self.directory = os.path.dirname(self.file_abspath)

        self.file_is_wordlist = wordlist_file
        self.encoding = encoding
        self.corpus_object = corpus_object
        self.wordlist_object = wordlist_object
        self.parameters_ = self._determine_parameters(**kwargs)

        # number of word types and tokens
        self._number_of_word_types = None
        self._number_of_word_tokens = None

        # word ngrams
        self._word_unigram_counter = None
        self._word_bigram_counter = None
        self._word_trigram_counter = None

        # wordlist
        self._wordlist = None
        if self.wordlist_object is not None:
            # self.wordlist_object is
            # either an iterable or a dict of word-count pairs
            if type(self.wordlist_object) is dict:
                word_count_dict = dict()
                if self.parameters_['keep_case']:
                    word_count_dict = self.wordlist_object
                else:
                    for word, count in self.wordlist_object:
                        word = word.lower()
                        if word not in word_count_dict:
                            word_count_dict[word] = 0
                        word_count_dict[word] += count

                self._wordlist = [
                    word_
                    for word_, _ in double_sorted(word_count_dict.items(),
                                                  key=lambda x: x[1],
                                                  reverse=True)
                ]
                self._word_unigram_counter = word_count_dict

            elif hasattr(self.wordlist_object, '__iter__'):
                if self.parameters_['keep_case']:
                    self._wordlist = sorted(set(self.wordlist_object))
                else:
                    self._wordlist = sorted(
                        set(w.lower() for w in self.wordlist_object))
                self._word_unigram_counter = {w: 1 for w in self._wordlist}

            else:
                raise TypeError('wordlist object must be a dict of word-count'
                                'pairs or an iterable of words')

        # corpus file object
        if self.corpus_object is not None:
            # self.corpus_object is either a list of strings or a long str
            if type(self.corpus_object) is list:
                corpus_str = fix_punctuations(' '.join(self.corpus_object))
            elif type(self.corpus_object) is six.text_type:
                corpus_str = fix_punctuations(self.corpus_object)
            else:
                raise TypeError('corpus object must be either a text or list')
            self.corpus_file_object = StringIO(corpus_str)
        elif self.file_abspath and not self.file_is_wordlist:
            self.corpus_file_object = open(self.file_abspath,
                                           encoding=self.encoding)
        else:
            self.corpus_file_object = None

        # wordlist file object
        if self.file_is_wordlist:
            self.wordlist_file_object = open(self.file_abspath,
                                             encoding=self.encoding)
        else:
            self.wordlist_file_object = StringIO()

        # manifold-related objects
        self._words_to_neighbors = None
        self._words_to_contexts = None
        self._contexts_to_words = None
        self._neighbor_graph = None

        # phon objects
        self._phone_unigram_counter = None
        self._phone_bigram_counter = None
        self._phone_trigram_counter = None

        self._phone_dict = None
        self._biphone_dict = None
        self._word_dict = None
        self._words_to_phones = None

        # trie objects
        self._broken_words_left_to_right = None
        self._broken_words_right_to_left = None
        self._successors = None
        self._predecessors = None

        Lexicon_BiSig.__init__(self, self.wordlist(),
                               self.parameters_['min_stem_length'],
                               self.parameters_['max_affix_length'],
                               self.parameters_['min_sig_count'],
                               self.parameters_['suffixing'])
Exemple #6
0
    def output_all_results(self, directory=None, verbose=False, test=False):
        """
        Output all Linguistica results to *directory*.

        :param directory: output directory. If not specified, it defaults to
            the current directory given by ``os.getcwd()``.
        """
        if not directory:
            output_dir = os.getcwd()
        else:
            output_dir = os.path.abspath(directory)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint(verbose, 'ngram objects')

            fname = 'word_bigrams.txt'
            obj = double_sorted(self.word_bigram_counter().items(),
                                key=lambda x: x[1],
                                reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(
                obj,
                f_path,
                title='Word bigrams',
                headers=['Word bigram', 'Count'],
                row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]],
                column_widths=[50, 10],
                lxa_parameters=self.parameters(),
                test=test,
                encoding=self.encoding,
                number_of_word_types=self.number_of_word_types(),
                number_of_word_tokens=self.number_of_word_tokens(),
                input_file_path=self.file_abspath)
            vprint(verbose, '\t' + fname)

            fname = 'word_trigrams.txt'
            obj = double_sorted(self.word_trigram_counter().items(),
                                key=lambda x: x[1],
                                reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(
                obj,
                f_path,
                title='Word trigrams',
                headers=['Word trigram', 'Count'],
                row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]],
                column_widths=[75, 10],
                lxa_parameters=self.parameters(),
                test=test,
                encoding=self.encoding,
                number_of_word_types=self.number_of_word_types(),
                number_of_word_tokens=self.number_of_word_tokens(),
                input_file_path=self.file_abspath)
            vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        vprint(verbose, 'morphological signature objects')

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Stems to words '
                     '(descending order of word count)',
                     headers=['Stem', 'Word count', 'Words'],
                     row_functions=[
                         lambda x: x[0], lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: x[0],
                            reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Stems to words '
                     '(alphabetical order of stems)',
                     headers=['Stem', 'Word count', '1st 10 words'],
                     row_functions=[
                         lambda x: x[0], lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_stems.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to stems',
                     headers=['Signature', 'Stem count', 'Stems'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_stems_truncated.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to stems '
                     '(first 10 stems for each sig)',
                     headers=['Signature', 'Stem count', '1st 10 stems'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ' '.join(sorted(x[1])[:10])
                     ],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'stems_to_signatures.txt'
        obj = double_sorted(self.stems_to_signatures().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Stems to signatures',
            headers=['Stems', 'Signatures'],
            row_functions=[
                lambda x: x[0],
                lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))
            ],
            column_widths=[15, 0],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'words_to_signatures.txt'
        obj = double_sorted(self.words_to_signatures().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Words to signatures',
            headers=['Word', 'Sig count', 'Signatures'],
            row_functions=[
                lambda x: x[0], lambda x: len(x[1]),
                lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))
            ],
            column_widths=[25, 15, 0],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_words.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to words',
                     headers=['Signature', 'Word count', 'Words'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1]))
                     ],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'signatures_to_words_truncated.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Signatures to words '
                     '(first 10 words for each sig)',
                     headers=['Signature', 'Word count', '1st 10 words'],
                     row_functions=[
                         lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]),
                         lambda x: ', '.join(sorted(x[1])[:10])
                     ],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'words_to_sigtransforms.txt'
        obj = double_sorted(self.words_to_sigtransforms().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Words to sigtransforms',
                     headers=['Word', 'Signature transforms'],
                     row_functions=[
                         lambda x: x[0], lambda x: ', '.join(
                             SEP_SIG.join(sig) + SEP_SIGTRANSFORM + affix
                             for sig, affix in sorted(x[1]))
                     ],
                     column_widths=[20, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'affixes_to_signatures.txt'
        obj = double_sorted(self.affixes_to_signatures().items(),
                            key=lambda x: len(x[1]),
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Affixes to signatures',
            headers=['Affix', 'Sig count', 'Signatures'],
            row_functions=[
                lambda x: x[0], lambda x: len(x[1]),
                lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))
            ],
            column_widths=[15, 15, 0],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint(verbose, 'manifold objects')

            fname = 'words_to_neighbors.txt'
            obj = list()  # list of tuple(word, list of neighbor words)
            for word in self.wordlist()[:self.parameters()['max_word_types']]:
                obj.append((word, self.words_to_neighbors()[word]))
            f_path = os.path.join(output_dir, fname)
            output_latex(
                obj,
                f_path,
                title='Words to neighbors',
                headers=['Word', 'Neighbors'],
                row_functions=[lambda x: x[0], lambda x: ' '.join(x[1])],
                column_widths=[25, 0],
                lxa_parameters=self.parameters(),
                test=test,
                encoding=self.encoding,
                number_of_word_types=self.number_of_word_types(),
                number_of_word_tokens=self.number_of_word_tokens(),
                input_file_path=self.file_abspath)
            vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        vprint(verbose, 'phon objects')

        def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_,
                                        test_, encoding_,
                                        number_of_word_types_,
                                        number_of_word_tokens_,
                                        input_file_path_):
            output_latex(obj_,
                         f_path_,
                         title=title_,
                         headers=[
                             'Word', 'Count', 'Frequency', 'Phones',
                             'Unigram plog', 'Avg unigram plog', 'Bigram plog',
                             'Avg bigram plog'
                         ],
                         row_functions=[
                             lambda x: x[0],
                             lambda x: x[1].count,
                             lambda x: '%.6f' % x[1].frequency,
                             lambda x: ' '.join(x[1].phones),
                             lambda x: '%8.3f' % x[1].unigram_plog,
                             lambda x: '%8.3f' % x[1].avg_unigram_plog,
                             lambda x: '%8.3f' % x[1].bigram_plog,
                             lambda x: '%8.3f' % x[1].avg_bigram_plog,
                         ],
                         column_widths=[35, 10, 15, 60, 15, 15, 15, 15],
                         lxa_parameters=lxa_parameters_,
                         test=test_,
                         encoding=encoding_,
                         number_of_word_types=number_of_word_types_,
                         number_of_word_tokens=number_of_word_tokens_,
                         input_file_path=input_file_path_)

        fname = 'wordlist.txt'
        obj_word_phon = list()  # list of tuple(word, list of neighbor words)
        for word in self.wordlist():
            obj_word_phon.append((word, self.word_phonology_dict()[word]))
        f_path = os.path.join(output_dir, 'wordlist.txt')
        output_latex_for_phon_words(obj_word_phon, f_path,
                                    'Wordlist sorted by word count',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'wordlist_by_avg_unigram_plog.txt'
        obj_unigram_plog = double_sorted(obj_word_phon,
                                         key=lambda x: x[1].avg_unigram_plog,
                                         reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_unigram_plog, f_path,
                                    'Wordlist sorted by avg unigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'wordlist_by_avg_bigram_plog.txt'
        obj_bigram_plog = double_sorted(obj_word_phon,
                                        key=lambda x: x[1].avg_bigram_plog,
                                        reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_bigram_plog, f_path,
                                    'Wordlist sorted by avg bigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'phones.txt'
        obj = double_sorted(self.phone_dict().items(),
                            key=lambda x: x[1].count,
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Phones',
                     headers=['Phone', 'Count', 'Frequency', 'Plog'],
                     row_functions=[
                         lambda x: x[0],
                         lambda x: x[1].count,
                         lambda x: '%.6f' % x[1].frequency,
                         lambda x: '%8.3f' % x[1].plog,
                     ],
                     column_widths=[10, 10, 15, 15],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'biphones.txt'
        obj = double_sorted(self.biphone_dict().items(),
                            key=lambda x: x[1].count,
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Biphones',
            headers=['Biphone', 'Count', 'Frequency', 'MI', 'Weighted MI'],
            row_functions=[
                lambda x: ' '.join(x[0]),
                lambda x: x[1].count,
                lambda x: '%.6f' % x[1].frequency,
                lambda x: '%8.3f' % x[1].MI,
                lambda x: '%8.3f' % x[1].weighted_MI,
            ],
            column_widths=[10, 10, 15, 15, 15],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'triphones.txt'
        obj = double_sorted(self.phone_trigram_counter().items(),
                            key=lambda x: x[1],
                            reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Triphones',
                     headers=['Triphone', 'Count'],
                     row_functions=[
                         lambda x: ' '.join(x[0]),
                         lambda x: x[1],
                     ],
                     column_widths=[15, 10],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        # ----------------------------------------------------------------------
        vprint(verbose, 'trie objects')

        fname = 'words_as_tries.txt'
        obj = list()
        for word in self.wordlist():
            obj.append((word, self.broken_words_left_to_right()[word],
                        self.broken_words_right_to_left()[word]))
        f_path = os.path.join(output_dir, fname)
        output_latex(
            obj,
            f_path,
            title='Words as tries',
            headers=['Word', 'Left-to-right trie', 'Right-to-left trie'],
            row_functions=[
                lambda x: x[0],
                lambda x: ' '.join(x[1]),
                lambda x: ' '.join(x[2]),
            ],
            column_widths=[35, 50, 50],
            lxa_parameters=self.parameters(),
            test=test,
            encoding=self.encoding,
            number_of_word_types=self.number_of_word_types(),
            number_of_word_tokens=self.number_of_word_tokens(),
            input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'successors.txt'
        obj = double_sorted(self.successors().items(),
                            key=lambda x: len(x[1]),
                            reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Successors',
                     headers=['String', 'Successors'],
                     row_functions=[
                         lambda x: x[0],
                         lambda x: ' '.join(sorted(x[1])),
                     ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)

        fname = 'predecessors.txt'
        obj = double_sorted(self.predecessors().items(),
                            key=lambda x: len(x[1]),
                            reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj,
                     f_path,
                     title='Predecessors',
                     headers=['String', 'Predecessors'],
                     row_functions=[
                         lambda x: x[0],
                         lambda x: ' '.join(sorted(x[1])),
                     ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test,
                     encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint(verbose, '\t' + fname)
Exemple #7
0
def run(unigram_counter=None,
        bigram_counter=None,
        trigram_counter=None,
        max_word_types=1000,
        n_neighbors=9,
        n_eigenvectors=11,
        min_context_count=3):

    word_freq_pairs = double_sorted(unigram_counter.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

    if len(word_freq_pairs) > max_word_types:
        wordlist = [word for word, _ in word_freq_pairs[:max_word_types]]
    else:
        wordlist = [word for word, _ in word_freq_pairs]

    n_words = len(wordlist)

    # computing the context array
    # also words_to_contexts and contexts_to_words dicts
    context_array, words_to_contexts, contexts_to_words = get_array(
        wordlist, bigram_counter, trigram_counter, min_context_count)

    # computing shared context master matrix
    shared_context_matrix = context_array.dot(context_array.T).todense()
    del context_array

    # computing diameter
    diameter = normalize(n_words, shared_context_matrix)

    # computing incidence graph
    incidence_graph = compute_incidence_graph(n_words, diameter,
                                              shared_context_matrix)
    del shared_context_matrix

    # computing laplacian matrix
    laplacian_matrix = compute_laplacian(diameter, incidence_graph)
    del diameter
    del incidence_graph

    # computing eigenvectors and eigenvalues
    eigenvalues, eigenvectors = compute_eigenvectors(laplacian_matrix)
    del laplacian_matrix

    # computing distances between words
    # take first N columns of eigenvector matrix
    coordinates = eigenvectors[:, :n_eigenvectors]
    word_distances = compute_words_distance(coordinates)
    del coordinates
    del eigenvalues

    # computing nearest neighbors now
    nearest_neighbors = compute_closest_neighbors(word_distances, n_neighbors)

    words_to_neighbors = dict()

    for i in range(len(wordlist)):
        line = nearest_neighbors[i]
        word_idx, neighbors_idx = line[0], line[1:]
        word = wordlist[word_idx]
        neighbors = [wordlist[idx] for idx in neighbors_idx]
        words_to_neighbors[word] = neighbors

    return words_to_neighbors, words_to_contexts, contexts_to_words
Exemple #8
0
def get_array(wordlist, bigram_to_freq, trigram_to_freq, min_context_count):
    worddict = {word: wordlist.index(word) for word in wordlist}

    # convert the bigram and trigram counter dicts into list and sort them
    # throw away bi/trigrams whose frequency is below min_context_count

    bigram_to_freq_sorted = [(bigram, freq) for bigram, freq in double_sorted(
        bigram_to_freq.items(), key=lambda x: x[1], reverse=True)
                             if freq >= min_context_count]

    trigram_to_freq_sorted = [
        (trigram, freq) for trigram, freq in double_sorted(
            trigram_to_freq.items(), key=lambda x: x[1], reverse=True)
        if freq >= min_context_count
    ]

    # This is necessary so we can reference variables from inner functions
    class Namespace:
        pass

    ns = Namespace()
    ns.n_contexts = 0

    # We use "n_contexts" to keep track of how many unique contexts there are.
    # Conveniently, n_contexts also serves to provide a unique context
    # index whenever the program encounters a new context. The dummy class
    # Namespace is to make it possible that we can refer to and update
    # n_contexts within inner functions
    # (both "contexts_increment" and "add_word")
    # inside this "GetContextArray" function.

    def contexts_increment():
        tmp = ns.n_contexts
        ns.n_contexts += 1
        return tmp

    contextdict = defaultdict(contexts_increment)
    # key: context (e.g., tuple ('of', '_', 'cat') as a 3-gram context for 'the'
    # value: context index (int)
    # This dict is analogous to worddict, where each key is a word (str)
    # and each value is a word index (int).

    # entries for sparse matrix
    rows = []  # row numbers are word indices
    cols = []  # column numbers are context indices
    values = []

    words_to_contexts = dict()
    contexts_to_words = dict()

    for word in worddict.keys():
        words_to_contexts[word] = dict()

    def add_word(current_word, current_context, occurrence_count):
        word_no = worddict[current_word]
        context_no = contextdict[current_context]
        rows.append(word_no)
        cols.append(context_no)

        # if we use 1, we assume "type" counts.
        # What if we use occurrence_count (--> "token" counts)?
        values.append(1)

        # update words_to_contexts and contexts_to_words
        if current_context not in words_to_contexts[current_word]:
            words_to_contexts[current_word][current_context] = 0

        if current_context not in contexts_to_words:
            contexts_to_words[current_context] = dict()
        if current_word not in contexts_to_words[current_context]:
            contexts_to_words[current_context][current_word] = 0

        words_to_contexts[current_word][current_context] += occurrence_count
        contexts_to_words[current_context][current_word] += occurrence_count

    for trigram, freq in trigram_to_freq_sorted:
        word1, word2, word3 = trigram

        context1 = ('_', word2, word3)
        context2 = (word1, '_', word3)
        context3 = (word1, word2, '_')

        if word1 in words_to_contexts:
            add_word(word1, context1, freq)
        if word2 in words_to_contexts:
            add_word(word2, context2, freq)
        if word3 in words_to_contexts:
            add_word(word3, context3, freq)

    for bigram, freq in bigram_to_freq_sorted:
        word1, word2 = bigram

        context1 = ('_', word2)
        context2 = (word1, '_')

        if word1 in words_to_contexts:
            add_word(word1, context1, freq)
        if word2 in words_to_contexts:
            add_word(word2, context2, freq)

    # csr_matrix in scipy.sparse means compressed matrix
    context_array = sparse.csr_matrix((values, (rows, cols)),
                                      shape=(len(worddict), ns.n_contexts + 1),
                                      dtype=np.int64)

    return context_array, words_to_contexts, contexts_to_words
Exemple #9
0
    def output_all_results(self, directory=None, verbose=False, test=False):
        """
        Output all Linguistica results to *directory*.

        :param directory: output directory. If not specified, it defaults to
            the current directory given by ``os.getcwd()``.
        """
        if not directory:
            output_dir = os.getcwd()
        else:
            output_dir = os.path.abspath(directory)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint('ngram objects', verbose=verbose)

            fname = 'word_bigrams.txt'
            obj = double_sorted(self.word_bigram_counter().items(),
                                key=lambda x: x[1], reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(obj, f_path,
                         title='Word bigrams',
                         headers=['Word bigram', 'Count'],
                         row_functions=[lambda x: ' '.join(x[0]),
                                        lambda x: x[1]],
                         column_widths=[50, 10],
                         lxa_parameters=self.parameters(),
                         test=test, encoding=self.encoding,
                         number_of_word_types=self.number_of_word_types(),
                         number_of_word_tokens=self.number_of_word_tokens(),
                         input_file_path=self.file_abspath)
            vprint('\t' + fname, verbose=verbose)

            fname = 'word_trigrams.txt'
            obj = double_sorted(self.word_trigram_counter().items(),
                                key=lambda x: x[1], reverse=True)
            f_path = os.path.join(output_dir, fname)
            output_latex(obj, f_path,
                         title='Word trigrams',
                         headers=['Word trigram', 'Count'],
                         row_functions=[lambda x: ' '.join(x[0]),
                                        lambda x: x[1]],
                         column_widths=[75, 10],
                         lxa_parameters=self.parameters(),
                         test=test, encoding=self.encoding,
                         number_of_word_types=self.number_of_word_types(),
                         number_of_word_tokens=self.number_of_word_tokens(),
                         input_file_path=self.file_abspath)
            vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        vprint('morphological signature objects', verbose=verbose)

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Stems to words '
                           '(descending order of word count)',
                     headers=['Stem', 'Word count', 'Words'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'stems_to_words.txt'
        obj = double_sorted(self.stems_to_words().items(),
                            key=lambda x: x[0], reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Stems to words '
                           '(alphabetical order of stems)',
                     headers=['Stem', 'Word count', '1st 10 words'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_stems.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to stems',
                     headers=['Signature', 'Stem count', 'Stems'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_stems_truncated.txt'
        obj = double_sorted(self.signatures_to_stems().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to stems '
                           '(first 10 stems for each sig)',
                     headers=['Signature', 'Stem count', '1st 10 stems'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ' '.join(sorted(x[1])[:10])],
                     column_widths=[30, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'stems_to_signatures.txt'
        obj = double_sorted(self.stems_to_signatures().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Stems to signatures',
                     headers=['Stems', 'Signatures'],
                     row_functions=[lambda x: x[0],
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig)
                                              for sig in sorted(x[1]))],
                     column_widths=[15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'words_to_signatures.txt'
        obj = double_sorted(self.words_to_signatures().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Words to signatures',
                     headers=['Word', 'Sig count', 'Signatures'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig)
                                              for sig in sorted(x[1]))],
                     column_widths=[25, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_words.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to words',
                     headers=['Signature', 'Word count', 'Words'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x: ', '.join(sorted(x[1]))],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'signatures_to_words_truncated.txt'
        obj = double_sorted(self.signatures_to_words().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Signatures to words '
                           '(first 10 words for each sig)',
                     headers=['Signature', 'Word count', '1st 10 words'],
                     row_functions=[lambda x: SEP_SIG.join(x[0]),
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ', '.join(sorted(x[1])[:10])],
                     column_widths=[20, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'words_to_sigtransforms.txt'
        obj = double_sorted(self.words_to_sigtransforms().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Words to sigtransforms',
                     headers=['Word', 'Signature transforms'],
                     row_functions=[lambda x: x[0],
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig) +
                                              SEP_SIGTRANSFORM + affix
                                              for sig, affix in sorted(x[1]))],
                     column_widths=[20, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'affixes_to_signatures.txt'
        obj = double_sorted(self.affixes_to_signatures().items(),
                            key=lambda x: len(x[1]), reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Affixes to signatures',
                     headers=['Affix', 'Sig count', 'Signatures'],
                     row_functions=[lambda x: x[0],
                                    lambda x: len(x[1]),
                                    lambda x:
                                    ', '.join(SEP_SIG.join(sig)
                                              for sig in sorted(x[1]))],
                     column_widths=[15, 15, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        if self.corpus_file_object:
            vprint('manifold objects', verbose=verbose)

            fname = 'words_to_neighbors.txt'
            obj = list()  # list of tuple(word, list of neighbor words)
            for word in self.wordlist()[: self.parameters()['max_word_types']]:
                obj.append((word, self.words_to_neighbors()[word]))
            f_path = os.path.join(output_dir, fname)
            output_latex(obj, f_path,
                         title='Words to neighbors',
                         headers=['Word', 'Neighbors'],
                         row_functions=[lambda x: x[0],
                                        lambda x: ' '.join(x[1])],
                         column_widths=[25, 0],
                         lxa_parameters=self.parameters(),
                         test=test, encoding=self.encoding,
                         number_of_word_types=self.number_of_word_types(),
                         number_of_word_tokens=self.number_of_word_tokens(),
                         input_file_path=self.file_abspath)
            vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        vprint('phon objects', verbose=verbose)

        def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_,
                                        test_, encoding_, number_of_word_types_,
                                        number_of_word_tokens_,
                                        input_file_path_):
            output_latex(obj_, f_path_,
                         title=title_,
                         headers=['Word', 'Count', 'Frequency', 'Phones',
                                  'Unigram plog', 'Avg unigram plog',
                                  'Bigram plog', 'Avg bigram plog'],
                         row_functions=[lambda x: x[0],
                                        lambda x: x[1].count,
                                        lambda x:
                                        '%.6f' % x[1].frequency,
                                        lambda x:
                                        ' '.join(x[1].phones),
                                        lambda x:
                                        '%8.3f' % x[1].unigram_plog,
                                        lambda x:
                                        '%8.3f' % x[1].avg_unigram_plog,
                                        lambda x:
                                        '%8.3f' % x[1].bigram_plog,
                                        lambda x:
                                        '%8.3f' % x[1].avg_bigram_plog,
                                        ],
                         column_widths=[35, 10, 15, 60, 15, 15, 15, 15],
                         lxa_parameters=lxa_parameters_,
                         test=test_, encoding=encoding_,
                         number_of_word_types=number_of_word_types_,
                         number_of_word_tokens=number_of_word_tokens_,
                         input_file_path=input_file_path_)

        fname = 'wordlist.txt'
        obj_word_phon = list()  # list of tuple(word, list of neighbor words)
        for word in self.wordlist():
            obj_word_phon.append((word, self.word_phonology_dict()[word]))
        f_path = os.path.join(output_dir, 'wordlist.txt')
        output_latex_for_phon_words(obj_word_phon, f_path,
                                    'Wordlist sorted by word count',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'wordlist_by_avg_unigram_plog.txt'
        obj_unigram_plog = double_sorted(obj_word_phon,
                                         key=lambda x: x[1].avg_unigram_plog,
                                         reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_unigram_plog, f_path,
                                    'Wordlist sorted by avg unigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'wordlist_by_avg_bigram_plog.txt'
        obj_bigram_plog = double_sorted(obj_word_phon,
                                        key=lambda x: x[1].avg_bigram_plog,
                                        reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex_for_phon_words(obj_bigram_plog, f_path,
                                    'Wordlist sorted by avg bigram plog',
                                    self.parameters(), test, self.encoding,
                                    self.number_of_word_types(),
                                    self.number_of_word_tokens(),
                                    self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'phones.txt'
        obj = double_sorted(self.phone_dict().items(),
                            key=lambda x: x[1].count, reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Phones',
                     headers=['Phone', 'Count', 'Frequency', 'Plog'],
                     row_functions=[lambda x: x[0],
                                    lambda x: x[1].count,
                                    lambda x: '%.6f' % x[1].frequency,
                                    lambda x: '%8.3f' % x[1].plog,
                                    ],
                     column_widths=[10, 10, 15, 15],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'biphones.txt'
        obj = double_sorted(self.biphone_dict().items(),
                            key=lambda x: x[1].count, reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Biphones',
                     headers=['Biphone', 'Count', 'Frequency',
                              'MI', 'Weighted MI'],
                     row_functions=[lambda x: ' '.join(x[0]),
                                    lambda x: x[1].count,
                                    lambda x:
                                    '%.6f' % x[1].frequency,
                                    lambda x:
                                    '%8.3f' % x[1].MI,
                                    lambda x:
                                    '%8.3f' % x[1].weighted_MI,
                                    ],
                     column_widths=[10, 10, 15, 15, 15],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'triphones.txt'
        obj = double_sorted(self.phone_trigram_counter().items(),
                            key=lambda x: x[1], reverse=True)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Triphones',
                     headers=['Triphone', 'Count'],
                     row_functions=[lambda x: ' '.join(x[0]),
                                    lambda x: x[1],
                                    ],
                     column_widths=[15, 10],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        # ----------------------------------------------------------------------
        vprint('trie objects', verbose=verbose)

        fname = 'words_as_tries.txt'
        obj = list()
        for word in self.wordlist():
            obj.append((word,
                        self.broken_words_left_to_right()[word],
                        self.broken_words_right_to_left()[word]))
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Words as tries',
                     headers=['Word', 'Left-to-right trie',
                              'Right-to-left trie'],
                     row_functions=[lambda x: x[0],
                                    lambda x: ' '.join(x[1]),
                                    lambda x: ' '.join(x[2]),
                                    ],
                     column_widths=[35, 50, 50],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'successors.txt'
        obj = double_sorted(self.successors().items(),
                            key=lambda x: len(x[1]), reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Successors',
                     headers=['String', 'Successors'],
                     row_functions=[lambda x: x[0],
                                    lambda x: ' '.join(sorted(x[1])),
                                    ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)

        fname = 'predecessors.txt'
        obj = double_sorted(self.predecessors().items(),
                            key=lambda x: len(x[1]), reverse=False)
        f_path = os.path.join(output_dir, fname)
        output_latex(obj, f_path,
                     title='Predecessors',
                     headers=['String', 'Predecessors'],
                     row_functions=[lambda x: x[0],
                                    lambda x: ' '.join(sorted(x[1])),
                                    ],
                     column_widths=[35, 0],
                     lxa_parameters=self.parameters(),
                     test=test, encoding=self.encoding,
                     number_of_word_types=self.number_of_word_types(),
                     number_of_word_tokens=self.number_of_word_tokens(),
                     input_file_path=self.file_abspath)
        vprint('\t' + fname, verbose=verbose)
Exemple #10
0
    def _initialize(self):
        # number of word types and tokens
        self._number_of_word_types = None
        self._number_of_word_tokens = None

        # word ngrams
        self._word_unigram_counter = None
        self._word_bigram_counter = None
        self._word_trigram_counter = None

        # wordlist
        self._wordlist = None
        if self.wordlist_object is not None:
            # self.wordlist_object is
            # either an iterable or a dict of word-count pairs
            if type(self.wordlist_object) is dict:
                word_count_dict = dict()
                if self.parameters_['keep_case']:
                    word_count_dict = self.wordlist_object
                else:
                    for word, count in self.wordlist_object:
                        word = word.lower()
                        if word not in word_count_dict:
                            word_count_dict[word] = 0
                        word_count_dict[word] += count

                self._wordlist = [word for word, _ in
                                  double_sorted(word_count_dict.items(),
                                                key=lambda x: x[1],
                                                reverse=True)]
                self._word_unigram_counter = word_count_dict

            elif hasattr(self.wordlist_object, '__iter__'):
                if self.parameters_['keep_case']:
                    self._wordlist = sorted(set(self.wordlist_object))
                else:
                    self._wordlist = sorted(
                        set(w.lower() for w in self.wordlist_object))
                self._word_unigram_counter = {w: 1 for w in self._wordlist}

            else:
                raise TypeError('wordlist object must be a dict of word-count'
                                'pairs or an iterable of words')

        # signature-related objects
        self._stems_to_words = None
        self._signatures_to_stems = None
        self._stems_to_signatures = None
        self._words_to_signatures = None
        self._signatures_to_words = None
        self._words_to_sigtransforms = None

        self._signatures = None
        self._affixes_to_signatures = None
        self._words_in_signatures = None
        self._affixes = None
        self._stems = None

        # corpus file object
        if self.corpus_object is not None:
            # self.corpus_object is either a list of strings or a long str
            if type(self.corpus_object) is list:
                corpus_str = fix_punctuations(' '.join(self.corpus_object))
            elif type(self.corpus_object) is str:
                corpus_str = fix_punctuations(self.corpus_object)
            else:
                raise TypeError('corpus object must be either a str or a list')
            self.corpus_file_object = StringIO(corpus_str)
        elif self.file_abspath and not self.file_is_wordlist:
            self.corpus_file_object = open(self.file_abspath,
                                           encoding=self.encoding)
        else:
            self.corpus_file_object = None

        # wordlist file object
        if self.file_is_wordlist:
            self.wordlist_file_object = open(self.file_abspath,
                                             encoding=self.encoding)
        else:
            self.wordlist_file_object = StringIO()

        # manifold-related objects
        self._words_to_neighbors = None
        self._words_to_contexts = None
        self._contexts_to_words = None
        self._neighbor_graph = None

        # phon objects
        self._phone_unigram_counter = None
        self._phone_bigram_counter = None
        self._phone_trigram_counter = None

        self._phone_dict = None
        self._biphone_dict = None
        self._word_dict = None
        self._words_to_phones = None

        # trie objects
        self._broken_words_left_to_right = None
        self._broken_words_right_to_left = None
        self._successors = None
        self._predecessors = None
def run(unigram_counter=None,
        bigram_counter=None,
        trigram_counter=None,
        max_word_types=1000,
        n_neighbors=9,
        n_eigenvectors=11,
        min_context_count=3):

    word_freq_pairs = double_sorted(unigram_counter.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

    # words = dict()
    # for word, _ in word_freq_pairs:
    #     words[word] = 0

    # word_pairs = dict()
    # for word, _ in word_freq_pairs:
    #     word_pairs[word] = words
    # print(word_pairs)

    if len(word_freq_pairs) > max_word_types:
        freqlist = [freq for _, freq in word_freq_pairs[:max_word_types]]
        wordlist = [word for word, _ in word_freq_pairs[:max_word_types]]
    else:
        freqlist = [freq for _, freq in word_freq_pairs]
        wordlist = [word for word, _ in word_freq_pairs]
    n_words = len(wordlist)

    # computing the context array
    # also words_to_contexts and contexts_to_words dicts
    context_array, words_to_contexts, contexts_to_words = get_array(
        wordlist, bigram_counter, trigram_counter, min_context_count)

    # computing shared context master matrix
    shared_context_matrix = context_array.dot(context_array.T.todense())
    for row in range(n_words):
        for col in range(n_words):
            shared_context_matrix[
                row, col] = 10000000000 * shared_context_matrix[row, col] / (
                    freqlist[col] * freqlist[row])

    # computing sum of contexts shared with other words
    total_contexts = normalize(n_words, shared_context_matrix)

    # computing adjusted shared contexts with the [x, x] pair replaced with sum of contexts shared
    incidence = compute_incidence_graph(n_words, total_contexts,
                                        shared_context_matrix)

    # computing the closest neighbors to a given word
    result = find_closest(incidence, n_words, n_neighbors)

    del context_array
    del shared_context_matrix

    # computing which words are neighbors based on worddict
    words_to_neighbors = dict()

    for i in range(len(wordlist)):
        line = result[i].astype(np.int64)
        word_idx, neighbors_idx = line[0], line[1:]
        word = wordlist[word_idx]
        neighbors = [wordlist[idx] for idx in neighbors_idx]
        words_to_neighbors[word] = neighbors

    # computing graph of neighbors
    compute_graph(words_to_neighbors)

    return words_to_neighbors, words_to_contexts, contexts_to_words