Example #1
0
    def line2doc(self, line):
        words = self.line2words(line)

        if self.use_wordids:
            # get all distinct terms in this document, ignore unknown words
            uniq_words = set(words).intersection(iterkeys(self.word2id))

            # the following creates a unique list of words *in the same order*
            # as they were in the input. when iterating over the documents,
            # the (word, count) pairs will appear in the same order as they
            # were in the input (bar duplicates), which looks better.
            # if this was not needed, we might as well have used useWords = set(words)
            use_words, marker = [], set()
            for word in words:
                if (word in uniq_words) and (word not in marker):
                    use_words.append(word)
                    marker.add(word)
            # construct a list of (wordIndex, wordFrequency) 2-tuples
            doc = list(zip(map(self.word2id.get, use_words),
                      map(words.count, use_words)))
        else:
            uniq_words = set(words)
            # construct a list of (word, wordFrequency) 2-tuples
            doc = list(zip(uniq_words, map(words.count, uniq_words)))

        # return the document, then forget it and move on to the next one
        # note that this way, only one doc is stored in memory at a time, not the whole corpus
        return doc
Example #2
0
    def line2doc(self, line):
        words = self.line2words(line)

        if self.use_wordids:
            # get all distinct terms in this document, ignore unknown words
            uniq_words = set(words).intersection(iterkeys(self.word2id))

            # the following creates a unique list of words *in the same order*
            # as they were in the input. when iterating over the documents,
            # the (word, count) pairs will appear in the same order as they
            # were in the input (bar duplicates), which looks better.
            # if this was not needed, we might as well have used useWords = set(words)
            use_words, marker = [], set()
            for word in words:
                if (word in uniq_words) and (word not in marker):
                    use_words.append(word)
                    marker.add(word)
            # construct a list of (wordIndex, wordFrequency) 2-tuples
            doc = zip(
                map(self.word2id.get, use_words), map(words.count, use_words)
            )  # using list.count is suboptimal but speed of this whole function is irrelevant
        else:
            uniq_words = set(words)
            # construct a list of (word, wordFrequency) 2-tuples
            doc = zip(
                uniq_words, map(words.count, uniq_words)
            )  # using list.count is suboptimal but that's irrelevant at this point

        # return the document, then forget it and move on to the next one
        # note that this way, only one doc is stored in memory at a time, not the whole corpus
        return doc
Example #3
0
    def doc2bow(self, document, allow_update=False, return_missing=False):
        """
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` is set, then also update dictionary in the process: create
        ids for new words. At the same time, update document frequencies -- for
        each word appearing in this document, increase its document frequency (`self.dfs`)
        by one.

        If `allow_update` is **not** set, this function is `const`, aka read-only.
        """
        result = {}
        missing = {}
        if isinstance(document, string_types):
            raise TypeError(
                "doc2bow expects an array of utf8 tokens on input, not a string"
            )
        document = sorted(utils.to_utf8(token) for token in document)
        # construct (word, frequency) mapping. in python3 this is done simply
        # using Counter(), but here i use itertools.groupby() for the job
        for word_norm, group in itertools.groupby(document):
            frequency = len(
                list(group)
            )  # how many times does this word appear in the input document
            tokenid = self.token2id.get(word_norm, None)
            if tokenid is None:
                # first time we see this token (~normalized form)
                if return_missing:
                    missing[word_norm] = frequency
                if not allow_update:  # if we aren't allowed to create new tokens, continue with the next unique token
                    continue
                tokenid = len(self.token2id)
                self.token2id[
                    word_norm] = tokenid  # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence!

            # update how many times a token appeared in the document
            result[tokenid] = frequency

        if allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            # increase document count for each unique token that appeared in the document
            for tokenid in iterkeys(result):
                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
        else:
            return result
Example #4
0
    def doc2bow(self, document, allow_update=False, return_missing=False):
        """
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` is set, then also update dictionary in the process: create
        ids for new words. At the same time, update document frequencies -- for
        each word appearing in this document, increase its document frequency (`self.dfs`)
        by one.

        If `allow_update` is **not** set, this function is `const`, aka read-only.
        """
        result = {}
        missing = {}
        if isinstance(document, string_types):
            raise TypeError("doc2bow expects an array of utf8 tokens on input, not a string")
        document = sorted(utils.to_utf8(token) for token in document)
        # construct (word, frequency) mapping. in python3 this is done simply
        # using Counter(), but here i use itertools.groupby() for the job
        for word_norm, group in itertools.groupby(document):
            frequency = len(list(group)) # how many times does this word appear in the input document
            tokenid = self.token2id.get(word_norm, None)
            if tokenid is None:
                # first time we see this token (~normalized form)
                if return_missing:
                    missing[word_norm] = frequency
                if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token
                    continue
                tokenid = len(self.token2id)
                self.token2id[word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence!

            # update how many times a token appeared in the document
            result[tokenid] = frequency

        if allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            # increase document count for each unique token that appeared in the document
            for tokenid in iterkeys(result):
                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
        else:
            return result
Example #5
0
    def doc2bow(self, document, allow_update=False, return_missing=False):
        """
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` or `self.allow_update` is set, then also update dictionary
        in the process: update overall corpus statistics and document frequencies.
        For each id appearing in this document, increase its document frequency
        (`self.dfs`) by one.

        """
        result = {}
        missing = {}
        document = sorted(
            document)  # convert the input to plain list (needed below)
        for word_norm, group in itertools.groupby(document):
            frequency = len(
                list(group)
            )  # how many times does this word appear in the input document
            tokenid = self.restricted_hash(word_norm)
            result[tokenid] = result.get(tokenid, 0) + frequency
            if self.debug:
                # increment document count for each unique token that appeared in the document
                self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm,
                                                               0) + 1

        if allow_update or self.allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            if self.debug:
                # increment document count for each unique tokenid that appeared in the document
                # done here, because several words may map to the same tokenid
                for tokenid in iterkeys(result):
                    self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
        else:
            return result
Example #6
0
    def doc2bow(self, document, allow_update=False, return_missing=False):
        """
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` or `self.allow_update` is set, then also update dictionary
        in the process: update overall corpus statistics and document frequencies.
        For each id appearing in this document, increase its document frequency
        (`self.dfs`) by one.

        """
        result = {}
        missing = {}
        document = sorted(document) # convert the input to plain list (needed below)
        for word_norm, group in itertools.groupby(document):
            frequency = len(list(group)) # how many times does this word appear in the input document
            tokenid = self.restricted_hash(word_norm)
            result[tokenid] = result.get(tokenid, 0) + frequency
            if self.debug:
                # increment document count for each unique token that appeared in the document
                self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1

        if allow_update or self.allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            if self.debug:
                # increment document count for each unique tokenid that appeared in the document
                # done here, because several words may map to the same tokenid
                for tokenid in iterkeys(result):
                    self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
        else:
            return result
Example #7
0
def print_debug(id2token, u, s, topics, num_words=10, num_neg=None):
    if num_neg is None:
        # by default, print half as many salient negative words as positive
        num_neg = num_words / 2

    logger.info('computing word-topic salience for %i topics' % len(topics))
    topics, result = set(topics), {}
    # TODO speed up by block computation
    for uvecno, uvec in enumerate(u):
        uvec = numpy.abs(numpy.asarray(uvec).flatten())
        udiff = uvec / numpy.sqrt(numpy.sum(numpy.dot(uvec, uvec)))
        for topic in topics:
            result.setdefault(topic, []).append((udiff[topic], uvecno))

    logger.debug("printing %i+%i salient words" % (num_words, num_neg))
    for topic in sorted(iterkeys(result)):
        weights = sorted(result[topic], key=lambda x: -abs(x[0]))
        _, most = weights[0]
        if u[most, topic] < 0.0: # the most significant word has a negative sign => flip sign of u[most]
            normalize = -1.0
        else:
            normalize = 1.0

        # order features according to salience; ignore near-zero entries in u
        pos, neg = [], []
        for weight, uvecno in weights:
            if normalize * u[uvecno, topic] > 0.0001:
                pos.append('%s(%.3f)' % (id2token[uvecno], u[uvecno, topic]))
                if len(pos) >= num_words:
                    break

        for weight, uvecno in weights:
            if normalize * u[uvecno, topic] < -0.0001:
                neg.append('%s(%.3f)' % (id2token[uvecno], u[uvecno, topic]))
                if len(neg) >= num_neg:
                    break

        logger.info('topic #%s(%.3f): %s, ..., %s' % (topic, s[topic], ', '.join(pos), ', '.join(neg)))
Example #8
0
 def __str__(self):
     some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
     return "Dictionary(%i unique tokens: %s%s)" % (
         len(self), some_keys, '...' if len(self) > 5 else '')
Example #9
0
 def __str__(self):
     some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
     return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')