Python Document Examples

Programming Language: Python

Namespace/Package Name: proto.corpus_pb2

Class/Type: Document

Examples at hotexamples.com: 3

Python Document - 3 examples found. These are the top rated real world Python examples of proto.corpus_pb2.Document extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Document(2)

language(2)

author(1)

id(1)

title(1)

Example #1

Show file

File: corpus_reader.py Project: hxsebastien/topicmod

    def prepare_document(self, num, language, authors):
        """
        The first step in creating a protocol buffer for a document;
        filling any document specific information.  Also creates
        a term frequency dictionary.
        """
        d = Document()

        if self.author:
            d.author = authors[self.author]
        d.language = language

        tf_token = nltk.FreqDist()
        for ii in self.sentences():
            for jj in ii:
                tf_token.inc(jj)

        return d, tf_token

Example #2

Show file

File: corpus_reader.py Project: NetBUG/topicmod

    def prepare_document(self, num, language, authors):
        """
        The first step in creating a protocol buffer for a document;
        filling any document specific information.  Also creates
        a term frequency dictionary.
        """
        d = Document()

        if self.author:
            d.author = authors[self.author]
        d.language = language

        tf_token = nltk.FreqDist()
        for ii in self.sentences():
            for jj in ii:
                tf_token.inc(jj)

        return d, tf_token

Example #3

Show file

    def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab,
              pos, synsets, stemmer):
        d = Document()
        assert language == self.lang

        # Use the given ID if we have it
        if self._id:
            d.id = self._id
        else:
            d.id = num
        d.title = "/".join(self._filename.split("/")[-3:])
        # print d.title

        d.language = language

        tf_token = nltk.FreqDist()

        for ii in self.sentences():
            for jj in ii:
                tf_token.inc(jj)

        s = d.sentences.add()
        for ii in self.sentences():
            for jj in ii:
                w = s.words.add()
                w.token = token_vocab[jj]
                w.lemma = lemma_vocab[stemmer(language, jj)]
                w.tfidf = token_df.compute_tfidf(jj, tf_token.freq(jj))
        return d