Example #1
0
    def prepare_document(self, num, language, authors):
        """
        The first step in creating a protocol buffer for a document;
        filling any document specific information.  Also creates
        a term frequency dictionary.
        """
        d = Document()

        if self.author:
            d.author = authors[self.author]
        d.language = language

        tf_token = nltk.FreqDist()
        for ii in self.sentences():
            for jj in ii:
                tf_token.inc(jj)

        return d, tf_token
Example #2
0
    def prepare_document(self, num, language, authors):
        """
        The first step in creating a protocol buffer for a document;
        filling any document specific information.  Also creates
        a term frequency dictionary.
        """
        d = Document()

        if self.author:
            d.author = authors[self.author]
        d.language = language

        tf_token = nltk.FreqDist()
        for ii in self.sentences():
            for jj in ii:
                tf_token.inc(jj)

        return d, tf_token
Example #3
0
    def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab,
              pos, synsets, stemmer):
        d = Document()
        assert language == self.lang

        # Use the given ID if we have it
        if self._id:
            d.id = self._id
        else:
            d.id = num
        d.title = "/".join(self._filename.split("/")[-3:])
        # print d.title

        d.language = language

        tf_token = nltk.FreqDist()

        for ii in self.sentences():
            for jj in ii:
                tf_token.inc(jj)

        s = d.sentences.add()
        for ii in self.sentences():
            for jj in ii:
                w = s.words.add()
                w.token = token_vocab[jj]
                w.lemma = lemma_vocab[stemmer(language, jj)]
                w.tfidf = token_df.compute_tfidf(jj, tf_token.freq(jj))
        return d