def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab, pos, synsets, stemmer): d = Document() assert language == self.lang # Use the given ID if we have it if self._id: d.id = self._id else: d.id = num d.title = "/".join(self._filename.split("/")[-3:]) # print d.title d.language = language tf_token = nltk.FreqDist() for ii in self.sentences(): for jj in ii: tf_token.inc(jj) s = d.sentences.add() for ii in self.sentences(): for jj in ii: w = s.words.add() w.token = token_vocab[jj] w.lemma = lemma_vocab[stemmer(language, jj)] w.tfidf = token_df.compute_tfidf(jj, tf_token.freq(jj)) return d
def prepare_document(self, num, language, authors): """ The first step in creating a protocol buffer for a document; filling any document specific information. Also creates a term frequency dictionary. """ d = Document() if self.author: d.author = authors[self.author] d.language = language tf_token = nltk.FreqDist() for ii in self.sentences(): for jj in ii: tf_token.inc(jj) return d, tf_token