def build_ta_index(self): clock = Timer() ta_index = "" index = 0 for book in iter_data_to_stream(self.fname): if index % 50 == 0: print "Processing bookindex #" + str(index) clock.print_lap() title = book[0] author = book[1] ta_row = str(index) + "_" + str(title) + "_" + str(author) + "\n" index += 1 ta_index += ta_row print "\n Done!\n" clock.print_lap() clock = None print "\n Writing to file " + str(self.ta_indexfile) + " ...\n" with open(self.ta_indexfile, "w") as text_file: text_file.write(ta_index)
# testing # print (tokenized_corpus.read_ta_index(12)) print "\n Starting document stream." doc_stream = (tokenized_corpus.get_token_stream()) print " doc_stream type: " + str(type(doc_stream)) print "\n" clock = Timer() print "\n Creating dictionary." id2word_pgfin = gensim.corpora.Dictionary(doc_stream) print (id2word_pgfin) clock.print_lap() # filter tokens: discard those in less than 2 documents and those in more than 40% # leaves about 10% with test set of 20 id2word_pgfin.filter_extremes(no_below=10, no_above=0.7) print (id2word_pgfin) print "\n Saving dictionary to disk.\n" # id2word_pgfin.save('./data/pgfintestdata20.dictionary') id2word_pgfin.save('./data/pgfin.dictionary') clock.print_lap() # create a stream of bag-of-words vectors pgfin_bow_corpus = CorpusPGFinBOW(datafile, id2word_pgfin)