def test_ldac_conversion(self): dtm = self.dtm N, V = dtm.shape doclines = list(utils.dtm2ldac(self.dtm)) nd_unique = np.sum(dtm > 0, axis=1) for n, docline in zip(nd_unique, doclines): self.assertEqual(n, int(docline.split(' ')[0])) self.assertEqual(len(doclines), N) f = io.StringIO('\n'.join(doclines)) dtm_new = utils.ldac2dtm(f) self.assertTrue(np.all(dtm == dtm_new))
print("Writing tokens into output file") with open('output/data.tokens', 'w') as f: for token in tokens_set: print(token, file=f) print("Generating Document-Term Matrix (DTM)") dtm = np.empty((len(documents_words), len(tokens_set)), dtype=np.intc) tokens_count = len(tokens_set) for token_index, token in enumerate(tokens_set): print(str(token_index), " of ", str(tokens_count), end='\r') for doc_index, document in enumerate(documents_words): dtm[doc_index, token_index] = document.count(token) print("Generating LDAC data") doclines = list(lda_utils.dtm2ldac(dtm)) print("Writing LDAC file") with open('output/data.ldac', 'w') as f: for line in doclines: print(line, file=f) def load_govuk_data(): ldac_fn = os.path.join('output', 'data.ldac') return lda.utils.ldac2dtm(open(ldac_fn), offset=0) def load_govuk_tokens(): tokens_fn = os.path.join('output', 'data.tokens') with open(tokens_fn) as f: tokens = tuple(f.read().split())