Beispiel #1
0
 def test_ldac_conversion(self):
     dtm = self.dtm
     N, V = dtm.shape
     doclines = list(utils.dtm2ldac(self.dtm))
     nd_unique = np.sum(dtm > 0, axis=1)
     for n, docline in zip(nd_unique, doclines):
         self.assertEqual(n, int(docline.split(' ')[0]))
     self.assertEqual(len(doclines), N)
     f = io.StringIO('\n'.join(doclines))
     dtm_new = utils.ldac2dtm(f)
     self.assertTrue(np.all(dtm == dtm_new))
Beispiel #2
0
 def test_ldac_conversion(self):
     dtm = self.dtm
     N, V = dtm.shape
     doclines = list(utils.dtm2ldac(self.dtm))
     nd_unique = np.sum(dtm > 0, axis=1)
     for n, docline in zip(nd_unique, doclines):
         self.assertEqual(n, int(docline.split(' ')[0]))
     self.assertEqual(len(doclines), N)
     f = io.StringIO('\n'.join(doclines))
     dtm_new = utils.ldac2dtm(f)
     self.assertTrue(np.all(dtm == dtm_new))
print("Writing tokens into output file")
with open('output/data.tokens', 'w') as f:
    for token in tokens_set:
        print(token, file=f)

print("Generating Document-Term Matrix (DTM)")
dtm = np.empty((len(documents_words), len(tokens_set)), dtype=np.intc)
tokens_count = len(tokens_set)

for token_index, token in enumerate(tokens_set):
    print(str(token_index), " of ", str(tokens_count), end='\r')
    for doc_index, document in enumerate(documents_words):
        dtm[doc_index, token_index] = document.count(token)

print("Generating LDAC data")
doclines = list(lda_utils.dtm2ldac(dtm))

print("Writing LDAC file")
with open('output/data.ldac', 'w') as f:
    for line in doclines:
        print(line, file=f)

def load_govuk_data():
    ldac_fn = os.path.join('output', 'data.ldac')
    return lda.utils.ldac2dtm(open(ldac_fn), offset=0)


def load_govuk_tokens():
    tokens_fn = os.path.join('output', 'data.tokens')
    with open(tokens_fn) as f:
        tokens = tuple(f.read().split())