Esempio n. 1
0
 def test_save(self):
     with tempfile.NamedTemporaryFile(
             prefix="sourced.ml-topics-test-") as f:
         self.model.save(f.name)
         new = Topics().load(f.name)
         self.assertEqual(self.model.tokens, new.tokens)
         self.assertEqual((self.model.matrix != new.matrix).getnnz(), 0)
Esempio n. 2
0
def bigartm2asdf(args):
    """
    BigARTM "readable" model -> Topics -> Modelforge ASDF.
    """
    tokens = []
    data = []
    indices = []
    indptr = [0]
    if args.input != "-":
        fin = open(args.input)
    else:
        fin = sys.stdin
    try:
        # the first line is the header
        fin.readline()
        for line in fin:
            items = line.split(";")
            tokens.append(items[0])
            nnz = 0
            for i, v in enumerate(items[2:]):
                if v == "0":
                    continue
                nnz += 1
                data.append(float(v))
                indices.append(i)
            indptr.append(indptr[-1] + nnz)
    finally:
        if args.input != "-":
            fin.close()
    data = numpy.array(data, dtype=numpy.float32)
    indices = numpy.array(indices, dtype=numpy.int32)
    matrix = csr_matrix((data, indices, indptr),
                        shape=(len(tokens), len(items) - 2)).T
    Topics().construct(tokens, None, matrix).save(args.output)
Esempio n. 3
0
 def test_bigartm2asdf(self):
     with tempfile.NamedTemporaryFile(prefix="sourced.ml-topics-test-") as f:
         args = argparse.Namespace(
             input=os.path.join(os.path.dirname(__file__), paths.TOPICS_SRC),
             output=f.name)
         bigartm2asdf(args)
         model = Topics().load(f.name)
         self.assertEqual(len(model), 320)
         self.assertEqual(len(model.tokens), 1000)
Esempio n. 4
0
class TopicsTests(unittest.TestCase):
    def setUp(self):
        self.model = Topics().load(source=paths.TOPICS)

    def test_dump(self):
        res = self.model.dump()
        self.assertEqual(res, """320 topics, 1000 tokens
First 10 tokens: ['ulcancel', 'domainlin', 'trudi', 'fncreateinstancedbaselin', 'wbnz', 'lmultiplicand', 'otronumero', 'qxln', 'gvgq', 'polaroidish']
Topics: unlabeled
non-zero elements: 6211  (0.019409)""")  # nopep8

    def test_props(self):
        self.assertEqual(len(self.model), 320)
        self.assertEqual(len(self.model.tokens), 1000)
        self.assertIsNone(self.model.topics)
        zt = self.model[0]
        self.assertEqual(len(zt), 8)
        self.assertEqual(zt[0][0], "olcustom")
        self.assertAlmostEqual(zt[0][1], 1.23752e-06, 6)

    def test_label(self):
        with self.assertRaises(ValueError):
            self.model.label_topics([1, 2, 3])
        with self.assertRaises(TypeError):
            self.model.label_topics(list(range(320)))
        self.model.label_topics([str(i) for i in range(320)])
        self.assertEqual(self.model.topics[0], "0")

    def test_save(self):
        with tempfile.NamedTemporaryFile(
                prefix="sourced.ml-topics-test-") as f:
            self.model.save(f.name)
            new = Topics().load(f.name)
            self.assertEqual(self.model.tokens, new.tokens)
            self.assertEqual((self.model.matrix != new.matrix).getnnz(), 0)

    def test_bigartm2asdf_entry(self):
        with tempfile.NamedTemporaryFile(
                prefix="sourced.ml-topics-test-") as f:
            args = argparse.Namespace(input=os.path.join(
                os.path.dirname(__file__), paths.TOPICS_SRC),
                                      output=f.name)
            bigartm2asdf_entry(args)
            model = Topics().load(f.name)
            self.assertEqual(len(model), 320)
            self.assertEqual(len(model.tokens), 1000)
Esempio n. 5
0
 def setUp(self):
     self.model = Topics().load(source=paths.TOPICS)