Esempio n. 1
0
    def testDocFreqAndToken2IdForSeveralDocsWithOneWord(self):
        # two docs
        texts = [['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 2}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 3}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # four docs
        texts = [['human'], ['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 4}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())
Esempio n. 2
0
    def test_doc_freq_and_token2id_for_several_docs_with_one_word(self):
        # two docs
        texts = [['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 2}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 3}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # four docs
        texts = [['human'], ['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 4}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())
    def testRange(self):
        # all words map to the same id
        d = HashDictionary(self.texts, id_range=1, debug=True)
        dfs = {0: 9}
        id2token = {
            0:
            set([
                'minors', 'graph', 'system', 'trees', 'eps', 'computer',
                'survey', 'user', 'human', 'time', 'interface', 'response'
            ])
        }
        token2id = {
            'minors': 0,
            'graph': 0,
            'system': 0,
            'trees': 0,
            'eps': 0,
            'computer': 0,
            'survey': 0,
            'user': 0,
            'human': 0,
            'time': 0,
            'interface': 0,
            'response': 0
        }
        self.assertEqual(d.dfs, dfs)
        self.assertEqual(d.id2token, id2token)
        self.assertEqual(d.token2id, token2id)

        # 2 ids: 0/1 for even/odd number of bytes in the word
        d = HashDictionary(self.texts, id_range=2, myhash=lambda key: len(key))
        dfs = {0: 7, 1: 7}
        id2token = {
            0:
            set([
                'minors', 'system', 'computer', 'survey', 'user', 'time',
                'response'
            ]),
            1:
            set(['interface', 'graph', 'trees', 'eps', 'human'])
        }
        token2id = {
            'minors': 0,
            'graph': 1,
            'system': 0,
            'trees': 1,
            'eps': 1,
            'computer': 0,
            'survey': 0,
            'user': 0,
            'human': 1,
            'time': 0,
            'interface': 1,
            'response': 0
        }
        self.assertEqual(d.dfs, dfs)
        self.assertEqual(d.id2token, id2token)
        self.assertEqual(d.token2id, token2id)
Esempio n. 4
0
    def testDebugMode(self):
        # two words
        texts = [['human', 'cat']]
        d = HashDictionary(texts, debug=True, myhash=zlib.adler32)
        expected = {9273: {'cat'}, 31002: {'human'}}
        self.assertEqual(d.id2token, expected)

        # now the same thing, with debug off
        texts = [['human', 'cat']]
        d = HashDictionary(texts, debug=False, myhash=zlib.adler32)
        expected = {}
        self.assertEqual(d.id2token, expected)
Esempio n. 5
0
    def testDocFreqForOneDocWithSeveralWord(self):
        # two words
        texts = [['human', 'cat']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)

        # three words
        texts = [['human', 'cat', 'minors']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 15001: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)
Esempio n. 6
0
    def test_doc_freq_for_one_doc_with_several_word(self):
        # two words
        texts = [['human', 'cat']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)

        # three words
        texts = [['human', 'cat', 'minors']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 15001: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)
Esempio n. 7
0
    def testBuild(self):
        d = HashDictionary(self.texts, myhash=zlib.adler32)
        expected = {
            5232: 2,
            5798: 3,
            10608: 2,
            12466: 2,
            12736: 3,
            15001: 2,
            18451: 3,
            23844: 3,
            28591: 2,
            29104: 2,
            31002: 2,
            31049: 2
        }

        self.assertEqual(d.dfs, expected)
        expected = {
            'minors': 15001,
            'graph': 18451,
            'system': 5798,
            'trees': 23844,
            'eps': 31049,
            'computer': 10608,
            'survey': 28591,
            'user': 12736,
            'human': 31002,
            'time': 29104,
            'interface': 12466,
            'response': 5232
        }

        for ex in expected:
            self.assertEqual(d.token2id[ex], expected[ex])
Esempio n. 8
0
 def test_saveAsText(self):
     """ `HashDictionary` can be saved as textfile. """
     tmpf = get_tmpfile('dict_test.txt')
     # use some utf8 strings, to test encoding serialization
     d = HashDictionary(
         ['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()])
     d.save_as_text(tmpf)
     self.assertTrue(os.path.exists(tmpf))
Esempio n. 9
0
 def test_saveAsTextBz2(self):
     """ `HashDictionary` can be saved & loaded as compressed pickle. """
     tmpf = get_tmpfile('dict_test.txt.bz2')
     # use some utf8 strings, to test encoding serialization
     d = HashDictionary(
         ['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()])
     d.save(tmpf)
     self.assertTrue(os.path.exists(tmpf))
     d2 = d.load(tmpf)
     self.assertEqual(len(d), len(d2))
Esempio n. 10
0
    def testFilter(self):
        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes()
        expected = {}
        self.assertEqual(d.dfs, expected)

        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes(no_below=0, no_above=0.3)
        expected = {
            29104: 2,
            31049: 2,
            28591: 2,
            5232: 2,
            10608: 2,
            12466: 2,
            15001: 2,
            31002: 2
        }
        self.assertEqual(d.dfs, expected)

        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes(no_below=3, no_above=1.0, keep_n=4)
        expected = {5798: 3, 12736: 3, 18451: 3, 23844: 3}
        self.assertEqual(d.dfs, expected)
 def setUp(self):
     self.texts = [['human', 'interface', 'computer'],
                   ['eps', 'user', 'interface', 'system'],
                   ['system', 'human', 'system', 'eps'],
                   ['user', 'response', 'time'],
                   ['trees'],
                   ['graph', 'trees']]
     self.dictionary = HashDictionary(self.texts)
     # Following is the mapping:
     # {'computer': 10608,
     #  'eps': 31049,
     #  'graph': 18451,
     #  'human': 31002,
     #  'interface': 12466,
     #  'response': 5232,
     #  'system': 5798,
     #  'time': 29104,
     #  'trees': 23844,
     #  'user': 12736}
     self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
     # Suppose the segmented topics from s_one_pre are:
     self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]]
Esempio n. 12
0
 def testDocFreqOneDoc(self):
     texts = [['human', 'interface', 'computer']]
     d = HashDictionary(texts, myhash=zlib.adler32)
     expected = {10608: 1, 12466: 1, 31002: 1}
     self.assertEqual(d.dfs, expected)
Esempio n. 13
0


from gensim.topic_coherence import probability_estimation
from gensim.corpora.hashdictionary import HashDictionary
from gensim.models import word2vec

texts = [
     ['human', 'interface', 'computer'],
     ['eps', 'user', 'interface', 'system'],
     ['system', 'human', 'system', 'eps'],
     ['user', 'response', 'time'],
    ['trees'],
     ['graph', 'trees']
 ]
dictionary = HashDictionary(texts)
w2id = dictionary.token2id

segmented_topics = [
   [
         (w2id['system'], w2id['graph']),
         (w2id['computer'], w2id['graph']),
         (w2id['computer'], w2id['system'])
     ],
    [
        (w2id['computer'], w2id['graph']),
        (w2id['user'], w2id['graph']),
        (w2id['user'], w2id['computer'])]
 ]
# create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
 def setup_dictionary(self):
     self.dictionary = HashDictionary(self.texts)