Beispiel #1
0
    def test_doc_freq_and_token2id_for_several_docs_with_one_word(self):
        # two docs
        texts = [['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 2}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 3}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # four docs
        texts = [['human'], ['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 4}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())
 def test_saveAsText(self):
     """ `HashDictionary` can be saved as textfile. """
     tmpf = get_tmpfile('dict_test.txt')
     # use some utf8 strings, to test encoding serialization
     d = HashDictionary(['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()])
     d.save_as_text(tmpf)
     self.assertTrue(os.path.exists(tmpf))
Beispiel #3
0
    def testDocFreqAndToken2IdForSeveralDocsWithOneWord(self):
        # two docs
        texts = [['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 2}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 3}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # four docs
        texts = [['human'], ['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 4}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())
Beispiel #4
0
 def test_saveAsText(self):
     """ `HashDictionary` can be saved as textfile. """
     tmpf = get_tmpfile('dict_test.txt')
     # use some utf8 strings, to test encoding serialization
     d = HashDictionary(
         ['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()])
     d.save_as_text(tmpf)
     self.assertTrue(os.path.exists(tmpf))
    def testRange(self):
        # all words map to the same id
        d = HashDictionary(self.texts, id_range=1, debug=True)
        dfs = {0: 9}
        id2token = {
            0:
            set([
                'minors', 'graph', 'system', 'trees', 'eps', 'computer',
                'survey', 'user', 'human', 'time', 'interface', 'response'
            ])
        }
        token2id = {
            'minors': 0,
            'graph': 0,
            'system': 0,
            'trees': 0,
            'eps': 0,
            'computer': 0,
            'survey': 0,
            'user': 0,
            'human': 0,
            'time': 0,
            'interface': 0,
            'response': 0
        }
        self.assertEqual(d.dfs, dfs)
        self.assertEqual(d.id2token, id2token)
        self.assertEqual(d.token2id, token2id)

        # 2 ids: 0/1 for even/odd number of bytes in the word
        d = HashDictionary(self.texts, id_range=2, myhash=lambda key: len(key))
        dfs = {0: 7, 1: 7}
        id2token = {
            0:
            set([
                'minors', 'system', 'computer', 'survey', 'user', 'time',
                'response'
            ]),
            1:
            set(['interface', 'graph', 'trees', 'eps', 'human'])
        }
        token2id = {
            'minors': 0,
            'graph': 1,
            'system': 0,
            'trees': 1,
            'eps': 1,
            'computer': 0,
            'survey': 0,
            'user': 0,
            'human': 1,
            'time': 0,
            'interface': 1,
            'response': 0
        }
        self.assertEqual(d.dfs, dfs)
        self.assertEqual(d.id2token, id2token)
        self.assertEqual(d.token2id, token2id)
 def test_saveAsTextBz2(self):
     """ `HashDictionary` can be saved & loaded as compressed pickle. """
     tmpf = get_tmpfile('dict_test.txt.bz2')
     # use some utf8 strings, to test encoding serialization
     d = HashDictionary(['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()])
     d.save(tmpf)
     self.assertTrue(os.path.exists(tmpf))
     d2 = d.load(tmpf)
     self.assertEqual(len(d), len(d2))
Beispiel #7
0
 def test_saveAsTextBz2(self):
     """ `HashDictionary` can be saved & loaded as compressed pickle. """
     tmpf = get_tmpfile('dict_test.txt.bz2')
     # use some utf8 strings, to test encoding serialization
     d = HashDictionary(
         ['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()])
     d.save(tmpf)
     self.assertTrue(os.path.exists(tmpf))
     d2 = d.load(tmpf)
     self.assertEqual(len(d), len(d2))
Beispiel #8
0
    def test_doc_freq_for_one_doc_with_several_word(self):
        # two words
        texts = [['human', 'cat']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)

        # three words
        texts = [['human', 'cat', 'minors']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 15001: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)
Beispiel #9
0
    def testDocFreqForOneDocWithSeveralWord(self):
        # two words
        texts = [['human', 'cat']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)

        # three words
        texts = [['human', 'cat', 'minors']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {9273: 1, 15001: 1, 31002: 1}
        self.assertEqual(d.dfs, expected)
Beispiel #10
0
    def testDebugMode(self):
        # two words
        texts = [['human', 'cat']]
        d = HashDictionary(texts, debug=True, myhash=zlib.adler32)
        expected = {9273: {'cat'}, 31002: {'human'}}
        self.assertEqual(d.id2token, expected)

        # now the same thing, with debug off
        texts = [['human', 'cat']]
        d = HashDictionary(texts, debug=False, myhash=zlib.adler32)
        expected = {}
        self.assertEqual(d.id2token, expected)
Beispiel #11
0
    def testBuild(self):
        d = HashDictionary(self.texts, myhash=zlib.adler32)
        expected = {
            5232: 2,
            5798: 3,
            10608: 2,
            12466: 2,
            12736: 3,
            15001: 2,
            18451: 3,
            23844: 3,
            28591: 2,
            29104: 2,
            31002: 2,
            31049: 2
        }

        self.assertEqual(d.dfs, expected)
        expected = {
            'minors': 15001,
            'graph': 18451,
            'system': 5798,
            'trees': 23844,
            'eps': 31049,
            'computer': 10608,
            'survey': 28591,
            'user': 12736,
            'human': 31002,
            'time': 29104,
            'interface': 12466,
            'response': 5232
        }

        for ex in expected:
            self.assertEqual(d.token2id[ex], expected[ex])
 def setUp(self):
     self.texts = [
         ["human", "interface", "computer"],
         ["eps", "user", "interface", "system"],
         ["system", "human", "system", "eps"],
         ["user", "response", "time"],
         ["trees"],
         ["graph", "trees"],
     ]
     self.dictionary = HashDictionary(self.texts)
     # Following is the mapping:
     # {'computer': 10608,
     #  'eps': 31049,
     #  'graph': 18451,
     #  'human': 31002,
     #  'interface': 12466,
     #  'response': 5232,
     #  'system': 5798,
     #  'time': 29104,
     #  'trees': 23844,
     #  'user': 12736}
     self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
     # Suppose the segmented topics from s_one_pre are:
     self.segmented_topics = [
         [(5798, 18451), (10608, 18451), (10608, 5798)],
         [(10608, 18451), (12736, 18451), (12736, 10608)],
     ]
    def testFilter(self):
        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes()
        expected = {}
        self.assertEqual(d.dfs, expected)

        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes(no_below=0, no_above=0.3)
        expected = {29104: 2, 31049: 2, 28591: 2, 5232: 2, 10608: 2, 12466: 2, 15001: 2, 31002: 2}
        self.assertEqual(d.dfs, expected)

        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes(no_below=3, no_above=1.0, keep_n=4)
        expected = {5798: 3, 12736: 3, 18451: 3, 23844: 3}
        self.assertEqual(d.dfs, expected)
 def setUp(self):
     self.texts = [['human', 'interface', 'computer'],
                   ['eps', 'user', 'interface', 'system'],
                   ['system', 'human', 'system', 'eps'],
                   ['user', 'response', 'time'],
                   ['trees'],
                   ['graph', 'trees']]
     self.dictionary = HashDictionary(self.texts)
     # Following is the mapping:
     # {'computer': 10608,
     #  'eps': 31049,
     #  'graph': 18451,
     #  'human': 31002,
     #  'interface': 12466,
     #  'response': 5232,
     #  'system': 5798,
     #  'time': 29104,
     #  'trees': 23844,
     #  'user': 12736}
     self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
     # Suppose the segmented topics from s_one_pre are:
     self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]]
class TestProbabilityEstimation(unittest.TestCase):
    def setUp(self):
        self.texts = [
            ["human", "interface", "computer"],
            ["eps", "user", "interface", "system"],
            ["system", "human", "system", "eps"],
            ["user", "response", "time"],
            ["trees"],
            ["graph", "trees"],
        ]
        self.dictionary = HashDictionary(self.texts)
        # Following is the mapping:
        # {'computer': 10608,
        #  'eps': 31049,
        #  'graph': 18451,
        #  'human': 31002,
        #  'interface': 12466,
        #  'response': 5232,
        #  'system': 5798,
        #  'time': 29104,
        #  'trees': 23844,
        #  'user': 12736}
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        # Suppose the segmented topics from s_one_pre are:
        self.segmented_topics = [
            [(5798, 18451), (10608, 18451), (10608, 5798)],
            [(10608, 18451), (12736, 18451), (12736, 10608)],
        ]

    def testPBooleanDocument(self):
        """Test p_boolean_document()"""
        # Unique topic ids are 5798, 10608, 12736 and 18451
        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
        expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])}
        self.assertTrue(obtained == expected)

    def testPBooleanSlidingWindow(self):
        """Test p_boolean_sliding_window()"""
        # Test with window size as 2. window_id is zero indexed.
        obtained, _ = probability_estimation.p_boolean_sliding_window(
            self.texts, self.segmented_topics, self.dictionary, 2
        )
        expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])}
        self.assertTrue(obtained == expected)
class TestProbabilityEstimation(unittest.TestCase):
    def setUp(self):
        self.texts = [['human', 'interface', 'computer'],
                      ['eps', 'user', 'interface', 'system'],
                      ['system', 'human', 'system', 'eps'],
                      ['user', 'response', 'time'],
                      ['trees'],
                      ['graph', 'trees']]
        self.dictionary = HashDictionary(self.texts)
        # Following is the mapping:
        # {'computer': 10608,
        #  'eps': 31049,
        #  'graph': 18451,
        #  'human': 31002,
        #  'interface': 12466,
        #  'response': 5232,
        #  'system': 5798,
        #  'time': 29104,
        #  'trees': 23844,
        #  'user': 12736}
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        # Suppose the segmented topics from s_one_pre are:
        self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]]

    def testPBooleanDocument(self):
        """Test p_boolean_document()"""
        # Unique topic ids are 5798, 10608, 12736 and 18451
        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
        expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])}
        self.assertTrue(obtained == expected)

    def testPBooleanSlidingWindow(self):
        """Test p_boolean_sliding_window()"""
        # Test with window size as 2. window_id is zero indexed.
        obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2)
        expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])}
        self.assertTrue(obtained == expected)
Beispiel #17
0


from gensim.topic_coherence import probability_estimation
from gensim.corpora.hashdictionary import HashDictionary
from gensim.models import word2vec

texts = [
     ['human', 'interface', 'computer'],
     ['eps', 'user', 'interface', 'system'],
     ['system', 'human', 'system', 'eps'],
     ['user', 'response', 'time'],
    ['trees'],
     ['graph', 'trees']
 ]
dictionary = HashDictionary(texts)
w2id = dictionary.token2id

segmented_topics = [
   [
         (w2id['system'], w2id['graph']),
         (w2id['computer'], w2id['graph']),
         (w2id['computer'], w2id['system'])
     ],
    [
        (w2id['computer'], w2id['graph']),
        (w2id['user'], w2id['graph']),
        (w2id['user'], w2id['computer'])]
 ]
# create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
Beispiel #18
0
reload(sys)
sys.setdefaultencoding('utf8')

#Get the command line arguments
inputFile = sys.argv[1]

doc = []
texts = []
with codecs.open(inputFile, encoding='utf-8', mode='r',
                 errors='ignore') as inptFile:
    for line in inptFile:
        line = line.split()
        for word in line:
            doc.append(word)
        texts.append(doc)
        doc = []

dictionary = HashDictionary(texts)
w2id = dictionary.token2id
corpus = [dictionary.doc2bow(doc) for doc in texts]
pickle.dump(dictionary, open("wiki_dictionary.p", "wb"))
pickle.dump(w2id, open("wiki_w2id.p", "wb"))
pickle.dump(corpus, open("wiki_corpus.p", "wb"))
'''
dic = pickle.load( open( "wiki_dictionary.p", "rb" ) )
w2id = pickle.load( open( "wiki_w2id.p", "rb" ) )
data = pickle.load( open( "wiki_corpus.p", "rb" ) )
print w2id
print data
'''
 def setup_dictionary(self):
     self.dictionary = HashDictionary(self.texts)
Beispiel #20
0
 def testDocFreqOneDoc(self):
     texts = [['human', 'interface', 'computer']]
     d = HashDictionary(texts, myhash=zlib.adler32)
     expected = {10608: 1, 12466: 1, 31002: 1}
     self.assertEqual(d.dfs, expected)
Beispiel #21
0
    def testFilter(self):
        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes()
        expected = {}
        self.assertEqual(d.dfs, expected)

        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes(no_below=0, no_above=0.3)
        expected = {
            29104: 2,
            31049: 2,
            28591: 2,
            5232: 2,
            10608: 2,
            12466: 2,
            15001: 2,
            31002: 2
        }
        self.assertEqual(d.dfs, expected)

        d = HashDictionary(self.texts, myhash=zlib.adler32)
        d.filter_extremes(no_below=3, no_above=1.0, keep_n=4)
        expected = {5798: 3, 12736: 3, 18451: 3, 23844: 3}
        self.assertEqual(d.dfs, expected)