def test_simple_file(self):
        input = u"""#include GLFW_INCLUDE_GLU
                   #include <GLFW/glfw3.h>
                   #include <cstdio>
                   
                   /* Random function */
                   static void glfw_key_callback(int key, int scancode, int action, int mod){
                     if(glfw_key_callback){
                       // Comment here
                       input_event_queue->push(inputaction);   
                     }
                     printf("%s", "asciiじゃない文字");
                   }""".encode("utf-8")
        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
        (file_hash,lines,LOC,SLOC) = final_stats
        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens

        self.assertEqual(lines,12)
        self.assertEqual(LOC,11)
        self.assertEqual(SLOC,9)

        self.assertEqual(tokens_count_total,27)
        self.assertEqual(tokens_count_unique,21)
        self.assert_common_properties(tokens)

        hard_tokens = set(['int@@::@@4','void@@::@@1','cstdio@@::@@1','action@@::@@1','static@@::@@1','key@@::@@1','glfw_key_callback@@::@@1','mod@@::@@1','if@@::@@1','glfw3@@::@@1','scancode@@::@@1','h@@::@@1','GLFW_INCLUDE_GLU@@::@@1','input_event_queue@@::@@2','GLFW@@::@@1','push@@::@@1','inputaction@@::@@1','include@@::@@3'])
        this_tokens = set(tokens[3:].split(','))
        self.assertTrue(len(hard_tokens - this_tokens),0)

        m = hashlib.md5()
        m.update(tokens[3:])
        self.assertEqual(m.hexdigest(),token_hash)
    def test_line_counts_1(self):
        input = """ line 1
                    line 2
                    line 3 """
        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
        (file_hash,lines,LOC,SLOC) = final_stats

        self.assertEqual(lines,3)
        self.assertEqual(LOC,3)
        self.assertEqual(SLOC,3)
    def test_multiline_comment(self):
        input = '/* this is a \n comment */ /* Last one */ '
        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
        (file_hash,lines,LOC,SLOC) = final_stats
        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens


        self.assertEqual(lines,2)
        self.assertEqual(LOC,2)
        self.assertEqual(SLOC,0)

        self.assertEqual(tokens_count_total,0)
        self.assertEqual(tokens_count_unique,0)
        self.assert_common_properties(tokens)
    def test_comments(self):
        input = "// Hello\n // World"
        (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators)
        (file_hash,lines,LOC,SLOC) = final_stats
        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens


        self.assertEqual(lines,2)
        self.assertEqual(LOC,2)
        self.assertEqual(SLOC,0)

        self.assertEqual(tokens_count_total,0)
        self.assertEqual(tokens_count_unique,0)
        self.assert_common_properties(tokens)
Example #5
0
            for index, word in enumerate(doc.text_no_stopwords):
                old_topic = doc.topic_words[index]
                doc.topic_counts[old_topic] -= 1
                distrib = ((alpha + doc.topic_counts) *
                           (gamma + words_given_topics[word]) /
                           (vocab_size * gamma + topic_word_assign))
                new_topic = sample_discrete(distrib)
                doc.topic_words[index] = new_topic
                doc.topic_counts[new_topic] += 1
    for index, topic in enumerate(topics):
        top_topic_words = sorted(topic.word_counts,
                                 key=lambda x: topic.word_counts[x],
                                 reverse=True)[:top_words]
        logging.info('{}: {}'.format(index, ' '.join(top_topic_words)))


if __name__ == '__main__':
    np.random.seed(1234)
    # POS test dataset is sci.space
    train_reviews = []
    test_reviews = []
    test_count = 50
    tokenizer.tokenize_files('tmp/POS', train_reviews)
    test_reviews = train_reviews[-test_count:]
    train_reviews = train_reviews[:-test_count]
    # NEG test dataset is sci.med
    tokenizer.tokenize_files('tmp/NEG', train_reviews)
    test_reviews.extend(train_reviews[-test_count:])
    train_reviews = train_reviews[:-test_count]
    run_lda(train_reviews, test_reviews, K=10, train_iters=10)
def get_review_files(review_dir, reviews):
    # Get all review files and complete vocab counts
    for review_type in (POS, NEG):
        search_dir = os.path.join(review_dir, review_type)
        tokenizer.tokenize_files(search_dir, reviews)