def test_simple_file(self): input = u"""#include GLFW_INCLUDE_GLU #include <GLFW/glfw3.h> #include <cstdio> /* Random function */ static void glfw_key_callback(int key, int scancode, int action, int mod){ if(glfw_key_callback){ // Comment here input_event_queue->push(inputaction); } printf("%s", "asciiじゃない文字"); }""".encode("utf-8") (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) (file_hash,lines,LOC,SLOC) = final_stats (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens self.assertEqual(lines,12) self.assertEqual(LOC,11) self.assertEqual(SLOC,9) self.assertEqual(tokens_count_total,27) self.assertEqual(tokens_count_unique,21) self.assert_common_properties(tokens) hard_tokens = set(['int@@::@@4','void@@::@@1','cstdio@@::@@1','action@@::@@1','static@@::@@1','key@@::@@1','glfw_key_callback@@::@@1','mod@@::@@1','if@@::@@1','glfw3@@::@@1','scancode@@::@@1','h@@::@@1','GLFW_INCLUDE_GLU@@::@@1','input_event_queue@@::@@2','GLFW@@::@@1','push@@::@@1','inputaction@@::@@1','include@@::@@3']) this_tokens = set(tokens[3:].split(',')) self.assertTrue(len(hard_tokens - this_tokens),0) m = hashlib.md5() m.update(tokens[3:]) self.assertEqual(m.hexdigest(),token_hash)
def test_line_counts_1(self): input = """ line 1 line 2 line 3 """ (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) (file_hash,lines,LOC,SLOC) = final_stats self.assertEqual(lines,3) self.assertEqual(LOC,3) self.assertEqual(SLOC,3)
def test_multiline_comment(self): input = '/* this is a \n comment */ /* Last one */ ' (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) (file_hash,lines,LOC,SLOC) = final_stats (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens self.assertEqual(lines,2) self.assertEqual(LOC,2) self.assertEqual(SLOC,0) self.assertEqual(tokens_count_total,0) self.assertEqual(tokens_count_unique,0) self.assert_common_properties(tokens)
def test_comments(self): input = "// Hello\n // World" (final_stats, final_tokens, file_times) = tokenizer.tokenize_files(input, comment_inline_pattern, comment_open_close_pattern, separators) (file_hash,lines,LOC,SLOC) = final_stats (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens self.assertEqual(lines,2) self.assertEqual(LOC,2) self.assertEqual(SLOC,0) self.assertEqual(tokens_count_total,0) self.assertEqual(tokens_count_unique,0) self.assert_common_properties(tokens)
for index, word in enumerate(doc.text_no_stopwords): old_topic = doc.topic_words[index] doc.topic_counts[old_topic] -= 1 distrib = ((alpha + doc.topic_counts) * (gamma + words_given_topics[word]) / (vocab_size * gamma + topic_word_assign)) new_topic = sample_discrete(distrib) doc.topic_words[index] = new_topic doc.topic_counts[new_topic] += 1 for index, topic in enumerate(topics): top_topic_words = sorted(topic.word_counts, key=lambda x: topic.word_counts[x], reverse=True)[:top_words] logging.info('{}: {}'.format(index, ' '.join(top_topic_words))) if __name__ == '__main__': np.random.seed(1234) # POS test dataset is sci.space train_reviews = [] test_reviews = [] test_count = 50 tokenizer.tokenize_files('tmp/POS', train_reviews) test_reviews = train_reviews[-test_count:] train_reviews = train_reviews[:-test_count] # NEG test dataset is sci.med tokenizer.tokenize_files('tmp/NEG', train_reviews) test_reviews.extend(train_reviews[-test_count:]) train_reviews = train_reviews[:-test_count] run_lda(train_reviews, test_reviews, K=10, train_iters=10)
def get_review_files(review_dir, reviews): # Get all review files and complete vocab counts for review_type in (POS, NEG): search_dir = os.path.join(review_dir, review_type) tokenizer.tokenize_files(search_dir, reviews)