def main(): global NUM_TOP_TOPICS check_start('./test_doc_guess_algorithm.py <folder> ' + '<output> <num_topics>', 4) # Input naming folder = sys.argv[1] output = sys.argv[2] num_topics = int(sys.argv[3]) topic_order_count = 0 topic_feeling_count = 0 topic_offset_count = 0 # File allocation format_file = folder + '/initial.formatted' vocab_file = folder + '/initial.vocab' info_file = folder + '/out/final.other' with open(format_file) as data_file: format_lines = data_file.readlines() with open(info_file) as data_file: info_lines = data_file.readlines() NUM_TOP_TOPICS = int(info_lines[0].split(' ')[1]) known_topics = get_known_topics(folder) print(str(known_topics[0])) word_dict = get_word_dictionary(folder, vocab_file) hierarchy_struct = get_hierarchy_struct(folder) # Cycle through top-level documents count = 0 for index in range(len(format_lines)): line = format_lines[index] line = line.strip() # known_topics = get_known_topics() weights = get_document_values(index, hierarchy_struct) contents = line.split('|~|')[3] contents = clean_line(contents) classed_doc = classify(contents, weights, word_dict) classed_array = get_classed_array(classed_doc) if index % 1000 == 0 and index != 0: print('\tTopics correct position ' + '%d: %s' % (index, str(topic_order_count / index))) if compare_topics_order(classed_array, known_topics[index], 1): topic_order_count += 1 count += 1 print('\tTopics correct position %d: %s' % (count, str(topic_order_count / count)))
def main(): check_start('./test_doc_guess_algorithm.py <folder> <output> <num_topics>', 4) # Input naming folder = sys.argv[1] output = sys.argv[2] num_topics = int(sys.argv[3]) topic_order_count = [0] * num_topics topic_feeling_count = [0] * num_topics topic_offset_count = [0] * num_topics # File allocation format_file = folder + '/initial.formatted' vocab_file = folder + '/initial.vocab' word_assignment_file = folder + '/out/word-assignments.dat' gamma_file = folder + '/out/final.gamma' # Read in required data with open(vocab_file) as data_file: vocab_lines = data_file.readlines() with open(word_assignment_file) as data_file: word_assignment_lines = data_file.readlines() with open(format_file) as data_file: format_lines = data_file.readlines() with open(gamma_file) as data_file: gamma_lines = data_file.readlines() vocab_details = get_vocab_details(vocab_lines, word_assignment_lines) topic_lines = get_document_topics(gamma_lines) vocab_index = get_vocab_index(vocab_lines) num_docs = len(format_lines) for index in range(len(format_lines)): line = format_lines[index] line = line.strip() known_topics = topic_lines[index].strip().split(' ') document_contents = line.split('|~|')[3] document_contents = clean_line(document_contents) classed_doc = classify(document_contents, vocab_details, vocab_index) if len(classed_doc) == 0: print(document_contents) continue if index % 1000 == 0 and index != 0: print('Snapshot: ' + str(index)) for i in range(1, num_topics): print('\tTopics correct position %d: %s' % (i, str(topic_order_count[i] / index))) print('\tTopics within range %d: %s' % (i, str(topic_feeling_count[i] / index))) print('\tTopics within offset %d: %s' % (i, str(topic_offset_count[i] / index))) for i in range(1, len(classed_doc)): if compare_topics_order(classed_doc, known_topics, i): topic_order_count[i] += 1 if compare_topic_slice(classed_doc, known_topics, i): topic_feeling_count[i] += 1 if compare_position_offset(classed_doc, known_topics, i): topic_offset_count[i] += 1 out_ptr = open(output, 'w') out_ptr.write('Number of documents: ' + str(num_docs) + '\n') for i in range(num_topics): out_ptr.write('Topics correct position %d: %s\n' % (i, str(topic_order_count[i] / num_docs))) out_ptr.write('Topics within range %d: %s\n' % (i, str(topic_feeling_count[i] / num_docs))) out_ptr.write('Topics within offset %d: %s\n' % (i, str(topic_offset_count[i] / num_docs)))