コード例 #1
0
def main():
    global NUM_TOP_TOPICS

    check_start('./test_doc_guess_algorithm.py <folder> ' +
                '<output> <num_topics>', 4)

    # Input naming
    folder = sys.argv[1]
    output = sys.argv[2]
    num_topics = int(sys.argv[3])

    topic_order_count = 0
    topic_feeling_count = 0
    topic_offset_count = 0

    # File allocation
    format_file = folder + '/initial.formatted'
    vocab_file = folder + '/initial.vocab'
    info_file = folder + '/out/final.other'

    with open(format_file) as data_file:
        format_lines = data_file.readlines()

    with open(info_file) as data_file:
            info_lines = data_file.readlines()

    NUM_TOP_TOPICS = int(info_lines[0].split(' ')[1])

    known_topics = get_known_topics(folder)
    print(str(known_topics[0]))
    word_dict = get_word_dictionary(folder, vocab_file)
    hierarchy_struct = get_hierarchy_struct(folder)

    # Cycle through top-level documents
    count = 0
    for index in range(len(format_lines)):
        line = format_lines[index]
        line = line.strip()

        # known_topics = get_known_topics()

        weights = get_document_values(index, hierarchy_struct)
        contents = line.split('|~|')[3]
        contents = clean_line(contents)
        classed_doc = classify(contents, weights, word_dict)
        classed_array = get_classed_array(classed_doc)

        if index % 1000 == 0 and index != 0:
            print('\tTopics correct position ' +
                  '%d: %s' % (index, str(topic_order_count / index)))

        if compare_topics_order(classed_array, known_topics[index], 1):
            topic_order_count += 1

        count += 1

    print('\tTopics correct position %d: %s' % (count, str(topic_order_count / count)))
コード例 #2
0
def main():
    check_start('./test_doc_guess_algorithm.py <folder> <output> <num_topics>', 4)

    # Input naming
    folder = sys.argv[1]
    output = sys.argv[2]
    num_topics = int(sys.argv[3])

    topic_order_count = [0] * num_topics
    topic_feeling_count = [0] * num_topics
    topic_offset_count = [0] * num_topics

    # File allocation
    format_file = folder + '/initial.formatted'
    vocab_file = folder + '/initial.vocab'
    word_assignment_file = folder + '/out/word-assignments.dat'
    gamma_file = folder + '/out/final.gamma'

    # Read in required data
    with open(vocab_file) as data_file:
        vocab_lines = data_file.readlines()

    with open(word_assignment_file) as data_file:
        word_assignment_lines = data_file.readlines()

    with open(format_file) as data_file:
        format_lines = data_file.readlines()

    with open(gamma_file) as data_file:
        gamma_lines = data_file.readlines()

    vocab_details = get_vocab_details(vocab_lines, word_assignment_lines)
    topic_lines = get_document_topics(gamma_lines)
    vocab_index = get_vocab_index(vocab_lines)

    num_docs = len(format_lines)

    for index in range(len(format_lines)):
        line = format_lines[index]
        line = line.strip()

        known_topics = topic_lines[index].strip().split(' ')
        document_contents = line.split('|~|')[3]
        document_contents = clean_line(document_contents)
        classed_doc = classify(document_contents, vocab_details, vocab_index)

        if len(classed_doc) == 0:
            print(document_contents)
            continue

        if index % 1000 == 0 and index != 0:
            print('Snapshot: ' + str(index))
            for i in range(1, num_topics):
                print('\tTopics correct position %d: %s' % (i, str(topic_order_count[i] / index)))
                print('\tTopics within range %d: %s' % (i, str(topic_feeling_count[i] / index)))
                print('\tTopics within offset %d: %s' % (i, str(topic_offset_count[i] / index)))

        for i in range(1, len(classed_doc)):
            if compare_topics_order(classed_doc, known_topics, i):
                topic_order_count[i] += 1

            if compare_topic_slice(classed_doc, known_topics, i):
                topic_feeling_count[i] += 1

            if compare_position_offset(classed_doc, known_topics, i):
                topic_offset_count[i] += 1

    out_ptr = open(output, 'w')

    out_ptr.write('Number of documents: ' + str(num_docs) + '\n')
    for i in range(num_topics):
        out_ptr.write('Topics correct position %d: %s\n' % (i, str(topic_order_count[i] / num_docs)))
        out_ptr.write('Topics within range %d: %s\n' % (i, str(topic_feeling_count[i] / num_docs)))
        out_ptr.write('Topics within offset %d: %s\n' % (i, str(topic_offset_count[i] / num_docs)))