def main():
    string = load()
    print("Building tree")
    tree = construct_tree(string)
    print("Building tree for reversed string")
    reversed_tree = construct_tree(string[::-1])

    print("Counting sentences")
    sentence_length_count = count_sentence_length(string)
    count_of_length = [
        count_all_possibilities(sentence_length_count, i)
        for i in range(len(sentence_length_count))
    ]

    print("Finding words")
    process_update_interval = len(tree.root.next) // 20
    for i, (key, child) in enumerate(tree.root.next.items()):
        if i % process_update_interval == 1:
            print("|", end="", flush=True)

        cursor = suffix_tree.Cursor(tree.root, key, len(child), tree.root)
        mark_words(tree, reversed_tree, count_of_length, cursor,
                   str(cursor.current_node))
    dict_df = pandas.DataFrame(data=dict_data)
    dict_df.head()
    print()
def mark_words(tree, reversed_tree, count_of_length, cursor, string):
    if split_mark in string:
        return

    if cursor.current_node.counter >= MIN_COUNT:  # Filter by count
        if len(string) > 1:
            # Calculate co
            p_no_split = cursor.current_node.counter / count_of_length[len(
                string)]

            left_part = new_cursor(tree, string[0])
            right_part = copy.copy(cursor)
            right_part.move_front_forward(string[0])
            p_split = []
            for i in range(1, len(string)):
                p_left = left_part.current_node.counter / count_of_length[i]
                p_right = right_part.current_node.counter / count_of_length[
                    len(string) - i]
                p_split.append(p_left * p_right)

                if i != len(string) - 1:
                    left_part.move_forward(string[i])
                    right_part.move_front_forward(string[i])
            # End of for i in range(1, len(string))

            co = p_no_split / max(p_split)

            if co >= MIN_CO:  # Filter by co
                # Calculate entropy
                reverse_lookup = reversed_tree.query_cursor(string[::-1])
                if reverse_lookup.length + 1 == len(
                        reverse_lookup.current_node):
                    left_entropy = entropy_of_list(
                        reverse_lookup.current_node.next.values())
                else:
                    left_entropy = 0
                right_entropy = entropy_of_list(
                    cursor.current_node.next.values())

                # Calculate score
                score = co * (left_entropy + right_entropy)

                # Filter by score and entropy
                if score > MIN_SCORE and left_entropy > MIN_ENTROPY and right_entropy > MIN_ENTROPY:
                    dict_data["词语"].append(string)
                    dict_data["子节点数"].append(cursor.current_node.counter)
                    dict_data["凝固度"].append(co)
                    dict_data["左自由度"].append(left_entropy)
                    dict_data["右自由度"].append(right_entropy)
                    dict_data["自由度和"].append(left_entropy + right_entropy)
                    dict_data["权重"].append(score)
            # End of if co >= MIN_CO
        # End of if len(string) > 1

        # Recursively find vocabulary in child nodes
        for key, child in cursor.current_node.next.items():
            next_cursor = suffix_tree.Cursor(cursor.current_node, key,
                                             len(child), tree.root)
            mark_words(tree, reversed_tree, count_of_length, next_cursor,
                       string + str(next_cursor.current_node))
Ejemplo n.º 3
0
def new_cursor(tree, branch):
    return suffix_tree.Cursor(tree.root, branch, 0, tree.root)
Ejemplo n.º 4
0
                else:
                    left_entropy = 0
                right_entropy = entropy_of_list(cursor.current_node.next.values())
                
                # Calculate score
                score = co*(left_entropy+right_entropy)
                
                # Filter by score and entropy
                if score > MIN_SCORE and left_entropy > MIN_ENTROPY and right_entropy > MIN_ENTROPY:
                    output_file.write("%s,%d,%f,%f,%f,%f,%f\n" % (string, cursor.current_node.counter, co, left_entropy, right_entropy, left_entropy+right_entropy, score))
# End of if co >= MIN_CO
# End of if len(string) > 1

# Recursively find vocabulary in child nodes
for key, child in cursor.current_node.next.items():
    next_cursor = suffix_tree.Cursor(cursor.current_node, key, len(child), tree.root)
    mark_words(tree, reversed_tree, count_of_length, next_cursor, string + str(next_cursor.current_node))
# End of if cursor.current_node.counter >= MIN_COUNT
# End of def mark_words


def main():
    string = load()
    print("Building tree")
    tree = construct_tree(string)
    print("Building tree for reversed string")
    reversed_tree = construct_tree(string[::-1])
    
    print("Counting sentences")
    sentence_length_count = count_sentence_length(string)
    count_of_length = [count_all_possibilities(sentence_length_count, i)