def main(): string = load() print("Building tree") tree = construct_tree(string) print("Building tree for reversed string") reversed_tree = construct_tree(string[::-1]) print("Counting sentences") sentence_length_count = count_sentence_length(string) count_of_length = [ count_all_possibilities(sentence_length_count, i) for i in range(len(sentence_length_count)) ] print("Finding words") process_update_interval = len(tree.root.next) // 20 for i, (key, child) in enumerate(tree.root.next.items()): if i % process_update_interval == 1: print("|", end="", flush=True) cursor = suffix_tree.Cursor(tree.root, key, len(child), tree.root) mark_words(tree, reversed_tree, count_of_length, cursor, str(cursor.current_node)) dict_df = pandas.DataFrame(data=dict_data) dict_df.head() print()
def mark_words(tree, reversed_tree, count_of_length, cursor, string): if split_mark in string: return if cursor.current_node.counter >= MIN_COUNT: # Filter by count if len(string) > 1: # Calculate co p_no_split = cursor.current_node.counter / count_of_length[len( string)] left_part = new_cursor(tree, string[0]) right_part = copy.copy(cursor) right_part.move_front_forward(string[0]) p_split = [] for i in range(1, len(string)): p_left = left_part.current_node.counter / count_of_length[i] p_right = right_part.current_node.counter / count_of_length[ len(string) - i] p_split.append(p_left * p_right) if i != len(string) - 1: left_part.move_forward(string[i]) right_part.move_front_forward(string[i]) # End of for i in range(1, len(string)) co = p_no_split / max(p_split) if co >= MIN_CO: # Filter by co # Calculate entropy reverse_lookup = reversed_tree.query_cursor(string[::-1]) if reverse_lookup.length + 1 == len( reverse_lookup.current_node): left_entropy = entropy_of_list( reverse_lookup.current_node.next.values()) else: left_entropy = 0 right_entropy = entropy_of_list( cursor.current_node.next.values()) # Calculate score score = co * (left_entropy + right_entropy) # Filter by score and entropy if score > MIN_SCORE and left_entropy > MIN_ENTROPY and right_entropy > MIN_ENTROPY: dict_data["词语"].append(string) dict_data["子节点数"].append(cursor.current_node.counter) dict_data["凝固度"].append(co) dict_data["左自由度"].append(left_entropy) dict_data["右自由度"].append(right_entropy) dict_data["自由度和"].append(left_entropy + right_entropy) dict_data["权重"].append(score) # End of if co >= MIN_CO # End of if len(string) > 1 # Recursively find vocabulary in child nodes for key, child in cursor.current_node.next.items(): next_cursor = suffix_tree.Cursor(cursor.current_node, key, len(child), tree.root) mark_words(tree, reversed_tree, count_of_length, next_cursor, string + str(next_cursor.current_node))
def new_cursor(tree, branch): return suffix_tree.Cursor(tree.root, branch, 0, tree.root)
else: left_entropy = 0 right_entropy = entropy_of_list(cursor.current_node.next.values()) # Calculate score score = co*(left_entropy+right_entropy) # Filter by score and entropy if score > MIN_SCORE and left_entropy > MIN_ENTROPY and right_entropy > MIN_ENTROPY: output_file.write("%s,%d,%f,%f,%f,%f,%f\n" % (string, cursor.current_node.counter, co, left_entropy, right_entropy, left_entropy+right_entropy, score)) # End of if co >= MIN_CO # End of if len(string) > 1 # Recursively find vocabulary in child nodes for key, child in cursor.current_node.next.items(): next_cursor = suffix_tree.Cursor(cursor.current_node, key, len(child), tree.root) mark_words(tree, reversed_tree, count_of_length, next_cursor, string + str(next_cursor.current_node)) # End of if cursor.current_node.counter >= MIN_COUNT # End of def mark_words def main(): string = load() print("Building tree") tree = construct_tree(string) print("Building tree for reversed string") reversed_tree = construct_tree(string[::-1]) print("Counting sentences") sentence_length_count = count_sentence_length(string) count_of_length = [count_all_possibilities(sentence_length_count, i)