def _c_stopwords(n, dict_path, output_path): """ 找出字数为n的词的共同词作为停用词并输出到output_path :param n:字数 :param dict_path:源文件目录 :param output_path:结果文件目录 :return: """ wl = WordList.WordList() keys = dict_path.keys() dic1 = dict_path[keys[0]] wl.read_data(keys[0] + '/' + dic1[n]) for i in keys[1:]: path = i + '/' + dict_path[i][n] print path wl = WordList.WordList() wl.read_data(path) words = wl.get_data().index.intersection(wl.get_data().index) df = pd.DataFrame({'word_segment': list(words)}) output_file = output_path + '/stopwords_' + str(n) + '.txt' df.to_csv(output_file, index=False) print df
def traverse(t): try: t.label() except AttributeError: print(t, end=" ") else: for child in t: if type(child) == tree.Tree: global noun noun = "" for words in child: if words[1] == "NN": temp = wordlist.getNoun(words[0]) if temp != "": noun = temp elif noun == "": noun = "misc" elif words[1] == "JJ": if noun in ratings: temp = ratings[noun] temp.append(words[0]) ratings[noun] = temp else: ratings[noun] = ([words[0]])
count for s: 4095 because of ^[^s][^s][^s][^s][^s][^s][^s]$ count for t: 5528 because of ^[^t][^t][^t][^t][^t][^t][^t]$ count for u: 6782 because of ^[^u][^u][^u][^u][^u][^u][^u][^u]$ count for v: 8428 because of ^[^v][^v][^v][^v][^v][^v][^v][^v]$ count for w: 8347 because of ^[^w][^w][^w][^w][^w][^w][^w][^w]$ count for x: 8953 because of ^[^x][^x][^x][^x][^x][^x][^x][^x]$ count for y: 8132 because of ^[^y][^y][^y][^y][^y][^y][^y][^y]$ count for z: 9000 because of ^[^z][^z][^z][^z][^z][^z][^z][^z]$ ''' import WordList import re wl = WordList.WordList() def worst_case(): return worst_case_for_one('.') def worst_case_for_one(regex): # Worst case for the first guess. words = [word for word in wl.lines if re.search(regex, word) != None] best_regex = None best_count = 0 for c in range(ord('t'), ord('z') + 1):
''' This is the actual executioner that gets run. ''' import WordList import random import math wordlist = WordList.WordList() class Executioner: # Initialize an Executioner, ready to play. Please do not change anything already # in this method, but feel free to add more stuff. def __init__(self): self.iterator = 0 self.alltime_hardest_words = dict([(line.split()[0], int(line.split()[1])) for line in open("alltime_hardest_words.txt", "r") ]) self.random_games = 0 self.max_random_games = 30 self.win_count = 0 self.lose_count = 0.001 # Avoid division by zero. self.new_win_or_loss_weight = 1.0 / 20 self.average_ratio = 0 self.tries_below_average = 0 self.tries_before_change = 10 self.choose_word_count = 2048 self.choose_word_counts = [ [16**i, 5, 5.0] for i in range(1, 4) ] self.select_count_each = 0
def indexing(index_dict: dict, doc_list: ["Document"]) -> None: total_doc_num = len(doc_list) for doc in doc_list: WordList.update_index_dict(index_dict, doc) WordList.calculate_tfidf(index_dict, total_doc_num)
def get_word(): word = wl.get_random_word() return word.upper()
def plot_times(filename="English.txt", start=500, stop=5500, step=500): """Vary n from 'start' to 'stop', incrementing by 'step'. At each iteration, use the create_word_list() from the 'WordList' module to generate a list of n randomized words from the specified file. Time (separately) how long it takes to load a LinkedList, a BST, and an AVL with the data set. Choose 5 random words from the data set. Time how long it takes to find each word in each object. Calculate the average search time for each object. Create one plot with two subplots. In the first subplot, plot the number of words in each dataset against the build time for each object. In the second subplot, plot the number of words against the search time for each object. Inputs: filename (str): the file to use in creating the data sets. start (int): the lower bound on the sample interval. stop (int): the upper bound on the sample interval. step (int): the space between points in the sample interval. Returns: Show the plot, but do not return any values. """ def wrapper(func, *args, **kwargs): def wrapped(): return func(*args, **kwargs) return wrapped def add_all(A, my_list): for x in my_list: A.add(x) def add_all_tree(A, my_list): for x in my_list: A.insert(x) def find_it(A, to_find): A.find(to_find) def find_average(A, my_list): find_times = [] for x in range(5): to_find = random.choice(my_list) # to_find = my_list[x] wrapped = wrapper(find_it, A, to_find) find_times.append(timeit.timeit(wrapped, number=1)) return np.mean(find_times) word_list = WordList.create_word_list() word_list = np.random.permutation(word_list) x_values = range(start, stop, step) list_times = [] bst_times = [] avl_times = [] find_list= [] find_bst= [] find_avl= [] A = LinkedList() B = BST() C = AVL() for x in x_values: wrapped = wrapper(add_all, A, word_list[:int(x)]) list_times.append(timeit.timeit(wrapped, number=1)) find_list.append(find_average(A, word_list[:int(x)])) A.clear() for x in x_values: wrapped = wrapper(add_all_tree, B, word_list[:int(x)]) bst_times.append(timeit.timeit(wrapped, number=1)) find_bst.append(find_average(B, word_list[:int(x)])) B.clear() for x in x_values: wrapped = wrapper(add_all_tree, C, word_list[:int(x)]) avl_times.append(timeit.timeit(wrapped, number=1)) find_avl.append(find_average(C, word_list[:int(x)])) C.clear() plt.subplot(121) plt.plot(x_values, list_times, label='Linked List') plt.plot(x_values, bst_times, label='BST') plt.plot(x_values, avl_times, label='AVL') plt.legend(loc='upper left') plt.xlabel('data points') plt.ylabel('seconds') plt.subplot(122) plt.plot(x_values, find_list,label='Linked List') plt.plot(x_values, find_bst, label='BST') plt.plot(x_values, find_avl, label='AVL') plt.legend(loc='upper left') plt.xlabel('data points') plt.ylabel('seconds') plt.show() plt.xlabel('data points')
def play_hangman_both(): guesser = Guesser.Guesser() executioner = Executioner.Executioner() wins = 0 losses = 0 myfile = open('results.txt', 'a') while True: word = raw_input('What is your word? ').strip() if (word == '-q'): end_game(guesser, executioner) return None while len(Guesser.wordlist.find_words(word)) == 0: addp = raw_input( 'That is not a word. ' + 'Do you want to add it to the dictionary? (y/n) ').strip() if addp == 'y': print 'Adding...' Guesser.wordlist.add_word(word) Guesser.wordlist = WordList.WordList() Executioner.wordlist = WordList.WordList() print 'added.' word = raw_input('What is your word? ').strip() if (word == '-q'): end_game(guesser, executioner) return None executioner.reset(word) guesser.solve(executioner) if executioner.did_guesser_win: losses += 1 myfile.write("Computer | " + (executioner._word + ' ' * (14 - len(executioner._word))) + " | computer\n") else: myfile.write("Computer | " + (executioner._word + ' ' * (14 - len(executioner._word))) + " | human\n") wins += 1 print 'You have', wins, 'wins and', losses, 'losses' print executioner.reset() incorrect = '' while executioner.is_playing: print "word:", executioner.word print "guessed letters:", incorrect print "body parts remaining:", executioner.body_parts - executioner.man while True: letter = raw_input('What is your guess? ').strip() if (letter == '-q'): end_game(guesser, executioner) return None if len(letter) > 1: print "That is more than just one letter. Please try again." elif len(letter) == 0: print "You didn't enter anything. Please try again." elif ord(letter) < ord('a') or ord(letter) > ord('z'): print "You must enter a lowercase letter. Please try again." else: break if executioner.guess(letter) != None: break if not letter in executioner.not_incorrect and not letter in incorrect: incorrect += letter print if executioner.did_guesser_win: wins += 1 myfile.write("Human | " + (executioner._word + ' ' * (14 - len(executioner._word))) + " | human\n") else: myfile.write("Human | " + (executioner._word + ' ' * (14 - len(executioner._word))) + " | computer\n") losses += 1 print 'You have', wins, 'wins and', losses, 'losses' print myfile.close()
def test_adding_word(self): wordlist = WordList() wordlist.add('ha') self.assertEqual(wordlist.list[-1], 'ha')
def test_generate_swear_word_dict(self): word_list = WordList() print(word_list)