def main(): files = sys.argv[1:] d = Dictionary() for f in files: for word in parseWords(f): d.add_word(word) d.save("words.dat")
def file_to_dict(path): word_file = open(path, 'r') dictionary = Dictionary() counter = 0 for line in word_file: if re.match('^[a-z]+$',line) is not None: dictionary.add_word(line.strip()) if counter % 25000 == 0: print "Loading Dictionary..." counter += 1 dictionary.update_word_count() word_file.close() return dictionary
def load_sentence_dict(data_file, max_len=0, remove_stop=False, encoding='utf8', split_symbol='\t', low_case=False, language='english'): d = Dictionary() with open(data_file, 'r') as fin: for line in fin: label, sentence = line.decode(encoding).strip().split(split_symbol) sentence_token = generate_sentence_token(sentence, max_len=max_len, remove_stop=remove_stop, low_case=low_case, language=language) for token in sentence_token: d.add_word(token) return d
def _load_dictionary(self, path): dictionary = Dictionary(self) with open(path, encoding="utf-8") as f: for line in f: if line.isspace() or line[0] == "#": continue cells = list(map(lambda c: c.strip(), filter(None, line.replace(" ", "\t").split("\t")))) text = cells[0] definitions = list(map(lambda s: s.strip(" \n\t"), cells[-1].split(","))) if len(cells) == 2: tags = text.split("+") word = dictionary.to_word(tags, definitions) dictionary.add_word(word) elif len(cells) == 3: tag = cells[1] dictionary.add_morpheme(tag, text, definitions) else: raise Exception("Bad line '{}'".format(cells)) return dictionary
class TestDictionary(unittest.TestCase): DEFAULT_TABLE_SIZE = 250727 DEFAULT_HASH_BASE = 31 DEFAULT_TIMEOUT = 10 FILENAMES = ['english_small.txt', 'english_large.txt', 'french.txt'] RANDOM_STR = 'FIT1008 is the best subject!' def setUp(self) -> None: """ Used by our test cases """ self.dictionary = Dictionary(TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE) def test_init(self) -> None: """ Testing type of our table and the length is 0 """ self.assertEqual(type(self.dictionary.hash_table), LinearProbeHashTable) self.assertEqual(len(self.dictionary.hash_table), 0) def test_load_dictionary_statistics(self) -> None: """ For each file, doing some basic testing on the statistics generated """ print("Testing load dictionary statistics method......") statistics = Statistics() for filename in TestDictionary.FILENAMES: words, time, collision_count, probe_total, probe_max, rehash_count = statistics.load_statistics( TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE * 2, filename, TestDictionary.DEFAULT_TIMEOUT) self.assertGreater(words, 0) self.assertLess(time, TestDictionary.DEFAULT_TIMEOUT) # TODO: Add your own test cases here # test case 1: # checking list of integers return by load statistics are all integers integers = [words, collision_count, probe_total, probe_max, rehash_count] assert (all(isinstance(item, int) for item in integers)) def test_load_dictionary(self) -> None: """ Reading a dictionary and ensuring the number of lines matches the number of words Also testing the various exceptions are raised correctly """ for filename in TestDictionary.FILENAMES: self.dictionary = Dictionary(TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE) words = self.dictionary.load_dictionary(filename) lines = file_len(filename) self.assertEqual(words, lines, "Number of words should match number of lines") # TODO: Add your own test cases (consider testing exceptions being raised) # test case 1: # checking it doesnt throw an erro for FileNotFoundError print("Testing load dictionary method......work on it") filename_2 = 'engli.txt' bucket = Dictionary(TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE) words = bucket.load_dictionary(filename_2) self.assertEqual(words, 0, "Number of words should be 0") def test_add_word(self) -> None: """ Testing the ability to add words """ # TODO: Add your own test cases print("Testing add word......") # test case 1: inserting hello into the hash table self.dictionary.add_word("Hello") current_size = len(self.dictionary.hash_table) self.assertEqual(current_size, 1, "add word method not working properly") # test case 2: Insert multiple of item and updating the hash table count test_list_2 = ['to', 'customize', 'exception', 'parameters', 'while', 'giving', 'you', 'complete', 'control', 'of', 'the active'] test_list_2_size = len(test_list_2) for item in test_list_2: self.dictionary.add_word(item) current_size = len(self.dictionary.hash_table) self.assertEqual(current_size, test_list_2_size + 1, "add word method not working properly") def test_find_word(self) -> None: """ Ensuring both valid and invalid words """ # TODO: Add your own test cases print("Testing find word......") # test case 1: converted all the words in the hash table to upper case and check if find word would be able # to convert it back to lower case and return true self.test_add_word() test_list_2 = ['TO', 'CUSTOMIZE', 'EXCEPTION', 'PARAMETERS', 'WHILE', 'GIVING', 'YOU', 'COMPLETE', 'CONTROL', 'OF', 'THE ACTIVE'] for item in test_list_2: result = self.dictionary.find_word(item) self.assertEqual(result, True, "add word method not working properly") # test case 2: finding word not in the dictionary and check if method returns False word = "AMAKOHIA" result = self.dictionary.find_word(word) self.assertEqual(result, False, "find word method not working properly") def test_delete_word(self) -> None: """ Deleting valid words and ensuring we can't delete invalid words """ print("Testing delete word......") self.dictionary.load_dictionary('english_small.txt') table_size = len(self.dictionary.hash_table) with self.assertRaises(KeyError): self.dictionary.delete_word(TestDictionary.RANDOM_STR) self.assertEqual(len(self.dictionary.hash_table), table_size) self.dictionary.delete_word('test') self.assertEqual(len(self.dictionary.hash_table), table_size - 1)
#!/usr/bin/env python3 #-*- coding: utf-8 -*- from dictionary import Dictionary import sys if len(sys.argv) != 3: sys.exit("Usage: generate_dictionary.py <input file> <ouput dictionary>") with open(sys.argv[1], 'r') as input_file: d = Dictionary() d.open(sys.argv[2]) d.clear() for word in input_file: d.add_word(word.strip()) d.save() d.close()
import argparse, os, pickle from dictionary import Dictionary parser = argparse.ArgumentParser() parser.add_argument('data', help='text file to make dictionary from') parser.add_argument('out', help='path to write dictionary pickle to') parser.add_argument('--max_vocab', type=int, default=100000, help='max_words in dictionary') args = parser.parse_args() assert (os.path.exists(args.data)) dic = Dictionary() freq = {} with open(args.data, 'r') as f: for line in f: for word in line.split(): freq[word] = freq.get(word, 0) + 1 for _, word in sorted([(-f, w) for w, f in freq.items() ])[:args.max_vocab - len(dic.idx2word)]: dic.add_word(word) with open(args.out, 'wb') as out_file: pickle.dump(dic, out_file)