Ejemplo n.º 1
0
def main():
    files = sys.argv[1:]
    d = Dictionary()
    for f in files:
        for word in parseWords(f):
            d.add_word(word)
    d.save("words.dat")
Ejemplo n.º 2
0
def file_to_dict(path):
    word_file = open(path, 'r')
    dictionary = Dictionary()

    counter = 0
    for line in word_file:
        if re.match('^[a-z]+$',line) is not None:
            dictionary.add_word(line.strip())
        if counter % 25000 == 0:
            print "Loading Dictionary..."
        counter += 1
    dictionary.update_word_count()
    word_file.close()
    return dictionary
Ejemplo n.º 3
0
def load_sentence_dict(data_file,
                       max_len=0,
                       remove_stop=False,
                       encoding='utf8',
                       split_symbol='\t',
                       low_case=False,
                       language='english'):
    d = Dictionary()
    with open(data_file, 'r') as fin:
        for line in fin:
            label, sentence = line.decode(encoding).strip().split(split_symbol)
            sentence_token = generate_sentence_token(sentence,
                                                     max_len=max_len,
                                                     remove_stop=remove_stop,
                                                     low_case=low_case,
                                                     language=language)
            for token in sentence_token:
                d.add_word(token)
    return d
Ejemplo n.º 4
0
    def _load_dictionary(self, path):
        dictionary = Dictionary(self)

        with open(path, encoding="utf-8") as f:
            for line in f:
                if line.isspace() or line[0] == "#":
                    continue

                cells = list(map(lambda c: c.strip(), filter(None, line.replace("  ", "\t").split("\t"))))

                text = cells[0]
                definitions = list(map(lambda s: s.strip(" \n\t"), cells[-1].split(",")))

                if len(cells) == 2:
                    tags = text.split("+")
                    word = dictionary.to_word(tags, definitions)
                    dictionary.add_word(word)
                elif len(cells) == 3:
                    tag = cells[1]
                    dictionary.add_morpheme(tag, text, definitions)
                else:
                    raise Exception("Bad line '{}'".format(cells))

        return dictionary
Ejemplo n.º 5
0
class TestDictionary(unittest.TestCase):
    DEFAULT_TABLE_SIZE = 250727
    DEFAULT_HASH_BASE = 31
    DEFAULT_TIMEOUT = 10
    FILENAMES = ['english_small.txt', 'english_large.txt', 'french.txt']
    RANDOM_STR = 'FIT1008 is the best subject!'
    
    def setUp(self) -> None:
        """ Used by our test cases """
        self.dictionary = Dictionary(TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE)
    
    def test_init(self) -> None:
        """ Testing type of our table and the length is 0 """
        self.assertEqual(type(self.dictionary.hash_table), LinearProbeHashTable)
        self.assertEqual(len(self.dictionary.hash_table), 0)
    
    def test_load_dictionary_statistics(self) -> None:
        """ For each file, doing some basic testing on the statistics generated """
        print("Testing load dictionary statistics method......")
        statistics = Statistics()
        for filename in TestDictionary.FILENAMES:
            words, time, collision_count, probe_total, probe_max, rehash_count = statistics.load_statistics(
                TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE * 2, filename,
                TestDictionary.DEFAULT_TIMEOUT)
            self.assertGreater(words, 0)
            self.assertLess(time, TestDictionary.DEFAULT_TIMEOUT)
            
            # TODO: Add your own test cases here

            # test case 1: # checking list of integers return by load statistics are all integers
            integers = [words, collision_count, probe_total, probe_max, rehash_count]
            assert (all(isinstance(item, int) for item in integers))
    
    def test_load_dictionary(self) -> None:
        """ Reading a dictionary and ensuring the number of lines matches the number of words
            Also testing the various exceptions are raised correctly """
        for filename in TestDictionary.FILENAMES:
            self.dictionary = Dictionary(TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE)
            words = self.dictionary.load_dictionary(filename)
            lines = file_len(filename)
            self.assertEqual(words, lines, "Number of words should match number of lines")
        
        # TODO: Add your own test cases (consider testing exceptions being raised)
        # test case 1: # checking it doesnt throw an erro for FileNotFoundError
        print("Testing load dictionary method......work on it")
        filename_2 = 'engli.txt'
        bucket = Dictionary(TestDictionary.DEFAULT_HASH_BASE, TestDictionary.DEFAULT_TABLE_SIZE)
        words = bucket.load_dictionary(filename_2)
        self.assertEqual(words, 0, "Number of words should be 0")
    
    def test_add_word(self) -> None:
        """ Testing the ability to add words """
        # TODO: Add your own test cases
        print("Testing add word......")
        
        # test case 1: inserting hello into the hash table
        self.dictionary.add_word("Hello")
        current_size = len(self.dictionary.hash_table)
        self.assertEqual(current_size, 1, "add word method not working properly")
        
        # test case 2: Insert multiple of item and updating the hash table count
        test_list_2 = ['to', 'customize', 'exception', 'parameters', 'while', 'giving', 'you', 'complete', 'control',
                       'of', 'the active']
        test_list_2_size = len(test_list_2)
        for item in test_list_2:
            self.dictionary.add_word(item)
        current_size = len(self.dictionary.hash_table)
        self.assertEqual(current_size, test_list_2_size + 1, "add word method not working properly")
        
    def test_find_word(self) -> None:
        """ Ensuring both valid and invalid words """
        # TODO: Add your own test cases
        print("Testing find word......")

        # test case 1: converted all the words in the hash table to upper case and check if find word would be able
        # to convert it back to lower case and return true
        self.test_add_word()
        test_list_2 = ['TO', 'CUSTOMIZE', 'EXCEPTION', 'PARAMETERS', 'WHILE', 'GIVING', 'YOU', 'COMPLETE', 'CONTROL',
                       'OF', 'THE ACTIVE']
        for item in test_list_2:
            result = self.dictionary.find_word(item)
            self.assertEqual(result, True, "add word method not working properly")

        # test case 2: finding word not in the dictionary and check if method returns False
        word = "AMAKOHIA"
        result = self.dictionary.find_word(word)
        self.assertEqual(result, False, "find word method not working properly")
        
    def test_delete_word(self) -> None:
        """ Deleting valid words and ensuring we can't delete invalid words """
        print("Testing delete word......")
        self.dictionary.load_dictionary('english_small.txt')
        table_size = len(self.dictionary.hash_table)
        with self.assertRaises(KeyError):
            self.dictionary.delete_word(TestDictionary.RANDOM_STR)
        self.assertEqual(len(self.dictionary.hash_table), table_size)
        
        self.dictionary.delete_word('test')
        self.assertEqual(len(self.dictionary.hash_table), table_size - 1)
Ejemplo n.º 6
0
#!/usr/bin/env python3
#-*- coding: utf-8 -*-

from dictionary import Dictionary
import sys


if len(sys.argv) != 3:
    sys.exit("Usage: generate_dictionary.py <input file> <ouput dictionary>")

with open(sys.argv[1], 'r') as input_file:
    d = Dictionary()
    d.open(sys.argv[2])
    d.clear()

    for word in input_file:
        d.add_word(word.strip())

    d.save()
    d.close()
Ejemplo n.º 7
0
import argparse, os, pickle
from dictionary import Dictionary

parser = argparse.ArgumentParser()
parser.add_argument('data', help='text file to make dictionary from')
parser.add_argument('out', help='path to write dictionary pickle to')
parser.add_argument('--max_vocab',
                    type=int,
                    default=100000,
                    help='max_words in dictionary')
args = parser.parse_args()

assert (os.path.exists(args.data))
dic = Dictionary()
freq = {}
with open(args.data, 'r') as f:
    for line in f:
        for word in line.split():
            freq[word] = freq.get(word, 0) + 1
for _, word in sorted([(-f, w) for w, f in freq.items()
                       ])[:args.max_vocab - len(dic.idx2word)]:
    dic.add_word(word)

with open(args.out, 'wb') as out_file:
    pickle.dump(dic, out_file)