def testRun(): lukija = WordReader(["../Material/Grimm's Fairy Tales.txt"]) lukija.readWords() punamusta = RedBlack(lukija) trie = Trie(lukija) print "Adding words from selected material..." intti = 0; setti = 0 for word in lukija.words: trie.add(word[0], word[1:]) punamusta.add(word[0], word[1:]) intti = intti + 1 if intti > lukija.wordcount / 100.0: setti = setti + 1 print setti, '% of words added' intti = 0 print "Searching for words in Grimm's Fairy tales and The Adventures of Tom Sawyer" word = raw_input( "Find a word (or its beginning) in the text: " ).rstrip( '\n' ) positions, count, linecount = trie.find(word) print "Found", count, "instances (", linecount, "lines) @", positions RBpositions, RBcount, RBlinecount = punamusta.find(word) print "Found", RBcount, "instances (", RBlinecount, "lines) @", RBpositions
def setUp(self): self.lukija = WordReader(["../../Material/Grimm's Fairy Tales.txt"], specialCharacters = ["-", "'"], acceptNumerals = True, acceptUpperCase = True, acceptLowerCase = False) # test addFileName self.lukija.addFileName("../../Material/The Adventures of Tom Sawyer by Mark Twain.txt")
class PySearcherTestCases(unittest.TestCase): def setUp(self): self.reader = WordReader() self.finder = Trie(self.reader) self.searcher = Searcher(self.finder, '') def tearDown(self): self.reader = None self.finder = None self.searcher = None def testRandomWord(self): """ Tests that non-empty words are found and they are not the same """ word1 = self.searcher.randomWord() word2 = self.searcher.randomWord() self.assertTrue(len(word1) > 1, 'Word length too short') self.assertTrue(len(word2) > 1, 'Word length too short') self.assertNotEqual(word1, word2, 'Found the same word') def testRandomWords(self): """ Tests that a set of random words do not contain the same words """ words = self.searcher.randomWord(5) self.assertTrue(len(set(words)) == 5, 'Did not find 5 unique words') def testBinaryOperationsAreWorking(self): """ Checks that operations are not identic and that correct number of hits is returned for every known result. """ self.reader.addFileName(MaterialFilePath, readNow=True) self.finder.addFromReader() results = [] for operation in operations: results.append(self.searcher.search(operations[operation], returnCount=True)) self.assertTrue(len(set(results)) == 6, #i.e. operations are not identic 'Searcher failed binary operation check') for searchTerm in binaryOperationsSearch: self.assertEqual(self.searcher.search(searchTerm, returnCount=True), binaryOperationsSearch[searchTerm], 'Searcher found wrong number of hits on some search')
def testWordCountAndClear(self): """ Test if the reader finds the correct number of words """ self.lukija = WordReader(['../../Material/50words_in_UTF-8.txt']) self.lukija.readWords() self.assertEqual(self.lukija.wordcount , wordsInTestFile, 'Did not get the correct number of words') # After clearing, we should not have any words in memory self.lukija.clear() self.assertEqual((self.lukija.words, self.lukija.wordcount, self.lukija.filecount, self.lukija.linecount), ([], 0, 0, 0))
class PyWordReaderTestCases(unittest.TestCase): def setUp(self): self.lukija = WordReader(["../../Material/Grimm's Fairy Tales.txt"], specialCharacters = ["-", "'"], acceptNumerals = True, acceptUpperCase = True, acceptLowerCase = False) # test addFileName self.lukija.addFileName("../../Material/The Adventures of Tom Sawyer by Mark Twain.txt") def tearDown(self): self.lukija.clear('all') def testSanitize(self): """ Test whether word sanitizing works """ self.words = [] for word in unsanitizedWords: self.words.append(self.lukija.sanitize(word)) self.assertEqual(self.words, sanitizedWords, 'Failed to sanitize words') def testCreateChrMap(self): """ Test whether index and character maps are okay """ self.chrMap, self.idxMap = self.lukija._createChrMap() self.assertEqual(self.chrMap, properChrMap, 'Bad character map') self.assertEqual(self.idxMap, properIdxMap, 'Bad index map') def testInd2char(self): """ Test function ind2char """ for index, val in enumerate(properChrMap): self.assertEqual(self.lukija.ind2char(index), properChrMap[index], 'ind2char function failed to map indices to characters') def testChar2ind(self): """ Test function ind2char """ for char in properIdxMap: self.assertEqual(self.lukija.char2ind(char), properIdxMap[char], 'char2ind function failed to map characters to indices') def testGetCharMapSize(self): """ Test whether getCharMapSize returns the correct value """ self.assertEqual(self.lukija.getCharMapSize(), len(properChrMap), 'getCharMapSize returned wrong map size') def testLineCount(self): """ Test whether WordReader reads all lines in files """ self.lukija.readWords() self.assertEqual(self.lukija.linecount , linesIn2books, 'Did not read correct number of lines from file') self.assertEqual(self.lukija.filecount , noOfFiles, 'Did not read correct number of lines from file') def testWordCountAndClear(self): """ Test if the reader finds the correct number of words """ self.lukija = WordReader(['../../Material/50words_in_UTF-8.txt']) self.lukija.readWords() self.assertEqual(self.lukija.wordcount , wordsInTestFile, 'Did not get the correct number of words') # After clearing, we should not have any words in memory self.lukija.clear() self.assertEqual((self.lukija.words, self.lukija.wordcount, self.lukija.filecount, self.lukija.linecount), ([], 0, 0, 0))
def setUp(self): self.reader = WordReader() self.finder = Trie(self.reader) self.searcher = Searcher(self.finder, '')
return average(runtimes) else: print string + '%20.3f ms' % (sum(runtimes) / repeats) if __name__ == "__main__": print "Hello World" trieAddFile = openFile('trieAddToEmpty', 'w') punamustaAddFile = openFile('punamustaAddToEmpty', 'w') trieFindLengthFile = openFile('trieFindWordLength', 'w') punamustaFindLengthFile = openFile('punamustaFindWordLength', 'w') trieFindWordCountFile = openFile('trieFindWordCount', 'w') punamustaFindWordCountFile = openFile('punamustaFindWordCoun', 'w') lukija = WordReader(["../Material/Grimm's Fairy Tales.txt"]) lukija.readWords() punamusta = RedBlack(lukija) trie = Trie(lukija) words = pickle.load( open( "randomWordList", "rb" ) ) # indexed by word len repeats = 100; runtimes = [] for i in range(2,17): runtime = addWordsToEmptyList(trie, 2**i, repeats, '%25s\t%10d\t' % ('trie:add', 2**i), False) print '%25s\t%10d\t\t%14.3f ms' % ('trie:addToEmpty', 2**i, runtime) trieAddFile.write('%10d\t%8.3f\n' % (2**i, runtime)) runtimes.append(findWords(trie, words[7], repeats, printout=False)) for index, runtime in enumerate(runtimes): print '%25s\t%10d\t\t%14.3f ms' % ('trie:findWordCount', 2**(index+2), runtime)