def get_dictionary( ): """ Build a Dictionary based on the Diceware data. """ dicto = Dictionary() print 'Parsing Diceware data...' i = 0; nLines = 7780 # open file for reading with open(Diceware.fname, 'r') as fid: for line in fid: tokens = Diceware.parse_line(line) if tokens is None: continue # save data to list word = Word(tokens['word'], -1, -1, i); dicto.add_word(word) # increment counter and show progress i = i + 1; progress = float(i) / float(nLines) if (progress % 0.05) < 1e-4: sys.stdout.write("\r%2.2f%%" %(progress*100)) sys.stdout.flush() print '\nDone.' return dicto
def preproc(self): """ normalize the data (clean/remove problematic characters) for processing """ print 'Pre-processing dictionary...' # remove words that contain non-alphanumeric characters dicto_new = Dictionary() alpha = re.compile('[\W]') num = re.compile('[0-9]') for word in self.dicto.get_words_iter(): word_str = word.string # remove numbers from words word_str = num.sub('', word_str) if len(word_str) <= 0: continue if not alpha.search(word_str): word.set_string(word_str) dicto_new.add_word(word) print 'Done.' self.dicto = dicto_new