def __load_data(self, dictname, filename): dictionary, is_created = Dictionary.objects.get_or_create(name=dictname) if not is_created: print 'Removing old data from dictionary ...' Words.objects.filter(dictionary=dictionary).delete() line = '' added_word_count = 0 try: in_file = codecs.open(filename, mode='r', encoding='utf-8') print 'Loading new data ...' for line_count, line in enumerate(in_file): if line: word, freq = line.split() if not word.isdigit(): added_word_count += 1 word = word.lower() trigrams = u' '.join(generate_trigram(word)) Words.objects.create(word=word, trigrams=trigrams, \ length=len(word), frequency=freq, \ dictionary=dictionary) except ValueError, ex: return "Error in %s line: bad file format (%s)\n`%s'" % \ (line_count, ex, line)
def __load_data(self, dictname, filename): dictionary, is_created = Dictionary.objects.get_or_create( name=dictname) if not is_created: print 'Removing old data from dictionary ...' Words.objects.filter(dictionary=dictionary).delete() dictionary.indexed_status = 'n' dictionary.save() line = '' added_word_count = 0 try: in_file = codecs.open(filename, mode='r', encoding='utf-8') print 'Loading new data ...' for line_count, line in enumerate(in_file): if line: word, freq = line.split() # TODO add process function, for checking word if not word.isdigit(): added_word_count += 1 word = word.lower() trigrams = u' '.join(generate_trigram(word)) Words.objects.create(word=word, trigrams=trigrams, \ length=len(word), frequency=freq, \ dictionary=dictionary) in_file.close() except ValueError, ex: print >> sys.stderr, "Error in %s line: bad file format (%s)\n`%s'" % \ (line_count, ex, line) return
def correct_word(word): trigrams = u' '.join(generate_trigram(word)) query_length = len(word) query_extended = '"%s"/2' % trigrams results = Words.search.query(query_extended) results = results.filter(dict_id=self.id) results = results.filter(length=range(query_length-2, query_length+3)) #results = list(results) #results.sort(lambda x, y: \ # cmp(cal_weight(query_length, y), cal_weight(query_length, x))) if results: result = results[0].word else: result = word return result