Example #1
0
    def __load_data(self, dictname, filename):
        dictionary, is_created = Dictionary.objects.get_or_create(name=dictname)
        if not is_created:
            print 'Removing old data from dictionary ...'
            Words.objects.filter(dictionary=dictionary).delete()

        line = ''
        added_word_count = 0
        try:
            in_file = codecs.open(filename, mode='r', encoding='utf-8')
            
            print 'Loading new data ...'

            for line_count, line in enumerate(in_file):
                if line:
                    word, freq = line.split()
                    if not word.isdigit():
                        added_word_count += 1
                        word = word.lower()
                        trigrams = u' '.join(generate_trigram(word))
                        Words.objects.create(word=word, trigrams=trigrams, \
                                             length=len(word), frequency=freq, \
                                             dictionary=dictionary)

        except ValueError, ex:
            return "Error in %s line: bad file format (%s)\n`%s'" % \
                   (line_count, ex, line)
Example #2
0
    def __load_data(self, dictname, filename):
        dictionary, is_created = Dictionary.objects.get_or_create(
            name=dictname)
        if not is_created:
            print 'Removing old data from dictionary ...'
            Words.objects.filter(dictionary=dictionary).delete()
            dictionary.indexed_status = 'n'
            dictionary.save()

        line = ''
        added_word_count = 0
        try:
            in_file = codecs.open(filename, mode='r', encoding='utf-8')

            print 'Loading new data ...'

            for line_count, line in enumerate(in_file):
                if line:
                    word, freq = line.split()
                    # TODO add process function, for checking word
                    if not word.isdigit():
                        added_word_count += 1
                        word = word.lower()
                        trigrams = u' '.join(generate_trigram(word))
                        Words.objects.create(word=word, trigrams=trigrams, \
                                             length=len(word), frequency=freq, \
                                             dictionary=dictionary)

            in_file.close()
        except ValueError, ex:
            print >> sys.stderr, "Error in %s line: bad file format (%s)\n`%s'" % \
                   (line_count, ex, line)
            return
Example #3
0
     def correct_word(word):
         trigrams = u' '.join(generate_trigram(word))
         query_length = len(word)
         query_extended = '"%s"/2' % trigrams
 
         results = Words.search.query(query_extended)
         results = results.filter(dict_id=self.id)
         results = results.filter(length=range(query_length-2, query_length+3))
         #results = list(results)
         #results.sort(lambda x, y: \
         #    cmp(cal_weight(query_length, y), cal_weight(query_length, x)))
 
         if results:
             result = results[0].word
         else:
             result = word
         return result