Exemple #1
0
def evaluateJamspell(modelFile, testText, alphabetFile, maxWords=50000):
    utils.loadAlphabet(alphabetFile)
    corrector = JamspellCorrector(modelFile)
    random.seed(42)
    originalText = loadText(testText)
    erroredText = generateTypos(originalText)
    assert len(originalText) == len(erroredText)
    originalSentences = generateSentences(originalText)
    erroredSentences = generateSentences(erroredText)
    errorsRate, fixRate, broken, topNerr, topNfix, execTime = \
        evaluateCorrector('jamspell', corrector, originalSentences, erroredSentences, maxWords)
    return errorsRate, fixRate, broken, topNerr, topNfix
    def train(self, trainFile):
        print '[info] loading text'
        text = loadText(trainFile)
        sentences = generateSentences(text)
        sentences = self.convertToIDs(sentences)

        print '[info] generating N-grams', len(sentences)
        total = len(sentences)
        lastTime = time.time()
        for i in xrange(0, total):
            sentence = sentences[i]
            for w in sentence:
                self.gram1[w] += 1
                self.totalWords += 1
            for j in xrange(len(sentence) - 1):
                self.gram2[(sentence[j], sentence[j+1])] += 1
            for j in xrange(len(sentence) - 2):
                self.gram3[(sentence[j], sentence[j+1], sentence[j+2])] += 1
            if time.time() - lastTime >= 4.0:
                lastTime = time.time()
                print '[info] processed %.2f%%' % (100.0 * i / total)

        print '[info] finished training'
Exemple #3
0
def main():
    parser = argparse.ArgumentParser(
        description='spelling correctors evaluation')
    parser.add_argument('file',
                        type=str,
                        help='text file to use for evaluation')
    parser.add_argument('-hs',
                        '--hunspell',
                        type=str,
                        help='path to hunspell model')
    parser.add_argument('-ns',
                        '--norvig',
                        type=str,
                        help='path to train file for Norvig spell corrector')
    parser.add_argument('-cs',
                        '--context',
                        type=str,
                        help='path to context spell model')
    parser.add_argument('-csp',
                        '--context_prototype',
                        type=str,
                        help='path to context spell prototype model')
    parser.add_argument('-jsp',
                        '--jamspell',
                        type=str,
                        help='path to jamspell model file')
    parser.add_argument('-t', '--test', action="store_true")
    parser.add_argument('-mx',
                        '--max_words',
                        type=int,
                        help='max words to evaluate')
    parser.add_argument('-a', '--alphabet', type=str, help='alphabet file')
    args = parser.parse_args()

    if args.alphabet:
        utils.loadAlphabet(args.alphabet)

    correctors = {
        'dummy': DummyCorrector(),
    }
    # corrector = correctors['dummy']

    maxWords = args.max_words

    print('[info] loading models')

    if args.hunspell:
        corrector = correctors['hunspell'] = HunspellCorrector(args.hunspell)

    if args.norvig:
        corrector = correctors['norvig'] = NorvigCorrector(args.norvig)

    if args.context:
        corrector = correctors['context'] = ContextCorrector(args.context)

    if args.context_prototype:
        corrector = correctors['prototype'] = ContextPrototypeCorrector(
            args.context_prototype)

    if args.jamspell:
        corrector = correctors['jamspell'] = JamspellCorrector(args.jamspell)

    if args.test:
        return testMode(corrector)

    random.seed(42)
    print('[info] loading text')
    originalText = loadText(args.file)
    originalTextLen = len(list(originalText))

    print('[info] generating typos')
    #将原始的词随机修改,并以单个词的集合-列表返回
    erroredText = generateTypos(originalText)
    erroredTextLen = len(list(erroredText))

    assert originalTextLen == erroredTextLen
    #将原始文本分割成句子(去掉其中的非法符号和非句号)(不包含句号)
    originalSentences = generateSentences(originalText)
    erroredSentences = generateSentences(erroredText)

    assert len(originalSentences) == len(erroredSentences)

    # for s in originalSentences[:50]:
    #    print ' '.join(s) + '.'

    print('[info] total words: %d' % len(originalText))
    print('[info] evaluating')

    results = {}

    for correctorName, corrector in correctors.items():
        errorsRate, fixRate, broken, topNerr, topNfix, execTime = \
            evaluateCorrector(correctorName, corrector, originalSentences, erroredSentences, maxWords)
        results[
            correctorName] = errorsRate, fixRate, broken, topNerr, topNfix, execTime

    print('')

    print('[info] %12s %8s  %8s  %8s  %8s  %8s  %8s' %
          ('', 'errRate', 'fixRate', 'broken', 'topNerr', 'topNfix', 'time'))
    # 将多个打分器的结果 resultsfixRate从大到小排序打印出来
    # 匿名函数 ~ 将x替换为results.items()即就是results.items[i][1]
    for k, _ in sorted(results.items(), key=lambda x: x[1]):
        print('[info] %10s  %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2fs' % \
              (k,
               100.0 * results[k][0],
               100.0 * results[k][1],
               100.0 * results[k][2],
               100.0 * results[k][3],
               100.0 * results[k][4],
               results[k][5]))