Python SymSpell.load_bigram_dictionaryの例、symspellpy.SymSpell.load_bigram_dictionary Pythonの例

コード例 #1

0

ファイルを表示

ファイル: segmentText.py プロジェクト: giangnguyenvanvsi/chatbot_nlu_preprocess

class SegmentText():
    def __init__(self, 
                 dictionary_path = None, 
                 bigram_path = None):
        
        self.name = "SegmenText"
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        # dictionary_path = pkg_resources.resource_filename(
        #     "symspellpy", "frequency_dictionary_en_82_765.txt")
        if dictionary_path != None:
            self.dictionary_path = dictionary_path
        else:
            self.dictionary_path = os.path.join("./symspellfre_", "frequency_dictionary_en_82_765.txt")
    
        if bigram_path != None:
            self.bigram_path = bigram_path
        else:
            self.bigram_path = os.path.join("./symspellfre_", "frequency_bigramdictionary_en_243_342.txt")
            # self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2)

    
    def split(self, sentence):
        # lookup suggestions for multi-word input strings (supports compound
        # splitting & merging)
        # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan")
        # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan eoy")
        # max edit distance per lookup (per single word, not per whole input string)
        suggestions = self.sym_spell.lookup_compound(sentence, max_edit_distance=2)
        # display suggestion term, edit distance, and term frequency
        for suggestion in suggestions:
            print(suggestion)
        return suggestions

コード例 #2

0

ファイルを表示

class SpellCorrect():
    def __init__(self,
                 dictionary_path=dictionary_path__,
                 bigram_path=bigram_path__):

        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        if self.is_valid_path(dictionary_path) and self.is_valid_path(
                bigram_path):
            self.sym_spell.load_dictionary(dictionary_path,
                                           term_index=0,
                                           count_index=1)
            self.sym_spell.load_bigram_dictionary(bigram_path,
                                                  term_index=0,
                                                  count_index=2)
            self.load_status = True
        else:
            self.load_status = False
        self.name = "Spell Corrector"

    def is_valid_path(self, path_file):
        if not os.path.exists(path_file):
            logging.error("The {} is not exists".format(path_file))
            return False
        return True

    def correct(self, sentence):
        if self.load_status:
            # max edit distance per lookup (per single word, not per whole input string)
            suggestions = self.sym_spell.lookup_compound(sentence,
                                                         max_edit_distance=2)
            # display suggestion term, edit distance, and term frequency
            for suggestion in suggestions:
                return suggestion.term
        return self.load_status

コード例 #3

0

ファイルを表示

class SymSpellChecker(object):
    def __init__(self):
        self.checker = SymSpell(max_dictionary_edit_distance=2)
        self.checker.load_dictionary(
            '/home/citao/github/symspellpy/frequency_dictionary_en_82_765.txt',
            0, 1)
        self.checker.load_bigram_dictionary(
            '/home/citao/github/symspellpy/frequency_bigramdictionary_en_243_342.txt',
            0, 2)

    def correct(self, word):
        suggestions = self.checker.lookup(word,
                                          Verbosity.CLOSEST,
                                          max_edit_distance=2)
        for suggestion in suggestions:
            cor_word = suggestion.term
            logging.info('Spell check: [{}] -> [{}]'.format(word, cor_word))
            return cor_word
        return word

    def correct_text(self, text):
        cor_list = []
        for word in text.split(' '):
            suggestions = self.checker.lookup(word,
                                              Verbosity.CLOSEST,
                                              max_edit_distance=2)
            cor_flag = False
            for suggestion in suggestions:
                cor_word = suggestion.term
                cor_list.append(cor_word)
                cor_flag = True
                break
            if not cor_flag:
                cor_list.append(word)
        return ' '.join(cor_list)

コード例 #4

0

ファイルを表示

ファイル: general_spell_check.py プロジェクト: jiansfoggy/text_recognition

def load_name_corection(dictionary_path, bigram_path):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary(dictionary_path,
                              term_index=0,
                              count_index=1,
                              encoding='utf-8')
    sym_spell.load_bigram_dictionary(bigram_path,
                                     term_index=0,
                                     count_index=2,
                                     encoding='utf-8')
    return sym_spell

コード例 #5

0

ファイルを表示

ファイル: symspell_sentences_streamed.py プロジェクト: maraqa1/CORD-19

def load_symspell():
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
    return sym_spell

コード例 #6

0

ファイルを表示

ファイル: test_symspellpy.py プロジェクト: youikim/symspellpy

    def test_lookup_compound_ignore_non_words(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who "
                "couqdn'tread in SIXTHgrade and ins pired him")
        correction = ("where is the love 123 he had dated for much of THEPAST "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the DHIRD 1 quarter of last year he had learned "
                      "of a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY "
                "of 12 funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with PLETY of 12 fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible 1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible 1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible AB1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible AB1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "PI on leave, arrange Co-I to do screening"
        correction = "PI on leave arrange co i to do screening"
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

コード例 #7

0

ファイルを表示

    def test_lookup_compound_transfer_casing(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = ("Whereis th elove hehaD Dated forImuch of thepast who "
                "couqdn'tread in sixthgrade AND ins pired him")
        correction = ("Where is the love he haD Dated for much of the past "
                      "who couldn't read in sixth grade AND inspired him")

        results = sym_spell.lookup_compound(typo,
                                            edit_distance_max,
                                            transfer_casing=True)
        self.assertEqual(correction, results[0].term)

コード例 #8

0

ファイルを表示

 def test_load_bigram_dictionary_invalid_path(self):
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(
         False,
         sym_spell.load_bigram_dictionary("invalid/dictionary/path.txt", 0,
                                          2))

コード例 #9

0

ファイルを表示

 def test_load_bigram_dictionary_bad_dict(self):
     dictionary_path = os.path.join(self.fortests_path, "bad_dict.txt")
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(
         True, sym_spell.load_bigram_dictionary(dictionary_path, 0, 2))
     self.assertEqual(2, len(sym_spell.bigrams))
     self.assertEqual(12, sym_spell.bigrams["rtyu tyui"])
     self.assertEqual(13, sym_spell.bigrams["yuio uiop"])

コード例 #10

0

ファイルを表示

class WordCorrector(LogicAdapter):

    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        self.language = kwargs.get('language', languages.ENG)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        self.dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
        # TODO : 숫자 없음. dictionary modifying 필요
        self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2)

    def can_process(self, statement):
        try:
            if " " in statement.text.lower():
                return False
            else:
                response = self.process(statement)
                return response.confidence == 1
        except:
            return False


    def process(self, statement, additional_response_selection_parameters=None):
        input_text = statement.text
        input_text = (input_text)
        suggestions = self.sym_spell.lookup_compound(input_text, max_edit_distance=2)

        for suggestion in suggestions:
            #print(suggestion)
            #print(type(suggestion))
            expression = "Do you mean \""+ str(suggestion).split(",")[0] +"\""

            if input_text == str(suggestion).split(",")[0]:
                expression = ""
            response = Statement(text=expression)
            response.confidence = 1
            #TODO: corrector 돌렸을 때 같을 땐 confidence 0, 다를 땐 confidence 1로 줬었는데 모 딴 거하다 이걸로 냅둠
        return response

コード例 #11

0

ファイルを表示

class Spell_Checker():
    def __init__(self):
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        self.dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
        self.bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

        self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2)

    def Correct_It(self, data):
        suggestions = self.sym_spell.lookup_compound(data, max_edit_distance=2,
                                            transfer_casing=True)

        clean_data = list()
        for suggestion in suggestions:
            clean_data.append(str(suggestion.term))

        correct_data = " ".join(clean_data)

        return correct_data

コード例 #12

0

ファイルを表示

 def test_load_bigram_dictionary_separator(self):
     dictionary_path = os.path.join(self.fortests_path,
                                    "separator_dict.txt")
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(
         True, sym_spell.load_bigram_dictionary(dictionary_path, 0, 1, "$"))
     self.assertEqual(5, len(sym_spell.bigrams))
     self.assertEqual(23135851162, sym_spell.bigrams["the"])
     self.assertEqual(13151942776, sym_spell.bigrams["of"])
     self.assertEqual(10956800, sym_spell.bigrams["abcs of"])
     self.assertEqual(10721728, sym_spell.bigrams["aaron and"])
     self.assertEqual(12997637966, sym_spell.bigrams["and"])

コード例 #13

0

ファイルを表示

ファイル: FeaturesImpl.py プロジェクト: WookaszU/TrollSpamTwitter

def init_symspell():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 1  # bylo tutaj 0
    prefix_length = 100
    # create object
    # sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            dictionary_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return

    return sym_spell

コード例 #14

0

ファイルを表示

ファイル: symspellpy.py プロジェクト: NoerNova/pythainlp

from pythainlp.corpus import get_corpus_path
from pythainlp.corpus import path_pythainlp_corpus
from pythainlp.tokenize import word_tokenize

_UNIGRAM = "tnc_freq.txt"
_BIGRAM = "tnc_bigram_word_freqs"

sym_spell = SymSpell()
sym_spell.load_dictionary(path_pythainlp_corpus(_UNIGRAM),
                          0,
                          1,
                          separator='\t',
                          encoding="utf-8-sig")
sym_spell.load_bigram_dictionary(get_corpus_path(_BIGRAM),
                                 0,
                                 2,
                                 separator='\t',
                                 encoding="utf-8-sig")


def spell(text: str, max_edit_distance: int = 2) -> List[str]:
    return [
        str(i).split(',')[0] for i in list(
            sym_spell.lookup(
                text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance))
    ]


def correct(text: str, max_edit_distance: int = 1) -> str:
    return spell(text, max_edit_distance=max_edit_distance)[0]

コード例 #15

0

ファイルを表示

    def test_lookup_compound_replaced_words(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        replacement_1 = {
            "whereis": "where is",
            "th": "the",
            "elove": "love",
            "hehad": "he had",
            "forimuch": "for much",
            "thepast": "the past",
            "couqdn'tread": "couldn't read",
            "sixthgrade": "sixth grade",
            "ins": "in"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(len(replacement_1), len(sym_spell.replaced_words))
        for k, v in replacement_1.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        replacement_2 = {
            "te": "the",
            "dhird": "third",
            "qarter": "quarter",
            "oflast": "of last",
            "jear": "year",
            "hadlearned": "had learned",
            "ofca": "of a",
            "sekretplan": "secret plan"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2),
            len(sym_spell.replaced_words))
        for k, v in replacement_2.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        replacement_3 = {
            "bigjest": "biggest",
            "playrs": "players",
            "strogsommer": "strong summer",
            "slatew": "slate",
            "ith": "with",
            "plety": "plenty",
            "funn": "fun"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2) + len(replacement_3),
            len(sym_spell.replaced_words))
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        for k, v in replacement_3.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

コード例 #16

0

ファイルを表示

ファイル: flask_server.py プロジェクト: faizankshaikh/AutoTranscriptGen

def export():
    import os
    import torch
    import zipfile
    import torchaudio
    from glob import glob

    device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
    model, decoder, utils = torch.hub.load('snakers4/silero-models',
                                        model='silero_stt',
                                        language='en')
    (read_batch, split_into_batches,
    read_audio, prepare_model_input) = utils  # see function signature for details
    
    
    os.system("ffmpeg -i 'video.mp4' -vn -acodec copy audio.aac")
    os.system("ffmpeg -i audio.aac audio.wav")


    # download a single file, any format compatible with TorchAudio (soundfile backend)
    # torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',
    #                                dst ='speech_orig.wav', progress=True)
    test_files = glob('audio.wav') 
    batches = split_into_batches(test_files, batch_size=10)
    input = prepare_model_input(read_batch(batches[0]))

    text = ""
    output = model(input)
    for example in output:
        pred = decoder(example.cpu())
        text = text + pred
        
    os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt")
    os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt")



    import pkg_resources
    from symspellpy import SymSpell, Verbosity

    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    # input_term = ("whereis th elove hehad dated forImuch of thepast who "
    #              "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print(suggestion)
        
        
    text = str(suggestion)


    cnt = 0
    textlines = []
    while cnt < len(text.split(" ")):
        print(text.split(" ")[cnt:cnt+5])
        line = "\n" + " ".join(text.split(" ")[cnt:cnt+5])
        textlines.append(line)
        cnt += 5
        
        
    f = open("script_cleaned.txt", "a")
    f.writelines(textlines)
    f.close()


    os.system("python -m aeneas.tools.execute_task \
        audio.wav \
        script_cleaned.txt \
        'task_language=eng|os_task_file_format=srt|is_text_type=plain' \
        subtitles.srt")



    with open("subtitles.srt") as f:
        srt = f.read()
        
    return Response(
        srt,
        mimetype="text/srt",
        headers={
            "Content-disposition": "attachment; filename=subtitiles.srt"
        }
    )

コード例 #17

0

ファイルを表示

class MaskTextSpotter(object):
    def __init__(self,
                 cfg,
                 confidence_threshold=0.7,
                 min_image_size=224,
                 output_polygon=True,
                 spellfix=True):
        self.cfg = cfg.clone()
        self.model = build_detection_model(cfg)
        self.model.eval()
        self.device = torch.device(cfg.MODEL.DEVICE)
        self.model.to(self.device)
        self.min_image_size = min_image_size

        self.spellfix = spellfix

        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")

        bigram_dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

        self.sym_spell.load_dictionary(dictionary_path,
                                       term_index=0,
                                       count_index=1)

        self.sym_spell.load_bigram_dictionary(bigram_dictionary_path,
                                              term_index=0,
                                              count_index=2)

        checkpointer = DetectronCheckpointer(cfg, self.model)
        if len(cfg.MODEL.WEIGHT):
            import logging
            logging.info('loading MaskTextSpotter from %s' % cfg.MODEL.WEIGHT)
            _ = checkpointer.load(cfg.MODEL.WEIGHT)

        self.transforms = self.build_transform()
        self.cpu_device = torch.device("cpu")
        self.confidence_threshold = confidence_threshold
        self.output_polygon = output_polygon

    def build_transform(self):
        """
        Creates a basic transformation that was used to train the models
        """
        cfg = self.cfg
        # we are loading images with OpenCV, so we don't need to convert them
        # to BGR, they are already! So all we need to do is to normalize
        # by 255 if we want to convert to BGR255 format, or flip the channels
        # if we want it to be in RGB in [0-1] range.
        if cfg.INPUT.TO_BGR255:
            to_bgr_transform = T.Lambda(lambda x: x * 255)
        else:
            to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]])

        normalize_transform = T.Normalize(mean=cfg.INPUT.PIXEL_MEAN,
                                          std=cfg.INPUT.PIXEL_STD)

        transform = T.Compose([
            T.ToPILImage(),
            T.Resize(self.min_image_size),
            T.ToTensor(),
            to_bgr_transform,
            normalize_transform,
        ])
        return transform

    def run_on_opencv_image(self, image):
        """
        Arguments:
            image (np.ndarray): an image as returned by OpenCV
        Returns:
            result_polygons (list): detection results
            result_words (list): recognition results
        """
        result_polygons, result_words, result_dict = self.compute_prediction(
            image)
        return result_polygons, result_words, result_dict

    def run_on_pillow_image(self, image):
        arr = np.array(image, dtype=np.uint8)
        result_polygons, result_words, result_dict = self.run_on_opencv_image(
            arr)
        return result_polygons, result_words, result_dict

    def compute_prediction(self, original_image):
        def spell_fix(wd):
            if self.spellfix:
                new_word = [
                    s.term for s in self.sym_spell.lookup(wd,
                                                          Verbosity.CLOSEST,
                                                          max_edit_distance=2,
                                                          include_unknown=True)
                ][0]
            else:
                new_word = wd
            return new_word

        def chunks(l, n):
            for i in range(0, len(l), n):
                yield l[i:i + n]

        def mk_direction(char_polygons):
            def centroid(char_polygon):
                centroid = Polygon(list(chunks(char_polygon,
                                               2))).centroid.coords
                return list(centroid)[0]

            first, last = char_polygons[0], char_polygons[-1]
            start, end = centroid(first), centroid(last)
            if start[0] == end[0]:
                end = (end[0] + 1, end[1])
            return start, end

        def line_detection(dicts, char_ratio=1.5):
            # box  [x1, y1, x2, y2]
            sorted_res = sorted(dicts, key=lambda d: d["box"][0])
            lines = dict()

            def point_in_next_word(word):
                width = word["box"][2] - word["box"][0]  # width = x2 - x1
                avg_char_width = width / float(len(word["seq_word"]))
                last_right_border = word["box"][2]
                next_word_pos_x = last_right_border + char_ratio * avg_char_width
                next_word_pos_y = word["box"][1]
                direction = word["direction"]
                point = Point(next_word_pos_x, next_word_pos_y)
                line = LineString(direction)
                x = np.array(point.coords[0])
                u = np.array(line.coords[0])
                v = np.array(line.coords[len(line.coords) - 1])
                n = v - u
                n /= np.linalg.norm(n, 2)
                P = u + n * np.dot(x - u, n)
                return (int(P[0]), int(P[1]))

            def distance_to_mid(word_point, word_box):
                point = Point(word_point["next_point"])
                box = word_box["box"]
                return abs(point.y -
                           (box[1] + box[3]) / 2.0)  # abs( y - (y2+y1)/2 )

            def find_next_word(word, index, sorted_words):
                next_point = Point(word["next_point"])
                next_words = [
                    other for other in sorted_words[index + 1:] if Polygon(
                        chunks(other["polygon"], 2)).contains(next_point)
                ]
                if next_words:
                    return min(next_words,
                               key=lambda x: distance_to_mid(word, x))
                else:
                    return None

            def find_previous_word(prev, word):
                if "previous_word" not in word.keys():
                    return prev
                else:
                    return min(prev,
                               word["previous_word"],
                               key=lambda x: distance_to_mid(x, word))

            for w in sorted_res:
                w["next_point"] = point_in_next_word(w)

            for i, w in enumerate(sorted_res):
                next_word = find_next_word(w, i, sorted_res)
                w["next_word"] = None
                if next_word:
                    better_previous = find_previous_word(w, next_word)
                    if better_previous == w:
                        w["next_word"] = next_word
                        if "previous_word" in next_word.keys():
                            next_word["previous_word"]["next_word"] = None
                        next_word["previous_word"] = w

            for w in sorted_res:
                if "previous_word" not in w.keys():
                    a = w
                    key_y = a["box"][1]
                    while key_y in lines.keys():
                        key_y = key_y + 1
                    lines[key_y] = [a]
                    while a["next_word"]:
                        a = a["next_word"]
                        lines[key_y].append(a)

            sorted_lines = sorted(lines.items(), key=lambda x: x[0])
            return ",".join([
                " ".join([w["seq_word"] for w in line])
                for _, line in sorted_lines
            ]), sorted_lines

        # apply pre-processing to image
        import datetime, time
        start_time = time.time()
        # print('transform', datetime.datetime.now())
        image = self.transforms(original_image)
        # convert to an ImageList, padded so that it is divisible by
        # cfg.DATALOADER.SIZE_DIVISIBILITY
        # print('to image list', datetime.datetime.now())
        image_list = to_image_list(image,
                                   self.cfg.DATALOADER.SIZE_DIVISIBILITY)
        image_list = image_list.to(self.device)
        # compute predictions
        with torch.no_grad():
            # print('predict', datetime.datetime.now())
            self.model.eval()
            predictions, _, _ = self.model(image_list)
            if not predictions or len(predictions) < 1:
                # print('no text detected')
                return [], [], {'label': '', 'details': []}
        # print('post process', datetime.datetime.now())
        global_predictions = predictions[0]
        char_predictions = predictions[1]
        char_mask = char_predictions['char_mask']
        char_boxes = char_predictions['boxes']
        words, rec_scores, rec_char_scores, char_polygons = self.process_char_mask(
            char_mask, char_boxes)
        detailed_seq_scores = char_predictions['detailed_seq_scores']
        seq_words = char_predictions['seq_outputs']
        seq_scores = char_predictions['seq_scores']
        global_predictions = [
            o.to(self.cpu_device) for o in global_predictions
        ]

        # always single image is passed at a time
        global_prediction = global_predictions[0]

        # reshape prediction (a BoxList) into the original image size
        height, width = original_image.shape[:-1]
        test_image_width, test_image_height = global_prediction.size
        global_prediction = global_prediction.resize((width, height))
        resize_ratio = float(height) / test_image_height
        boxes = global_prediction.bbox.tolist()
        scores = global_prediction.get_field("scores").tolist()
        masks = global_prediction.get_field("mask").cpu().numpy()

        result_polygons = []
        result_words = []
        result_dicts = []

        for k, box in enumerate(boxes):
            score = scores[k]
            if score < self.confidence_threshold:
                continue
            box = list(map(int, box))
            mask = masks[k, 0, :, :]
            polygon = self.mask2polygon(mask,
                                        box,
                                        original_image.shape,
                                        threshold=0.5,
                                        output_polygon=self.output_polygon)

            if polygon is None:
                polygon = [
                    box[0], box[1], box[2], box[1], box[2], box[3], box[0],
                    box[3]
                ]
            result_polygons.append(polygon)
            word = words[k]
            rec_score = rec_scores[k]
            char_score = rec_char_scores[k]
            seq_word = seq_words[k]
            seq_char_scores = seq_scores[k]
            seq_score = sum(seq_char_scores) / float(len(seq_char_scores))
            # spell_fix = lambda word: \
            #     [s.term for s in sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)][
            #         0]
            detailed_seq_score = detailed_seq_scores[k]
            detailed_seq_score = np.squeeze(np.array(detailed_seq_score),
                                            axis=1)
            # if 'total_text' in output_folder or 'cute80' in output_folder:
            #     result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [seq_word] + [score] + [rec_score] + [
            #         seq_score] + [char_score] + [detailed_seq_score] + [len(polygon)]
            # else:
            result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [
                seq_word
            ] + [score] + [rec_score] + [seq_score] + [char_score] + [
                detailed_seq_score
            ]
            # result_logs.append(result_log)
            if len(seq_word) > 0 and len(char_polygons[k]) > 0:
                d = {
                    "seq_word":
                    seq_word if len(seq_word) < 4 else spell_fix(seq_word),
                    "seq_word_orig":
                    seq_word,
                    "direction":
                    mk_direction([[int(c * resize_ratio) for c in p]
                                  for p in char_polygons[k]]),
                    "word":
                    word if len(word) < 4 else spell_fix(word),
                    "word_orig":
                    word,
                    "box": [int(x * 1.0) for x in box[:4]],
                    "polygon":
                    polygon,
                    "prob":
                    score * seq_score
                }
                result_words.append(d['seq_word'])
                result_dicts.append(d)

        # default_logger.debug('done', datetime.datetime.now())
        label, details = line_detection(result_dicts)
        end_time = time.time()
        # default_logger.debug('cost time: %s' % (end_time - start_time))
        line_result = {'label': label, 'details': details}
        # line_result_words = []
        # line_result_polygons = []
        # for ocr_detail in line_result['details']:
        #     pass
        # line_result_words = [a[1][0]['seq_word'] for a in line_result['details']]
        # line_result_polygons = [a[1][0]['polygon'] for a in line_result['details']]
        line_result_words = [a['seq_word'] for a in result_dicts]
        line_result_polygons = [a['polygon'] for a in result_dicts]
        # return result_polygons, result_words, line_result
        return line_result_polygons, line_result_words, line_result

    # def process_char_mask(self, char_masks, boxes, threshold=192):
    #     texts, rec_scores = [], []
    #     for index in range(char_masks.shape[0]):
    #         box = list(boxes[index])
    #         box = list(map(int, box))
    #         text, rec_score, _, _ = getstr_grid(char_masks[index, :, :, :].copy(), box, threshold=threshold)
    #         texts.append(text)
    #         rec_scores.append(rec_score)
    #     return texts, rec_scores

    def process_char_mask(self, char_masks, boxes, threshold=192):
        texts, rec_scores, rec_char_scores, char_polygons = [], [], [], []
        for index in range(char_masks.shape[0]):
            box = list(boxes[index])
            box = list(map(int, box))
            text, rec_score, rec_char_score, char_polygon = getstr_grid(
                char_masks[index, :, :, :].copy(), box, threshold=threshold)
            texts.append(text)
            rec_scores.append(rec_score)
            rec_char_scores.append(rec_char_score)
            char_polygons.append(char_polygon)
            # segmss.append(segms)
        return texts, rec_scores, rec_char_scores, char_polygons

    def mask2polygon(self,
                     mask,
                     box,
                     im_size,
                     threshold=0.5,
                     output_polygon=True):
        # mask 32*128
        image_width, image_height = im_size[1], im_size[0]
        box_h = box[3] - box[1]
        box_w = box[2] - box[0]
        cls_polys = (mask * 255).astype(np.uint8)
        poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h)))
        poly_map = poly_map.astype(np.float32) / 255
        poly_map = cv2.GaussianBlur(poly_map, (3, 3), sigmaX=3)
        ret, poly_map = cv2.threshold(poly_map, 0.5, 1, cv2.THRESH_BINARY)
        if output_polygon:
            SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
            poly_map = cv2.erode(poly_map, SE1)
            poly_map = cv2.dilate(poly_map, SE1)
            poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1)
            try:
                _, contours, _ = cv2.findContours(
                    (poly_map * 255).astype(np.uint8), cv2.RETR_LIST,
                    cv2.CHAIN_APPROX_NONE)
            except:
                contours, _ = cv2.findContours(
                    (poly_map * 255).astype(np.uint8), cv2.RETR_LIST,
                    cv2.CHAIN_APPROX_NONE)
            if len(contours) == 0:
                print(contours)
                print(len(contours))
                return None
            max_area = 0
            max_cnt = contours[0]
            for cnt in contours:
                area = cv2.contourArea(cnt)
                if area > max_area:
                    max_area = area
                    max_cnt = cnt
            perimeter = cv2.arcLength(max_cnt, True)
            epsilon = 0.01 * cv2.arcLength(max_cnt, True)
            approx = cv2.approxPolyDP(max_cnt, epsilon, True)
            pts = approx.reshape((-1, 2))
            pts[:, 0] = pts[:, 0] + box[0]
            pts[:, 1] = pts[:, 1] + box[1]
            polygon = list(pts.reshape((-1, )))
            polygon = list(map(int, polygon))
            if len(polygon) < 6:
                return None
        else:
            SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
            poly_map = cv2.erode(poly_map, SE1)
            poly_map = cv2.dilate(poly_map, SE1)
            poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1)
            idy, idx = np.where(poly_map == 1)
            xy = np.vstack((idx, idy))
            xy = np.transpose(xy)
            hull = cv2.convexHull(xy, clockwise=True)
            # reverse order of points.
            if hull is None:
                return None
            hull = hull[::-1]
            # find minimum area bounding box.
            rect = cv2.minAreaRect(hull)
            corners = cv2.boxPoints(rect)
            corners = np.array(corners, dtype="int")
            pts = get_tight_rect(corners, box[0], box[1], image_height,
                                 image_width, 1)
            polygon = [x * 1.0 for x in pts]
            polygon = list(map(int, polygon))
        return polygon

    def visualization(self, img, polygons, words):
        cur_img = copy.deepcopy(img)
        for polygon, word in zip(polygons, words):
            pts = np.array(polygon, np.int32)
            pts = pts.reshape((-1, 1, 2))
            xmin = min(pts[:, 0, 0])
            ymin = min(pts[:, 0, 1])
            r = random.randint(0, 255)
            g = random.randint(0, 255)
            b = random.randint(0, 255)
            cv2.polylines(cur_img, [pts], True, (b, g, r))
            cv2.putText(cur_img, word, (xmin, ymin), cv2.FONT_HERSHEY_TRIPLEX,
                        0.5, (b, g, r), 1)
        return cur_img

コード例 #18

0

ファイルを表示

    def test_lookup_compound(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = "whereis th elove"
        correction = "where is the love"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(2, results[0].distance)
        self.assertEqual(585, results[0].count)

        typo = "the bigjest playrs"
        correction = "the biggest players"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(2, results[0].distance)
        self.assertEqual(34, results[0].count)

        typo = "Can yu readthis"
        correction = "can you read this"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(3, results[0].distance)
        self.assertEqual(11440, results[0].count)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(0, results[0].count)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(0, results[0].count)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(0, results[0].count)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(10, results[0].distance)
        self.assertEqual(0, results[0].count)

コード例 #19

0

ファイルを表示

class spellchecker:
    def __init__(
        self,
        max_dictionary_edit_distance,
        prefix_length,
        unigram_freq_file,
        bigram_freq_file=None,
        pickle_file=None,
    ):
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=max_dictionary_edit_distance,
            prefix_length=prefix_length,
        )

        if pickle_file is not None:
            self.sym_spell.load_pickle(pickle_file, )
        else:
            self.sym_spell.load_dictionary(
                unigram_freq_file,
                term_index=0,
                count_index=1,
                encoding="utf-8",
            )

            if bigram_freq_file:
                self.sym_spell.load_bigram_dictionary(
                    bigram_freq_file,
                    term_index=0,
                    count_index=2,
                    encoding="utf-8",
                )

    def suggest(
        self,
        word,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        # defaults
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup(
            word,
            verbosity,
            max_edit_distance=max_edit_dist,
            include_unknown=include_unknown,
        )
        return {
            'original_term': word,
            'suggestions': suggestions,
        }

    def suggest_compound(
        self,
        phrase,
        max_edit_dist=None,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup_compound(
            phrase,
            max_edit_distance=max_edit_dist,
            # ignore_non_words=False,
            # split_phrase_by_space=True,
        )
        return {
            'original_term': phrase,
            'suggestions': suggestions,
        }

    def tokenize(self, phrases):
        return tokenize_sentence(phrases)

    # Tokenize into individual phrases and return a list of suggestions for each
    def suggest_tokenize(
        self,
        phrases,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        words = self.tokenize(phrases)

        sentence_suggestions = []
        for word in words:
            suggestions = self.sym_spell.lookup(
                word,
                verbosity,
                max_edit_distance=max_edit_dist,
                include_unknown=include_unknown,
            )
            sentence_suggestions.append({
                'original_term': word,
                'suggestions': suggestions,
            })

        return sentence_suggestions

コード例 #20

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: rockdrigoma/texthero

warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim")

TROPICAL_PATH = "tropical_dic.json"
FREQ_DICT_PATH = "frequency_dictionary_es_82_765.txt"
BIGRAM_PATH = "frequency_bigramdictionary_es_1Mnplus.txt"

with open(TROPICAL_PATH, "r") as file:
    tropical_dic = json.load(file)

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(FREQ_DICT_PATH, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(BIGRAM_PATH, term_index=0, count_index=2)

FIRST_INT = 11111111111111
LAST_INT = 99999999999999

PLACEHOLDERS_DICT = {}


@InputSeries(TextSeries)
def fillna(s: TextSeries) -> TextSeries:
    """
    Replaces not assigned values with empty string.


    Examples
    --------

コード例 #21

0

ファイルを表示

from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy",
                                                  "freq_name_dic.txt")
bigram_path = pkg_resources.resource_filename("symspellpy",
                                              "freq_name_bigram.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path,
                          term_index=0,
                          count_index=1,
                          encoding='utf-8')
# sym_spell.load_dictionary('C:/Users/nt.anh6/PycharmProjects/aicr_vn/nlp_model/spell_checker/dict/vi_full.txt', term_index=0, count_index=1, encoding='utf-8')
sym_spell.load_bigram_dictionary(bigram_path,
                                 term_index=0,
                                 count_index=2,
                                 encoding='utf-8')

# lookup suggestions for multi-word input strings (supports compound
# splitting & merging)
input_term = "Ngyễn tành nm"
# max edit distance per lookup (per single word, not per whole input string)
# suggestions = sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=2, include_unknown=True)
suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
# display suggestion term, edit distance, and term frequency
for suggestion in suggestions:
    print(suggestion)


def load_name_corection(dictionary_path, bigram_path):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

コード例 #22

0

ファイルを表示

import pytesseract
import cv2
import pkg_resources
import main
spell = SpellChecker()
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
# prob_corrector = malaya.spell.symspell()
prob_corrector = malaya.spell.probability()

app = Flask(__name__)
app.config['DEBUG'] == True


class FilePaths:
    "filenames and paths to data"
    fnCharList = 'model/charList.txt'
    fnAccuracy = 'model/accuracy.txt'
    fnTrain = 'data/'
    fnCorpus = 'data/corpus.txt'

コード例 #23

0

ファイルを表示

ファイル: preprocess.py プロジェクト: georgeyean/tm_analyzer

def test3():
    # from autocorrect import Speller
    # doc = docx.Document("Word docs_Peace/1_CTS_119_eng_text.docx")
    # result = [p.text for p in doc.paragraphs]
    #
    # spell = Speller(lang='en')
    #
    # for j in range(15):
    #     print(spell(result[j]))

    # import jamspell
    #
    # corrector = jamspell.TSpellCorrector()
    # corrector.LoadLangModel('en.bin')
    # text = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the Imperial Court, or in the Imperial Chamber,\n",
    #
    # text = corrector.FixFragment(text)
    # print(text)
    sys.path.append("treatyUtil")
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    from treatyUtil import spellcheck_keep_punctuation

    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term1 = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII.\
    According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred \
    Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir \
    Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the \
    account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either \
    Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, \
    or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, \
    both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding \
    all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions \
    ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with \
    the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor \
    to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the\
    Imperial Court, or in the Imperial Chamber,\n"

    #input_term = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited "

    input_term = "God, and Safety of the Chriſtian World (the Electors,\nPrinces and States of the Sacred Roman Empire \
    being\npreſent, approving and conſenting) the Articles of Peace\nand Anity, whereof the Tenour follows.\n1. That \
    there be a Chriſtian, univerſal\nThe Re-efta. and perpetual Peace, and a true and ſincere\nbliſhment of Friendſhip and \
    Amity between his Sacred\nPeace and A. Imperial Majeſty, the Houſe of Austria,\nmity.\nand all his Allies and Adherents, \
    and the\nHeirs and Succeffors of each of them, chiefly the King\nof Spain, and the Electors, Princes and States of the En-\npire,\
    of the one ſide, and her Sacred Royal Majeſty,\nand the Kingdom of Sweden, her Allies and Adherents,\nand the Heirs and Succeſſors\
    of each of them, eſpecially\nthe moſt Chriſtian King, the reſpective Electors, Princes\nand States of the Empire, of the other ſide ; \
    and that this\nPeace be obſerv'd and cultivated ſincerely and ſeriouſly,\nſo that each Party may procure the Benefit, Honour and\nAdvantage \
    of one another, and thereby the Fruits of this\nPeace and Amity may be ſeen to grow up and fouriſh a-\nnew, by a ſure and reciprocal \
    maintaining of a good\nand faithful Neighbourhood between the Roman Empire\nand the Kingdom of Sweden reciprocally,\nII. That there be \
    on both ſides à perpe-\nAn Amneſty\ntua) Oblivion and Amneſty of all that has\nfrom all Hoffi- been done Since the beginning of theſe\nlity.\nTroubles, \
    in what Place or in what Man-\n"

    input_term2 = "God, and Safety of the Chriſtian World (the Electors,\nPrinces"
    input_term = re.sub("\n", " ", input_term)
    input_term = re.sub("- ", "", input_term)
    #input_term = re.sub("-", "", input_term)
    input_term = re.sub("ſ", "s", input_term)

    # word_split = re.compile(r"[^\W]+", re.U)
    # suggestions = sym_spell.lookup_compound((input_term), ignore_non_words=True, max_edit_distance=2)
    # for suggestion in suggestions:
    #    print(suggestion)
    #
    # corrected = suggestions[0].term
    # # This combined with split_phrase_by_space=True would be enough just to spell check
    # # but punctuation is lost.
    #
    # # The spell check is already done in 'corrected'. Now we just want to keep the punctuation.
    # in_list = word_split.findall(input_term)
    # chk_list = word_split.findall(corrected)
    # print(input_term)
    # print(corrected)
    # print(in_list)
    # print(chk_list)
    # pdb.set_trace()
    #
    # # To keep punctuation we take the original phrase and do word by word replacement
    # out_term = ""
    # outs  = input_term.split()
    # word_count = 0
    # for word in in_list:
    #     print(out_term)
    #     print(outs[word_count].lower(), word, chk_list[word_count])
    #     temp = outs[word_count].lower().replace(word, chk_list[word_count])
    #     word_count += 1
    #     out_term += temp+" "
    #
    # print(out_term)
    # return

    # max edit distance per lookup (per single word, not per whole input string)
    #pdb.set_trace()
    #print(spellcheck_keep_punctuation(input_term))
    suggestions = sym_spell.lookup_compound((input_term),
                                            transfer_casing=True,
                                            ignore_non_words=True,
                                            max_edit_distance=2)
    # display suggestion term, edit distance, and term frequency
    #print(suggestions)
    for suggestion in suggestions:
        print(suggestion)