Beispiel #1
0
 def __init__(self, modelFile):
     super(JamspellCorrector, self).__init__()
     import jamspell
     self.model = jamspell.TSpellCorrector()
     # self.model.SetPenalty(16.0, 0.0)
     if not (self.model.LoadLangModel(modelFile)):
         raise Exception('wrong model file: %s' % modelFile)
Beispiel #2
0
def spell_tokenizer(text):
    """
    Perform word tokenization using casual_tokenize after spelling correction
    :param text: string without punctuation
    :return: list of tokens
    """
    tokens = []
    corrector = jamspell.TSpellCorrector()
    corrector.LoadLangModel('model_en.bin')

    for word in casual_tokenize(rm_punctuation(text), preserve_case=False, reduce_len=True, strip_handles=True):
        if not (bool(re.search(r'\d', word))):
            corr_word = corrector.GetCandidates([word], 0)
            if (len(corr_word) > 0) and (word != corr_word[0]):
                for candidate in corr_word[:1]:
                    tokens.append(candidate)
            else:
                tokens.append(word)

    wordnet_lemmatizer = WordNetLemmatizer()
    stems = [wordnet_lemmatizer.lemmatize(item) for item in tokens]
    # stemmer = PorterStemmer()
    # stems = [stemmer.stem(item) for item in tokens]

    return stems
Beispiel #3
0
def main():
    "main function"
    # optional command line args
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", help="train the NN", action="store_true")
    parser.add_argument("--validate",
                        help="validate the NN",
                        action="store_true")
    parser.add_argument("--beamsearch",
                        help="use beam search instead of best path decoding",
                        action="store_true")
    args = parser.parse_args()

    # train or validate on IAM dataset
    if args.train or args.validate:
        # load training data, create TF model
        loader = DataLoader(FilePaths.fnTrain, Model.batchSize, Model.imgSize,
                            Model.maxTextLen)

        # save characters of model for inference mode
        open(FilePaths.fnCharList, 'w').write(str().join(loader.charList))

        # execute training or validation
        if args.train:
            model = Model(loader.charList, args.beamsearch)
            train(model, loader)
        elif args.validate:
            model = Model(loader.charList, args.beamsearch, mustRestore=True)
            validate(model, loader)

    # infer text on test image
    else:
        sentence_list = []
        #print(open(FilePaths.fnAccuracy).read())
        model = Model(open(FilePaths.fnCharList).read(),
                      args.beamsearch,
                      mustRestore=True)
        imgFiles = os.listdir('../../WordSegmentation/out/1.png')
        for (i, f) in enumerate(imgFiles):
            print(' recognised the word %s' % f)

            # read image, prepare it by resizing it to fixed height and converting it to grayscale
            img1 = '../../WordSegmentation/out/1.png/' + f
            #img = prepareImg(cv2.imread('11.png/%s'%f), 50)
            infer(model, img1, sentence_list, f)
            #infer(model, FilePaths.fnInfer)
        sentence_list = sorted(sentence_list, key=lambda entry: entry[0][0])
        sentence = ""
        for x, y in sentence_list:
            sentence = sentence + " " + y
        text_file = open("sentence.txt", "w")
        corrector = jamspell.TSpellCorrector()
        print("yo")
        corrector.LoadLangModel('en.bin')
        print("yo")
        sentence = corrector.FixFragment(sentence)
        print(sentence)
        text_file.write(sentence)
        text_file.close()
def spell_correct_context(query_str):
    corrector = jamspell.TSpellCorrector()  # Create a corrector
    corrector.LoadLangModel('./en.bin')
    list_of_words = get_list(query_str)
    #PRINTING THE CANDIDATES
    # for i in range(len(list_of_words)):
    #     print(list_of_words[i]+" -> ", corrector.GetCandidates(list_of_words, i))
    # print("Did you mean " + "'"+corrector.FixFragment(query_str)+ "'"+"?")
    return corrector.FixFragment(query_str)
Beispiel #5
0
    def __init__(self, f_name_jamspell_model=None):
        self.max_sequence_length = None
        self.corrector = jamspell.TSpellCorrector()

        if f_name_jamspell_model != None:
            if os.path.isfile(f_name_jamspell_model) == False:
                print('[E] Языковая модель %s для JamSpell не найдена.' %
                      f_name_jamspell_model)
                return
            self.corrector.LoadLangModel(f_name_jamspell_model)
Beispiel #6
0
 def __loads(self):
     ## load base model
     if not os.path.exists("model"):
         raise ValueError
     self.corrector = jamspell.TSpellCorrector()
     if not self.corrector.LoadLangModel('ru_small.bin'):
         raise ValueError
     # Обученная модель для русского языка
     self.model = Model("model")
     self.morph = pymorphy2.MorphAnalyzer()
Beispiel #7
0
    def __init__(self, tokenize=True, pretrained=False, device="cpu"):
        self.tokenize = tokenize
        self.pretrained = None
        self.device = None

        self.ckpt_path = None
        self.vocab_path, self.weights_path = "", ""
        self.model, self.vocab = None, None

        self.model = jamspell.TSpellCorrector()
        self.model.LoadLangModel('en.bin')
Beispiel #8
0
 def __init__(self):
     self.stop = set(stopwords.words('english'))
     self.stop.update([
         '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{',
         '}'
     ])
     self.api_url = settings.W2V_API_URL
     self.candidate_sentence = None
     self.right_sentence = None
     self.similarity = None
     self.corrector = jamspell.TSpellCorrector()
     self.corrector.LoadLangModel('en.bin')
    def __init__(self):
        self.simple_charectrs = set(
            list(
                u"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
            ))
        self.rus_charectrs = set(
            list(
                u"АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
            ))

        self.corrector = jamspell.TSpellCorrector()
        self.corrector.LoadLangModel("ru_small.bin")

        self.morph = pymorphy2.MorphAnalyzer()
def normalize_queries():
    get_stopwords()
    corrector = jamspell.TSpellCorrector()
    corrector.LoadLangModel('./jamspell_model/ru_small.bin')
    with open("queries_new.tsv",
              "r") as q_file_from, open("queries_norm.tsv", "w") as q_file_to:
        reader = csv.reader(q_file_from, delimiter='\t')
        writer = csv.writer(q_file_to, delimiter='\t')
        for row in tqdm(reader):
            i = int(row[0])
            query = row[1]
            query = get_google_spelling(query)
            query = normalize(query)
            # query = corrector.FixFragment(row[1])
            writer.writerow([row[0], query])
Beispiel #11
0
 def __init__(self,
              jamspell_model_path: str = DEFAULT_JAMSPELL_MODEL_PATH,
              correction_mapping_path: str = DEFAULT_MAPPING_PATH,
              stanza_model_dir_path=DEFAULT_STANZA_DIR_PATH,
              use_gpu=False):
     """
     :param jamspell_model_path: Relative or absolute path to the Jamspell binary model file.
     :param correction_mapping_path: Relative or absolute path to the CSV file which conducts word replacements.
     :param stanza_model_dir_path: Relative or absolute path to the directory IN WHICH the Estonian Stanza models reside in.
     :param use_gpu: Whether to use the GPU's CUDA support for Stanza operations.
     """
     self.stanza_pipeline = None
     self.corrector = jamspell.TSpellCorrector()
     self.ensure_model_folders()
     self.word_mapping = self.load_mapper_resources(correction_mapping_path)
     self._load_corrector_resources(jamspell_model_path)
     self._load_lemmatizer_resources(stanza_model_dir_path, use_gpu)
Beispiel #12
0
def jamspell(model: str = 'wiki', **kwargs):
    """
    Load a jamspell Spell Corrector for Malay.

    Parameters
    ----------
    model: str, optional (default='wiki+news')
        Supported models. Allowed values:

        * ``'wiki+news'`` - Wikipedia + News, 337MB.
        * ``'wiki'`` - Wikipedia, 148MB.
        * ``'news'`` - local news, 215MB.

    Returns
    -------
    result: malaya.spell.JamSpell class
    """

    try:
        import jamspell as jamspellpy
    except BaseException:
        raise ModuleNotFoundError(
            'jamspell not installed. Please install it and try again.')

    model = model.lower()
    supported_models = ['wiki+news', 'wiki', 'news']
    if model not in supported_models:
        raise ValueError(
            f'model not supported, available models are {str(supported_models)}'
        )

    path = check_file(PATH_NGRAM['jamspell'][model],
                      S3_PATH_NGRAM['jamspell'][model], **kwargs)
    try:
        corrector = jamspellpy.TSpellCorrector()
        corrector.LoadLangModel(path['model'])
    except BaseException:
        raise Exception(
            f"failed to load jamspell model, please run `malaya.utils.delete_cache('preprocessing/jamspell/{model.replace('+', '-')}')`"
        )
    return JamSpell(corrector=corrector)
    def __init__(self, stopwords, lang='ru', type_preproc=None, speech_2_vec_lemm=False, way_to_udp=None, 
                 keep_pos_udp=True, keep_punct_udp=False,
                 model_checker='ru_small.bin', parallel_workers=20):
        """
        Очистка текста от спецсимволов и стоп-слов
            stopwords: list
                Список стоп-слов
            lang: {'ru', 'eng'}, default='ru'
                Язык текста
            type_preproc: {'stem', 'lemm', 'udp', None}, default=None
                Тип предобработки - стемминг, лемматизация, обработка UDPipe парсером или ничего.
                Стемминг осуществляется через библиотеку nltk (SnowballStemmer).
                Лемматизация осуществляется через библиотеку pymorphy2. К лемме также можно добавить часть речи из тэгов pymorphy2 через нижнее подчеркивание '_' - 'делать_VERB'.
                Обработка UDPipe парсером осуществляется через библиотеку ufal.udpipe. Обработка включает в себя лемматизацию с возможностью добавления части речи через нижнее подчеркивание (тэги частей речи соответствуют  формату Universal PoS Tags) '_' - 'делать_VERB', также существует возможность сохранения пунктуации.
            speech_2_vec_lemm: bool, default=False
                Флаг для добавления части речи в предобработке с лемматизацией (type_preproc='lemm')
            way_to_udp: str, default=None
                Путь до обученной модели UDPipe
            keep_pos_udp: bool, default=True
                Флаг добавления части речи к лемме
            keep_punct_udp: bool, default=False
                Флаг сохранения пунктуации
            model_checker: str, default='ru_small.bin'
                Путь до модели проверки орфографии jamspell
            parallel_workers: int, default=20
                Кол-во процессорорв для распараллеливания
        """
        self.lang = lang

        self.replace_by_space_re = re.compile('[/(){}\[\]\|@\.,:;!?-]|[\s]')  # шаблон под замену данных символов на пробел
        self.bad_symbols_re = re.compile('[^0-9а-я #+_]')               # шаблон под удаление всех символов кроме этих
        self.stopwords_re = r'\b' + r'\b|\b'.join(stopwords) + r'\b'    # шаблон для чистки стоп-слов
        
        self.type_preproc = type_preproc
        self.speech_2_vec_lemm = speech_2_vec_lemm
        self.way_to_udp = way_to_udp
        self.keep_pos_udp = keep_pos_udp
        self.keep_punct_udp = keep_punct_udp
        self.corrector = jamspell.TSpellCorrector()
        self.corrector.LoadLangModel(model_checker)
        self.parallel_workers = parallel_workers
Beispiel #14
0
def corrector_data(audio_wav_path: str):
    # THIS RECOGNATIONS SPITH TO TEXT
    res = get_data_in_audio(audio_wav)
    import jamspell
    from nltk.tokenize import RegexpTokenizer
    from nltk.corpus import stopwords
    from nltk.tokenize import sent_tokenize, word_tokenize
    corrector = jamspell.TSpellCorrector()
    corrector.LoadLangModel('ru_small.bin')
    # data = [[[i.normal_form for i in morph.parse(j) if i.score == 1 or i.score == 0.5] for j in
    #          corrector.GetCandidates([i, ], 0)] for i in raw]
    # tokenizer = RegexpTokenizer(r'\w+')
    stop_words = stopwords.words("russian")
    mega_set = set()
    for l in res:
        for token in word_tokenize(l, language="russian"):
            if token in stop_words:
                continue
            token_norm_set = norm_dict(corrector, token)
            mega_set |= token_norm_set
    return mega_set
Beispiel #15
0
                if v0 != v1:
                    cprint(
                        f"[{ext_name}] invalid value for enum attribute \"{p}\": {v0} != {v1}",
                        'magenta')
                    d0 = v0 - v1
                    d1 = v1 - v0
                    if len(d0):
                        cprint(f"[{ext_name}] missing elements are: {d0}",
                               'magenta')
                    if len(d1):
                        cprint(
                            f"[{ext_name}] invalid elements in doc are: {d1}",
                            'magenta')

            elif attr == HAVE_EXTERNAL:
                if ext_name.startswith("ui.") and p not in ("@fontname"):
                    cprint(f"DOC [{ext_name}] missing enum attribute\"{p}\"",
                           'magenta')
            elif attr == HAVE_PDDOC:
                cprint(
                    f"DOC [{ext_name}] no enum for attribute \"{p}\" (in external)",
                    'magenta')

    if args.spell:
        import jamspell
        corrector = jamspell.TSpellCorrector()

        corrector.LoadLangModel('ceammc.bin')
        cprint(f"checking [{ext_name}] ...", "blue")
        check_spell(root)
Beispiel #16
0
    def spellCorrect(self):
        corrector = jamspell.TSpellCorrector()
        corrector.LoadLangModel('en.bin')

        output = corrector.FixFragment(self.fileContent)
        return output
Beispiel #17
0
def permutative_solve(detected_bank, detected_puzzle=None):

    #detected_bank = [word.lower() for word in detected_bank]

    if detected_puzzle == None:
        detected_puzzle = []
        with open('test_searches/word_search.txt', 'r') as f:
            line = f.readline()
            while line:
                detected_puzzle.append(line[:-1].upper())
                detected_puzzle[-1] = detected_puzzle[-1].split(' ')
                line = f.readline()
    else:
        size = len(detected_puzzle)
        temp_puzzle = [['|' for _ in range(size)] for _ in range(size)]
        for i in range(len(detected_puzzle)):
            for j in range(len(detected_puzzle[i])):
                temp_puzzle[i][j] = detected_puzzle[i][j]
        print(temp_puzzle)

    solver = ps.PuzzleSolver(len(detected_puzzle[0]), len(detected_puzzle),
                             detected_puzzle, detected_bank)

    print('-------------------SOLVING PUZZLE----------------------')
    incorrect_words, found = solver.solve()

    if len(incorrect_words) == 0:
        print('\nALL WORDS FOUND!')
        return found

    print('-----------------RETRYING WITH SWAPPED CHARACTERS----------------')
    potential_words = []

    for word in incorrect_words:
        permutations = letter_swap(word)
        permutations.append(word)
        potential_words.append(permutations)

    print(potential_words)

    for word in solver.potential_words_solve(incorrect_words, potential_words):
        found.append(word)

    if len(incorrect_words) == 0:
        print('\nALL WORDS FOUND!')
        return found

    print('-----------------RETRYING WITH NEW BANK----------------')
    corrector = jamspell.TSpellCorrector()
    corrector.LoadLangModel('protos_data/en.bin')

    potential_words = []

    # get word candidates that may be the correct target word, put them in potential_words
    lower_case_attempts = []
    for word in incorrect_words:
        lower_case_attempts.append(word.lower())

    for i in range(len(lower_case_attempts)):
        candidates = list(corrector.GetCandidates(lower_case_attempts, i))
        candidates = [word.upper() for word in candidates]
        candidates.append(incorrect_words[i])
        potential_words.append(candidates)

    if len(potential_words) == 0:
        print(
            '\nCould not find alternate spellings for the following words: ',
            incorrect_words,
        )
        return found
    else:
        print('Incorrect words', incorrect_words)

    for word in solver.potential_words_solve(incorrect_words, potential_words):
        found.append(word)

    # Found every word! Done
    if len(incorrect_words) == 0:
        print('\nALL WORDS FOUND!')
        return found

    print('SOME WORDS NOT FOUND:', incorrect_words)
    return found
Beispiel #18
0
def trainLangModel(trainText, alphabetFile, modelFile):
    corrector = jamspell.TSpellCorrector()
    corrector.TrainLangModel(trainText, alphabetFile, modelFile)
Beispiel #19
0
def spell_corr():
    corrector = jamspell.TSpellCorrector()
    corrector.LoadLangModel('model_en.bin')
    logger.info(f"corrector created: {result}")
    return corrector
Beispiel #20
0
def get_spell_checker():    
    spell_checker = jamspell.TSpellCorrector()
    spell_checker.LoadLangModel(get_jamspell_model_file_name())
    return spell_checker