refFileName = 'referenceText.txt' # Set frequency list file name freqFileName = 'frequency_list_aze.txt' # Set vocabulary file name vocabFileName = 'wordList.pickle' # Read the wordList which is Azerbaijani vocabulary with open(vocabFileName, 'rb') as handle: vocabulary = pickle.load(handle) # Create frequency list of Azerbaijani language frequencyList = hf.createFrequencyList(refFileName, vocabulary['Words'].tolist()) # Write frequency list into .txt file. Pickle can also be used for the same purpose # Disable disable if .txt file is already generated # hf.writeTXT(frequencyList, freqFileName) # Create SymSpell object and read the frequency list symspell = SymSpell() symspell.load_dictionary(freqFileName, 0, 1, encoding="utf-8") word = "ilım" # Use documentation to perform custom edits # https://symspellpy.readthedocs.io/en/latest/api/index.html suggestions = symspell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) # List the suggestions for the word, in descending order for suggestion in suggestions: print(suggestion)
class Speller: def __init__(self, **kwargs): if 'dev' in kwargs: print( "Spellchecker run in development mode...small sample of checking items loaded only" ) self.flagDev = True self.abkuerzung_path = "inject/Abkuerzungen.csv" self.word_freq_path = "inject/word_freq_test.txt" else: self.abkuerzung_path = "inject/Abkuerzungen.csv" self.word_freq_path = "inject/word_freq_list_overall.txt" print("Initializing spellchecker...") self.loadAbbreviations() self.loadSymSpell() print("Initializing spellchecker complete") def loadSymSpell(self): self.sym_spell = SymSpell() self.sym_spell.load_dictionary(self.word_freq_path, 0, 1, encoding="utf8") def loadAbbreviations(self): self.Abkuerzungen = pd.read_csv(self.abkuerzung_path, sep=";", header=0, encoding="latin-1") self.Abkuerzungen['lower_case'] = [ str(x).lower().strip(".") for x in list(self.Abkuerzungen['Abkuerzung']) ] self.Abkuerzungen = self.Abkuerzungen.dropna() self.Abkuerzungen = self.Abkuerzungen.reset_index() def dev(self, objId): self.db = DB() self.objId = objId self.process_obj(self.objId) def process_obj(self, obj): self.label_obj = self.db.mongo_db.labels.find_one( {"_id": ObjectId(self.objId)}) for p in self.label_obj["pages"]: if "read_text_raw" in p: s = p["read_text_raw"] c_s = self.check_string(s) p["read_text"] = c_s def store_obj(self): self.db.mongo_db.labels.update({"_id": ObjectId(self.objId)}, self.label_obj) def get_store_obj(self): return self.label_obj def check_string(self, in_string): # spell correction string_corr = [] string = in_string.replace("\n", ' break ') # problems with processing '\n' for word in string.split( ' ' ): # process word by word since the compound version deletes any special characters word = word.strip('\r\n') if (word == "AUF") | (word == "AU"): word = 'Arbeitsunfähigkeit' if (word == "P:"): word = 'Patient:' if word not in ('', ',', '!', '.', ';', ':', '?', '-', '') and word.lower().strip(".,") not in list( self.Abkuerzungen['lower_case']): input_term = word if input_term[-1] in ("?", "!", ".", ",", ":", ";"): input_term = input_term[:-1] # max edit distance per lookup (per single word, not per whole input string) suggestions = self.sym_spell.lookup( input_term, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True, ignore_token=r".*[()].*", include_unknown=True ) # display suggestion term, edit distance, and term frequency if word[-1] in ("?", "!", ".", ",", ":", ";"): suggestion = str(suggestions[0].term) + word[-1] else: suggestion = str(suggestions[0].term) if input_term.lower() == str(suggestions[0].term): string_corr = string_corr + [str(word)] else: string_corr = string_corr + [suggestion] if word.lower().strip(".,:") in list( self.Abkuerzungen['lower_case']): string_corr = string_corr + [ self.Abkuerzungen['narrativ'][list( self.Abkuerzungen['lower_case']).index( word.lower().strip(".,:"))] ] string_corr = ' '.join(string_corr) string_corr = string_corr.replace("break", '\n') return string_corr
def test_lookup_compound_replaced_words(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") replacement_1 = { "whereis": "where is", "th": "the", "elove": "love", "hehad": "he had", "forimuch": "for much", "thepast": "the past", "couqdn'tread": "couldn't read", "sixthgrade": "sixth grade", "ins": "in" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(len(replacement_1), len(sym_spell.replaced_words)) for k, v in replacement_1.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") replacement_2 = { "te": "the", "dhird": "third", "qarter": "quarter", "oflast": "of last", "jear": "year", "hadlearned": "had learned", "ofca": "of a", "sekretplan": "secret plan" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2), len(sym_spell.replaced_words)) for k, v in replacement_2.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") replacement_3 = { "bigjest": "biggest", "playrs": "players", "strogsommer": "strong summer", "slatew": "slate", "ith": "with", "plety": "plenty", "funn": "fun" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2) + len(replacement_3), len(sym_spell.replaced_words)) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) for k, v in replacement_3.items(): self.assertEqual(v, sym_spell.replaced_words[k].term)
encoding='utf-8-sig', float_precision='round_trip') ukrainian_dataframe = dataframe1[dataframe1['predicted_language'] == 'Ukrainian'] ukrainian_dataframe = ukrainian_dataframe.drop_duplicates() sym_spell = SymSpell() with open( "/Users/lidiiamelnyk/Downloads/dss-plugin-nlp-preparation-main/resource/dictionaries/uk.txt", 'r', encoding='utf-8-sig') as myfile: corpus = myfile.read() corpus_path = "/Users/lidiiamelnyk/Downloads/dss-plugin-nlp-preparation-main/resource/dictionaries/uk.txt" symspell_dictionary = sym_spell.load_dictionary(corpus_path, term_index=0, count_index=1, separator=None, encoding='utf-8-sig') ukrainian_dataframe['comment'] = ukrainian_dataframe['comment'].astype(str) ukrainian_dataframe['comments_corrected'] = ukrainian_dataframe[ 'comment'].apply(lambda x: (sym_spell.lookup(x, Verbosity.CLOSEST, max_edit_distance=0, include_unknown=True, transfer_casing=False, ignore_token=r"\w+\d"))) #for i, row in ukrainian_dataframe.iterrows(): # if len(row['comments_corrected']) > 0: # pass
def test_load_dictionary_invalid_path(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual(False, sym_spell.load_dictionary( "invalid/dictionary/path.txt", 0, 1))
class Spellchecker(object): """ We use https://github.com/mammothb/symspellpy to do basic word / n-gram based spell checking. It is based on SymSpell: https://github.com/wolfgarbe/SymSpell TODO: We should be very cautious here and only "auto-correct" small one-off OCR errors (like "mnlicious -> malicious"), as otherwise we may change domain specific terms and abbreviations. TODO: consider optionally using a OCR cloud API, like: https://www.abbyy.com/cloud-ocr-sdk/ https://cloud.ocrsdk.com/demo/ This gives (much) better results, but it ain't free ;-) """ sym_spell = None dictionary_path = None max_edit_distance = 1 max_dict_edit_distance = 2 prefix_length = 7 count_threshold = 2 def __init__(self, language="en"): self.sym_spell = SymSpell( max_dictionary_edit_distance=self.max_dict_edit_distance, prefix_length=self.prefix_length, count_threshold=self.count_threshold, ) # FIXME support non-english languages and custom models if language == "en": self.dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") # term_index is the column of the term and count_index is the # column of the term frequency self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) else: log.warning( f"No spell checking available for language '{language}'") self.sym_spell = None def suggestions(self, input_term): # No suggestions if no spell checker available if not self.sym_spell: return [] # max edit distance per lookup # (max_edit_distance_lookup <= max_dictionary_edit_distance) suggestions = self.sym_spell.lookup( input_term, Verbosity.CLOSEST, max_edit_distance=self.max_edit_distance, include_unknown=False, transfer_casing=True, ) return suggestions def correct_word(self, input_term): stripped = str(input_term).strip(string.punctuation) # Don't correct words shorter than 4 chars (too risky it might be a domain specific abbreviation ) if len(stripped) < 4: return input_term # if less alpha chars than alpha chars: ignore if len(re.findall(r"[A-Za-z]", stripped)) <= len( re.findall(r"[^A-Za-z]", stripped)): return input_term suggestions = self.suggestions(stripped) # No suggestions? Leave it as is if len(suggestions) == 0: return input_term # OCR tokenizer may leave training punctation -> re-add candidate = suggestions[0].term for i in [-2, -1]: if input_term[i] in string.punctuation: candidate += input_term[i] return candidate
nltk.download("punkt") warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim") TROPICAL_PATH = "tropical_dic.json" FREQ_DICT_PATH = "frequency_dictionary_es_82_765.txt" BIGRAM_PATH = "frequency_bigramdictionary_es_1Mnplus.txt" with open(TROPICAL_PATH, "r") as file: tropical_dic = json.load(file) sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(FREQ_DICT_PATH, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(BIGRAM_PATH, term_index=0, count_index=2) FIRST_INT = 11111111111111 LAST_INT = 99999999999999 PLACEHOLDERS_DICT = {} @InputSeries(TextSeries) def fillna(s: TextSeries) -> TextSeries: """ Replaces not assigned values with empty string. Examples
from pathlib import Path from fastapi import FastAPI from symspellpy import SymSpell, Verbosity from app.schemas import LookupRequest, LookupResponse MAX_EDIT_DISTANCE = 2 DICTIONARY_PATH = Path('./data/kk.txt') app = FastAPI() symspell = SymSpell(max_dictionary_edit_distance=MAX_EDIT_DISTANCE) symspell.load_dictionary(DICTIONARY_PATH, term_index=0, count_index=1, encoding='utf-8') @app.get("/") async def read_root(): """Displays greeting message in homepage Returns: dict: a dictionary with greeting message """ return {"message": "✌"} @app.post("/lookup", response_model=LookupResponse)
def symspell_edit_distance_load(dictionary_path, request): sym_spell = SymSpell(request.param) sym_spell.load_dictionary(dictionary_path, 0, 1) return sym_spell, request.param
class SpellCheck(): def __init__(self, init_path=None): """Spelling checker: symspellpy==6.5.2. https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage. https://towardsdatascience.com/essential-text-correction-process-for-nlp-tasks-f731a025fcc3.""" self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.set_dictionary_path(init_path) self.set_dictionary() # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1) def set_dictionary_path(self, path): if path: self.path = path else: self.path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") return self.path def set_df(self): self.df = pd.read_csv(self.path, sep=' ', header=None, dtype={ 0: str, 1: np.int }) return self.df def set_dict(self): self.set_df() self.dictionary = { self.df.loc[i, 0]: self.df.loc[i, 1] for i in self.df.index } return self.dictionary def set_dictionary(self): self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1) self.set_dict() return None def find(self, term): return self.dictionary.get(term, 'nothing found') def append_dict(self, df_custom, cust_path='./data/cust_freq_dict_en.txt'): """Add custom dictionary. df: [term, freq]""" df_init = self.set_df() try: df_custom = df_custom.replace([np.inf, -np.inf, np.nan], 99) df_custom[1] = df_custom[1].astype(int) df = pd.concat([df_init, df_custom], ignore_index=True) except Exception as err: st.write('something went wrong', err) return -1 # Remove duplicate terms and sort on frequency df.drop_duplicates(subset=[0], keep='first', inplace=True) df.sort_values(by=[1], ascending=False, inplace=True) # Save & Load after adding custom dictionary self.set_dictionary_path(cust_path) df.to_csv(self.path, sep=' ', index=None, header=None) # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1) self.set_dictionary() return None def __call__(self, input_term, N=8): """lookup suggestions for single- and multi-word input strings""" # Check loner words (N chars) on possible concatenation # https://symspellpy.readthedocs.io/en/latest/api/symspellpy.html#symspellpy.symspellpy.Verbosity if (len(input_term.split(' '))) == 1 or (len(input_term) < N): suggestions = self.sym_spell.lookup(input_term, Verbosity.TOP, max_edit_distance=2, transfer_casing=True, include_unknown=True) else: # Punctuation get's lost! suggestions = self.sym_spell.lookup_compound(input_term, max_edit_distance=2, transfer_casing=True) # Suggestion term, term frequency, and edit distance # return [(sug.term, sug.count, sug.distance) for sug in suggestions] return [sug.term for sug in suggestions][0]
class SpellChecker: """ Spell check a string with max edit distance of 2. Only applied on alphabet character words (not alphanumeric). Spell check is applied independently to each token. Token of size <=2 are skipped. """ name = 'spell_checker' def __init__(self, vocab_files_path: List[str], tokenizer=str.split): """ Parameters ---------- vocab_files_path : List[str] List of file paths to vocabulary used by the spell checker for correction. Vocabulary files must be one token per line and the tokens should be in the first column left-to-right if there are multiple columns. tokenizer : Callable[str, List[str]], default=str.split Takes as input a string, outputs a list of tokens. """ self.tokenizer = tokenizer self.spellchecker = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) for fp in vocab_files_path: assert self.spellchecker.load_dictionary(fp, 0, 1) # # https://stackoverflow.com/questions/1528932/how-to-create-inline-objects-with-properties # self.spellchecker.update({ # 'dummy': type('_', (object,), dict(lookup=lambda w, *_, **__: w))() # }) def _correct(self, token: str) -> str: """ Parameters ---------- token : str Input string Returns ------- corrected : str corrected token """ # https://github.com/mammothb/symspellpy/issues/7 o = self.spellchecker\ .lookup( token, verbosity=Verbosity.TOP, max_edit_distance=2, ignore_token=r'\w{,2}', # ignore tokens of size 2 or less transfer_casing=True) if not o: return token word = o[0].term if token[0].isupper(): word = word[0].upper() + word[1:] # find start punctuation start_idx = 0 start_punct = '' while token[start_idx] in string.punctuation: start_punct += token[start_idx] if start_idx + 1 < len(token): start_idx += 1 else: break # find end punctuation end_idx = 1 end_punct = '' while token[-end_idx] in string.punctuation: end_punct += token[-end_idx] if end_idx - 1 > 0: end_idx -= 1 else: break return start_punct + word + end_punct def __call__(self, doc: str) -> str: """ Parameters ---------- doc : str Input string Returns ------- doc : str spell checked string """ return " ".join([ self._correct(w) if w.isalpha() else w for w in self.tokenizer(doc) ])
import nltk from symspellpy import SymSpell, Verbosity nltk.download('punkt') nltk.download('averaged_perceptron_tagger') sym_spell = SymSpell() sym_spell.load_dictionary('disease_dict', 0, 1, separator="$") def recognize(query_text): sent = nltk.word_tokenize(query_text) pos_tag = nltk.pos_tag(sent) disease_pattern = r""" DISEASE: {(<NN.*><POS>){0,1}<JJ>*<NN.*>+(<IN><DT><NN.*>+){0,1}} """ cp = nltk.RegexpParser(disease_pattern) ret = [] for chunk in cp.parse(pos_tag).subtrees(): if chunk.label() == 'DISEASE': buf = "" for word in chunk: if word[1] != 'POS': buf += ' ' buf += word[0] suggestions = sym_spell.lookup(buf.strip().lower(), Verbosity.CLOSEST, max_edit_distance=2) ret.extend([x._term for x in suggestions]) return ret
def export(): import os import torch import zipfile import torchaudio from glob import glob device = torch.device('cpu') # gpu also works, but our models are fast enough for CPU model, decoder, utils = torch.hub.load('snakers4/silero-models', model='silero_stt', language='en') (read_batch, split_into_batches, read_audio, prepare_model_input) = utils # see function signature for details os.system("ffmpeg -i 'video.mp4' -vn -acodec copy audio.aac") os.system("ffmpeg -i audio.aac audio.wav") # download a single file, any format compatible with TorchAudio (soundfile backend) # torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav', # dst ='speech_orig.wav', progress=True) test_files = glob('audio.wav') batches = split_into_batches(test_files, batch_size=10) input = prepare_model_input(read_batch(batches[0])) text = "" output = model(input) for example in output: pred = decoder(example.cpu()) text = text + pred os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt") os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt") import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) # input_term = ("whereis th elove hehad dated forImuch of thepast who " # "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) suggestions = sym_spell.lookup_compound(text, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) text = str(suggestion) cnt = 0 textlines = [] while cnt < len(text.split(" ")): print(text.split(" ")[cnt:cnt+5]) line = "\n" + " ".join(text.split(" ")[cnt:cnt+5]) textlines.append(line) cnt += 5 f = open("script_cleaned.txt", "a") f.writelines(textlines) f.close() os.system("python -m aeneas.tools.execute_task \ audio.wav \ script_cleaned.txt \ 'task_language=eng|os_task_file_format=srt|is_text_type=plain' \ subtitles.srt") with open("subtitles.srt") as f: srt = f.read() return Response( srt, mimetype="text/srt", headers={ "Content-disposition": "attachment; filename=subtitiles.srt" } )
def test3(): # from autocorrect import Speller # doc = docx.Document("Word docs_Peace/1_CTS_119_eng_text.docx") # result = [p.text for p in doc.paragraphs] # # spell = Speller(lang='en') # # for j in range(15): # print(spell(result[j])) # import jamspell # # corrector = jamspell.TSpellCorrector() # corrector.LoadLangModel('en.bin') # text = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the Imperial Court, or in the Imperial Chamber,\n", # # text = corrector.FixFragment(text) # print(text) sys.path.append("treatyUtil") import pkg_resources from symspellpy import SymSpell, Verbosity from treatyUtil import spellcheck_keep_punctuation sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term1 = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII.\ According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred \ Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir \ Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the \ account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either \ Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, \ or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, \ both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding \ all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions \ ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with \ the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor \ to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the\ Imperial Court, or in the Imperial Chamber,\n" #input_term = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited " input_term = "God, and Safety of the Chriſtian World (the Electors,\nPrinces and States of the Sacred Roman Empire \ being\npreſent, approving and conſenting) the Articles of Peace\nand Anity, whereof the Tenour follows.\n1. That \ there be a Chriſtian, univerſal\nThe Re-efta. and perpetual Peace, and a true and ſincere\nbliſhment of Friendſhip and \ Amity between his Sacred\nPeace and A. Imperial Majeſty, the Houſe of Austria,\nmity.\nand all his Allies and Adherents, \ and the\nHeirs and Succeffors of each of them, chiefly the King\nof Spain, and the Electors, Princes and States of the En-\npire,\ of the one ſide, and her Sacred Royal Majeſty,\nand the Kingdom of Sweden, her Allies and Adherents,\nand the Heirs and Succeſſors\ of each of them, eſpecially\nthe moſt Chriſtian King, the reſpective Electors, Princes\nand States of the Empire, of the other ſide ; \ and that this\nPeace be obſerv'd and cultivated ſincerely and ſeriouſly,\nſo that each Party may procure the Benefit, Honour and\nAdvantage \ of one another, and thereby the Fruits of this\nPeace and Amity may be ſeen to grow up and fouriſh a-\nnew, by a ſure and reciprocal \ maintaining of a good\nand faithful Neighbourhood between the Roman Empire\nand the Kingdom of Sweden reciprocally,\nII. That there be \ on both ſides à perpe-\nAn Amneſty\ntua) Oblivion and Amneſty of all that has\nfrom all Hoffi- been done Since the beginning of theſe\nlity.\nTroubles, \ in what Place or in what Man-\n" input_term2 = "God, and Safety of the Chriſtian World (the Electors,\nPrinces" input_term = re.sub("\n", " ", input_term) input_term = re.sub("- ", "", input_term) #input_term = re.sub("-", "", input_term) input_term = re.sub("ſ", "s", input_term) # word_split = re.compile(r"[^\W]+", re.U) # suggestions = sym_spell.lookup_compound((input_term), ignore_non_words=True, max_edit_distance=2) # for suggestion in suggestions: # print(suggestion) # # corrected = suggestions[0].term # # This combined with split_phrase_by_space=True would be enough just to spell check # # but punctuation is lost. # # # The spell check is already done in 'corrected'. Now we just want to keep the punctuation. # in_list = word_split.findall(input_term) # chk_list = word_split.findall(corrected) # print(input_term) # print(corrected) # print(in_list) # print(chk_list) # pdb.set_trace() # # # To keep punctuation we take the original phrase and do word by word replacement # out_term = "" # outs = input_term.split() # word_count = 0 # for word in in_list: # print(out_term) # print(outs[word_count].lower(), word, chk_list[word_count]) # temp = outs[word_count].lower().replace(word, chk_list[word_count]) # word_count += 1 # out_term += temp+" " # # print(out_term) # return # max edit distance per lookup (per single word, not per whole input string) #pdb.set_trace() #print(spellcheck_keep_punctuation(input_term)) suggestions = sym_spell.lookup_compound((input_term), transfer_casing=True, ignore_non_words=True, max_edit_distance=2) # display suggestion term, edit distance, and term frequency #print(suggestions) for suggestion in suggestions: print(suggestion)
class Trainer(Engine): r"""Wrapper class to train a model. See :class:`laia.engine.Engine` for more information. Args: model: model to train. criterion: used criterion to train the model. optimizer: optimizer object that will update the parameters of the model. data_loader: iterable object from which batches are read. batch_input_fn: function used to extract the input for the model (e.g. a ``torch.Tensor``), from the batch loaded by the ``data_loader``. If ``None``, the batch is fed as-is to the model. batch_target_fn: if given, this callable object is used to extract the targets from the batch, which are passed to the `ITER_START` and `ITER_END` hooks. batch_id_fn: if given, this callable object is used to extract the batch ids to be used in a possible exception. progress_bar: if ``True``, :mod:`tqdm` will be used to show a progress bar for each epoch. If a string is given, the content of the string will be shown before the progress bar. iterations_per_update: Number of successive mini-batch parameter gradients to accumulate before updating the parameters. cv_number: Display information to see which cross-validation we are doing use_baseline: Whether to perform the baseline (No CL nor SSL) use_cl: Whether to use curriculum learning (CL) use_transfer: Whether to use transfer learning (TL) use_semi_supervised: Whether to use semi-supervised learning (SSL) threshold_score_semi_supervised: Threshold on the rank of the samples for SSL data_semi_supervised_loader: unlabbeled dataset for SSL epoch_frequency_semi_supervision: Frequency of update of the SSL dataset B (the one which is not labelled), cf report syms: token-text table original_data_loader: Original dataset (dataset A) for SSL """ def __init__( self, model, # type: torch.nn.Module criterion, # type: Optional[Callable] optimizer, # type: torch.optim.Optimizer data_loader=None, # type: Optional[Iterable] batch_input_fn=None, # type: Optional[Callable] batch_target_fn=None, # type: Optional[Callable] batch_id_fn=None, # type: Optional[Callable] progress_bar=None, # type: Optional[Union[bool, str]] iterations_per_update=1, # type: int cv_number=None, use_baseline=None, use_cl=None, use_transfer=None, use_semi_supervised=None, threshold_score_semi_supervised=None, data_semi_supervised_loader=None, epoch_frequency_semi_supervision=None, syms=None, original_data_loader=None, ): # type: (...) -> None super(Trainer, self).__init__(model=model, data_loader=data_loader, batch_input_fn=batch_input_fn, batch_target_fn=batch_target_fn, batch_id_fn=batch_id_fn, progress_bar=progress_bar, use_baseline=use_baseline, use_cl=use_cl, use_transfer=use_transfer) self._criterion = criterion self._optimizer = optimizer self._iterations_per_update = iterations_per_update self._updates = 0 self._cv_number = cv_number self._progress_bar = progress_bar self.data_loader = data_loader self.use_semi_supervised = use_semi_supervised self.threshold_score_semi_supervised = threshold_score_semi_supervised self.data_semi_supervised_loader = data_semi_supervised_loader self.epoch_frequency_semi_supervision = epoch_frequency_semi_supervision self.counter_epoch_semi_supervision = 0 self.semi_supervision_started = False self.original_dataset = { 'ids': data_loader.dataset._ids, 'imgs': data_loader.dataset._imgs, 'txts': data_loader.dataset._txts } self.decoder = CTCGreedyDecoder() self.syms = syms self.original_data_loader = original_data_loader # Load Spell Checker self.sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7) dict_name = 'de_50k.txt' #"frequency_dictionary_en_82_765.txt" if not self.sym_spell.load_dictionary( dict_name, term_index=0, count_index=1, encoding='utf-8-sig'): print("error loading spell checker") @property def criterion(self): return self._criterion @criterion.setter def criterion(self, criterion): assert callable(criterion) self._criterion = criterion @property def optimizer(self): return self._optimizer def updates(self): return self._updates @property def logger(self): return _logger @property def iterations_per_update(self): return self._iterations_per_update @iterations_per_update.setter def iterations_per_update(self, num): if num is None: self._iterations_per_update = 1 else: assert isinstance(num, int) assert num > 0 self._iterations_per_update = num @action def start_semi_supervision(self): self.semi_supervision_started = True def score_semi_supervision(self, batch, mode='entropy'): """Compute the rank scores for SSL for the given batch depending on the mode: 'entropy' or 'diff' for 'diff-proba' metric""" batch_input, batch_target = self._prepare_input_and_target(batch) batch_ids = self.batch_id_fn(batch) if self.batch_id_fn else None # Batch timestep probabilities batch_output = self._model(batch_input) # Batch prediction that will be the label for the sample selected batch_decode = self.decoder(batch_output) # Compute the score of each sample: median of the difference between the top 2 probabilities per timestep x, xs = transform_output(batch_output) x = F.softmax(x, dim=2) xs = xs.numpy() if mode == 'diff': best_probas, _ = x.topk(dim=2, k=2) scores = best_probas[:, :, 0] - best_probas[:, :, 1] elif mode == 'entropy': sorted_probas, _ = x.topk(dim=2, k=x.shape[2]) best_probas_entropy = (sorted_probas[:, :, 0] * torch.log(sorted_probas[:, :, 0]))[:, :, None] all_probas_but_best_entropy = ( 1 - sorted_probas[:, :, 1:]) * torch.log(1 - sorted_probas[:, :, 1:]) #scores = minus real entropy scores = torch.cat( [best_probas_entropy, all_probas_but_best_entropy], dim=2).sum(dim=2) sizes = np.arange(scores.shape[0]) < xs[..., None] sizes = torch.tensor(sizes.T).cuda() semi_supervised_score = scores.sum(dim=0) / (sizes).float().sum(dim=0) semi_supervised_score = semi_supervised_score.cpu().detach().numpy() return semi_supervised_score, batch_ids, batch_decode, batch_target @action def compute_semi_supervision(self, epoch): if self.use_semi_supervised and self.semi_supervision_started: # Frequency of update if self.counter_epoch_semi_supervision == self.epoch_frequency_semi_supervision: self.counter_epoch_semi_supervision = 0 # Get the dataset B cf report new_ids, new_imgs, new_txts = self.compute_score_semi_supervision( epoch) # Fraction of the original dataset # To be changed on your usage nb_samples = 0 #len(self.original_dataset['ids'])//2 idx = np.random.randint(0, len(self.original_dataset['ids']), nb_samples) ori_ids = [self.original_dataset['ids'][i] for i in idx] ori_imgs = [self.original_dataset['imgs'][i] for i in idx] ori_txts = [self.original_dataset['txts'][i] for i in idx] self.data_loader.dataset._ids = ori_ids + new_ids self.data_loader.dataset._imgs = ori_imgs + new_imgs self.data_loader.dataset._txts = ori_txts + new_txts self.counter_epoch_semi_supervision += 1 def compute_score_semi_supervision(self, epoch): # Choose mode mode = 'entropy' # Batch iterator if self._progress_bar: batch_iterator = tqdm( self.data_semi_supervised_loader, desc=self._progress_bar if isinstance(self._progress_bar, string_classes) else None, ) else: batch_iterator = self.data_semi_supervised_loader # Compute metric on labelled dataset A original_scores = [] for it, batch in enumerate(self.original_data_loader, 1): semi_supervised_score, _, _, _ = self.score_semi_supervision( batch, mode) original_scores.append(semi_supervised_score) original_scores = np.concatenate(original_scores) print('Median Percentiles', np.percentile(original_scores, [25, 50, 75, 90, 95, 99])) # Median score on the dataset A score_threshold = np.percentile(original_scores, 50) # If the user has indicated a threshold on the rank score, we use it # If not we take the median of the score on the dataset A if self.threshold_score_semi_supervised != 0.0: score_threshold = self.threshold_score_semi_supervised print(score_threshold, "\n") # To store the samples to be added to the training set new_ids = [] new_txts = [] targets = [] for it, batch in enumerate(batch_iterator, 1): semi_supervised_score, batch_ids, batch_decode, batch_target = self.score_semi_supervision( batch, mode) print( 'Median Percentiles', np.percentile(semi_supervised_score, [25, 50, 75, 90, 95, 99])) # Compute the ids of samples to be added to the training set idx = np.argwhere( semi_supervised_score > score_threshold).reshape(-1) # Add ids and new labels batch_ids = np.array(batch_ids) new_ids += batch_ids[idx].reshape(-1).tolist() new_txts += [[str(self.syms[val]) for val in batch_decode[i]] for i in idx] targets += [[str(self.syms[val]) for val in batch_target[i]] for i in idx] # Compute the filenames of the new labelled samples unlabelled_imgs = self.data_semi_supervised_loader.dataset._imgs file_format = unlabelled_imgs[0].split('.')[-1] head, tail = os.path.split(unlabelled_imgs[0]) new_imgs = [os.path.join(head, i + "." + file_format) for i in new_ids] # Correction with Spell Checker corrected_new_txts = [] for txt in new_txts: txt = ''.join(txt).split('@') correction = [] for word in txt: suggestion = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=5, include_unknown=True)[0] correction.append(suggestion.term.split(',')[0]) corrected_new_txts.append(list('@'.join(correction))) # print few corrections # for txt, corr, target in zip(new_txts[:50],corrected_new_txts[:50],targets[:50]): # print('\n') # print('output',''.join(txt).split('@')) # print('corrected',''.join(corr).split('@')) # print('target',''.join(target).split('@')) return new_ids, new_imgs, corrected_new_txts def add_evaluator(self, evaluator, when=EPOCH_END, condition=None): r"""Add an evaluator to run at the end of each epoch.""" if evaluator is not None: self.add_hook( when, Hook(condition, evaluator.run) if condition is not None else evaluator.run, ) return self @action def run(self): r"""Run training """ assert callable( self._batch_input_fn ), "batch_input_fn (type: {!r}) is not callable".format( str(self._batch_target_fn)) assert callable( self._batch_target_fn ), "batch_target_fn (type: {!r}) is not callable".format( str(self._batch_target_fn)) while not self._must_stop: if self._cv_number != '': print("Cross_validation", self._cv_number) self._run_epoch() return self def _run_iteration(self, batch_n, batch, train_iterations=None): batch_input, batch_target = self._prepare_input_and_target(batch) action_kwargs = { "batch": batch, "batch_num": batch_n, "epoch": self._epochs, "iteration": self._iterations, "batch_input": batch_input, "batch_target": batch_target, } self._call_hooks(ITER_START, **action_kwargs) if self._must_stop: return # Make all parameter gradients equal to zero. # Note: IT % NIPU = the iteration after a step() if self._iterations % self.iterations_per_update == 0: self._optimizer.zero_grad() # Put model in training mode if hasattr(self._model, "train"): self._model.train() # Run model with self.exception_catcher(batch): batch_output = self._model(batch_input) # Note: These checks are only active when logging level <= DEBUG check_inf( tensor=batch_output, logger=__name__, msg="Found {abs_num} ({rel_num:.2%}) INF values in the " "model output at epoch {epoch}, batch {batch} (absolute " "iteration {iteration})", epoch=self._epochs, batch=self.batch_id_fn(batch) if self.batch_id_fn else batch, iteration=self._iterations, ) check_nan( tensor=batch_output, logger=__name__, msg="Found {abs_num} ({rel_num:.2%}) NAN values in the " "model output at epoch {epoch}, batch {batch} (absolute " "iteration {iteration})", epoch=self._epochs, batch=self.batch_id_fn(batch) if self.batch_id_fn else batch, iteration=self._iterations, ) batch_loss = self.compute_loss(batch, batch_output, batch_target) if batch_loss is None: return # Make the loss and gradients w.r.t. output independent of the number # of accumulated iterations. if self.iterations_per_update > 1: batch_loss /= self.iterations_per_update # Compute gradients w.r.t. parameters self.logger.debug( "Start backward at epoch {}, batch {} (absolute iteration {})", self._epochs, batch_n, self._iterations, ) with self.exception_catcher(batch): batch_loss.backward() self._iterations += 1 # Update model parameters. if self._iterations % self.iterations_per_update == 0: self._updates += 1 self.logger.debug( "Updating parameters at epoch {}, batch {} (absolute iteration {})", self._epochs, batch_n, self._iterations, ) self._optimizer.step() action_kwargs["train_iterations"] = self._iterations action_kwargs["batch_output"] = batch_output action_kwargs["batch_loss"] = batch_loss.item() action_kwargs["batch_id"] = self.batch_id_fn( batch) if self.batch_id_fn else None self._call_hooks(ITER_END, **action_kwargs) def compute_loss(self, batch, batch_output, batch_target): with self.exception_catcher(batch): kwargs = {} if isinstance(self._criterion, Loss) and self.batch_id_fn: kwargs = {"batch_ids": self.batch_id_fn(batch)} loss = self._criterion(batch_output, batch_target, **kwargs) if loss is not None: if torch.sum(torch.isnan(loss)).item() > 0: raise ValueError("The loss is NaN") if torch.sum(torch.isinf(loss)).item() > 0: raise ValueError("The loss is +/-Inf") return loss def state_dict(self): state = super(Trainer, self).state_dict() state["optimizer"] = self._optimizer.state_dict() state["updates"] = self._updates return state def load_state_dict(self, state): super(Trainer, self).load_state_dict(state) self._optimizer.load_state_dict(state["optimizer"]) self._updates = state["updates"]
class MaskTextSpotter(object): def __init__(self, cfg, confidence_threshold=0.7, min_image_size=224, output_polygon=True, spellfix=True): self.cfg = cfg.clone() self.model = build_detection_model(cfg) self.model.eval() self.device = torch.device(cfg.MODEL.DEVICE) self.model.to(self.device) self.min_image_size = min_image_size self.spellfix = spellfix self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(bigram_dictionary_path, term_index=0, count_index=2) checkpointer = DetectronCheckpointer(cfg, self.model) if len(cfg.MODEL.WEIGHT): import logging logging.info('loading MaskTextSpotter from %s' % cfg.MODEL.WEIGHT) _ = checkpointer.load(cfg.MODEL.WEIGHT) self.transforms = self.build_transform() self.cpu_device = torch.device("cpu") self.confidence_threshold = confidence_threshold self.output_polygon = output_polygon def build_transform(self): """ Creates a basic transformation that was used to train the models """ cfg = self.cfg # we are loading images with OpenCV, so we don't need to convert them # to BGR, they are already! So all we need to do is to normalize # by 255 if we want to convert to BGR255 format, or flip the channels # if we want it to be in RGB in [0-1] range. if cfg.INPUT.TO_BGR255: to_bgr_transform = T.Lambda(lambda x: x * 255) else: to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) normalize_transform = T.Normalize(mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD) transform = T.Compose([ T.ToPILImage(), T.Resize(self.min_image_size), T.ToTensor(), to_bgr_transform, normalize_transform, ]) return transform def run_on_opencv_image(self, image): """ Arguments: image (np.ndarray): an image as returned by OpenCV Returns: result_polygons (list): detection results result_words (list): recognition results """ result_polygons, result_words, result_dict = self.compute_prediction( image) return result_polygons, result_words, result_dict def run_on_pillow_image(self, image): arr = np.array(image, dtype=np.uint8) result_polygons, result_words, result_dict = self.run_on_opencv_image( arr) return result_polygons, result_words, result_dict def compute_prediction(self, original_image): def spell_fix(wd): if self.spellfix: new_word = [ s.term for s in self.sym_spell.lookup(wd, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) ][0] else: new_word = wd return new_word def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] def mk_direction(char_polygons): def centroid(char_polygon): centroid = Polygon(list(chunks(char_polygon, 2))).centroid.coords return list(centroid)[0] first, last = char_polygons[0], char_polygons[-1] start, end = centroid(first), centroid(last) if start[0] == end[0]: end = (end[0] + 1, end[1]) return start, end def line_detection(dicts, char_ratio=1.5): # box [x1, y1, x2, y2] sorted_res = sorted(dicts, key=lambda d: d["box"][0]) lines = dict() def point_in_next_word(word): width = word["box"][2] - word["box"][0] # width = x2 - x1 avg_char_width = width / float(len(word["seq_word"])) last_right_border = word["box"][2] next_word_pos_x = last_right_border + char_ratio * avg_char_width next_word_pos_y = word["box"][1] direction = word["direction"] point = Point(next_word_pos_x, next_word_pos_y) line = LineString(direction) x = np.array(point.coords[0]) u = np.array(line.coords[0]) v = np.array(line.coords[len(line.coords) - 1]) n = v - u n /= np.linalg.norm(n, 2) P = u + n * np.dot(x - u, n) return (int(P[0]), int(P[1])) def distance_to_mid(word_point, word_box): point = Point(word_point["next_point"]) box = word_box["box"] return abs(point.y - (box[1] + box[3]) / 2.0) # abs( y - (y2+y1)/2 ) def find_next_word(word, index, sorted_words): next_point = Point(word["next_point"]) next_words = [ other for other in sorted_words[index + 1:] if Polygon( chunks(other["polygon"], 2)).contains(next_point) ] if next_words: return min(next_words, key=lambda x: distance_to_mid(word, x)) else: return None def find_previous_word(prev, word): if "previous_word" not in word.keys(): return prev else: return min(prev, word["previous_word"], key=lambda x: distance_to_mid(x, word)) for w in sorted_res: w["next_point"] = point_in_next_word(w) for i, w in enumerate(sorted_res): next_word = find_next_word(w, i, sorted_res) w["next_word"] = None if next_word: better_previous = find_previous_word(w, next_word) if better_previous == w: w["next_word"] = next_word if "previous_word" in next_word.keys(): next_word["previous_word"]["next_word"] = None next_word["previous_word"] = w for w in sorted_res: if "previous_word" not in w.keys(): a = w key_y = a["box"][1] while key_y in lines.keys(): key_y = key_y + 1 lines[key_y] = [a] while a["next_word"]: a = a["next_word"] lines[key_y].append(a) sorted_lines = sorted(lines.items(), key=lambda x: x[0]) return ",".join([ " ".join([w["seq_word"] for w in line]) for _, line in sorted_lines ]), sorted_lines # apply pre-processing to image import datetime, time start_time = time.time() # print('transform', datetime.datetime.now()) image = self.transforms(original_image) # convert to an ImageList, padded so that it is divisible by # cfg.DATALOADER.SIZE_DIVISIBILITY # print('to image list', datetime.datetime.now()) image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) image_list = image_list.to(self.device) # compute predictions with torch.no_grad(): # print('predict', datetime.datetime.now()) self.model.eval() predictions, _, _ = self.model(image_list) if not predictions or len(predictions) < 1: # print('no text detected') return [], [], {'label': '', 'details': []} # print('post process', datetime.datetime.now()) global_predictions = predictions[0] char_predictions = predictions[1] char_mask = char_predictions['char_mask'] char_boxes = char_predictions['boxes'] words, rec_scores, rec_char_scores, char_polygons = self.process_char_mask( char_mask, char_boxes) detailed_seq_scores = char_predictions['detailed_seq_scores'] seq_words = char_predictions['seq_outputs'] seq_scores = char_predictions['seq_scores'] global_predictions = [ o.to(self.cpu_device) for o in global_predictions ] # always single image is passed at a time global_prediction = global_predictions[0] # reshape prediction (a BoxList) into the original image size height, width = original_image.shape[:-1] test_image_width, test_image_height = global_prediction.size global_prediction = global_prediction.resize((width, height)) resize_ratio = float(height) / test_image_height boxes = global_prediction.bbox.tolist() scores = global_prediction.get_field("scores").tolist() masks = global_prediction.get_field("mask").cpu().numpy() result_polygons = [] result_words = [] result_dicts = [] for k, box in enumerate(boxes): score = scores[k] if score < self.confidence_threshold: continue box = list(map(int, box)) mask = masks[k, 0, :, :] polygon = self.mask2polygon(mask, box, original_image.shape, threshold=0.5, output_polygon=self.output_polygon) if polygon is None: polygon = [ box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3] ] result_polygons.append(polygon) word = words[k] rec_score = rec_scores[k] char_score = rec_char_scores[k] seq_word = seq_words[k] seq_char_scores = seq_scores[k] seq_score = sum(seq_char_scores) / float(len(seq_char_scores)) # spell_fix = lambda word: \ # [s.term for s in sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)][ # 0] detailed_seq_score = detailed_seq_scores[k] detailed_seq_score = np.squeeze(np.array(detailed_seq_score), axis=1) # if 'total_text' in output_folder or 'cute80' in output_folder: # result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [seq_word] + [score] + [rec_score] + [ # seq_score] + [char_score] + [detailed_seq_score] + [len(polygon)] # else: result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [ seq_word ] + [score] + [rec_score] + [seq_score] + [char_score] + [ detailed_seq_score ] # result_logs.append(result_log) if len(seq_word) > 0 and len(char_polygons[k]) > 0: d = { "seq_word": seq_word if len(seq_word) < 4 else spell_fix(seq_word), "seq_word_orig": seq_word, "direction": mk_direction([[int(c * resize_ratio) for c in p] for p in char_polygons[k]]), "word": word if len(word) < 4 else spell_fix(word), "word_orig": word, "box": [int(x * 1.0) for x in box[:4]], "polygon": polygon, "prob": score * seq_score } result_words.append(d['seq_word']) result_dicts.append(d) # default_logger.debug('done', datetime.datetime.now()) label, details = line_detection(result_dicts) end_time = time.time() # default_logger.debug('cost time: %s' % (end_time - start_time)) line_result = {'label': label, 'details': details} # line_result_words = [] # line_result_polygons = [] # for ocr_detail in line_result['details']: # pass # line_result_words = [a[1][0]['seq_word'] for a in line_result['details']] # line_result_polygons = [a[1][0]['polygon'] for a in line_result['details']] line_result_words = [a['seq_word'] for a in result_dicts] line_result_polygons = [a['polygon'] for a in result_dicts] # return result_polygons, result_words, line_result return line_result_polygons, line_result_words, line_result # def process_char_mask(self, char_masks, boxes, threshold=192): # texts, rec_scores = [], [] # for index in range(char_masks.shape[0]): # box = list(boxes[index]) # box = list(map(int, box)) # text, rec_score, _, _ = getstr_grid(char_masks[index, :, :, :].copy(), box, threshold=threshold) # texts.append(text) # rec_scores.append(rec_score) # return texts, rec_scores def process_char_mask(self, char_masks, boxes, threshold=192): texts, rec_scores, rec_char_scores, char_polygons = [], [], [], [] for index in range(char_masks.shape[0]): box = list(boxes[index]) box = list(map(int, box)) text, rec_score, rec_char_score, char_polygon = getstr_grid( char_masks[index, :, :, :].copy(), box, threshold=threshold) texts.append(text) rec_scores.append(rec_score) rec_char_scores.append(rec_char_score) char_polygons.append(char_polygon) # segmss.append(segms) return texts, rec_scores, rec_char_scores, char_polygons def mask2polygon(self, mask, box, im_size, threshold=0.5, output_polygon=True): # mask 32*128 image_width, image_height = im_size[1], im_size[0] box_h = box[3] - box[1] box_w = box[2] - box[0] cls_polys = (mask * 255).astype(np.uint8) poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h))) poly_map = poly_map.astype(np.float32) / 255 poly_map = cv2.GaussianBlur(poly_map, (3, 3), sigmaX=3) ret, poly_map = cv2.threshold(poly_map, 0.5, 1, cv2.THRESH_BINARY) if output_polygon: SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) poly_map = cv2.erode(poly_map, SE1) poly_map = cv2.dilate(poly_map, SE1) poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1) try: _, contours, _ = cv2.findContours( (poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) except: contours, _ = cv2.findContours( (poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) if len(contours) == 0: print(contours) print(len(contours)) return None max_area = 0 max_cnt = contours[0] for cnt in contours: area = cv2.contourArea(cnt) if area > max_area: max_area = area max_cnt = cnt perimeter = cv2.arcLength(max_cnt, True) epsilon = 0.01 * cv2.arcLength(max_cnt, True) approx = cv2.approxPolyDP(max_cnt, epsilon, True) pts = approx.reshape((-1, 2)) pts[:, 0] = pts[:, 0] + box[0] pts[:, 1] = pts[:, 1] + box[1] polygon = list(pts.reshape((-1, ))) polygon = list(map(int, polygon)) if len(polygon) < 6: return None else: SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) poly_map = cv2.erode(poly_map, SE1) poly_map = cv2.dilate(poly_map, SE1) poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1) idy, idx = np.where(poly_map == 1) xy = np.vstack((idx, idy)) xy = np.transpose(xy) hull = cv2.convexHull(xy, clockwise=True) # reverse order of points. if hull is None: return None hull = hull[::-1] # find minimum area bounding box. rect = cv2.minAreaRect(hull) corners = cv2.boxPoints(rect) corners = np.array(corners, dtype="int") pts = get_tight_rect(corners, box[0], box[1], image_height, image_width, 1) polygon = [x * 1.0 for x in pts] polygon = list(map(int, polygon)) return polygon def visualization(self, img, polygons, words): cur_img = copy.deepcopy(img) for polygon, word in zip(polygons, words): pts = np.array(polygon, np.int32) pts = pts.reshape((-1, 1, 2)) xmin = min(pts[:, 0, 0]) ymin = min(pts[:, 0, 1]) r = random.randint(0, 255) g = random.randint(0, 255) b = random.randint(0, 255) cv2.polylines(cur_img, [pts], True, (b, g, r)) cv2.putText(cur_img, word, (xmin, ymin), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (b, g, r), 1) return cur_img
class MagicRecognition: def __init__(self, file_all_cards: str, file_keywords: str, languages=tuple("English"), max_ratio_diff=0.3, max_ratio_diff_keyword=0.2) -> None: """Load dictionnaries of cards and keywords Parameters ---------- file_all_cards: str Path to the file containing all cards. If the file does not exist, it is downloaded from mtgjson. file_keywords: str Path to the file containing all keywords. If the file does not exist, it is downloaded from mtgjson. max_ratio_diff : float, optional Maximum ratio (distance/length) for a text to be considered as a card name, by default 0.3 max_ratio_diff_keyword : float, optional Maximum ratio (distance/length) for a text to be considered as a (ignored) keyword, by default 0.2 """ self.max_ratio_diff = max_ratio_diff self.max_ratio_diff_keyword = max_ratio_diff_keyword if not Path(file_all_cards).is_file(): def write_card(f, card): i = card.find(" //") if i != -1: card = card[:i] f.write(card + "$1\n") # required for SymSpell all_cards_json = load_json(URL_ALL_CARDS) with Path(file_all_cards).open("a") as f: for card, l in all_cards_json["data"].items(): if "English" in languages: write_card(f, card) for e in l[0]["foreignData"]: if e["language"] in languages: write_card(f, e["name"]) self.sym_all_cards = SymSpell(max_dictionary_edit_distance=6) self.sym_all_cards._distance_algorithm = editdistance.DistanceAlgorithm.LEVENSHTEIN self.sym_all_cards.load_dictionary(file_all_cards, 0, 1, separator="$") self.all_cards = self.sym_all_cards._words print(f"Loaded {file_all_cards}: {len(self.all_cards)} cards") self.edit_dist = editdistance.EditDistance(editdistance.DistanceAlgorithm.LEVENSHTEIN) if not Path(file_keywords).is_file(): keywords = load_json(URL_KEYWORDS) json.dump(keywords, Path(file_keywords).open("w")) def concat_lists(LL): res = [] for L in LL: res.extend(L) return res keywords_json = json.load(Path(file_keywords).open()) keywords = concat_lists(keywords_json["data"].values()) keywords.extend(["Display", "Land", "Search", "Profile"]) self.sym_keywords = SymSpell(max_dictionary_edit_distance=3) for k in keywords: self.sym_keywords.create_dictionary_entry(k, 1) print(f"Loaded {file_keywords}: {len(keywords)} cards") def _preprocess(self, text: str) -> str: """Remove characters which can't appear on a Magic card (OCR error)""" return re.sub("[^a-zA-Z',. ]", '', text).rstrip(' ') def _preprocess_texts(self, box_texts: BoxTextList) -> None: """Apply `preprocess` on each text""" for box_text in box_texts: box_text.text = self._preprocess(box_text.text) def box_texts_to_cards(self, box_texts: BoxTextList) -> BoxTextList: """Recognize cards from raw texts""" box_texts.sort() box_cards = BoxTextList() for box, text, _ in box_texts: sug = self.sym_keywords.lookup(text, Verbosity.CLOSEST, max_edit_distance=min(3, int(self.max_ratio_diff_keyword * len(text)))) if sug != []: logging.info(f"Keyword rejected: {text} {sug[0].distance/len(text)} {sug[0].term}") else: card = self._search(self._preprocess(text)) if card is not None: box_cards.add(box, card) return box_cards def _assign_stacked(self, box_texts: BoxTextList, box_cards: BoxTextList) -> None: """Set multipliers (e.g. x4) for each (stacked) card in `box_cards` Parameters ---------- box_texts : BoxTextList BoxTextList containing potential multipliers box_cards : BoxTextList BoxTextList containing recognized cards """ def _assign_stacked_one(box_cards: BoxTextList, m: int, comp) -> None: i_min = 0 for i, box_card in enumerate(box_cards): if comp(box_card.box, box_cards[i_min].box): i_min = i box_cards[i_min].n = m logging.info(f"{box_cards[i_min].text} assigned to x{m}") def dist(p: tuple, q: tuple) -> float: return (p[0] - q[0])**2 + (p[1] - q[1])**2 def comp_md(box1: tuple, box2: tuple, box: tuple) -> float: if box1[0] > box[0] or box1[1] > box[1]: return False return dist(box, box1) < dist(box, box2) def comp_sb(box1: tuple, box2: tuple, box: tuple) -> float: return dist(box, box1) < dist(box, box2) comp = (comp_md, comp_sb) for box, text, _ in box_texts: if len(text) == 2: for i in [0, 1]: if text[i] in '×xX' and text[1 - i].isnumeric(): _assign_stacked_one(box_cards, int(text[1 - i]), partial(comp[i], box=box)) def _box_cards_to_deck(self, box_cards: BoxTextList) -> Deck: """Convert recognized cards to decklist""" maindeck, sideboard = Pile(), Pile() n_cards = sum(c.n for c in box_cards) n_added = 0 last_main_card = max(60, n_cards - 15) for _, card, n in box_cards: def add_cards(c, deck, p): if c in deck.cards: deck.cards[c] += p elif p > 0: deck.cards[c] = p n_added_main = max(min(n, last_main_card - n_added), 0) add_cards(card, maindeck, n_added_main) add_cards(card, sideboard, n - n_added_main) n_added += n deck = Deck() deck.maindeck = maindeck deck.sideboard = sideboard return deck def box_texts_to_deck(self, box_texts: BoxTextList) -> Deck: """Convert raw texts to decklist Parameters ---------- box_texts : BoxTextList Raw texts given by an OCR Returns ------- Deck Decklist obtained from `box_texts` """ box_cards = self.box_texts_to_cards(box_texts) self._assign_stacked(box_texts, box_cards) return self._box_cards_to_deck(box_cards) def _search(self, text): """If `text` can be recognized as a Magic card, return that card. Otherwise, return None.""" if len(text) < 3: # a card name is never that short return None if len(text) > 30: # a card name is never that long logging.info(f"Too long: {text}") return None if text in self.all_cards: return text i = text.find("..") # search for truncated card name if i != -1: dist = int(self.max_ratio_diff * i) card = None for c in self.all_cards: d = self.edit_dist.compare(text[:i], c[:i], dist) if d != -1 and d < dist: card = c dist = d if card is None: logging.info(f"Not prefix: {text}") else: logging.info(f"Found prefix: {text} {dist/i} {card}") return card else: text = text.replace('.', '').rstrip(' ') sug = self.sym_all_cards.lookup(text, Verbosity.CLOSEST, max_edit_distance=min(6, int(self.max_ratio_diff * len(text)))) if sug != []: card = sug[0].term ratio = sug[0].distance / len(text) if len(text) < len(card) + 7: logging.info(f"Corrected: {text} {ratio} {card}") return card logging.info(f"Not corrected (too long): {text} {ratio} {card}") else: logging.info(f"Not found: {text}") return None
def SymSpell() -> SymSpellPy: symspell = SymSpellPy() symspell.load_dictionary('data/dictionary.txt', 0, 1) return symspell
class spellchecker: def __init__( self, max_dictionary_edit_distance, prefix_length, unigram_freq_file, bigram_freq_file=None, pickle_file=None, ): self.sym_spell = SymSpell( max_dictionary_edit_distance=max_dictionary_edit_distance, prefix_length=prefix_length, ) if pickle_file is not None: self.sym_spell.load_pickle(pickle_file, ) else: self.sym_spell.load_dictionary( unigram_freq_file, term_index=0, count_index=1, encoding="utf-8", ) if bigram_freq_file: self.sym_spell.load_bigram_dictionary( bigram_freq_file, term_index=0, count_index=2, encoding="utf-8", ) def suggest( self, word, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): # defaults if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) return { 'original_term': word, 'suggestions': suggestions, } def suggest_compound( self, phrase, max_edit_dist=None, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup_compound( phrase, max_edit_distance=max_edit_dist, # ignore_non_words=False, # split_phrase_by_space=True, ) return { 'original_term': phrase, 'suggestions': suggestions, } def tokenize(self, phrases): return tokenize_sentence(phrases) # Tokenize into individual phrases and return a list of suggestions for each def suggest_tokenize( self, phrases, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE words = self.tokenize(phrases) sentence_suggestions = [] for word in words: suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) sentence_suggestions.append({ 'original_term': word, 'suggestions': suggestions, }) return sentence_suggestions
def pipeline_ocr_export_no_binarization(SHAPE_ROTATION_RESULTS_PATH, EXPORT_RESULTS_PATH, tess_config, batch_size, limit=None): ''' run this function if binarization has already been applied or is not needed SHAPE_ROTATION_RESULTS_PATH - source (files from rotation output) EXPORT_RESULTS_PATH - export ''' # load and init symspell library import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) # load gridsearch data df_valid_files = pd.read_csv(SHAPE_ROTATION_RESULTS_PATH) valid_files_list = list(df_valid_files.file) print('total files:\t', len(valid_files_list)) # get new files col_names = [ 'row_nr', 'level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text', 'text_low', 'symspell_sc', 'symspell_ws', 'file' ] new_files = get_new_files_to_be_processed(path=EXPORT_RESULTS_PATH, col_names=col_names, index_col='file', all_files=valid_files_list) # get list shaped input only filename new_files = df_valid_files[df_valid_files.file.isin( new_files)].file.to_list() if limit != None: new_files = new_files[:limit] new_files_batches = list(divide_chunks(new_files, batch_size)) print('total batches', len(new_files_batches), 'total files', len(new_files)) conf = tess_config if len(new_files_batches) > 0: for batch in new_files_batches[:]: args_1 = batch args_2 = len(args_1) * [sym_spell] args_3 = len(args_1) * [conf] args_4 = len(args_1) * [None] args_5 = len(args_1) * [True] args_6 = len(args_1) * [False] args_7 = len(args_1) * [0] args_8 = len(args_1) * [False] all_args = list( zip(args_1, args_2, args_3, args_4, args_5, args_6, args_7, args_8)) print("Starting Batch..") pool = multiprocessing.Pool() results = pool.starmap(extract_text_from_image, all_args) df_tmp_results = pd.concat(results).to_records() with open(EXPORT_RESULTS_PATH, 'a', newline='') as outcsv: writer = csv.writer(outcsv) writer.writerows(df_tmp_results) cv2.destroyAllWindows() pool.close() pool.terminate()
class WordCorrection: """ Provides the ability to correct input. """ __instance = None @staticmethod def get_instance(edit_distance=None): """ Static access method. """ if WordCorrection.__instance is None: WordCorrection(edit_distance) return WordCorrection.__instance def __init__(self, edit_distance=None): """ Creates the SpellChecker object to be used for word correction. Language frequency lists are downloaded from a Git dictionary. Args: edit_distance (int): The maximum edit distance used for the correction (default is 2). """ if WordCorrection.__instance is not None: raise Exception("An instance of this class already exists") else: WordCorrection.__instance = self self.settings_manager = SettingsManager.get_instance() self.settings_manager.attach(self) # attach the object so it gets observed when the language changes self.language = self.settings_manager.get_language() self.spell = SymSpell(max_dictionary_edit_distance=edit_distance) self.spell.load_dictionary(os.path.join(os.path.dirname(os.path.abspath(__file__)),"dictionaries", "frequency_lists", self.language + ".txt"), 0, 1, encoding="utf-8") def update(self): """Updates the language at use if it has been changed.""" new_lang = self.settings_manager.get_language() if self.language == new_lang: return self.language = new_lang self.spell.load_dictionary(os.path.join(os.path.dirname(os.path.abspath(__file__)),"dictionaries", "frequency_lists", self.language + ".txt"), 0, 1, encoding="utf-8") def correct(self, word): """ Returns a correction of the word. Args: word (string): The word to be corrected. Returns: string: Word as predicted by the module. """ suggestions = self.spell.lookup(word, verbosity=Verbosity.CLOSEST, include_unknown=True, transfer_casing=True) suggestion = str(suggestions[0]).split(", ")[0] return suggestion
def extract_for_hocs_showcase(GS_PATH, HOCR_RESULTS_PATH, HOCR_DIR, conf, limit, N_CPU, batch_size): ''' df_gs - input dataframe from gridsearch ''' # init symspell library import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) # index list of filenames filename_path = HOCR_DIR + 'filenames_index.csv' files = ['choose file>'] language = 'eng+deu+fra' # filter df_valid_files = pd.read_csv(GS_PATH) df_valid_files = df_valid_files[df_valid_files.length > 100] df_valid_files = df_valid_files.sort_values(by=["measure"], ascending=False) valid_files_list = list(df_valid_files.file) print('total files:\t', len(valid_files_list)) # get new files col_names = ['file', 'name'] new_files = get_new_files_to_be_processed(path=HOCR_RESULTS_PATH, col_names=col_names, index_col='file', all_files=valid_files_list) # get array shaped input parameters new_files = df_valid_files[df_valid_files.file.isin(new_files)].values if limit != None: new_files = new_files[:limit] new_files_batches = list(divide_chunks(new_files, batch_size)) print('total batches', len(new_files_batches), 'total files', len(new_files)) files = [] cv2.destroyAllWindows() if len(new_files_batches) > 0: for batch in new_files_batches[:]: args_1 = batch args_2 = len(args_1) * [HOCR_DIR] args_3 = len(args_1) * [sym_spell] args_4 = len(args_1) * [conf] all_args = list(zip(args_1, args_2, args_3, args_4)) print("Starting Batch..") pool = multiprocessing.Pool(processes=N_CPU) results = pool.starmap(single_hocr_extract, all_args) files.extend(results) with open(HOCR_RESULTS_PATH, 'a', newline='') as outcsv: writer = csv.writer(outcsv) writer.writerows(results) cv2.destroyAllWindows() pool.close() pool.terminate() filename_path = HOCR_DIR + 'filenames_index.csv' files = ['data/' + i[0] for i in files] pd.DataFrame({'filename': files}).to_csv(filename_path, index=False)
class FeatureExtractor(BaseEstimator, TransformerMixin): """Extract review text, emojis and emoji sentiment. Takes a sequence of strings and produces a dict of values. Keys are `review`, `emojis`, and `emoji-sentiment`. """ def __init__(self, lang='ta'): self.lang = lang self.normalizer = BaseNormalizer(lang) # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling # detect_lang_and_store in feature_utils.py self.lmap = self.load_language_maps( os.path.join(os.path.dirname(sys.path[0]), '../resources/data/alltextslang.txt')) self.soundexer = Soundex() self.ta_trans = Transliterator(source='eng', target='tam', build_lookup=True) self.ml_trans = Transliterator(source='eng', target='mal', build_lookup=True) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell.load_dictionary( '../../src/extern/data/etymdict.csv.vocab.tsv.gz', term_index=0, count_index=1, separator="\t") super().__init__() def load_language_maps(self, mapfile): lmap = {} with open(mapfile, 'r') as mapf: for line in mapf: text, lang, conf = line.rstrip().split('\t') lmap[text] = (lang, float(conf)) return lmap def get_language_tag(self, text): return self.lmap.get(text, ('unknown', 0.0)) def fit(self, x, y=None): return self def transform(self, reviews): features = np.recarray( shape=(len(reviews), ), dtype=[ ('review', object), ('emojis', object), ('emoji_sentiment', object), ('lang_tag', object), ('len_range', object), ('soundexes', object), ], ) for i, review in enumerate(reviews): features['review'][i] = self.normalizer.normalize(text=review) emojis, sentiment = get_emojis_from_text(review) features['emojis'][i] = ' '.join(emojis) features['emoji_sentiment'][i] = sentiment lang, conf = self.get_language_tag(review.strip()) if lang == self.lang or lang == (self.lang + 'en'): # google agrees with some confidence agreement = 1 elif conf < 0.5: # google says not-tamil, but weakly agreement = 0.5 else: # google clearly says not-tamil agreement = 0 features['lang_tag'][i] = {'lang': lang, 'agreement': agreement} features['len_range'][i] = get_doc_len_range(review) if self.lang == 'ta': review_trans = self.ta_trans.transform(review) for word in review_trans.split(): suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) if len(suggestions) > 0 and suggestions[0].distance < 3: print(word, suggestions[0].term) # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity elif self.lang == 'ml': review_trans = self.ml_trans.transform(review) else: review_trans = review # TODO: introduce spell correct here for added normalisation # print(lang, review_trans) features['soundexes'][i] = ' '.join([ self.soundexer.soundex(word) for word in review_trans.split() ]) return features
class Spellchecker: """The class responsible for token spellchecking""" MAX_EDIT_DISTANCE = 3 MAX_LEVENSHTEIN_DISTANCE = 4 def __init__(self, dictionary_path): """ Args: dictionary_path (str): the path of the token frequency dict """ self.sym_spell = SymSpell( max_dictionary_edit_distance=Spellchecker.MAX_EDIT_DISTANCE, ) loaded = self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, separator=" ") assert (loaded) print("Loaded SymSpell dictionary.") def _replace_in_df(self, df, to_replace, value): df["TOKEN"] = df["TOKEN"].str.replace(to_replace, value) def manual_spellcheck(self, df): """ Manual spellcheck that looks for KNOWN and repeatable patterns in the data that can be replaced by a correct token/word. Args: df (pandas.Dataframe): the dataframe that will get spellchecked """ self._replace_in_df(df, "jebesmal", "jedesmal") self._replace_in_df(df, "Thatsache", "tat") self._replace_in_df(df, "Giengen", "gingen") self._replace_in_df(df, "Gcgenthcrl", "Gegenteil") self._replace_in_df(df, "\? unastie", "Dynastie") self._replace_in_df(df, "L rnnburg", "Luxemburg") self._replace_in_df(df, "Jstzt", "jetzt") self._replace_in_df(df, "Glaubensbekcnntniß", "Glaubenbekenntniss") self._replace_in_df(df, "T u r i n", "Turin") self._replace_in_df(df, "nöthicM", "nötigen") def automated_spellcheck(self, df): """ Automated spellcheck that looks for unpredictable OCR errors in the data and replaces them with hopefully correct tokens. Args: df (pandas.Dataframe): the dataframe that will get spellchecked """ wikidata_client = Client() tokenizer = Tokenizer(replace_not_contraction=False) ignore_regex_str = "^[0-9.!?*„_\-\—,;:<>='|\[\]\"()^«»/°•©>]+" ignore_regex = re.compile(ignore_regex_str) for index, row in df.iterrows(): # Get the token token = row["TOKEN"] wiki_metadata = row["NEL-LIT"] # Autocorrect suggestions = self.sym_spell.lookup( token, Verbosity.TOP, transfer_casing=True, include_unknown=True, ignore_token=ignore_regex_str, max_edit_distance=Spellchecker.MAX_EDIT_DISTANCE) # Save the first suggestion if we have one if suggestions and suggestions[0].term != token.lower(): if wiki_metadata.startswith('Q'): # 1. 'Qxxxx' - Use the Wikidata column value to spellcheck if ignore_regex.match(token): # token should be ignored continue wikidata_entity = wikidata_client.get(wiki_metadata) try: wikidata_label = wikidata_entity.attributes['labels'][ 'de']['value'] except KeyError: # the wikidata has no 'de' entry for the label, ignore spellcorrection continue wikidata_labels = tokenizer.tokenize(wikidata_label) wikidata_labels = map(lambda t: t.value, wikidata_labels) wikidata_labels = filter( lambda t: not ignore_regex.match(t), wikidata_labels) wikidata_labels = list(wikidata_labels) # Check if the token is not an abbreviation is_abbreviation = False for sublabel in wikidata_labels: if sublabel.startswith(token): print(token, "(abbrev) ->", sublabel, " | ", wiki_metadata) df.at[index, 'TOKEN'] = sublabel is_abbreviation = True break if is_abbreviation: continue try: best_match = sorted( wikidata_labels, key=lambda t: distance(t, token))[0] except IndexError: continue if distance( best_match, token) <= Spellchecker.MAX_LEVENSHTEIN_DISTANCE: print(token, "(best_match) ->", best_match, " | ", wiki_metadata) df.at[index, 'TOKEN'] = best_match else: # 2. 'NIL' / '_' - Use symspell suggestion = suggestions[0].term print(token, "(symspell) ->", suggestion, " | ", wiki_metadata) df.at[index, 'TOKEN'] = suggestion
STOPWORDS_NLTK = set(stopwords.words('english')) STOPWORDS_SPACY = sp.Defaults.stop_words lemmatiser_nltk = WordNetLemmatizer() nlp = spacy.load(name='en_core_web_sm', disable=['parser', 'ner']) # set max_dictionary_edit_distance=0 to avoid spelling correction dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7) # term_index is the column of the term and # count_index is the column of the term frequency sym_spell.load_dictionary(corpus=dictionary_path, term_index=0, count_index=1) sym_spell.load_dictionary(corpus=bigram_path, term_index=0, count_index=1) def get_n_word_strings(terms: list, n: int) -> list: """ Extract all n-word strings from a list of varying n-word strings. :param terms: List of strings to extract from. :param n: Integer of words in a string to extract by. :return: List of n-word strings. """ try: if isinstance(terms, str): terms = list(terms) return get_n_word_strings(terms, n)
from itertools import islice import pkg_resources from symspellpy import SymSpell sym_spell = SymSpell() dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") sym_spell.load_dictionary(dictionary_path, 0, 1) print(list(islice(sym_spell.words.items(), 5)))
def test_lookup_compound_no_bigram(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = "whereis th elove" correction = "whereas the love" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(64, results[0].count) typo = "the bigjest playrs" correction = "the biggest players" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(34, results[0].count) typo = "Can yu readthis" correction = "can you read this" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(3, results[0].distance) self.assertEqual(3, results[0].count) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("whereas the love head dated for much of the past who " "couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of " "a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(10, results[0].distance) self.assertEqual(0, results[0].count)
def preprocess(tweets, params=None): """ INPUT: list:tweets (e.g pos_train,neg_train) --->a list of raw tweets. dict:params (to pass options to the preprocess)--> e.g. params={'stemmeriz':True,'stop_words_removal':True,'lemmatize':True,'emoticons_2_str':True,'expand_not':True,'autocorr':True,'viterbi_segmenter':True,'del_deplicates':True} OUTPUT: list, a list of list of tokens.--->each tweet is a list composed of tokens DESCRIPTION: Implement a series of standard text preprocessing to input tweets, including: remove all nonalpha words, remov all stopwords, remove hashtag and lemmetize, stmmize words with NLTK, etc """ if params == None: return tweets else: stemmerize = params.get('stemmerize') lemmatize = params.get('lemmatize') stop_words_removal = params.get('stop_words_removal') emoticons_2_str = params.get('emoticons_2_str') expand_not = params.get('expand_not') autocorr = params.get('autocorr') viterbi_segmenter = params.get('viterbi_segmenter') del_deplicates = params.get('del_deplicates') """tokenize tweets using tokenizer tool from nltk.TweetTokenizer""" """ Delete duplicates in the input datasets,respectively 9.77%(9777) and 8.91%(8912) Should be False if process test_tweets """ if del_deplicates: tweets = delete_deplicates(tweets) tweets = tokenize(tweets) """remove stopwords""" if stop_words_removal: for index in range(len(tweets)): tweets[index] = list( filter( lambda x: x not in invalid_string and x not in stop_words, tweets[index])) if emoticons_2_str: """turn emoticons into string""" for index in range(len(tweets)): for i in range(len(tweets[index])): if tweets[index][i] in emoticonsHappySet: tweets[index][i] = 'happy' if tweets[index][i] in emoticonsSadSet: tweets[index][i] = 'sad' if stemmerize: """lemmetize, stmmize words with NLTK. Lemmatization == Stemming++ """ for index in range(len(tweets)): for i in range(len(tweets[index])): tweets[index][i] = stemmer.stem(tweets[index][i]) if lemmatize: for index in range(len(tweets)): for i in range(len(tweets[index])): tweets[index][i] = lemmatize_single(tweets[index][i]) if lemmatize or stemmerize or expand_not or emoticons_2_str: """again remove the empty string""" tweets[index] = list(filter(lambda x: x, tweets[index])) """join and resplit so that 'you re' becomes 'you','are' etc""" tweets[index] = (" ".join(tweets[index])).strip().split() """again remove stopwords""" tweets[index] = list( filter(lambda x: x not in invalid_string and x not in stop_words, tweets[index])) if autocorr: symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") symspell.load_dictionary(dictionary_path, term_index=0, count_index=1) pos_train = autocorrect(tweets, symspell) if viterbi_segmenter: pos_train = join_resplit(tweets_viterbi_segment(pos_train)) pos_train = filter_one_letter_word(pos_train) for index in range(len(tweets)): tweets[index] = ' '.join(tweets[index]) return tweets
from nltk.stem.porter import PorterStemmer import re import nltk from nltk.tokenize import word_tokenize from language_detector import detect_language import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") if sym_spell.word_count: pass else: sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) ################################### #### sentence level preprocess #### ################################### # lowercase + base filter # some basic normalization def f_base(s): """ :param s: string to be processed :return: processed string: see comments in the source code for more info """ # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter) s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', s) # before lower case
* \ https://github.com/mammothb/symspellpy """ from typing import List from symspellpy import SymSpell, Verbosity from pythainlp.corpus import get_corpus_path from pythainlp.corpus import path_pythainlp_corpus from pythainlp.tokenize import word_tokenize _UNIGRAM = "tnc_freq.txt" _BIGRAM = "tnc_bigram_word_freqs" sym_spell = SymSpell() sym_spell.load_dictionary(path_pythainlp_corpus(_UNIGRAM), 0, 1, separator='\t', encoding="utf-8-sig") sym_spell.load_bigram_dictionary(get_corpus_path(_BIGRAM), 0, 2, separator='\t', encoding="utf-8-sig") def spell(text: str, max_edit_distance: int = 2) -> List[str]: return [ str(i).split(',')[0] for i in list( sym_spell.lookup( text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance)) ]