refFileName = 'referenceText.txt'
# Set frequency list file name
freqFileName = 'frequency_list_aze.txt'
# Set vocabulary file name
vocabFileName = 'wordList.pickle'

# Read the wordList which is Azerbaijani vocabulary
with open(vocabFileName, 'rb') as handle:
    vocabulary = pickle.load(handle)

# Create frequency list of Azerbaijani language
frequencyList = hf.createFrequencyList(refFileName, vocabulary['Words'].tolist())

# Write frequency list into .txt file. Pickle can also be used for the same purpose
# Disable disable if .txt file is already generated
# hf.writeTXT(frequencyList, freqFileName)

# Create SymSpell object and read the frequency list
symspell = SymSpell()
symspell.load_dictionary(freqFileName, 0, 1, encoding="utf-8")

word = "ilım"
# Use documentation to perform custom edits
# https://symspellpy.readthedocs.io/en/latest/api/index.html
suggestions = symspell.lookup(word, Verbosity.CLOSEST,
                               max_edit_distance=2, include_unknown=True)

# List the suggestions for the word, in descending order
for suggestion in suggestions:
    print(suggestion)
Exemple #2
0
class Speller:
    def __init__(self, **kwargs):

        if 'dev' in kwargs:

            print(
                "Spellchecker run in development mode...small sample of checking items loaded only"
            )
            self.flagDev = True

            self.abkuerzung_path = "inject/Abkuerzungen.csv"
            self.word_freq_path = "inject/word_freq_test.txt"

        else:

            self.abkuerzung_path = "inject/Abkuerzungen.csv"
            self.word_freq_path = "inject/word_freq_list_overall.txt"

        print("Initializing spellchecker...")
        self.loadAbbreviations()
        self.loadSymSpell()
        print("Initializing spellchecker complete")

    def loadSymSpell(self):
        self.sym_spell = SymSpell()
        self.sym_spell.load_dictionary(self.word_freq_path,
                                       0,
                                       1,
                                       encoding="utf8")

    def loadAbbreviations(self):
        self.Abkuerzungen = pd.read_csv(self.abkuerzung_path,
                                        sep=";",
                                        header=0,
                                        encoding="latin-1")
        self.Abkuerzungen['lower_case'] = [
            str(x).lower().strip(".")
            for x in list(self.Abkuerzungen['Abkuerzung'])
        ]
        self.Abkuerzungen = self.Abkuerzungen.dropna()
        self.Abkuerzungen = self.Abkuerzungen.reset_index()

    def dev(self, objId):
        self.db = DB()
        self.objId = objId
        self.process_obj(self.objId)

    def process_obj(self, obj):

        self.label_obj = self.db.mongo_db.labels.find_one(
            {"_id": ObjectId(self.objId)})

        for p in self.label_obj["pages"]:
            if "read_text_raw" in p:
                s = p["read_text_raw"]
                c_s = self.check_string(s)
                p["read_text"] = c_s

    def store_obj(self):
        self.db.mongo_db.labels.update({"_id": ObjectId(self.objId)},
                                       self.label_obj)

    def get_store_obj(self):
        return self.label_obj

    def check_string(self, in_string):
        # spell correction
        string_corr = []
        string = in_string.replace("\n",
                                   ' break ')  # problems with processing '\n'
        for word in string.split(
                ' '
        ):  # process word by word since the compound version deletes any special characters
            word = word.strip('\r\n')
            if (word == "AUF") | (word == "AU"):
                word = 'Arbeitsunfähigkeit'
            if (word == "P:"):
                word = 'Patient:'
            if word not in ('', ',', '!', '.', ';', ':', '?', '-',
                            '') and word.lower().strip(".,") not in list(
                                self.Abkuerzungen['lower_case']):
                input_term = word
                if input_term[-1] in ("?", "!", ".", ",", ":", ";"):
                    input_term = input_term[:-1]
                # max edit distance per lookup (per single word, not per whole input string)
                suggestions = self.sym_spell.lookup(
                    input_term,
                    Verbosity.CLOSEST,
                    max_edit_distance=2,
                    transfer_casing=True,
                    ignore_token=r".*[()].*",
                    include_unknown=True
                )  # display suggestion term, edit distance, and term frequency

                if word[-1] in ("?", "!", ".", ",", ":", ";"):
                    suggestion = str(suggestions[0].term) + word[-1]
                else:
                    suggestion = str(suggestions[0].term)
                if input_term.lower() == str(suggestions[0].term):
                    string_corr = string_corr + [str(word)]
                else:
                    string_corr = string_corr + [suggestion]
            if word.lower().strip(".,:") in list(
                    self.Abkuerzungen['lower_case']):
                string_corr = string_corr + [
                    self.Abkuerzungen['narrativ'][list(
                        self.Abkuerzungen['lower_case']).index(
                            word.lower().strip(".,:"))]
                ]
        string_corr = ' '.join(string_corr)

        string_corr = string_corr.replace("break", '\n')

        return string_corr
    def test_lookup_compound_replaced_words(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        replacement_1 = {
            "whereis": "where is",
            "th": "the",
            "elove": "love",
            "hehad": "he had",
            "forimuch": "for much",
            "thepast": "the past",
            "couqdn'tread": "couldn't read",
            "sixthgrade": "sixth grade",
            "ins": "in"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(len(replacement_1), len(sym_spell.replaced_words))
        for k, v in replacement_1.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        replacement_2 = {
            "te": "the",
            "dhird": "third",
            "qarter": "quarter",
            "oflast": "of last",
            "jear": "year",
            "hadlearned": "had learned",
            "ofca": "of a",
            "sekretplan": "secret plan"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2),
            len(sym_spell.replaced_words))
        for k, v in replacement_2.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        replacement_3 = {
            "bigjest": "biggest",
            "playrs": "players",
            "strogsommer": "strong summer",
            "slatew": "slate",
            "ith": "with",
            "plety": "plenty",
            "funn": "fun"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2) + len(replacement_3),
            len(sym_spell.replaced_words))
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        for k, v in replacement_3.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)
Exemple #4
0
                         encoding='utf-8-sig',
                         float_precision='round_trip')
ukrainian_dataframe = dataframe1[dataframe1['predicted_language'] ==
                                 'Ukrainian']
ukrainian_dataframe = ukrainian_dataframe.drop_duplicates()

sym_spell = SymSpell()
with open(
        "/Users/lidiiamelnyk/Downloads/dss-plugin-nlp-preparation-main/resource/dictionaries/uk.txt",
        'r',
        encoding='utf-8-sig') as myfile:
    corpus = myfile.read()
corpus_path = "/Users/lidiiamelnyk/Downloads/dss-plugin-nlp-preparation-main/resource/dictionaries/uk.txt"
symspell_dictionary = sym_spell.load_dictionary(corpus_path,
                                                term_index=0,
                                                count_index=1,
                                                separator=None,
                                                encoding='utf-8-sig')

ukrainian_dataframe['comment'] = ukrainian_dataframe['comment'].astype(str)
ukrainian_dataframe['comments_corrected'] = ukrainian_dataframe[
    'comment'].apply(lambda x: (sym_spell.lookup(x,
                                                 Verbosity.CLOSEST,
                                                 max_edit_distance=0,
                                                 include_unknown=True,
                                                 transfer_casing=False,
                                                 ignore_token=r"\w+\d")))

#for i, row in ukrainian_dataframe.iterrows():
#   if len(row['comments_corrected']) > 0:
#      pass
Exemple #5
0
 def test_load_dictionary_invalid_path(self):
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(False, sym_spell.load_dictionary(
         "invalid/dictionary/path.txt", 0, 1))
Exemple #6
0
class Spellchecker(object):
    """
    We use https://github.com/mammothb/symspellpy to do basic word / n-gram based spell checking.
    It is based on SymSpell:
    https://github.com/wolfgarbe/SymSpell

    TODO: We should be very cautious here and only "auto-correct" small one-off OCR errors (like "mnlicious -> malicious"),
    as otherwise we may change domain specific terms and abbreviations.

    TODO: consider optionally using a OCR cloud API,
    like: https://www.abbyy.com/cloud-ocr-sdk/
    https://cloud.ocrsdk.com/demo/

    This gives (much) better results, but it ain't free ;-)



    """

    sym_spell = None
    dictionary_path = None
    max_edit_distance = 1
    max_dict_edit_distance = 2
    prefix_length = 7
    count_threshold = 2

    def __init__(self, language="en"):
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=self.max_dict_edit_distance,
            prefix_length=self.prefix_length,
            count_threshold=self.count_threshold,
        )
        # FIXME support non-english languages and custom models
        if language == "en":
            self.dictionary_path = pkg_resources.resource_filename(
                "symspellpy", "frequency_dictionary_en_82_765.txt")
            # term_index is the column of the term and count_index is the
            # column of the term frequency
            self.sym_spell.load_dictionary(self.dictionary_path,
                                           term_index=0,
                                           count_index=1)
        else:
            log.warning(
                f"No spell checking available for language '{language}'")
            self.sym_spell = None

    def suggestions(self, input_term):
        # No suggestions if no spell checker available
        if not self.sym_spell:
            return []

        # max edit distance per lookup
        # (max_edit_distance_lookup <= max_dictionary_edit_distance)
        suggestions = self.sym_spell.lookup(
            input_term,
            Verbosity.CLOSEST,
            max_edit_distance=self.max_edit_distance,
            include_unknown=False,
            transfer_casing=True,
        )
        return suggestions

    def correct_word(self, input_term):
        stripped = str(input_term).strip(string.punctuation)

        # Don't correct words shorter than 4 chars (too risky it might be a domain specific abbreviation )
        if len(stripped) < 4:
            return input_term

        # if less alpha chars than alpha chars: ignore
        if len(re.findall(r"[A-Za-z]", stripped)) <= len(
                re.findall(r"[^A-Za-z]", stripped)):
            return input_term

        suggestions = self.suggestions(stripped)

        # No suggestions? Leave it as is
        if len(suggestions) == 0:
            return input_term

        # OCR tokenizer may leave training punctation -> re-add
        candidate = suggestions[0].term
        for i in [-2, -1]:
            if input_term[i] in string.punctuation:
                candidate += input_term[i]

        return candidate
nltk.download("punkt")

warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim")

TROPICAL_PATH = "tropical_dic.json"
FREQ_DICT_PATH = "frequency_dictionary_es_82_765.txt"
BIGRAM_PATH = "frequency_bigramdictionary_es_1Mnplus.txt"

with open(TROPICAL_PATH, "r") as file:
    tropical_dic = json.load(file)

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(FREQ_DICT_PATH, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(BIGRAM_PATH, term_index=0, count_index=2)

FIRST_INT = 11111111111111
LAST_INT = 99999999999999

PLACEHOLDERS_DICT = {}


@InputSeries(TextSeries)
def fillna(s: TextSeries) -> TextSeries:
    """
    Replaces not assigned values with empty string.


    Examples
Exemple #8
0
from pathlib import Path

from fastapi import FastAPI
from symspellpy import SymSpell, Verbosity

from app.schemas import LookupRequest, LookupResponse

MAX_EDIT_DISTANCE = 2
DICTIONARY_PATH = Path('./data/kk.txt')

app = FastAPI()

symspell = SymSpell(max_dictionary_edit_distance=MAX_EDIT_DISTANCE)
symspell.load_dictionary(DICTIONARY_PATH,
                         term_index=0,
                         count_index=1,
                         encoding='utf-8')


@app.get("/")
async def read_root():
    """Displays greeting message in homepage

    Returns:
        dict: a dictionary with greeting message
    """

    return {"message": "✌"}


@app.post("/lookup", response_model=LookupResponse)
def symspell_edit_distance_load(dictionary_path, request):
    sym_spell = SymSpell(request.param)
    sym_spell.load_dictionary(dictionary_path, 0, 1)
    return sym_spell, request.param
Exemple #10
0
class SpellCheck():
    def __init__(self, init_path=None):
        """Spelling checker: symspellpy==6.5.2.

        https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage.
        https://towardsdatascience.com/essential-text-correction-process-for-nlp-tasks-f731a025fcc3."""
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.set_dictionary_path(init_path)
        self.set_dictionary()
        # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1)

    def set_dictionary_path(self, path):
        if path:
            self.path = path
        else:
            self.path = pkg_resources.resource_filename(
                "symspellpy", "frequency_dictionary_en_82_765.txt")
        return self.path

    def set_df(self):
        self.df = pd.read_csv(self.path,
                              sep=' ',
                              header=None,
                              dtype={
                                  0: str,
                                  1: np.int
                              })
        return self.df

    def set_dict(self):
        self.set_df()
        self.dictionary = {
            self.df.loc[i, 0]: self.df.loc[i, 1]
            for i in self.df.index
        }
        return self.dictionary

    def set_dictionary(self):
        self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1)
        self.set_dict()
        return None

    def find(self, term):
        return self.dictionary.get(term, 'nothing found')

    def append_dict(self, df_custom, cust_path='./data/cust_freq_dict_en.txt'):
        """Add custom dictionary.

        df: [term, freq]"""
        df_init = self.set_df()
        try:
            df_custom = df_custom.replace([np.inf, -np.inf, np.nan], 99)
            df_custom[1] = df_custom[1].astype(int)
            df = pd.concat([df_init, df_custom], ignore_index=True)
        except Exception as err:
            st.write('something went wrong', err)
            return -1

        # Remove duplicate terms and sort on frequency
        df.drop_duplicates(subset=[0], keep='first', inplace=True)
        df.sort_values(by=[1], ascending=False, inplace=True)

        # Save & Load after adding custom dictionary
        self.set_dictionary_path(cust_path)
        df.to_csv(self.path, sep=' ', index=None, header=None)
        # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1)
        self.set_dictionary()
        return None

    def __call__(self, input_term, N=8):
        """lookup suggestions for single- and multi-word input strings"""
        # Check loner words (N chars) on possible concatenation
        # https://symspellpy.readthedocs.io/en/latest/api/symspellpy.html#symspellpy.symspellpy.Verbosity
        if (len(input_term.split(' '))) == 1 or (len(input_term) < N):
            suggestions = self.sym_spell.lookup(input_term,
                                                Verbosity.TOP,
                                                max_edit_distance=2,
                                                transfer_casing=True,
                                                include_unknown=True)
        else:
            # Punctuation get's lost!
            suggestions = self.sym_spell.lookup_compound(input_term,
                                                         max_edit_distance=2,
                                                         transfer_casing=True)
        # Suggestion term, term frequency, and edit distance
        # return [(sug.term, sug.count, sug.distance) for sug in suggestions]
        return [sug.term for sug in suggestions][0]
Exemple #11
0
class SpellChecker:
    """
    Spell check a string with max edit distance of 2. Only
    applied on alphabet character words (not alphanumeric).
    Spell check is applied independently to each token. Token
    of size <=2 are skipped.
    """

    name = 'spell_checker'

    def __init__(self, vocab_files_path: List[str], tokenizer=str.split):
        """
        Parameters
        ----------
        vocab_files_path : List[str]
            List of file paths to vocabulary used by the spell
            checker for correction. Vocabulary files must be
            one token per line and the tokens should be in the
            first column left-to-right if there are multiple
            columns.

        tokenizer : Callable[str, List[str]], default=str.split
            Takes as input a string, outputs a list of tokens.
        """
        self.tokenizer = tokenizer
        self.spellchecker = SymSpell(max_dictionary_edit_distance=2,
                                     prefix_length=7)
        for fp in vocab_files_path:
            assert self.spellchecker.load_dictionary(fp, 0, 1)


#         # https://stackoverflow.com/questions/1528932/how-to-create-inline-objects-with-properties
#         self.spellchecker.update({
#             'dummy': type('_', (object,), dict(lookup=lambda w, *_, **__: w))()
#         })

    def _correct(self, token: str) -> str:
        """
        Parameters
        ----------
        token : str
            Input string

        Returns
        -------
        corrected : str
            corrected token
        """
        # https://github.com/mammothb/symspellpy/issues/7
        o = self.spellchecker\
            .lookup(
                token, verbosity=Verbosity.TOP,
                max_edit_distance=2,
                ignore_token=r'\w{,2}',  # ignore tokens of size 2 or less
                transfer_casing=True)
        if not o: return token

        word = o[0].term
        if token[0].isupper():
            word = word[0].upper() + word[1:]

        # find start punctuation
        start_idx = 0
        start_punct = ''
        while token[start_idx] in string.punctuation:
            start_punct += token[start_idx]
            if start_idx + 1 < len(token):
                start_idx += 1
            else:
                break

        # find end punctuation
        end_idx = 1
        end_punct = ''
        while token[-end_idx] in string.punctuation:
            end_punct += token[-end_idx]
            if end_idx - 1 > 0:
                end_idx -= 1
            else:
                break

        return start_punct + word + end_punct

    def __call__(self, doc: str) -> str:
        """
        Parameters
        ----------
        doc : str
            Input string

        Returns
        -------
        doc : str
            spell checked string
        """
        return " ".join([
            self._correct(w) if w.isalpha() else w for w in self.tokenizer(doc)
        ])
import nltk
from symspellpy import SymSpell, Verbosity
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sym_spell = SymSpell()
sym_spell.load_dictionary('disease_dict', 0, 1, separator="$")
def recognize(query_text):
  sent = nltk.word_tokenize(query_text)
  pos_tag = nltk.pos_tag(sent)
  disease_pattern = r"""
                DISEASE: {(<NN.*><POS>){0,1}<JJ>*<NN.*>+(<IN><DT><NN.*>+){0,1}}                           
              """
  cp = nltk.RegexpParser(disease_pattern)
  ret = []
  for chunk in cp.parse(pos_tag).subtrees():
    if chunk.label() == 'DISEASE':
      buf = ""
      for word in chunk:
        if word[1] != 'POS':
          buf += ' '
        buf += word[0]
      suggestions = sym_spell.lookup(buf.strip().lower(), Verbosity.CLOSEST,
                               max_edit_distance=2)
      ret.extend([x._term for x in suggestions])
  return ret
def export():
    import os
    import torch
    import zipfile
    import torchaudio
    from glob import glob

    device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
    model, decoder, utils = torch.hub.load('snakers4/silero-models',
                                        model='silero_stt',
                                        language='en')
    (read_batch, split_into_batches,
    read_audio, prepare_model_input) = utils  # see function signature for details
    
    
    os.system("ffmpeg -i 'video.mp4' -vn -acodec copy audio.aac")
    os.system("ffmpeg -i audio.aac audio.wav")


    # download a single file, any format compatible with TorchAudio (soundfile backend)
    # torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',
    #                                dst ='speech_orig.wav', progress=True)
    test_files = glob('audio.wav') 
    batches = split_into_batches(test_files, batch_size=10)
    input = prepare_model_input(read_batch(batches[0]))

    text = ""
    output = model(input)
    for example in output:
        pred = decoder(example.cpu())
        text = text + pred
        
    os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt")
    os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt")



    import pkg_resources
    from symspellpy import SymSpell, Verbosity

    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    # input_term = ("whereis th elove hehad dated forImuch of thepast who "
    #              "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print(suggestion)
        
        
    text = str(suggestion)


    cnt = 0
    textlines = []
    while cnt < len(text.split(" ")):
        print(text.split(" ")[cnt:cnt+5])
        line = "\n" + " ".join(text.split(" ")[cnt:cnt+5])
        textlines.append(line)
        cnt += 5
        
        
    f = open("script_cleaned.txt", "a")
    f.writelines(textlines)
    f.close()


    os.system("python -m aeneas.tools.execute_task \
        audio.wav \
        script_cleaned.txt \
        'task_language=eng|os_task_file_format=srt|is_text_type=plain' \
        subtitles.srt")



    with open("subtitles.srt") as f:
        srt = f.read()
        
    return Response(
        srt,
        mimetype="text/srt",
        headers={
            "Content-disposition": "attachment; filename=subtitiles.srt"
        }
    )
Exemple #14
0
def test3():
    # from autocorrect import Speller
    # doc = docx.Document("Word docs_Peace/1_CTS_119_eng_text.docx")
    # result = [p.text for p in doc.paragraphs]
    #
    # spell = Speller(lang='en')
    #
    # for j in range(15):
    #     print(spell(result[j]))

    # import jamspell
    #
    # corrector = jamspell.TSpellCorrector()
    # corrector.LoadLangModel('en.bin')
    # text = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the Imperial Court, or in the Imperial Chamber,\n",
    #
    # text = corrector.FixFragment(text)
    # print(text)
    sys.path.append("treatyUtil")
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    from treatyUtil import spellcheck_keep_punctuation

    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term1 = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII.\
    According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred \
    Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir \
    Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the \
    account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either \
    Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, \
    or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, \
    both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding \
    all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions \
    ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with \
    the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor \
    to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the\
    Imperial Court, or in the Imperial Chamber,\n"

    #input_term = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited "

    input_term = "God, and Safety of the Chriſtian World (the Electors,\nPrinces and States of the Sacred Roman Empire \
    being\npreſent, approving and conſenting) the Articles of Peace\nand Anity, whereof the Tenour follows.\n1. That \
    there be a Chriſtian, univerſal\nThe Re-efta. and perpetual Peace, and a true and ſincere\nbliſhment of Friendſhip and \
    Amity between his Sacred\nPeace and A. Imperial Majeſty, the Houſe of Austria,\nmity.\nand all his Allies and Adherents, \
    and the\nHeirs and Succeffors of each of them, chiefly the King\nof Spain, and the Electors, Princes and States of the En-\npire,\
    of the one ſide, and her Sacred Royal Majeſty,\nand the Kingdom of Sweden, her Allies and Adherents,\nand the Heirs and Succeſſors\
    of each of them, eſpecially\nthe moſt Chriſtian King, the reſpective Electors, Princes\nand States of the Empire, of the other ſide ; \
    and that this\nPeace be obſerv'd and cultivated ſincerely and ſeriouſly,\nſo that each Party may procure the Benefit, Honour and\nAdvantage \
    of one another, and thereby the Fruits of this\nPeace and Amity may be ſeen to grow up and fouriſh a-\nnew, by a ſure and reciprocal \
    maintaining of a good\nand faithful Neighbourhood between the Roman Empire\nand the Kingdom of Sweden reciprocally,\nII. That there be \
    on both ſides à perpe-\nAn Amneſty\ntua) Oblivion and Amneſty of all that has\nfrom all Hoffi- been done Since the beginning of theſe\nlity.\nTroubles, \
    in what Place or in what Man-\n"

    input_term2 = "God, and Safety of the Chriſtian World (the Electors,\nPrinces"
    input_term = re.sub("\n", " ", input_term)
    input_term = re.sub("- ", "", input_term)
    #input_term = re.sub("-", "", input_term)
    input_term = re.sub("ſ", "s", input_term)

    # word_split = re.compile(r"[^\W]+", re.U)
    # suggestions = sym_spell.lookup_compound((input_term), ignore_non_words=True, max_edit_distance=2)
    # for suggestion in suggestions:
    #    print(suggestion)
    #
    # corrected = suggestions[0].term
    # # This combined with split_phrase_by_space=True would be enough just to spell check
    # # but punctuation is lost.
    #
    # # The spell check is already done in 'corrected'. Now we just want to keep the punctuation.
    # in_list = word_split.findall(input_term)
    # chk_list = word_split.findall(corrected)
    # print(input_term)
    # print(corrected)
    # print(in_list)
    # print(chk_list)
    # pdb.set_trace()
    #
    # # To keep punctuation we take the original phrase and do word by word replacement
    # out_term = ""
    # outs  = input_term.split()
    # word_count = 0
    # for word in in_list:
    #     print(out_term)
    #     print(outs[word_count].lower(), word, chk_list[word_count])
    #     temp = outs[word_count].lower().replace(word, chk_list[word_count])
    #     word_count += 1
    #     out_term += temp+" "
    #
    # print(out_term)
    # return

    # max edit distance per lookup (per single word, not per whole input string)
    #pdb.set_trace()
    #print(spellcheck_keep_punctuation(input_term))
    suggestions = sym_spell.lookup_compound((input_term),
                                            transfer_casing=True,
                                            ignore_non_words=True,
                                            max_edit_distance=2)
    # display suggestion term, edit distance, and term frequency
    #print(suggestions)
    for suggestion in suggestions:
        print(suggestion)
class Trainer(Engine):
    r"""Wrapper class to train a model.

    See :class:`laia.engine.Engine` for more information.

    Args:
        model: model to train.
        criterion: used criterion to train the model.
        optimizer: optimizer object that will update the parameters of the model.
        data_loader: iterable object from which batches are read.
        batch_input_fn: function used to extract the input
            for the model (e.g. a ``torch.Tensor``), from the batch loaded by
            the ``data_loader``. If ``None``, the batch is fed as-is to the
            model.
        batch_target_fn: if given, this callable object
            is used to extract the targets from the batch, which are
            passed to the `ITER_START` and `ITER_END` hooks.
        batch_id_fn: if given, this callable object is
            used to extract the batch ids to be used in a possible exception.
        progress_bar: if ``True``, :mod:`tqdm` will be
            used to show a progress bar for each epoch. If a string is given,
            the content of the string will be shown before the progress bar.
        iterations_per_update: Number of successive mini-batch
            parameter gradients to accumulate before updating the parameters.
        cv_number: Display information to see which cross-validation we are doing
        use_baseline: Whether to perform the baseline (No CL nor SSL)
        use_cl: Whether to use curriculum learning (CL)
        use_transfer: Whether to use transfer learning (TL)
        use_semi_supervised: Whether to use semi-supervised learning (SSL)
        threshold_score_semi_supervised: Threshold on the rank of the samples for SSL
        data_semi_supervised_loader: unlabbeled dataset for SSL
        epoch_frequency_semi_supervision: Frequency of update of the SSL dataset B (the one which is not labelled), cf report
        syms: token-text table
        original_data_loader: Original dataset (dataset A) for SSL
    """
    def __init__(
        self,
        model,  # type: torch.nn.Module
        criterion,  # type: Optional[Callable]
        optimizer,  # type: torch.optim.Optimizer
        data_loader=None,  # type: Optional[Iterable]
        batch_input_fn=None,  # type: Optional[Callable]
        batch_target_fn=None,  # type: Optional[Callable]
        batch_id_fn=None,  # type: Optional[Callable]
        progress_bar=None,  # type: Optional[Union[bool, str]]
        iterations_per_update=1,  # type: int
        cv_number=None,
        use_baseline=None,
        use_cl=None,
        use_transfer=None,
        use_semi_supervised=None,
        threshold_score_semi_supervised=None,
        data_semi_supervised_loader=None,
        epoch_frequency_semi_supervision=None,
        syms=None,
        original_data_loader=None,
    ):
        # type: (...) -> None
        super(Trainer, self).__init__(model=model,
                                      data_loader=data_loader,
                                      batch_input_fn=batch_input_fn,
                                      batch_target_fn=batch_target_fn,
                                      batch_id_fn=batch_id_fn,
                                      progress_bar=progress_bar,
                                      use_baseline=use_baseline,
                                      use_cl=use_cl,
                                      use_transfer=use_transfer)
        self._criterion = criterion
        self._optimizer = optimizer
        self._iterations_per_update = iterations_per_update
        self._updates = 0
        self._cv_number = cv_number
        self._progress_bar = progress_bar

        self.data_loader = data_loader

        self.use_semi_supervised = use_semi_supervised
        self.threshold_score_semi_supervised = threshold_score_semi_supervised
        self.data_semi_supervised_loader = data_semi_supervised_loader
        self.epoch_frequency_semi_supervision = epoch_frequency_semi_supervision
        self.counter_epoch_semi_supervision = 0
        self.semi_supervision_started = False
        self.original_dataset = {
            'ids': data_loader.dataset._ids,
            'imgs': data_loader.dataset._imgs,
            'txts': data_loader.dataset._txts
        }
        self.decoder = CTCGreedyDecoder()
        self.syms = syms
        self.original_data_loader = original_data_loader

        # Load Spell Checker
        self.sym_spell = SymSpell(max_dictionary_edit_distance=5,
                                  prefix_length=7)
        dict_name = 'de_50k.txt'  #"frequency_dictionary_en_82_765.txt"
        if not self.sym_spell.load_dictionary(
                dict_name, term_index=0, count_index=1, encoding='utf-8-sig'):
            print("error loading spell checker")

    @property
    def criterion(self):
        return self._criterion

    @criterion.setter
    def criterion(self, criterion):
        assert callable(criterion)
        self._criterion = criterion

    @property
    def optimizer(self):
        return self._optimizer

    def updates(self):
        return self._updates

    @property
    def logger(self):
        return _logger

    @property
    def iterations_per_update(self):
        return self._iterations_per_update

    @iterations_per_update.setter
    def iterations_per_update(self, num):
        if num is None:
            self._iterations_per_update = 1
        else:
            assert isinstance(num, int)
            assert num > 0
            self._iterations_per_update = num

    @action
    def start_semi_supervision(self):
        self.semi_supervision_started = True

    def score_semi_supervision(self, batch, mode='entropy'):
        """Compute the rank scores for SSL for the given batch depending on the mode: 'entropy' or 'diff' for 'diff-proba' metric"""
        batch_input, batch_target = self._prepare_input_and_target(batch)
        batch_ids = self.batch_id_fn(batch) if self.batch_id_fn else None

        # Batch timestep probabilities
        batch_output = self._model(batch_input)
        # Batch prediction that will be the label for the sample selected
        batch_decode = self.decoder(batch_output)

        # Compute the score of each sample: median of the difference between the top 2 probabilities per timestep
        x, xs = transform_output(batch_output)
        x = F.softmax(x, dim=2)
        xs = xs.numpy()

        if mode == 'diff':
            best_probas, _ = x.topk(dim=2, k=2)
            scores = best_probas[:, :, 0] - best_probas[:, :, 1]

        elif mode == 'entropy':
            sorted_probas, _ = x.topk(dim=2, k=x.shape[2])
            best_probas_entropy = (sorted_probas[:, :, 0] *
                                   torch.log(sorted_probas[:, :, 0]))[:, :,
                                                                      None]
            all_probas_but_best_entropy = (
                1 - sorted_probas[:, :,
                                  1:]) * torch.log(1 - sorted_probas[:, :, 1:])
            #scores = minus real entropy
            scores = torch.cat(
                [best_probas_entropy, all_probas_but_best_entropy],
                dim=2).sum(dim=2)

        sizes = np.arange(scores.shape[0]) < xs[..., None]
        sizes = torch.tensor(sizes.T).cuda()
        semi_supervised_score = scores.sum(dim=0) / (sizes).float().sum(dim=0)
        semi_supervised_score = semi_supervised_score.cpu().detach().numpy()

        return semi_supervised_score, batch_ids, batch_decode, batch_target

    @action
    def compute_semi_supervision(self, epoch):
        if self.use_semi_supervised and self.semi_supervision_started:

            # Frequency of update
            if self.counter_epoch_semi_supervision == self.epoch_frequency_semi_supervision:
                self.counter_epoch_semi_supervision = 0

                # Get the dataset B cf report
                new_ids, new_imgs, new_txts = self.compute_score_semi_supervision(
                    epoch)

                # Fraction of the original dataset
                # To be changed on your usage
                nb_samples = 0  #len(self.original_dataset['ids'])//2
                idx = np.random.randint(0, len(self.original_dataset['ids']),
                                        nb_samples)
                ori_ids = [self.original_dataset['ids'][i] for i in idx]
                ori_imgs = [self.original_dataset['imgs'][i] for i in idx]
                ori_txts = [self.original_dataset['txts'][i] for i in idx]

                self.data_loader.dataset._ids = ori_ids + new_ids
                self.data_loader.dataset._imgs = ori_imgs + new_imgs
                self.data_loader.dataset._txts = ori_txts + new_txts

            self.counter_epoch_semi_supervision += 1

    def compute_score_semi_supervision(self, epoch):
        # Choose mode
        mode = 'entropy'
        # Batch iterator
        if self._progress_bar:
            batch_iterator = tqdm(
                self.data_semi_supervised_loader,
                desc=self._progress_bar
                if isinstance(self._progress_bar, string_classes) else None,
            )
        else:
            batch_iterator = self.data_semi_supervised_loader

        # Compute metric on labelled dataset A
        original_scores = []
        for it, batch in enumerate(self.original_data_loader, 1):
            semi_supervised_score, _, _, _ = self.score_semi_supervision(
                batch, mode)
            original_scores.append(semi_supervised_score)

        original_scores = np.concatenate(original_scores)
        print('Median Percentiles',
              np.percentile(original_scores, [25, 50, 75, 90, 95, 99]))
        # Median score on the dataset A
        score_threshold = np.percentile(original_scores, 50)

        # If the user has indicated a threshold on the rank score, we use it
        # If not we take the median of the score on the dataset A
        if self.threshold_score_semi_supervised != 0.0:
            score_threshold = self.threshold_score_semi_supervised
        print(score_threshold, "\n")

        # To store the samples to be added to the training set
        new_ids = []
        new_txts = []
        targets = []
        for it, batch in enumerate(batch_iterator, 1):
            semi_supervised_score, batch_ids, batch_decode, batch_target = self.score_semi_supervision(
                batch, mode)

            print(
                'Median Percentiles',
                np.percentile(semi_supervised_score, [25, 50, 75, 90, 95, 99]))

            # Compute the ids of samples to be added to the training set
            idx = np.argwhere(
                semi_supervised_score > score_threshold).reshape(-1)
            # Add ids and new labels
            batch_ids = np.array(batch_ids)
            new_ids += batch_ids[idx].reshape(-1).tolist()
            new_txts += [[str(self.syms[val]) for val in batch_decode[i]]
                         for i in idx]
            targets += [[str(self.syms[val]) for val in batch_target[i]]
                        for i in idx]

        # Compute the filenames of the new labelled samples
        unlabelled_imgs = self.data_semi_supervised_loader.dataset._imgs
        file_format = unlabelled_imgs[0].split('.')[-1]
        head, tail = os.path.split(unlabelled_imgs[0])
        new_imgs = [os.path.join(head, i + "." + file_format) for i in new_ids]

        # Correction with Spell Checker
        corrected_new_txts = []
        for txt in new_txts:
            txt = ''.join(txt).split('@')
            correction = []
            for word in txt:
                suggestion = self.sym_spell.lookup(word,
                                                   Verbosity.CLOSEST,
                                                   max_edit_distance=5,
                                                   include_unknown=True)[0]
                correction.append(suggestion.term.split(',')[0])
            corrected_new_txts.append(list('@'.join(correction)))

        # print few corrections
        # for txt, corr, target  in zip(new_txts[:50],corrected_new_txts[:50],targets[:50]):
        #     print('\n')
        #     print('output',''.join(txt).split('@'))
        #     print('corrected',''.join(corr).split('@'))
        #     print('target',''.join(target).split('@'))

        return new_ids, new_imgs, corrected_new_txts

    def add_evaluator(self, evaluator, when=EPOCH_END, condition=None):
        r"""Add an evaluator to run at the end of each epoch."""
        if evaluator is not None:
            self.add_hook(
                when,
                Hook(condition, evaluator.run)
                if condition is not None else evaluator.run,
            )
        return self

    @action
    def run(self):
        r"""Run training """
        assert callable(
            self._batch_input_fn
        ), "batch_input_fn (type: {!r}) is not callable".format(
            str(self._batch_target_fn))
        assert callable(
            self._batch_target_fn
        ), "batch_target_fn (type: {!r}) is not callable".format(
            str(self._batch_target_fn))
        while not self._must_stop:
            if self._cv_number != '':
                print("Cross_validation", self._cv_number)
            self._run_epoch()
        return self

    def _run_iteration(self, batch_n, batch, train_iterations=None):
        batch_input, batch_target = self._prepare_input_and_target(batch)

        action_kwargs = {
            "batch": batch,
            "batch_num": batch_n,
            "epoch": self._epochs,
            "iteration": self._iterations,
            "batch_input": batch_input,
            "batch_target": batch_target,
        }
        self._call_hooks(ITER_START, **action_kwargs)

        if self._must_stop:
            return

        # Make all parameter gradients equal to zero.
        # Note: IT % NIPU = the iteration after a step()
        if self._iterations % self.iterations_per_update == 0:
            self._optimizer.zero_grad()

        # Put model in training mode
        if hasattr(self._model, "train"):
            self._model.train()

        # Run model
        with self.exception_catcher(batch):
            batch_output = self._model(batch_input)

        # Note: These checks are only active when logging level <= DEBUG
        check_inf(
            tensor=batch_output,
            logger=__name__,
            msg="Found {abs_num} ({rel_num:.2%}) INF values in the "
            "model output at epoch {epoch}, batch {batch} (absolute "
            "iteration {iteration})",
            epoch=self._epochs,
            batch=self.batch_id_fn(batch) if self.batch_id_fn else batch,
            iteration=self._iterations,
        )
        check_nan(
            tensor=batch_output,
            logger=__name__,
            msg="Found {abs_num} ({rel_num:.2%}) NAN values in the "
            "model output at epoch {epoch}, batch {batch} (absolute "
            "iteration {iteration})",
            epoch=self._epochs,
            batch=self.batch_id_fn(batch) if self.batch_id_fn else batch,
            iteration=self._iterations,
        )

        batch_loss = self.compute_loss(batch, batch_output, batch_target)
        if batch_loss is None:
            return

        # Make the loss and gradients w.r.t. output independent of the number
        # of accumulated iterations.
        if self.iterations_per_update > 1:
            batch_loss /= self.iterations_per_update

        # Compute gradients w.r.t. parameters
        self.logger.debug(
            "Start backward at epoch {}, batch {} (absolute iteration {})",
            self._epochs,
            batch_n,
            self._iterations,
        )
        with self.exception_catcher(batch):
            batch_loss.backward()

        self._iterations += 1

        # Update model parameters.
        if self._iterations % self.iterations_per_update == 0:
            self._updates += 1
            self.logger.debug(
                "Updating parameters at epoch {}, batch {} (absolute iteration {})",
                self._epochs,
                batch_n,
                self._iterations,
            )
            self._optimizer.step()

        action_kwargs["train_iterations"] = self._iterations
        action_kwargs["batch_output"] = batch_output
        action_kwargs["batch_loss"] = batch_loss.item()
        action_kwargs["batch_id"] = self.batch_id_fn(
            batch) if self.batch_id_fn else None

        self._call_hooks(ITER_END, **action_kwargs)

    def compute_loss(self, batch, batch_output, batch_target):
        with self.exception_catcher(batch):
            kwargs = {}
            if isinstance(self._criterion, Loss) and self.batch_id_fn:
                kwargs = {"batch_ids": self.batch_id_fn(batch)}
            loss = self._criterion(batch_output, batch_target, **kwargs)
            if loss is not None:
                if torch.sum(torch.isnan(loss)).item() > 0:
                    raise ValueError("The loss is NaN")
                if torch.sum(torch.isinf(loss)).item() > 0:
                    raise ValueError("The loss is +/-Inf")
            return loss

    def state_dict(self):
        state = super(Trainer, self).state_dict()
        state["optimizer"] = self._optimizer.state_dict()
        state["updates"] = self._updates
        return state

    def load_state_dict(self, state):
        super(Trainer, self).load_state_dict(state)
        self._optimizer.load_state_dict(state["optimizer"])
        self._updates = state["updates"]
Exemple #16
0
class MaskTextSpotter(object):
    def __init__(self,
                 cfg,
                 confidence_threshold=0.7,
                 min_image_size=224,
                 output_polygon=True,
                 spellfix=True):
        self.cfg = cfg.clone()
        self.model = build_detection_model(cfg)
        self.model.eval()
        self.device = torch.device(cfg.MODEL.DEVICE)
        self.model.to(self.device)
        self.min_image_size = min_image_size

        self.spellfix = spellfix

        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")

        bigram_dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

        self.sym_spell.load_dictionary(dictionary_path,
                                       term_index=0,
                                       count_index=1)

        self.sym_spell.load_bigram_dictionary(bigram_dictionary_path,
                                              term_index=0,
                                              count_index=2)

        checkpointer = DetectronCheckpointer(cfg, self.model)
        if len(cfg.MODEL.WEIGHT):
            import logging
            logging.info('loading MaskTextSpotter from %s' % cfg.MODEL.WEIGHT)
            _ = checkpointer.load(cfg.MODEL.WEIGHT)

        self.transforms = self.build_transform()
        self.cpu_device = torch.device("cpu")
        self.confidence_threshold = confidence_threshold
        self.output_polygon = output_polygon

    def build_transform(self):
        """
        Creates a basic transformation that was used to train the models
        """
        cfg = self.cfg
        # we are loading images with OpenCV, so we don't need to convert them
        # to BGR, they are already! So all we need to do is to normalize
        # by 255 if we want to convert to BGR255 format, or flip the channels
        # if we want it to be in RGB in [0-1] range.
        if cfg.INPUT.TO_BGR255:
            to_bgr_transform = T.Lambda(lambda x: x * 255)
        else:
            to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]])

        normalize_transform = T.Normalize(mean=cfg.INPUT.PIXEL_MEAN,
                                          std=cfg.INPUT.PIXEL_STD)

        transform = T.Compose([
            T.ToPILImage(),
            T.Resize(self.min_image_size),
            T.ToTensor(),
            to_bgr_transform,
            normalize_transform,
        ])
        return transform

    def run_on_opencv_image(self, image):
        """
        Arguments:
            image (np.ndarray): an image as returned by OpenCV
        Returns:
            result_polygons (list): detection results
            result_words (list): recognition results
        """
        result_polygons, result_words, result_dict = self.compute_prediction(
            image)
        return result_polygons, result_words, result_dict

    def run_on_pillow_image(self, image):
        arr = np.array(image, dtype=np.uint8)
        result_polygons, result_words, result_dict = self.run_on_opencv_image(
            arr)
        return result_polygons, result_words, result_dict

    def compute_prediction(self, original_image):
        def spell_fix(wd):
            if self.spellfix:
                new_word = [
                    s.term for s in self.sym_spell.lookup(wd,
                                                          Verbosity.CLOSEST,
                                                          max_edit_distance=2,
                                                          include_unknown=True)
                ][0]
            else:
                new_word = wd
            return new_word

        def chunks(l, n):
            for i in range(0, len(l), n):
                yield l[i:i + n]

        def mk_direction(char_polygons):
            def centroid(char_polygon):
                centroid = Polygon(list(chunks(char_polygon,
                                               2))).centroid.coords
                return list(centroid)[0]

            first, last = char_polygons[0], char_polygons[-1]
            start, end = centroid(first), centroid(last)
            if start[0] == end[0]:
                end = (end[0] + 1, end[1])
            return start, end

        def line_detection(dicts, char_ratio=1.5):
            # box  [x1, y1, x2, y2]
            sorted_res = sorted(dicts, key=lambda d: d["box"][0])
            lines = dict()

            def point_in_next_word(word):
                width = word["box"][2] - word["box"][0]  # width = x2 - x1
                avg_char_width = width / float(len(word["seq_word"]))
                last_right_border = word["box"][2]
                next_word_pos_x = last_right_border + char_ratio * avg_char_width
                next_word_pos_y = word["box"][1]
                direction = word["direction"]
                point = Point(next_word_pos_x, next_word_pos_y)
                line = LineString(direction)
                x = np.array(point.coords[0])
                u = np.array(line.coords[0])
                v = np.array(line.coords[len(line.coords) - 1])
                n = v - u
                n /= np.linalg.norm(n, 2)
                P = u + n * np.dot(x - u, n)
                return (int(P[0]), int(P[1]))

            def distance_to_mid(word_point, word_box):
                point = Point(word_point["next_point"])
                box = word_box["box"]
                return abs(point.y -
                           (box[1] + box[3]) / 2.0)  # abs( y - (y2+y1)/2 )

            def find_next_word(word, index, sorted_words):
                next_point = Point(word["next_point"])
                next_words = [
                    other for other in sorted_words[index + 1:] if Polygon(
                        chunks(other["polygon"], 2)).contains(next_point)
                ]
                if next_words:
                    return min(next_words,
                               key=lambda x: distance_to_mid(word, x))
                else:
                    return None

            def find_previous_word(prev, word):
                if "previous_word" not in word.keys():
                    return prev
                else:
                    return min(prev,
                               word["previous_word"],
                               key=lambda x: distance_to_mid(x, word))

            for w in sorted_res:
                w["next_point"] = point_in_next_word(w)

            for i, w in enumerate(sorted_res):
                next_word = find_next_word(w, i, sorted_res)
                w["next_word"] = None
                if next_word:
                    better_previous = find_previous_word(w, next_word)
                    if better_previous == w:
                        w["next_word"] = next_word
                        if "previous_word" in next_word.keys():
                            next_word["previous_word"]["next_word"] = None
                        next_word["previous_word"] = w

            for w in sorted_res:
                if "previous_word" not in w.keys():
                    a = w
                    key_y = a["box"][1]
                    while key_y in lines.keys():
                        key_y = key_y + 1
                    lines[key_y] = [a]
                    while a["next_word"]:
                        a = a["next_word"]
                        lines[key_y].append(a)

            sorted_lines = sorted(lines.items(), key=lambda x: x[0])
            return ",".join([
                " ".join([w["seq_word"] for w in line])
                for _, line in sorted_lines
            ]), sorted_lines

        # apply pre-processing to image
        import datetime, time
        start_time = time.time()
        # print('transform', datetime.datetime.now())
        image = self.transforms(original_image)
        # convert to an ImageList, padded so that it is divisible by
        # cfg.DATALOADER.SIZE_DIVISIBILITY
        # print('to image list', datetime.datetime.now())
        image_list = to_image_list(image,
                                   self.cfg.DATALOADER.SIZE_DIVISIBILITY)
        image_list = image_list.to(self.device)
        # compute predictions
        with torch.no_grad():
            # print('predict', datetime.datetime.now())
            self.model.eval()
            predictions, _, _ = self.model(image_list)
            if not predictions or len(predictions) < 1:
                # print('no text detected')
                return [], [], {'label': '', 'details': []}
        # print('post process', datetime.datetime.now())
        global_predictions = predictions[0]
        char_predictions = predictions[1]
        char_mask = char_predictions['char_mask']
        char_boxes = char_predictions['boxes']
        words, rec_scores, rec_char_scores, char_polygons = self.process_char_mask(
            char_mask, char_boxes)
        detailed_seq_scores = char_predictions['detailed_seq_scores']
        seq_words = char_predictions['seq_outputs']
        seq_scores = char_predictions['seq_scores']
        global_predictions = [
            o.to(self.cpu_device) for o in global_predictions
        ]

        # always single image is passed at a time
        global_prediction = global_predictions[0]

        # reshape prediction (a BoxList) into the original image size
        height, width = original_image.shape[:-1]
        test_image_width, test_image_height = global_prediction.size
        global_prediction = global_prediction.resize((width, height))
        resize_ratio = float(height) / test_image_height
        boxes = global_prediction.bbox.tolist()
        scores = global_prediction.get_field("scores").tolist()
        masks = global_prediction.get_field("mask").cpu().numpy()

        result_polygons = []
        result_words = []
        result_dicts = []

        for k, box in enumerate(boxes):
            score = scores[k]
            if score < self.confidence_threshold:
                continue
            box = list(map(int, box))
            mask = masks[k, 0, :, :]
            polygon = self.mask2polygon(mask,
                                        box,
                                        original_image.shape,
                                        threshold=0.5,
                                        output_polygon=self.output_polygon)

            if polygon is None:
                polygon = [
                    box[0], box[1], box[2], box[1], box[2], box[3], box[0],
                    box[3]
                ]
            result_polygons.append(polygon)
            word = words[k]
            rec_score = rec_scores[k]
            char_score = rec_char_scores[k]
            seq_word = seq_words[k]
            seq_char_scores = seq_scores[k]
            seq_score = sum(seq_char_scores) / float(len(seq_char_scores))
            # spell_fix = lambda word: \
            #     [s.term for s in sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)][
            #         0]
            detailed_seq_score = detailed_seq_scores[k]
            detailed_seq_score = np.squeeze(np.array(detailed_seq_score),
                                            axis=1)
            # if 'total_text' in output_folder or 'cute80' in output_folder:
            #     result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [seq_word] + [score] + [rec_score] + [
            #         seq_score] + [char_score] + [detailed_seq_score] + [len(polygon)]
            # else:
            result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [
                seq_word
            ] + [score] + [rec_score] + [seq_score] + [char_score] + [
                detailed_seq_score
            ]
            # result_logs.append(result_log)
            if len(seq_word) > 0 and len(char_polygons[k]) > 0:
                d = {
                    "seq_word":
                    seq_word if len(seq_word) < 4 else spell_fix(seq_word),
                    "seq_word_orig":
                    seq_word,
                    "direction":
                    mk_direction([[int(c * resize_ratio) for c in p]
                                  for p in char_polygons[k]]),
                    "word":
                    word if len(word) < 4 else spell_fix(word),
                    "word_orig":
                    word,
                    "box": [int(x * 1.0) for x in box[:4]],
                    "polygon":
                    polygon,
                    "prob":
                    score * seq_score
                }
                result_words.append(d['seq_word'])
                result_dicts.append(d)

        # default_logger.debug('done', datetime.datetime.now())
        label, details = line_detection(result_dicts)
        end_time = time.time()
        # default_logger.debug('cost time: %s' % (end_time - start_time))
        line_result = {'label': label, 'details': details}
        # line_result_words = []
        # line_result_polygons = []
        # for ocr_detail in line_result['details']:
        #     pass
        # line_result_words = [a[1][0]['seq_word'] for a in line_result['details']]
        # line_result_polygons = [a[1][0]['polygon'] for a in line_result['details']]
        line_result_words = [a['seq_word'] for a in result_dicts]
        line_result_polygons = [a['polygon'] for a in result_dicts]
        # return result_polygons, result_words, line_result
        return line_result_polygons, line_result_words, line_result

    # def process_char_mask(self, char_masks, boxes, threshold=192):
    #     texts, rec_scores = [], []
    #     for index in range(char_masks.shape[0]):
    #         box = list(boxes[index])
    #         box = list(map(int, box))
    #         text, rec_score, _, _ = getstr_grid(char_masks[index, :, :, :].copy(), box, threshold=threshold)
    #         texts.append(text)
    #         rec_scores.append(rec_score)
    #     return texts, rec_scores

    def process_char_mask(self, char_masks, boxes, threshold=192):
        texts, rec_scores, rec_char_scores, char_polygons = [], [], [], []
        for index in range(char_masks.shape[0]):
            box = list(boxes[index])
            box = list(map(int, box))
            text, rec_score, rec_char_score, char_polygon = getstr_grid(
                char_masks[index, :, :, :].copy(), box, threshold=threshold)
            texts.append(text)
            rec_scores.append(rec_score)
            rec_char_scores.append(rec_char_score)
            char_polygons.append(char_polygon)
            # segmss.append(segms)
        return texts, rec_scores, rec_char_scores, char_polygons

    def mask2polygon(self,
                     mask,
                     box,
                     im_size,
                     threshold=0.5,
                     output_polygon=True):
        # mask 32*128
        image_width, image_height = im_size[1], im_size[0]
        box_h = box[3] - box[1]
        box_w = box[2] - box[0]
        cls_polys = (mask * 255).astype(np.uint8)
        poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h)))
        poly_map = poly_map.astype(np.float32) / 255
        poly_map = cv2.GaussianBlur(poly_map, (3, 3), sigmaX=3)
        ret, poly_map = cv2.threshold(poly_map, 0.5, 1, cv2.THRESH_BINARY)
        if output_polygon:
            SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
            poly_map = cv2.erode(poly_map, SE1)
            poly_map = cv2.dilate(poly_map, SE1)
            poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1)
            try:
                _, contours, _ = cv2.findContours(
                    (poly_map * 255).astype(np.uint8), cv2.RETR_LIST,
                    cv2.CHAIN_APPROX_NONE)
            except:
                contours, _ = cv2.findContours(
                    (poly_map * 255).astype(np.uint8), cv2.RETR_LIST,
                    cv2.CHAIN_APPROX_NONE)
            if len(contours) == 0:
                print(contours)
                print(len(contours))
                return None
            max_area = 0
            max_cnt = contours[0]
            for cnt in contours:
                area = cv2.contourArea(cnt)
                if area > max_area:
                    max_area = area
                    max_cnt = cnt
            perimeter = cv2.arcLength(max_cnt, True)
            epsilon = 0.01 * cv2.arcLength(max_cnt, True)
            approx = cv2.approxPolyDP(max_cnt, epsilon, True)
            pts = approx.reshape((-1, 2))
            pts[:, 0] = pts[:, 0] + box[0]
            pts[:, 1] = pts[:, 1] + box[1]
            polygon = list(pts.reshape((-1, )))
            polygon = list(map(int, polygon))
            if len(polygon) < 6:
                return None
        else:
            SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
            poly_map = cv2.erode(poly_map, SE1)
            poly_map = cv2.dilate(poly_map, SE1)
            poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1)
            idy, idx = np.where(poly_map == 1)
            xy = np.vstack((idx, idy))
            xy = np.transpose(xy)
            hull = cv2.convexHull(xy, clockwise=True)
            # reverse order of points.
            if hull is None:
                return None
            hull = hull[::-1]
            # find minimum area bounding box.
            rect = cv2.minAreaRect(hull)
            corners = cv2.boxPoints(rect)
            corners = np.array(corners, dtype="int")
            pts = get_tight_rect(corners, box[0], box[1], image_height,
                                 image_width, 1)
            polygon = [x * 1.0 for x in pts]
            polygon = list(map(int, polygon))
        return polygon

    def visualization(self, img, polygons, words):
        cur_img = copy.deepcopy(img)
        for polygon, word in zip(polygons, words):
            pts = np.array(polygon, np.int32)
            pts = pts.reshape((-1, 1, 2))
            xmin = min(pts[:, 0, 0])
            ymin = min(pts[:, 0, 1])
            r = random.randint(0, 255)
            g = random.randint(0, 255)
            b = random.randint(0, 255)
            cv2.polylines(cur_img, [pts], True, (b, g, r))
            cv2.putText(cur_img, word, (xmin, ymin), cv2.FONT_HERSHEY_TRIPLEX,
                        0.5, (b, g, r), 1)
        return cur_img
Exemple #17
0
class MagicRecognition:
    def __init__(self, file_all_cards: str, file_keywords: str, languages=tuple("English"), max_ratio_diff=0.3, max_ratio_diff_keyword=0.2) -> None:
        """Load dictionnaries of cards and keywords

        Parameters
        ----------
        file_all_cards: str
            Path to the file containing all cards. If the file does not exist, it is downloaded from mtgjson.
        file_keywords: str
            Path to the file containing all keywords. If the file does not exist, it is downloaded from mtgjson.
        max_ratio_diff : float, optional
            Maximum ratio (distance/length) for a text to be considered as a card name, by default 0.3
        max_ratio_diff_keyword : float, optional
            Maximum ratio (distance/length) for a text to be considered as a (ignored) keyword, by default 0.2
        """
        self.max_ratio_diff = max_ratio_diff
        self.max_ratio_diff_keyword = max_ratio_diff_keyword
        
        if not Path(file_all_cards).is_file():
            def write_card(f, card):
                i = card.find(" //")
                if i != -1:
                    card = card[:i]
                f.write(card + "$1\n")  # required for SymSpell

            all_cards_json = load_json(URL_ALL_CARDS)
            with Path(file_all_cards).open("a") as f:
                for card, l in all_cards_json["data"].items():
                    if "English" in languages:
                        write_card(f, card)
                    for e in l[0]["foreignData"]:
                        if e["language"] in languages:
                            write_card(f, e["name"])

        self.sym_all_cards = SymSpell(max_dictionary_edit_distance=6)
        self.sym_all_cards._distance_algorithm = editdistance.DistanceAlgorithm.LEVENSHTEIN
        self.sym_all_cards.load_dictionary(file_all_cards, 0, 1, separator="$")
        self.all_cards = self.sym_all_cards._words
        print(f"Loaded {file_all_cards}: {len(self.all_cards)} cards")
        self.edit_dist = editdistance.EditDistance(editdistance.DistanceAlgorithm.LEVENSHTEIN)

        if not Path(file_keywords).is_file():
            keywords = load_json(URL_KEYWORDS)
            json.dump(keywords, Path(file_keywords).open("w"))

        def concat_lists(LL):
            res = []
            for L in LL:
                res.extend(L)
            return res

        keywords_json = json.load(Path(file_keywords).open())
        keywords = concat_lists(keywords_json["data"].values())
        keywords.extend(["Display", "Land", "Search", "Profile"])
        self.sym_keywords = SymSpell(max_dictionary_edit_distance=3)
        for k in keywords:
            self.sym_keywords.create_dictionary_entry(k, 1)
        print(f"Loaded {file_keywords}: {len(keywords)} cards")

    def _preprocess(self, text: str) -> str:
        """Remove characters which can't appear on a Magic card (OCR error)"""
        return re.sub("[^a-zA-Z',. ]", '', text).rstrip(' ')

    def _preprocess_texts(self, box_texts: BoxTextList) -> None:
        """Apply `preprocess` on each text"""
        for box_text in box_texts:
            box_text.text = self._preprocess(box_text.text)

    def box_texts_to_cards(self, box_texts: BoxTextList) -> BoxTextList:
        """Recognize cards from raw texts"""
        box_texts.sort()
        box_cards = BoxTextList()
        for box, text, _ in box_texts:
            sug = self.sym_keywords.lookup(text,
                                           Verbosity.CLOSEST,
                                           max_edit_distance=min(3, int(self.max_ratio_diff_keyword * len(text))))
            if sug != []:
                logging.info(f"Keyword rejected: {text} {sug[0].distance/len(text)} {sug[0].term}")
            else:
                card = self._search(self._preprocess(text))
                if card is not None:
                    box_cards.add(box, card)
        return box_cards

    def _assign_stacked(self, box_texts: BoxTextList, box_cards: BoxTextList) -> None:
        """Set multipliers (e.g. x4) for each (stacked) card in `box_cards`

        Parameters
        ----------
        box_texts : BoxTextList
            BoxTextList containing potential multipliers
        box_cards : BoxTextList
            BoxTextList containing recognized cards
        """
        def _assign_stacked_one(box_cards: BoxTextList, m: int, comp) -> None:
            i_min = 0
            for i, box_card in enumerate(box_cards):
                if comp(box_card.box, box_cards[i_min].box):
                    i_min = i
            box_cards[i_min].n = m
            logging.info(f"{box_cards[i_min].text} assigned to x{m}")

        def dist(p: tuple, q: tuple) -> float:
            return (p[0] - q[0])**2 + (p[1] - q[1])**2

        def comp_md(box1: tuple, box2: tuple, box: tuple) -> float:
            if box1[0] > box[0] or box1[1] > box[1]:
                return False
            return dist(box, box1) < dist(box, box2)

        def comp_sb(box1: tuple, box2: tuple, box: tuple) -> float:
            return dist(box, box1) < dist(box, box2)

        comp = (comp_md, comp_sb)
        for box, text, _ in box_texts:
            if len(text) == 2:
                for i in [0, 1]:
                    if text[i] in '×xX' and text[1 - i].isnumeric():
                        _assign_stacked_one(box_cards, int(text[1 - i]), partial(comp[i], box=box))

    def _box_cards_to_deck(self, box_cards: BoxTextList) -> Deck:
        """Convert recognized cards to decklist"""
        maindeck, sideboard = Pile(), Pile()
        n_cards = sum(c.n for c in box_cards)
        n_added = 0
        last_main_card = max(60, n_cards - 15)
        for _, card, n in box_cards:

            def add_cards(c, deck, p):
                if c in deck.cards:
                    deck.cards[c] += p
                elif p > 0:
                    deck.cards[c] = p

            n_added_main = max(min(n, last_main_card - n_added), 0)
            add_cards(card, maindeck, n_added_main)
            add_cards(card, sideboard, n - n_added_main)
            n_added += n
        deck = Deck()
        deck.maindeck = maindeck
        deck.sideboard = sideboard
        return deck

    def box_texts_to_deck(self, box_texts: BoxTextList) -> Deck:
        """Convert raw texts to decklist

        Parameters
        ----------
        box_texts : BoxTextList
            Raw texts given by an OCR

        Returns
        -------
        Deck
            Decklist obtained from `box_texts`
        """
        box_cards = self.box_texts_to_cards(box_texts)
        self._assign_stacked(box_texts, box_cards)
        return self._box_cards_to_deck(box_cards)

    def _search(self, text):
        """If `text` can be recognized as a Magic card, return that card. Otherwise, return None."""
        if len(text) < 3:  # a card name is never that short
            return None
        if len(text) > 30:  # a card name is never that long
            logging.info(f"Too long: {text}")
            return None
        if text in self.all_cards:
            return text
        i = text.find("..")  # search for truncated card name
        if i != -1:
            dist = int(self.max_ratio_diff * i)
            card = None
            for c in self.all_cards:
                d = self.edit_dist.compare(text[:i], c[:i], dist)
                if d != -1 and d < dist:
                    card = c
                    dist = d
            if card is None:
                logging.info(f"Not prefix: {text}")
            else:
                logging.info(f"Found prefix: {text} {dist/i} {card}")
                return card
        else:
            text = text.replace('.', '').rstrip(' ')
            sug = self.sym_all_cards.lookup(text,
                                            Verbosity.CLOSEST,
                                            max_edit_distance=min(6, int(self.max_ratio_diff * len(text))))
            if sug != []:
                card = sug[0].term
                ratio = sug[0].distance / len(text)
                if len(text) < len(card) + 7:
                    logging.info(f"Corrected: {text} {ratio} {card}")
                    return card
                logging.info(f"Not corrected (too long): {text} {ratio} {card}")
            else:
                logging.info(f"Not found: {text}")
        return None
Exemple #18
0
def SymSpell() -> SymSpellPy:
    symspell = SymSpellPy()

    symspell.load_dictionary('data/dictionary.txt', 0, 1)

    return symspell
Exemple #19
0
class spellchecker:
    def __init__(
        self,
        max_dictionary_edit_distance,
        prefix_length,
        unigram_freq_file,
        bigram_freq_file=None,
        pickle_file=None,
    ):
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=max_dictionary_edit_distance,
            prefix_length=prefix_length,
        )

        if pickle_file is not None:
            self.sym_spell.load_pickle(pickle_file, )
        else:
            self.sym_spell.load_dictionary(
                unigram_freq_file,
                term_index=0,
                count_index=1,
                encoding="utf-8",
            )

            if bigram_freq_file:
                self.sym_spell.load_bigram_dictionary(
                    bigram_freq_file,
                    term_index=0,
                    count_index=2,
                    encoding="utf-8",
                )

    def suggest(
        self,
        word,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        # defaults
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup(
            word,
            verbosity,
            max_edit_distance=max_edit_dist,
            include_unknown=include_unknown,
        )
        return {
            'original_term': word,
            'suggestions': suggestions,
        }

    def suggest_compound(
        self,
        phrase,
        max_edit_dist=None,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup_compound(
            phrase,
            max_edit_distance=max_edit_dist,
            # ignore_non_words=False,
            # split_phrase_by_space=True,
        )
        return {
            'original_term': phrase,
            'suggestions': suggestions,
        }

    def tokenize(self, phrases):
        return tokenize_sentence(phrases)

    # Tokenize into individual phrases and return a list of suggestions for each
    def suggest_tokenize(
        self,
        phrases,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        words = self.tokenize(phrases)

        sentence_suggestions = []
        for word in words:
            suggestions = self.sym_spell.lookup(
                word,
                verbosity,
                max_edit_distance=max_edit_dist,
                include_unknown=include_unknown,
            )
            sentence_suggestions.append({
                'original_term': word,
                'suggestions': suggestions,
            })

        return sentence_suggestions
Exemple #20
0
def pipeline_ocr_export_no_binarization(SHAPE_ROTATION_RESULTS_PATH,
                                        EXPORT_RESULTS_PATH,
                                        tess_config,
                                        batch_size,
                                        limit=None):
    '''
    run this function if binarization has already been applied or is not needed
        SHAPE_ROTATION_RESULTS_PATH - source (files from rotation output)
        EXPORT_RESULTS_PATH - export
    '''

    # load and init symspell library
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

    # load gridsearch data
    df_valid_files = pd.read_csv(SHAPE_ROTATION_RESULTS_PATH)
    valid_files_list = list(df_valid_files.file)
    print('total files:\t', len(valid_files_list))

    # get new files
    col_names = [
        'row_nr', 'level', 'page_num', 'block_num', 'par_num', 'line_num',
        'word_num', 'left', 'top', 'width', 'height', 'conf', 'text',
        'text_low', 'symspell_sc', 'symspell_ws', 'file'
    ]
    new_files = get_new_files_to_be_processed(path=EXPORT_RESULTS_PATH,
                                              col_names=col_names,
                                              index_col='file',
                                              all_files=valid_files_list)

    # get list shaped input only filename
    new_files = df_valid_files[df_valid_files.file.isin(
        new_files)].file.to_list()
    if limit != None:
        new_files = new_files[:limit]

    new_files_batches = list(divide_chunks(new_files, batch_size))
    print('total batches', len(new_files_batches), 'total files',
          len(new_files))

    conf = tess_config
    if len(new_files_batches) > 0:
        for batch in new_files_batches[:]:
            args_1 = batch
            args_2 = len(args_1) * [sym_spell]
            args_3 = len(args_1) * [conf]
            args_4 = len(args_1) * [None]
            args_5 = len(args_1) * [True]
            args_6 = len(args_1) * [False]
            args_7 = len(args_1) * [0]
            args_8 = len(args_1) * [False]
            all_args = list(
                zip(args_1, args_2, args_3, args_4, args_5, args_6, args_7,
                    args_8))

            print("Starting Batch..")
            pool = multiprocessing.Pool()
            results = pool.starmap(extract_text_from_image, all_args)

            df_tmp_results = pd.concat(results).to_records()

            with open(EXPORT_RESULTS_PATH, 'a', newline='') as outcsv:
                writer = csv.writer(outcsv)
                writer.writerows(df_tmp_results)

            cv2.destroyAllWindows()
            pool.close()
            pool.terminate()
class WordCorrection:
    """
    Provides the ability to correct input.
    """

    __instance = None

    @staticmethod
    def get_instance(edit_distance=None):
        """ Static access method. """
        if WordCorrection.__instance is None:
            WordCorrection(edit_distance)
        return WordCorrection.__instance

    def __init__(self, edit_distance=None):
        """
            Creates the SpellChecker object to be used for word correction.
            Language frequency lists are downloaded from a Git dictionary.

            Args:
                edit_distance (int): The maximum edit distance used for the correction (default is 2).
        """
        if WordCorrection.__instance is not None:
            raise Exception("An instance of this class already exists")
        else:
            WordCorrection.__instance = self

            self.settings_manager = SettingsManager.get_instance()
            self.settings_manager.attach(self)  # attach the object so it gets observed when the language changes

            self.language = self.settings_manager.get_language()

            self.spell = SymSpell(max_dictionary_edit_distance=edit_distance)
            self.spell.load_dictionary(os.path.join(os.path.dirname(os.path.abspath(__file__)),"dictionaries", "frequency_lists", self.language + ".txt"),
                                       0, 1, encoding="utf-8")

    def update(self):
        """Updates the language at use if it has been changed."""

        new_lang = self.settings_manager.get_language()
        if self.language == new_lang:
            return

        self.language = new_lang
        self.spell.load_dictionary(os.path.join(os.path.dirname(os.path.abspath(__file__)),"dictionaries", "frequency_lists", self.language + ".txt"),
                                   0, 1, encoding="utf-8")

    def correct(self, word):
        """
        Returns a correction of the word.

         Args:
             word (string): The word to be corrected.

        Returns:
            string: Word as predicted by the module.
        """

        suggestions = self.spell.lookup(word,
                                        verbosity=Verbosity.CLOSEST,
                                        include_unknown=True,
                                        transfer_casing=True)

        suggestion = str(suggestions[0]).split(", ")[0]

        return suggestion
Exemple #22
0
def extract_for_hocs_showcase(GS_PATH, HOCR_RESULTS_PATH, HOCR_DIR, conf,
                              limit, N_CPU, batch_size):
    '''
    df_gs - input dataframe from gridsearch
    '''
    # init symspell library
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

    # index list of filenames
    filename_path = HOCR_DIR + 'filenames_index.csv'
    files = ['choose file>']
    language = 'eng+deu+fra'

    # filter
    df_valid_files = pd.read_csv(GS_PATH)
    df_valid_files = df_valid_files[df_valid_files.length > 100]
    df_valid_files = df_valid_files.sort_values(by=["measure"],
                                                ascending=False)

    valid_files_list = list(df_valid_files.file)
    print('total files:\t', len(valid_files_list))

    # get new files
    col_names = ['file', 'name']
    new_files = get_new_files_to_be_processed(path=HOCR_RESULTS_PATH,
                                              col_names=col_names,
                                              index_col='file',
                                              all_files=valid_files_list)

    # get array shaped input parameters
    new_files = df_valid_files[df_valid_files.file.isin(new_files)].values
    if limit != None:
        new_files = new_files[:limit]

    new_files_batches = list(divide_chunks(new_files, batch_size))
    print('total batches', len(new_files_batches), 'total files',
          len(new_files))

    files = []
    cv2.destroyAllWindows()
    if len(new_files_batches) > 0:
        for batch in new_files_batches[:]:
            args_1 = batch
            args_2 = len(args_1) * [HOCR_DIR]
            args_3 = len(args_1) * [sym_spell]
            args_4 = len(args_1) * [conf]
            all_args = list(zip(args_1, args_2, args_3, args_4))

            print("Starting Batch..")
            pool = multiprocessing.Pool(processes=N_CPU)
            results = pool.starmap(single_hocr_extract, all_args)

            files.extend(results)
            with open(HOCR_RESULTS_PATH, 'a', newline='') as outcsv:
                writer = csv.writer(outcsv)
                writer.writerows(results)

            cv2.destroyAllWindows()
            pool.close()
            pool.terminate()

        filename_path = HOCR_DIR + 'filenames_index.csv'
        files = ['data/' + i[0] for i in files]
        pd.DataFrame({'filename': files}).to_csv(filename_path, index=False)
Exemple #23
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features
class Spellchecker:
    """The class responsible for token spellchecking"""
    MAX_EDIT_DISTANCE = 3
    MAX_LEVENSHTEIN_DISTANCE = 4

    def __init__(self, dictionary_path):
        """
        Args:
            dictionary_path (str): the path of the token frequency dict
        """
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=Spellchecker.MAX_EDIT_DISTANCE, )

        loaded = self.sym_spell.load_dictionary(dictionary_path,
                                                term_index=0,
                                                count_index=1,
                                                separator=" ")
        assert (loaded)
        print("Loaded SymSpell dictionary.")

    def _replace_in_df(self, df, to_replace, value):
        df["TOKEN"] = df["TOKEN"].str.replace(to_replace, value)

    def manual_spellcheck(self, df):
        """
        Manual spellcheck that looks for KNOWN and repeatable patterns 
        in the data that can be replaced by a correct token/word.

        Args:
            df (pandas.Dataframe): the dataframe that will get spellchecked
        """
        self._replace_in_df(df, "jebesmal", "jedesmal")
        self._replace_in_df(df, "Thatsache", "tat")
        self._replace_in_df(df, "Giengen", "gingen")
        self._replace_in_df(df, "Gcgenthcrl", "Gegenteil")
        self._replace_in_df(df, "\? unastie", "Dynastie")
        self._replace_in_df(df, "L rnnburg", "Luxemburg")
        self._replace_in_df(df, "Jstzt", "jetzt")
        self._replace_in_df(df, "Glaubensbekcnntniß", "Glaubenbekenntniss")
        self._replace_in_df(df, "T u r i n", "Turin")
        self._replace_in_df(df, "nöthicM", "nötigen")

    def automated_spellcheck(self, df):
        """
        Automated spellcheck that looks for unpredictable OCR errors in the
        data and replaces them with hopefully correct tokens.

        Args:
            df (pandas.Dataframe): the dataframe that will get spellchecked
        """
        wikidata_client = Client()
        tokenizer = Tokenizer(replace_not_contraction=False)
        ignore_regex_str = "^[0-9.!?*„_\-\—,;:<>='|\[\]\"()^«»/°•©>]+"
        ignore_regex = re.compile(ignore_regex_str)

        for index, row in df.iterrows():
            # Get the token
            token = row["TOKEN"]
            wiki_metadata = row["NEL-LIT"]

            # Autocorrect
            suggestions = self.sym_spell.lookup(
                token,
                Verbosity.TOP,
                transfer_casing=True,
                include_unknown=True,
                ignore_token=ignore_regex_str,
                max_edit_distance=Spellchecker.MAX_EDIT_DISTANCE)
            # Save the first suggestion if we have one
            if suggestions and suggestions[0].term != token.lower():
                if wiki_metadata.startswith('Q'):
                    # 1. 'Qxxxx' - Use the Wikidata column value to spellcheck
                    if ignore_regex.match(token):
                        # token should be ignored
                        continue

                    wikidata_entity = wikidata_client.get(wiki_metadata)
                    try:
                        wikidata_label = wikidata_entity.attributes['labels'][
                            'de']['value']
                    except KeyError:
                        # the wikidata has no 'de' entry for the label, ignore spellcorrection
                        continue

                    wikidata_labels = tokenizer.tokenize(wikidata_label)
                    wikidata_labels = map(lambda t: t.value, wikidata_labels)
                    wikidata_labels = filter(
                        lambda t: not ignore_regex.match(t), wikidata_labels)
                    wikidata_labels = list(wikidata_labels)

                    # Check if the token is not an abbreviation
                    is_abbreviation = False
                    for sublabel in wikidata_labels:
                        if sublabel.startswith(token):
                            print(token, "(abbrev) ->", sublabel, " | ",
                                  wiki_metadata)
                            df.at[index, 'TOKEN'] = sublabel
                            is_abbreviation = True
                            break

                    if is_abbreviation:
                        continue

                    try:
                        best_match = sorted(
                            wikidata_labels,
                            key=lambda t: distance(t, token))[0]
                    except IndexError:
                        continue

                    if distance(
                            best_match,
                            token) <= Spellchecker.MAX_LEVENSHTEIN_DISTANCE:
                        print(token, "(best_match) ->", best_match, " | ",
                              wiki_metadata)
                        df.at[index, 'TOKEN'] = best_match
                else:
                    # 2. 'NIL' / '_' - Use symspell
                    suggestion = suggestions[0].term
                    print(token, "(symspell) ->", suggestion, " | ",
                          wiki_metadata)
                    df.at[index, 'TOKEN'] = suggestion
STOPWORDS_NLTK = set(stopwords.words('english'))
STOPWORDS_SPACY = sp.Defaults.stop_words

lemmatiser_nltk = WordNetLemmatizer()
nlp = spacy.load(name='en_core_web_sm', disable=['parser', 'ner'])

# set max_dictionary_edit_distance=0 to avoid spelling correction
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7)

# term_index is the column of the term and
# count_index is the column of the term frequency
sym_spell.load_dictionary(corpus=dictionary_path, term_index=0, count_index=1)
sym_spell.load_dictionary(corpus=bigram_path, term_index=0, count_index=1)


def get_n_word_strings(terms: list, n: int) -> list:
    """
    Extract all n-word strings from a list of varying n-word strings.

    :param terms: List of strings to extract from.
    :param n: Integer of words in a string to extract by.
    :return: List of n-word strings.
    """
    try:
        if isinstance(terms, str):
            terms = list(terms)
            return get_n_word_strings(terms, n)
from itertools import islice
import pkg_resources
from symspellpy import SymSpell

sym_spell = SymSpell()

dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, 0, 1)
print(list(islice(sym_spell.words.items(), 5)))
Exemple #27
0
    def test_lookup_compound_no_bigram(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = "whereis th elove"
        correction = "whereas the love"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(2, results[0].distance)
        self.assertEqual(64, results[0].count)

        typo = "the bigjest playrs"
        correction = "the biggest players"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(2, results[0].distance)
        self.assertEqual(34, results[0].count)

        typo = "Can yu readthis"
        correction = "can you read this"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(3, results[0].distance)
        self.assertEqual(3, results[0].count)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("whereas the love head dated for much of the past who "
                      "couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(0, results[0].count)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of "
                      "a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(0, results[0].count)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(0, results[0].count)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(10, results[0].distance)
        self.assertEqual(0, results[0].count)
def preprocess(tweets, params=None):
    """
    INPUT:
        list:tweets (e.g pos_train,neg_train) --->a list of raw tweets.
        dict:params (to pass options to the preprocess)--> e.g. params={'stemmeriz':True,'stop_words_removal':True,'lemmatize':True,'emoticons_2_str':True,'expand_not':True,'autocorr':True,'viterbi_segmenter':True,'del_deplicates':True}
    OUTPUT:
        list, a list of list of tokens.--->each tweet is a list composed of tokens
    
    DESCRIPTION:
        Implement a series of standard text preprocessing to input tweets, including:

        remove all nonalpha words,
        remov all stopwords,
        remove hashtag and lemmetize, stmmize words with NLTK, etc
    
    
    
    """
    if params == None:
        return tweets

    else:

        stemmerize = params.get('stemmerize')
        lemmatize = params.get('lemmatize')
        stop_words_removal = params.get('stop_words_removal')
        emoticons_2_str = params.get('emoticons_2_str')
        expand_not = params.get('expand_not')
        autocorr = params.get('autocorr')
        viterbi_segmenter = params.get('viterbi_segmenter')
        del_deplicates = params.get('del_deplicates')
    """tokenize tweets using tokenizer tool from nltk.TweetTokenizer"""
    """
    Delete duplicates in the input datasets,respectively 9.77%(9777) and 8.91%(8912)
    Should be False if process test_tweets
    """
    if del_deplicates:
        tweets = delete_deplicates(tweets)

    tweets = tokenize(tweets)
    """remove stopwords"""
    if stop_words_removal:
        for index in range(len(tweets)):
            tweets[index] = list(
                filter(
                    lambda x: x not in invalid_string and x not in stop_words,
                    tweets[index]))

    if emoticons_2_str:
        """turn emoticons into string"""
        for index in range(len(tweets)):
            for i in range(len(tweets[index])):
                if tweets[index][i] in emoticonsHappySet:
                    tweets[index][i] = 'happy'
                if tweets[index][i] in emoticonsSadSet:
                    tweets[index][i] = 'sad'

    if stemmerize:
        """lemmetize, stmmize words with NLTK. Lemmatization == Stemming++ """
        for index in range(len(tweets)):
            for i in range(len(tweets[index])):

                tweets[index][i] = stemmer.stem(tweets[index][i])

    if lemmatize:
        for index in range(len(tweets)):
            for i in range(len(tweets[index])):
                tweets[index][i] = lemmatize_single(tweets[index][i])

    if lemmatize or stemmerize or expand_not or emoticons_2_str:
        """again remove the empty string"""
        tweets[index] = list(filter(lambda x: x, tweets[index]))
        """join and resplit so that 'you re' becomes 'you','are' etc"""
        tweets[index] = (" ".join(tweets[index])).strip().split()
        """again remove stopwords"""
        tweets[index] = list(
            filter(lambda x: x not in invalid_string and x not in stop_words,
                   tweets[index]))

    if autocorr:
        symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        symspell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        pos_train = autocorrect(tweets, symspell)

    if viterbi_segmenter:
        pos_train = join_resplit(tweets_viterbi_segment(pos_train))
        pos_train = filter_one_letter_word(pos_train)

    for index in range(len(tweets)):
        tweets[index] = ' '.join(tweets[index])

    return tweets
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.tokenize import word_tokenize
from language_detector import detect_language

import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:
    pass
else:
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


###################################
#### sentence level preprocess ####
###################################

# lowercase + base filter
# some basic normalization
def f_base(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter)
    s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', s)  # before lower case
Exemple #30
0
    * \
        https://github.com/mammothb/symspellpy
"""
from typing import List
from symspellpy import SymSpell, Verbosity
from pythainlp.corpus import get_corpus_path
from pythainlp.corpus import path_pythainlp_corpus
from pythainlp.tokenize import word_tokenize

_UNIGRAM = "tnc_freq.txt"
_BIGRAM = "tnc_bigram_word_freqs"

sym_spell = SymSpell()
sym_spell.load_dictionary(path_pythainlp_corpus(_UNIGRAM),
                          0,
                          1,
                          separator='\t',
                          encoding="utf-8-sig")
sym_spell.load_bigram_dictionary(get_corpus_path(_BIGRAM),
                                 0,
                                 2,
                                 separator='\t',
                                 encoding="utf-8-sig")


def spell(text: str, max_edit_distance: int = 2) -> List[str]:
    return [
        str(i).split(',')[0] for i in list(
            sym_spell.lookup(
                text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance))
    ]