Exemple #1
0
def corriger_dossier(chemin_dossier_corpus, chemin_dossier_sortie):

    # Création du dossier de sortie s'il n'existe pas
    if not os.path.exists(chemin_dossier_sortie):
        os.makedirs(chemin_dossier_sortie)

    # Instance du vérificateur orthographique (la langue est entre parenthèses)
    chkr = SpellChecker("fr")

    # Chargement de la liste d'erreurs personnalisée
    charger_liste = True

    if charger_liste:
        # Chemin du fichier csv sous la forme erreur,correction
        chemin_liste = "Correction_automatique.csv"

        # Le fichier contient-il des entetes ? La variable est à True si oui, False sinon
        presence_entetes = True

        # Si le fichier contient des entêtes (presence_entetes = True), il faut les définir ici
        # La première valeur de la liste sera le nom de la colonne des formes erronées, et la deuxième celle des formes correctes
        liste_entetes = ["Erreur observée", "Correction proposée"]

        delimiteur = "\t"

        with open(chemin_liste, "r", encoding="utf-8", newline='') as csvfile:
            if presence_entetes == False:
                reader = csv.reader(csvfile, delimiter=delimiteur)
                for row in reader:
                    chkr.replace_always(row[0], row[1])
                    # ajouter le nouveau mot dans le dictionnaire
                    chkr.add(row[1])
            else:
                reader = csv.DictReader(csvfile, delimiter=delimiteur)
                for row in reader:
                    chkr.replace_always(row[liste_entetes[0]],
                                        row[liste_entetes[1]])
                    # ajouter le nouveau mot dans le dictionnaire
                    chkr.add(row[liste_entetes[1]])

    # Chemin du fichier tsv qui contiendra la liste des erreurs avec la correction proposée et le fichier d'origine
    fichier_erreurs = "%s/erreurs.csv" % chemin_dossier_sortie

    with open(fichier_erreurs, "w", encoding="utf-8", newline='') as csvfile:
        fieldnames = [
            "Erreur_détectée", "Correction_proposée", "Contexte", "Fichier"
        ]
        spamwriter = csv.writer(csvfile, delimiter="\t")
        spamwriter.writerow(fieldnames)

        for fichier in glob.glob("%s/*" % chemin_dossier_corpus):
            extension = fichier.split(".")[-1]
            if extension == "txt" or extension == "xml":
                print("Traitement du fichier %s" % fichier)
                nom_fichier = fichier.split("\\")[1]
                sans_extension = nom_fichier.split(".")[0]
                if extension == "xml":
                    contenu = lire_TEI_XML(fichier)
                else:
                    with open(fichier, 'r', encoding="utf-8") as fin:
                        contenu = fin.read()

                correction_contenu, lignes_erreurs = correction(
                    contenu, chkr, sans_extension)
                for ligne in lignes_erreurs:
                    spamwriter.writerow(ligne)
                with open("%s/%s_correction.txt" %
                          (chemin_dossier_sortie, sans_extension),
                          'w',
                          encoding="utf-8") as fout:
                    fout.write(correction_contenu)
Exemple #2
0
 def __init__(self):
     self.checker = SpellChecker("en")
Exemple #3
0
from DictionaryOfNewZealandEnglish.headword.citation.models import Citation
from DictionaryOfNewZealandEnglish.headword.models import Headword
from DictionaryOfNewZealandEnglish.settings import Config
from enchant.checker import SpellChecker
from DictionaryOfNewZealandEnglish.database import db
import multiprocessing as mp
from functools import partial
from collections import deque

from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from DictionaryOfNewZealandEnglish.settings import ProdConfig

#Initialise dictionary
checker = SpellChecker(Config.SPELLCHECK_LANG)

# Set multiprocessing
MUTLIP = True


def fixline(text):
    '''
    Takes text with concatinated words and does it's best to put the
    spaces back in using the pyenchant SPELLCHECK_LANG dictionary.
    Derived from:
    http://stackoverflow.com/questions/23314834/tokenizing-unsplit-words-from-ocr-using-nltk
    '''
    checker.set_text(text)
    for error in checker:
        for suggestion in error.suggest():
Exemple #4
0
def check_spellings(tweet):
    checker = SpellChecker("en_US",
                           filters=[EmailFilter, URLFilter, HashTagFilter])
    checker.set_text(tweet.text)
    return checker
Exemple #5
0

class CryptoAddressFilter(Filter):
    def _skip(self, word):
        return recrypto.match(word)


if __name__ == '__main__':
    import os
    import sys
    from enchant.checker import SpellChecker

    checker = SpellChecker('en_US',
                           sys.stdin.read(),
                           filters=[
                               AcronymFilter,
                               OptionFilter,
                               PackageFilter,
                               PathFilter,
                           ])

    with open(os.path.join(os.path.dirname(__file__), 'words.txt')) as f:
        for line in f:
            checker.add(line.strip())

    has_error = False
    for error in checker:
        print(f'Spelling error: {error.word}')
        print(
            f'Context: {error.leading_context(30)}{error.word}{error.trailing_context(30)}'
        )
        has_error = True
     
  #check for string size to amount of letters ratio  
  Nchar = 0
  for char in text:
      if char.isalpha() == True or char == u" ":
          Nchar += 1
  lr = Nchar/ float(txlen)   
  if lr < 0.7:
      letter_ratio += 1
      continue
  
  #remove all special characters apart from characterset below    
  text = re.sub('[^A-Za-z0-9.,!?\s\\xf6\\xfc\\xe4\\xdf\\u00DF]', '', text)
 
  #spell checking
  chkr = SpellChecker("de_CH",text)
  lasterror = ''
  lasterror2 = ''
  lasterror3 = ''
  Nerr = 0
  for err in chkr:
      repl = ''
      if lasterror == err.word or lasterror2 == err.word or lasterror3 == err.word:
          continue
      lasterror3 = lasterror2
      lasterror2 = lasterror
      lasterror = err.word
      Nerr += 1
      repl = err.suggest()
      try:
          if repl[0] == err.word:
Exemple #7
0
 def __init__(self, dictionary):
     self.dictionary = dictionary
     self.check = SpellChecker("en_US")
     self.stemmer = PorterStemmer()
Exemple #8
0
# setup paths
env = Path('.') / '.env'
load_dotenv(dotenv_path=env)

# setup flask
app = Flask(__name__)
slack_event_adapter = SlackEventAdapter(os.environ['SIGNING_SECRET'],
                                        '/slack/events', app)

# setup slack
client = slack.WebClient(token=os.environ['SLACK_TOKEN'])
bot_id = client.api_call('auth.test')['user_id']

# setup nlu
spell_check = SpellChecker('en_US')
nlp = spacy.load('en_core_web_sm')
gc = geonamescache.GeonamesCache()


def correct(message: str) -> str:
    '''
  corrects any spelling mistakes with most likely suggested word

  Arguments:
  - message: the message provided

  Returns:
  - the corrected string
  '''
    spell_check.set_text(message)
Exemple #9
0
from PIL import Image 
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from pdf2image import convert_from_path 

from enchant.checker import SpellChecker
special_words = ['coronavirus','covid','ncov','cov','wuhan','hubei','covax',
                 'biontech','lgus','lgu','comorbidities','Listahanan','sars',
                 'dfa','dilg','doj','dotr','doh','pcr','roa','duterte']

custom_stop_words = ['whereas','shall','via']
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stop_words)

english_spell_checker = SpellChecker("en_US")
for sp in special_words: english_spell_checker.add(sp)

def is_in_english(quote, max_error_count = 3, min_text_length = 3):
  english_spell_checker.set_text(quote)
  errors = [err.word for err in english_spell_checker]
  errors = list(set(errors)) 
  return False if ((len(errors) > max_error_count) or 
                   len(quote.split()) < min_text_length) else True

def remove_header_from_image(imageFilePath, threshold=0.06):
    template_files = [f for f in glob.glob("images/header_templates/*.jpg")]
    
    for headerFilePath in template_files:
        # Read the images from the file
        header_image = cv2.imread(headerFilePath)
Exemple #10
0
		contenu = contenu.replace("\n", " ")
	return contenu


# Chemin vers le dossier contenant les fichiers texte ou XML-TEI à corriger
dossier_corpus = "Evaluation/Correction/Corriger"

# Chemin vers le dossier pour enregister les sorties
dossier_sortie = "%s_script_correction" % dossier_corpus

# Création du dossier de sortie s'il n'existe pas
if not os.path.exists(dossier_sortie):
	os.makedirs(dossier_sortie)

# Instance du vérificateur orthographique (la langue est entre parenthèses)
chkr = SpellChecker("fr")

# Chargement de la liste d'erreurs personnalisée
charger_liste = False


if charger_liste:
	# Chemin du fichier csv sous la forme erreur,correction
	chemin_liste = "Liste_correction/liste.csv"

	# Le fichier contient-il des entetes ? La variable est à True si oui, False sinon
	presence_entetes = False

	# Si le fichier contient des entêtes (presence_entetes = True), il faut les définir ici
	# La première valeur de la liste sera le nom de la colonne des formes erronées, et la deuxième celle des formes correctes
	liste_entetes = ["forme_erronée", "forme_correcte"]
def getAspellChecker( ):
    return SpellChecker( "en_US" )
Exemple #12
0
import enchant
from enchant.checker import SpellChecker

checker = SpellChecker('ru_RU')
pwl = enchant.request_pwl_dict('NewWords.txt')
new_dictionary = enchant.DictWithPWL('ru_RU', 'NewWords.txt')


def CheckText(text):
    checker.set_text(text)
    answer = ''
    for i in checker:
        word = i.word
        if not new_dictionary.check(word):
            if answer == '':
                answer = answer + word
            else:
                answer = answer + ', ' + word
    return answer


def AddNewWord(word):
    with open('NewWords.txt', 'a', encoding='utf-8') as file:
        if not new_dictionary.check(word):
            file.write('\n' + word)
            return "Добавлено!"
        else:
            return "Слово уже есть в словаре!"
Exemple #13
0
def parse():
    src_dir = '.'
    app_dir = '..'

    export_path = '{}/src/assets/data/jokes.json'.format(app_dir)

    original = {}
    with open(export_path, 'r') as data_file:
        original = json.load(data_file)
    original = {o['id']: o for o in original}

    base_url = 'http://perelki.net/'
    response = requests.get(base_url,
                            headers={'User-Agent': generate_user_agent()})
    btree = etree.HTML(response.content)

    first_page = 1
    last_page = int(btree.xpath('/html/body/div[6]/a[last()]')[0].text)

    jokes = []
    for page_nr in tqdm(range(first_page, last_page + 1), 'Parsing jokes'):
        page_url = base_url + '?ps={}'.format(page_nr)
        page_response = requests.get(
            page_url, headers={'User-Agent': generate_user_agent()})
        ptree = etree.HTML(page_response.content)

        nodes = ptree.xpath(
            '/html/body/div[5]/div[@class="container" and @class!="ad"]')

        for node in nodes:
            meta_node = node.find('div[@class="about"]')

            # Many elements on the page has container class. Skip them
            # if no joke metadata available
            if not meta_node:
                continue

            node.remove(meta_node)

            pq_detail = pq(etree.tostring(node, encoding='unicode'))
            pq_meta = pq(etree.tostring(meta_node, encoding='unicode'))

            joke_text = (pq_detail.html().replace('<br/>', '\n').replace(
                '&#13;', '').replace(' ? ', ' - ').replace('\r\n',
                                                           '\n').strip())
            joke_id = pq_meta.find('a')[1].text.strip()
            joke_rate = int(pq_meta.find('span')[1].text.strip())
            joke_date = pq_meta.find('span')[2].text.strip()

            try:
                # Some jokes don't have any author
                joke_author = pq_meta.find('span')[3].text.strip()
            except IndexError:
                joke_author = None

            joke = original.get(joke_id)
            if joke:
                joke.update({'rate': joke_rate})
            else:
                checker = SpellChecker("pl")
                checker.set_text(joke_text)

                is_valid = True
                for error in checker:
                    is_valid = False
                    word = error.word

                    suggestions = checker.suggest(word)
                    try:
                        suggestion = suggestions[0]
                    except:
                        pass
                    else:
                        error.replace(suggestion)

                if not is_valid:
                    joke_text = checker.get_text()

                joke = {
                    'id': joke_id,
                    'is_checked': False,
                    'content': joke_text,
                    'rate': joke_rate,
                    'date': joke_date,
                    'author': joke_author
                }
            jokes.append(joke)

    # Copy parsed file
    with open(export_path, 'w') as outfile:
        json.dump(jokes,
                  outfile,
                  sort_keys=False,
                  indent=2,
                  ensure_ascii=False)
def evaluate_pyenchant_builtin(input, lang_code):
    import enchant
    from enchant.checker import SpellChecker

    input = build_article_information(input)

    result_content = "{ \"predictions\": [\n"

    for aidx, article in enumerate(input):
        for sidx, sentence in enumerate(article.sentences):
            chkr = SpellChecker(lang_code)

            chkr.set_text(sentence)
            suggestions = {}

            # We will need this to restructure the sentence
            dummy_tokens, dummy_spaces = call_regex(sentence)
            tokens, spaces = call_regex(sentence)

            def wordpos2token(word_pos, tkns, spcs):
                char_counter = 0
                for tidx, t in enumerate(tkns):
                    if word_pos == char_counter:
                        return tidx
                    char_counter += len(t)
                    if spcs[tidx] == True:
                        char_counter += 1
                    if tidx == len(tkns):
                        return tidx
                return (len(tkns) - 1)

            for err in chkr:
                word_pos = err.wordpos
                tidx = wordpos2token(word_pos, dummy_tokens, dummy_spaces)
                suggests = err.suggest()
                if len(suggests) == 1:
                    tokens[tidx] = suggests[0].replace("\\", "\\\\").replace(
                        "\"", "\\\\\"")
                elif len(suggests) > 1:
                    tokens[tidx] = suggests[0].replace("\\", "\\\\").replace(
                        "\"", "\\\\\"")
                    suggestions[tidx] = [
                        sugg.replace("\\", "\\\\").replace("\"", "\\\\\"")
                        for sugg in suggests[1:]
                    ]
                elif len(suggests) == 0:
                    tokens[tidx] = err.word.replace("\\", "\\\\").replace(
                        "\"", "\\\\\"")
                    word_pos = err.wordpos
                if tokens[tidx] == "\\":
                    tokens[tidx] = "\\\\"

            #tokens, spaces = call_regex(response)
            realNumTokens = len(tokens)
            shift = 0
            for tidx, token in enumerate(tokens):
                token = token.strip()

                if " " in token and ((len(token.split(" ")[0]) != 0) and
                                     (len(token.split(" ")[0]) != 0)):
                    realNumTokens += len(token.split(" ")) - 1
                    for tt in token.split(" "):
                        result_content += generate_token_information(
                            aidx, sidx, tidx + shift, tt,
                            suggestions[tidx] if tidx in suggestions else [],
                            dummy_spaces[tidx], tidx + shift <
                            (realNumTokens - 1))
                        shift += 1
                    shift -= 1
                else:
                    result_content += generate_token_information(
                        aidx, sidx, tidx + shift, token,
                        suggestions[tidx] if tidx in suggestions else [],
                        dummy_spaces[tidx], tidx + shift < (realNumTokens - 1))
            if (aidx <
                (len(input) - 1)) or (sidx < len(article.sentences) - 1):
                result_content += ",\n"

    result_content += "  ]\n}"

    return result_content
Exemple #15
0
def main():
    config = configparser.ConfigParser()
    config.read('setup.cfg')
    conf = config['potypo']

    chunker_list = []
    for chunker in conf['chunkers'].strip().split(","):
        if "." in chunker:
            components = chunker.rsplit('.', 1)
            mod = __import__(components[0], fromlist=[components[1]])
            class_object = getattr(mod, components[1])
        else:
            class_object = getattr(chunkers, chunker)

        chunker_list.append(class_object)

    filter_list = []
    for f in conf['filters'].strip().split(","):
        if "." in f:
            components = f.rsplit('.', 1)
            mod = __import__(components[0], fromlist=[components[1]])
            class_object = getattr(mod, components[1])
        else:
            class_object = getattr(filters, f)

        filter_list.append(class_object)

    if 'phrases' in conf:
        phrases = conf['phrases'].strip().split('\n')
        chunker_list.append(chunkers.make_PhraseChunker(phrases))

    if 'edgecase_words' in conf:
        words = conf['edgecase_words'].strip().split('\n')
        filter_list.append(filters.make_EdgecaseFilter(words))

    def errmsg(path, linenum, word):
        print("ERROR: {}:{}: {}".format(path, linenum, word))

    # checks contains one Check-Object for every po-file
    checks = []

    for root, dirs, files in os.walk(conf['locales_dir']):
        for f in files:
            if f.endswith(".po"):
                try:
                    checks.append(
                        Check(os.path.join(root, f), conf['wl_dir'],
                              chunker_list, filter_list))
                except errors.DictNotFoundError as err:
                    print(
                        err,
                        "Potypo will not check for spelling errors in this language."
                    )

    en_wordlist = Check.get_wordlist(conf['default_language'], conf['wl_dir'],
                                     conf['locales_dir'])
    en_dict = DictWithPWL(conf['default_language'], pwl=en_wordlist)
    en_ckr = SpellChecker(en_dict, chunkers=chunker_list, filters=filter_list)

    fail = False  # used for tracking whether failing errors occurred
    for c in checks:
        print("Checking Errors in file", c.popath, "for lang", c.lang)
        for entry in c.po:
            if entry.obsolete:
                continue

            en_ckr.set_text(entry.msgid)
            for err in en_ckr:
                fail = True
                path = os.path.relpath(c.popath,
                                       start=config['potypo']['locales_dir'])
                errmsg(path, entry.linenum, err.word)

            c.checker.set_text(entry.msgstr)
            for err in c.checker:
                if c.lang not in conf['no_fail']:
                    fail = True
                path = os.path.relpath(c.popath,
                                       start=config['potypo']['locales_dir'])
                errmsg(path, entry.linenum, err.word)

    print("Spell-checking done.")

    if fail:
        sys.exit(1)
    sys.exit(0)
Exemple #16
0
        return 0
    return math.exp(score)


if __name__ == '__main__':
    if '-h' in sys.argv or len(sys.argv) != 2:
        print 'cat text | python scripts/correct_spelling.py path/to/lm'
        sys.exit()

    # load LM
    print >>sys.stderr, 'Loading lm'
    lm = load_lm(sys.argv[1])
    unk = lm.score('<unk>', bos=False, eos=False)
    print >>sys.stderr, 'done'

    # check each line
    for line in sys.stdin:
        chkr = SpellChecker('en', line.strip())
        # for each error, replace with the suggestion with the highest ll
        for err in chkr:
            best_option = None
            best_score = float('-inf')
            for sug in err.suggest():
                score = get_weight(sug)
                if score > best_score:
                    best_score = score
                    best_option = sug
            if best_option is not None:
                err.replace(best_option)
        print chkr.get_text()
 def __init__(self, in_fname, out_fname):
     self.out_fname = out_fname
     self.load(in_fname)
     self._spell_checker = SpellChecker('en_US')
     self._wordnet = WordNetLemmatizer()
     self.rule_corrector = Rule()
Exemple #18
0
 def __init__(self,language=config.LANGUAGE):
     self.checker = SpellChecker(language)
for line in open("all-lan.json"):
    # read a line from file
    line = Infile.readline()

    # extract json format, and get related elements
    hjson = json.loads(line)
    ip = hjson['ip']
    body = hjson['text']

    # chkr = SpellChecker("en_US")

    # pwl = enchant.request_pwl_dict("mywords.txt")
    # d2 = enchant.DictWithPWL("en_US","mywords.txt")

    d2 = enchant.DictWithPWL("en_US", "mywords.txt")
    chkr = SpellChecker(d2)

    chkr.set_text(body)
    for err in chkr:
        # newwords.append(err.word)
        body = body.replace(err.word, "")

    body = re.sub(' +', ' ', body)
    body = body.strip()

    if len(body) < 3:
        continue
    else:
        wjson = generatejson(ip, body)
        Outfile.write(wjson)
        Outfile.flush()
Exemple #20
0
def get_spelling_errors(text):
    return " ".join([err.word for err in SpellChecker("en_US", text)]) or ""

import wx

from enchant.checker import SpellChecker
from enchant.checker.wxSpellCheckerDialog import wxSpellCheckerDialog

# Retrieve the text to be checked
text = "this is some smple text with a few erors in it"
print "[INITIAL TEXT:]", text

# Need to have an App before any windows will be shown
app = wx.PySimpleApp()

# Construct the dialog, and the SpellChecker it is to use
dlg = wxSpellCheckerDialog(None)
chkr = SpellChecker("en_US",text)
dlg.SetSpellChecker(chkr)

# Display the dialog, allowing user interaction
if dlg.ShowModal() == wx.ID_OK:
    # Checking completed successfully
    # Retrieve the modified text
    print "[FINAL TEXT:]", chkr.get_text()
else:
    # Checking was cancelled
    print "[CHECKING CANCELLED]"
    


Exemple #22
0
# -*- encoding: utf-8 -*-
import freeling
import os
from enchant import DictWithPWL
from enchant.checker import SpellChecker
from difflib import get_close_matches, SequenceMatcher

DATA = "/usr/local/share/freeling/"
LANG = "es"

assert os.path.getsize('../utilities/es-lat') > 0
my_dict = DictWithPWL('es', '../utilities/es-lat')
assert my_dict.provider.name == 'aspell'
chkr = SpellChecker(my_dict)


class Analyzer:
    def __init__(self):

        freeling.util_init_locale("default")

        # Create options set for maco analyzer
        op = freeling.maco_options(LANG)
        op.PunctuationFile = DATA + "common/punct.dat"
        op.DictionaryFile = DATA + LANG + "/es-ar/dicc.src"
        op.AffixFile = DATA + LANG + "/afixos.dat"
        op.LocutionsFile = DATA + LANG + "/locucions.dat"
        op.NPdataFile = DATA + LANG + "/np.dat"
        op.QuantitiesFile = DATA + LANG + "/quantities.dat"
        op.ProbabilityFile = DATA + LANG + "/probabilitats.dat"
Exemple #23
0
    def static_analysis(self, path):
        """
        Perform static analysis of the notebook.
        Read the notebook and check that there is no ouput and that the links
        in the markdown cells are not broken.
        Args:
            path (string): Name of notebook.
        Return:
            boolean: True if static analysis succeeded, otherwise False.
        """

        nb = nbformat.read(path, nbformat.current_nbformat)

        #######################
        # Check that the notebook does not contain output from code cells
        # (should not be in the repository, but well...).
        #######################
        no_unexpected_output = True

        # Check that the cell dictionary has an 'outputs' key and that it is
        # empty, relies on Python using short circuit evaluation so that we
        # don't get KeyError when retrieving the 'outputs' entry.
        cells_with_output = [
            c.source for c in nb.cells if 'outputs' in c and c.outputs
        ]
        if cells_with_output:
            no_unexpected_output = False
            print(
                'Cells with unexpected output:\n_____________________________')
            for cell in cells_with_output:
                print(cell + '\n---')
        else:
            print('no unexpected output')

        #######################
        # Check that all the links in the markdown cells are valid/accessible.
        #######################
        no_broken_links = True

        cells_and_broken_links = []
        for c in nb.cells:
            if c.cell_type == 'markdown':
                html_tree = document_fromstring(markdown.markdown(c.source))
                broken_links = []
                #iterlinks() returns tuples of the form (element, attribute, link, pos)
                for document_link in html_tree.iterlinks():
                    try:
                        if 'http' not in document_link[2]:  # Local file.
                            url = 'file://' + os.path.abspath(document_link[2])
                        else:  # Remote file.
                            url = document_link[2]
                        urlopen(url)
                    except URLError:
                        broken_links.append(url)
                if broken_links:
                    cells_and_broken_links.append((broken_links, c.source))
        if cells_and_broken_links:
            no_broken_links = False
            print('Cells with broken links:\n________________________')
            for links, cell in cells_and_broken_links:
                print(cell + '\n')
                print('\tBroken links:')
                print('\t' + '\n\t'.join(links) + '\n---')
        else:
            print('no broken links')

        #######################
        # Spell check all markdown cells and comments in code cells using the pyenchant spell checker.
        #######################
        no_spelling_mistakes = True
        simpleitk_notebooks_dictionary = DictWithPWL(
            'en_US',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'additional_dictionary.txt'))
        spell_checker = SpellChecker(simpleitk_notebooks_dictionary,
                                     filters=[EmailFilter, URLFilter])
        cells_and_spelling_mistakes = []
        for c in nb.cells:
            spelling_mistakes = []
            if c.cell_type == 'markdown':
                # Get the text as a string from the html without the markup which is replaced by space.
                spell_checker.set_text(' '.join(
                    etree.XPath('//text()')(document_fromstring(
                        markdown.markdown(c.source)))))
            elif c.cell_type == 'code':
                # Get all the comments and concatenate them into a single string separated by newlines.
                comment_lines = re.findall('#+.*', c.source)
                spell_checker.set_text('\n'.join(comment_lines))
            for error in spell_checker:
                error_message = 'error: ' + '\'' + error.word + '\', ' + 'suggestions: ' + str(
                    spell_checker.suggest())
                spelling_mistakes.append(error_message)
            if spelling_mistakes:
                cells_and_spelling_mistakes.append(
                    (spelling_mistakes, c.source))
        if cells_and_spelling_mistakes:
            no_spelling_mistakes = False
            print('Cells with spelling mistakes:\n________________________')
            for misspelled_words, cell in cells_and_spelling_mistakes:
                print(cell + '\n')
                print('\tMisspelled words and suggestions:')
                print('\t' + '\n\t'.join(misspelled_words) + '\n---')
        else:
            print('no spelling mistakes')

        return (no_unexpected_output and no_broken_links
                and no_spelling_mistakes)
from enchant import DictWithPWL
from enchant.checker import SpellChecker

my_dict = DictWithPWL("en_US", "myDict.txt")
my_checker = SpellChecker(my_dict)
with open('test_copy.txt', 'r') as f:
    f_contents = f.read().decode("utf-8-sig").encode(
        "utf-8")  #decode the contents to unicode and encode to utf-8
    my_checker.set_text(f_contents)
    e = 0
    for error in my_checker:
        print "ERROR:", error.word
        e = e + 1
    print('No. of errors: ', e)
'''
import enchant
import wx
from enchant.checker import SpellChecker
from enchant.checker.wxSpellCheckerDialog import wxSpellCheckerDialog
from enchant.checker.CmdLineChecker import CmdLineChecker


a = "Cats are animalss. " \
    "They are violenttt."
chkr = enchant.checker.SpellChecker("en_US")
chkr.set_text(a)
for err in chkr:
    print err.word
    sug = err.suggest()[0]
    err.replace(sug)
 def set_language(self, lang):
     dict = enchant.DictWithPWL(lang, "data/dict/enchant.txt")
     self.checker = SpellChecker(dict, chunkers=(HTMLChunker, ))
Exemple #26
0
import enchant
from enchant.checker import SpellChecker
# Одно слово:
dictionary = enchant.Dict("ru_RU")
# print(dictionary.check("раpчет"))
print(dictionary.check("УТС"))

# С учетом своего словаря
pwl = enchant.request_pwl_dict("NewWords.txt")
dictionary = enchant.DictWithPWL("ru_RU", "NewWords.txt")
print(dictionary.check("УТС"))

# Варианты замены:
# print(dictionary.suggest(u"раpчет"))

# Проверка предложений
checker = SpellChecker("ru_RU")
checker.set_text("рассчет себестоимости")
print([i.word for i in checker])

# С учетом фильтров
from enchant.tokenize import WikiWordFilter
checker = SpellChecker("ru_RU", filters=[WikiWordFilter])
checker.set_text("Разработка по УТС")
print([i.word for i in checker])
Exemple #27
0
def correct(token):
    suggestions = SpellChecker('en_US').suggest(token)
    if len(suggestions):
        return suggestions[0]
    else:
        return token
    def analyze(self, filename):
        """
        DESCRIPTION:
            
        PARAMETERS:
        
        RETURN: 
        """

        typo = None
        report = list()
        supported = True

        if self.language is None:
            print "[ERROR] unknown language, specify with --language"
            sys.exit(1)

        # set prefered spellcheck method
        #b = Broker()
        #b.set_ordering("nb","myspell,aspell")

        # create a SpellChecker based on book's language
        try:
            checker = SpellChecker(self.language)
        except DictNotFoundError as err:
            print "language not supported by installed dictionaries"
            supported = False

        print "(lang: " + self.language + ")"

        # check if document is valid XHTML
        if len(self.markup_status["errors"]) > 0 or not supported:
            invalid = "N/A"
            self.spelling_status = {
                "title": "Spellcheck",
                "errors": list(),
                "status": invalid,
                "details": None
            }

            return self.spelling_status

        # get a list of words
        parser = etree.XMLParser()
        xhtml = self.file_operations.read_file(filename)
        tree = etree.parse(StringIO.StringIO(xhtml), parser)
        root = tree.getroot()

        # extract plain text
        words = root.xpath("//text()")

        # convert to text
        text = ''.join(words)

        # perform a spellcheck using chosen tool
        try:

            # set text to check
            checker.set_text(text)

            for err in checker:
                typo = err.word
                report.append(typo)

            # make list distinct
            report_dist = list(set(report))
            report_sorted = sorted(report_dist)

            # add to report
            if len(report_sorted) == 0:
                status = "ok"
                detailed_report = None
            else:
                status = "error"
                filename_only = self.file_operations.filename_only_final(
                    filename)
                detailed_report = "spellcheck/" + filename_only

            self.spelling_status = {
                "title": "Spellcheck",
                "errors": report_sorted,
                "status": status,
                "details": detailed_report
            }

            if len(self.spelling_status["errors"]) > 0:
                self.build_detailed_report(filename)

        except IOError as err:
            print err

        #print "done."
        return self.spelling_status
print(str.upper())  # 把所有字符中的小写字母转换成大写字母
print(str.lower())  # 把所有字符中的大写字母转换成小写字母
print(str.capitalize())  # 把第一个字母转化为大写字母,其余小写
print(str.title())  # 把每个单词的第一个字母转化为大写,其余小写

# 词纠错

from enchant.checker import SpellChecker
chkr = SpellChecker("en_US")
chkr.set_text("Many peope likee to watch In the Name of People.")
for err in chkr:
    print("ERROR:", err.word)
# 输出:
# ERROR: peope
# ERROR: likee

# 词行还原
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('countries'))
from enchant.checker import SpellChecker
from enchant.tokenize import EmailFilter, URLFilter
spell = SpellChecker("en-US", filters=[EmailFilter, URLFilter])
filepath = str(input('Enter file location'))
fileopen = open(filepath)
#print(fileopen.read())
content = fileopen.read()
spell.set_text(content)
for err in spell:
    print("Missspelled word is: " + err.word + ' at position ' +
          str(err.wordpos))