Ejemplo n.º 1
0
 def __init__(self, dictionary_path):
     from spellchecker import SpellChecker
     self.spell = SpellChecker(local_dictionary=dictionary_path)
Ejemplo n.º 2
0
def predict_for_sentence(sentences, wordList):
    # 准备当作后端接口
    model_path = ['E://pycharm//gector//models//pretrained_gectors//xlnet_0_gector.th',
                  'E://pycharm//gector//models//pretrained_gectors//bert_0_gector.th',
                  'E://pycharm//gector//models//pretrained_gectors//roberta_1_gector.th'
                  ]
    vocab_path = 'E://pycharm//gector//data//output_vocabulary'
    model = GecBERTModel(vocab_path=vocab_path,
                         model_paths=model_path,
                         max_len=50, min_len=3,
                         iterations=5,
                         min_error_probability=0.0,
                         min_probability=0.0,
                         lowercase_tokens=0,
                         model_name='xlnet',
                         special_tokens_fix=0,
                         log=False,
                         confidence=0,
                         is_ensemble=1,
                         )
    spell = SpellChecker()
    for word in wordList:
        spell.word_frequency.add(word)
    error_labels = set()
    batch = []
    notes = set()
    correctList = []
    for sentence in sentences:
        tokens = sentence.split()
        batch.append(tokens)
    st = time.time()
    preds, cnt, labels, dics = model.handle_batch(batch, spell)
    for i in labels:
        error_labels.add(i)
    ed = time.time()
    for idx in range(len(preds)):
        print("after correct: ", [" ".join(x) for x in preds][idx])
        print("correct errors: ", cnt)
        corr = [" ".join(x) for x in preds][idx]
        correctList.append(corr)
        print(f'inference time: {ed - st}')
    for i in error_labels:
        if i.startswith('$REPLACE'):
            notes.add("替换")
        elif i.startswith('$DELETE'):
            notes.add("删除")
        elif i.startswith('$APPEND'):
            notes.add("插入")
        elif i.startswith('$TRANSFORM'):
            label = i.split('_', 1)[1]
            if label.startswith('VERB'):
                notes.add("动词形式有误")
            elif label.startswith('AGREEMENT'):
                notes.add("请注意单复数问题")
            elif label.startswith('CASE'):
                notes.add("注意大小写")
    if 'Spell' in list(dics.keys()):
        notes.add('拼写')
    for note in notes:
        print(note)
    dics['Spell'] = list(set(dics['Spell']))
    print(dics)
    ed1 = time.time()
    print(f'total time: {ed1 - st}')
    return correctList, list(notes), dics, cnt
Ejemplo n.º 3
0
 def __init__(self):
     self.spell = SpellChecker(language=None, case_sensitive=False)
     self.spell.word_frequency.load_text_file('./varietal_dictionary.txt')
    def specll_check(self, text):
        spell = SpellChecker()
        text = spell.split_words(text)

        return " ".join([spell.correction(word) for word in text])
Ejemplo n.º 5
0
def runSpellChecker(word):
    spell = SpellChecker()
    if len(spell.unknown([word])) >= 1:
        return spell.correction(word)
    return word
Ejemplo n.º 6
0
from spellchecker import SpellChecker
from scrap import update_vault_list
from ocr import OcrCheck
from db_operations import relic_from_screen_overwrite
import numpy as np
import cv2
import requests

# Initialize ##################################################################################

v_relic_list = update_vault_list()

# Define reference file for Spellchecking
spell_check = SpellChecker(distance=1)
spell_check.word_frequency.load_text_file('ref/other_ref/ref_words.txt')

# Define references files to use for Warframe Data
Era_file = 'ref/other_ref/ref_era.txt'
Lith_file = 'ref/other_ref/ref_lith.txt'
Meso_file = 'ref/other_ref/ref_meso.txt'
Neo_file = 'ref/other_ref/ref_neo.txt'
Axi_file = 'ref/other_ref/ref_axi.txt'
Quality_file = 'ref/other_ref/ref_quality.txt'
Ressources_file = 'ref/other_ref/ref_ressources.txt'


# Parse references files to lists
def parse_ref_files(file):
    ref_list = []
    with open(file, "r") as fileHandler:
        for line in fileHandler:
def get_text_layer(original_image):
    alphabets = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')

    spell = SpellChecker()

    a, b, c = original_image.shape
    blank_img = np.ones([a, b, c], dtype=np.uint8)
    blank_img.fill(255)

    a, b, c = original_image.shape
    blank_img2 = np.ones([a, b, c], dtype=np.uint8)
    blank_img2.fill(255)

    # Saving a original image and shape
    orig = original_image.copy()
    (origH, origW) = original_image.shape[:2]

    # set the new height and width to default 320 by using args #dictionary.
    (newW, newH) = (args["width"], args["height"])

    # Calculate the ratio between original and new image for both height and weight.
    # This ratio will be used to translate bounding box location on the original image.
    rW = origW / float(newW)
    rH = origH / float(newH)

    # resize the original image to new dimensions
    image = cv2.resize(original_image, (newW, newH))
    (H, W) = image.shape[:2]

    # construct a blob from the image to forward pass it to EAST model
    blob = cv2.dnn.blobFromImage(image,
                                 1.0, (W, H), (123.68, 116.78, 103.94),
                                 swapRB=True,
                                 crop=False)

    # load the pre-trained EAST model for text detection
    net = cv2.dnn.readNet(args["east"])

    # The following two layer need to pulled from EAST model for achieving this.
    layerNames = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]

    # Forward pass the blob from the image to get the desired output layers
    net.setInput(blob)
    (scores, geometry) = net.forward(layerNames)

    # Find predictions and  apply non-maxima suppression
    (boxes, confidence_val) = predictions(scores, geometry)
    boxes = non_max_suppression(np.array(boxes), probs=confidence_val)

    # initialize the list of results
    results = []

    # loop over the bounding boxes to find the coordinate of bounding boxes
    for (startX, startY, endX, endY) in boxes:
        # scale the coordinates based on the respective ratios in order to reflect bounding box on the original image
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)

        # extract the region of interest
        r = orig[startY:endY, startX:endX]

        # display regions of interests on blank image
        blank_img[startY:endY, startX:endX] = orig[startY:endY, startX:endX]

        # configuration setting to convert image to string.
        configuration = ("-l eng --oem 1 --psm 8")
        # TEXT RECOGNITION
        ##This will recognize the text from the image of bounding box
        text = pytesseract.image_to_string(r, config=configuration)

        # append bbox coordinate and associated text to the list of results
        results.append(((startX, startY, endX, endY), text))

    # Display the image with bounding box and recognized text
    orig_image = orig.copy()

    # cv2.imshow("blank_img", blank_img)

    text_list = []
    # Moving over the results and display on the image
    for ((start_X, start_Y, end_X, end_Y), text) in results:
        # display the text detected by Tesseract
        misspelled_word = ''.join(filter(alphabets.__contains__, text))
        final_word = spell.correction(misspelled_word)
        # print("{}\n".format(text))
        text_list.append(final_word)

        # Displaying text
        text = "".join([x if ord(x) < 128 else "" for x in final_word]).strip()
        cv2.rectangle(orig_image, (start_X, start_Y), (end_X, end_Y),
                      (0, 0, 255), 1)
        cv2.putText(orig_image, text, (start_X, start_Y - 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 1)

        cv2.putText(blank_img2, text, (start_X, start_Y - 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 1)

    return text_list, orig_image, blank_img, blank_img2
Ejemplo n.º 8
0
    def load_spell_check(self):
        if self.spell_check is not None:
            return

        self.spell_check = SpellChecker()
Ejemplo n.º 9
0
def expand_query(query):
    ''' Attempts to expand the given query by using synonyms from WordNet. As
    a consequnece of this process, the query is also tokenized and lemmatized.
    '''
    spell = SpellChecker(local_dictionary='chatbot/nlp/statics/no_50k.json')

    # Tokenize, tag and filter query using Spacy
    tokens = [
        # Store both token text and POS tag
        (token.text, token.pos_) for token in nb(query)
        # Filter away punctuation.
        if token.text not in string.punctuation
    ]

    # Add possible spelling corrections, without duplicates
    # We also want to keep the original token, since the detected misspelling
    # migt be intentional - power to the user!
    tokens += [
        (spell.correction(token[0]), token[1]) for token in tokens
        if not spell.correction(token[0]) in [token[0] for token in tokens]
    ]

    # Lemmatize tokens
    tokens = [
        # Store tuples of lemmatized tokens and their corresponding POS tags.
        (lemmatize(token[0], token[1])[0], token[0]) for token in tokens
    ]

    # Filter away stopwords as we do not want to expand them.
    tokens = [token for token in tokens if token not in get_stopwords()]

    # Store synonyms in a set, so duplicates are not added multiple times.
    synonyms = set()

    # The tokens in the expanded query.
    result = []

    for token in tokens:
        # Convert POS tags from Spacy to WordNet.
        pos = getattr(wn, token[1], None)

        # Find all synsets for the word, using the Norwegian language.
        synsets = wn.synsets(token[0], lang='nob', pos=pos)

        # Get a custom synset wrapper.
        custom_synsets = SynsetWrapper.get_instance()

        # Get the synset for this token.
        custom_synset = custom_synsets.get_synset(token[0])

        if custom_synset:
            # Remove the token itself to avoid duplication.
            custom_synset.remove(token[0])
            synonyms.update(custom_synset)

        if synsets:
            for synset in synsets:
                # Find all lemmas in the synset.
                for name in synset.lemma_names(lang='nob'):
                    # Some lemmas contain underscores, which we remove.
                    synonyms.add(name.replace('_', ' '))

            # If we found synonyms, we only add the synonyms. This is because
            # the original word is already included in the synset, so this
            # avoids adding it to the result list twice.
            continue

        # Add the original token to the full query.
        result.append(token[0])

    # Add custom synset to the query
    result += synonyms

    return ' '.join(result)
Ejemplo n.º 10
0
"""

import string
import typing as t
from datetime import date
from enum import Enum, IntEnum
from pathlib import Path
from typing import List, Optional

from pyaml import yaml
from pydantic import BaseModel, EmailStr, HttpUrl, ValidationError
from pydantic.color import Color
from spellchecker import SpellChecker
from typing_extensions import TypedDict

GLOBAL_CHECKER = SpellChecker()


class VersionNumber(IntEnum):
    """
    Contains the different possible versions of `PortfolioEntry` items.
    """

    version_0 = 0


class EntrySize(str, Enum):
    """
    Describes the scope of a portfolio item.
    Note, this will have a direct impact on the way that the piece of media is displayed to reader
    when rendered in the portfolio. `large` items will be visually larger than smaller sized items.
import numpy as np
import pandas as pd
import re
import demoji
from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
from city_state_dict import city_to_state_dict
demoji.download_codes()

# Initializes the spell checker, tokenizer and lammatizer.
check = SpellChecker()
tokenizer = RegexpTokenizer(r'\w+')
lemma = WordNetLemmatizer()

# Create a set of stopwords
stop_words = set(stopwords.words('english'))

# Reads in the data.
data = pd.read_json('data.json')


########################### FUNCTIONS ###################################
def correct_text(text):
    # text needs to be a list of clean word tokens without other characters.
    misspelled = check.unknown(text)
    for word in misspelled:
        text[text.index(word)] = check.correction(word)
    return list(set(text) - misspelled)
Ejemplo n.º 12
0
 def transform(self, X: dt.Frame):
     from spellchecker import SpellChecker
     self.spell = SpellChecker()
     return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: self.correction(x))
Ejemplo n.º 13
0
from autocorrect import Speller
from nltk import word_tokenize
import unidecode
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from emoticons_list import EMOTICONS
from emoticons_list import EMO_UNICODE
from string import punctuation
from nltk.corpus import stopwords
import spacy
import gensim
from collections import Counter
from contraction import CONTRACTION_MAP
from progress.bar import Bar

spell_corrector = SpellChecker()
# expending contractions
contraction_mapping = CONTRACTION_MAP
# take all key values from contraction_mapping
contractions_pattern = re.compile('({})'.format('|'.join(
    contraction_mapping.keys())),
                                  flags=re.IGNORECASE | re.DOTALL)

# initialize porter stemmer object
stemmer = PorterStemmer()
# initializa lemmatizer object
lemma = WordNetLemmatizer()
# convert emo_unicode to unicode_emo
UNICODE_EMO = {v: k for k, v in EMO_UNICODE.items()}
# list of stopwords from nltk
stopwords_nltk = list(stopwords.words('english'))
Ejemplo n.º 14
0
 def test_words(self):
     ''' rest the parsing of words '''
     spell = SpellChecker(language='en')
     res = ['this', 'is', 'a', 'test', 'of', 'this']
     self.assertEqual(spell.split_words('This is a test of this'), res)
Ejemplo n.º 15
0
            'DN': ['der', 'die', 'den', 'dem','einen', 'eine', 'einem', 'einer']
            }
'''
artikels = {
    'NM': ['der', 'ein'],
    'NF': ['die', 'eine'],
    'NN': ['das', 'ein'],
    'AM': ['den', 'einen'],
    'AF': ['die', 'eine'],
    'AN': ['das', 'ein'],
    'DM': ['dem', 'einem'],
    'DF': ['der', 'einer'],
    'DN': ['dem', 'einem']
}

spell = SpellChecker(language='de')


def check_spell(doc):
    #SpellChecker
    words = [token.text for token in doc]
    misspelled = spell.unknown(words)
    misspelled

    errors = []

    if misspelled:
        for misspell in misspelled:
            correct = spell.correction(misspell)
            tip = spell.candidates(misspell)
            error = {
Ejemplo n.º 16
0
from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pn
import numpy as np
import nltk
from nltk.stem.snowball import SpanishStemmer
from spellchecker import SpellChecker
import spacy
import time

nlp = spacy.load('es_core_news_sm')
stoplist = stopwords.words('spanish')
spanishStem=SpanishStemmer('spanish')
spell = SpellChecker(language='es')

def Lematizar(preguntas): #Recibo matriz de preguntas/respuestas
    t = time.time()
    for i in range(preguntas.shape[0]):
        oracion = ''
        for token in nlp(preguntas[i][1]):
            oracion = oracion + token.lemma_ + ' '
            # print(token.text, token.lemma_, token.pos_)
        preguntas[i][1] = oracion
    print('Elapsed in lematizar: ' ,(time.time() - t))
    return (preguntas)

def LematizarOracion(sentence): #Recibo string
    oracion = ''
    for token in nlp(sentence):
        oracion = oracion + token.lemma_ + ' '
    return (oracion)
Ejemplo n.º 17
0
a = len(pd_speeches.speaker.unique().tolist())

#assign most similar correct name to speaker in scanned transcripts (takes a bit):
pd_speeches.loc[((pd_speeches.session < 42) | (pd_speeches.session ==92)) & (pd_speeches.wp == 15), 'speaker'] = pd_speeches.loc[((pd_speeches.session < 42) | (pd_speeches.session ==92)) & (pd_speeches.wp == 15), 'speaker'].apply(lambda x: difflib.get_close_matches(x, names_right, n=1)[0] if difflib.get_close_matches(x, names_right, n=1) else x)

b = len(pd_speeches.speaker.unique().tolist())

print('previous n of distinct speakers: {}; after managing typos: {}'.format(a,b))

#discontinued; not that easy to solve
if False:
    
    #spellchecker
    from spellchecker import SpellChecker
    import string
    german = SpellChecker(language='de')

    # remove punct from text; create list of words and find those misspelled
    t_np = t.translate(str.maketrans('', '', string.punctuation))
    w = [w for w in t_np.split(' ') if w != '']
    misspelled = german.unknown(w)

    correct = []
    misp = []
    for word in misspelled:
        # Get the one `most likely` answer
        print(german.correction(word), word)
        correct.append(german.correction(word))
        misp.append(word)

    # implement algorithm that substitutes word (something like this)
Ejemplo n.º 18
0
 def __init__(self, parser, indexer, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self.spell = SpellChecker()
Ejemplo n.º 19
0
def is_spelling_correct(word_list):
	ret = True
	if len(SpellChecker().unknown(word_list)) > 0: ret = False
	if not ret: g_vars.get('logger').warning('Spell Check failed for {}'.format(word_list))
	return ret
Ejemplo n.º 20
0
from nltk.corpus import wordnet
from spellchecker import SpellChecker

dictionary = SpellChecker()


def get_suggestions(word):
    candidates = dictionary.candidates(word)
    candidates = [w for w in candidates if wordnet.synsets(w)]
    return candidates


def web_get_records(word):
    resp = ""
    syn = wordnet.synsets(word)
    if not syn:
        return None

    dform = {
        "n": "noun",
        "v": "verb",
        "a": "adjective",
        "r": "adverb",
        "s": "adjective satellite",
    }
    ctr1 = 1
    ctr2 = 97
    for i in syn[:10]:
        ctr2 = 97
        definition, examples, form = i.definition(), i.examples(), i.pos()
        resp = resp + str(ctr1) + "." + "\n"
Ejemplo n.º 21
0
from django.contrib import messages
from django.utils import timezone
from django.db.models import Sum
from .models import Grocery_List
from colorama import init, Fore
from operator import itemgetter
import numpy as np
import logging
import math
import os
import re

# ______INITIALIZATIONS________
init(autoreset=True)  # Colorama for printing colored text in the terminal
dirpath = os.getcwd()  # path of the project
spellchecker = SpellChecker(local_dictionary=dirpath + \
"\\INEZ\\static\\INEZ\\json\\spellings.json", case_sensitive=True) #Spellchecker

# loading the ~14000 products from cache if possible, else loading them from
# file and caching them
if cache.get("products") is not None:
    products = cache.get("products")
else:
    products = {}
    with open(os.getcwd() + "\\INEZ\\products.txt", encoding="utf-8-sig") as f:
        for line in f:
            values = line.split("|")
            products[values[0]] = [
                values[1].replace(",", "."), values[2].replace("\n", "")
            ]
    print(Fore.GREEN + 'Loaded %s products\n' % len(products))
    cache.set("products", products, (60 * 60))
def main():
    home_directory = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__)))
    with open(
            os.path.join(home_directory,
                         'extracted_ocr\\NY_Mercantile_Lib_1825.html'),
            'r') as f:
        contents = f.read()
        file = html.unescape(bs(contents, 'lxml'))
        catalog = []
        final_catalog = []
        sum_of_heights = 0
        #Counting the words that go into the denominator for average word height
        counted_words = 0
        for index, page in enumerate(file.find_all('page')):
            words = page.find_all('word')
            if len(words) == 0:
                continue
            words.sort(key=lambda x: float(x.get('xmin')))
            # First we sorted all of the words on the page from furthest left to furthest right.
            # Now we sort them all again by highest to lowest. Assuming one column of text per page,
            # this does an excellent job of putting all of the words in normal reading order.
            # Will need a different method for catalogs with two columns per page.
            words.sort(key=lambda x: float(x.get('ymin')))
            old_line_y = float(words[0].get('ymin'))
            line = []
            for word in words:
                # Ignoring empty "words." Not sure where they're coming from; this started happening
                # when I switched to ASCII encoding from the problematic-for-other-reasons UTF-8
                if word.text == '':
                    continue
                # Encoding error that's pervasive in NY Mercantile 1825
                if word.text == 'â€' or word.text == 'â€':
                    continue
                # Ignoring single lower case letters. These tend to be OCR artifacts that aren't useful
                if re.match("[a-z]{1}$", word.text):
                    continue
                # Ignoring et cetera
                if word.text == '&c':
                    continue
                # Ignoring things in all caps because they're typically headers
                # Even the catalogs where they're not necessarily (e.g., Ladies' Lib
                # of Kalamazoo), the OCR usually makes the all caps in running text
                # into normal text.
                if re.match("[A-Z]{2}", word.text):
                    continue
                # Needed for Lib Co Boston 1830, where a lot of 1s got turned into Is in
                # the shelf number column
                # if word.text == 'I':
                #    continue
                # Ignoring random flecks on the page that get turned into punctuation by OCR
                # Combined with number screen below.
                # if re.match("[_.,*:|'\"\^\-º]$", word.text):
                #continue
                # Ignoring common column headings. "Mo." is a common OCR error for an italicized "No."
                # For NY Mercantile 1825 we need "Vol" to identify contents of multi-volume sets.
                # if re.match("Vol", word.text) or
                if re.match("No.", word.text) or re.match(
                        "Mo.", word.text) or re.match(
                            "Shelf", word.text) or re.match("Size", word.text):
                    continue
                # Ignoring page numbers -- 1, 2, or 3-digit numbers not followed by "nd", "rd", "th", etc.
                # Can also adjust to ignore shelf numbers when needed (e.g., NY Mechanics 1844)
                # And a tweak to ignore OCR-eaten numbers.
                # I swear that not all of this punctuation is in the ASCII code space, but I've seen
                # all of it in the OCR extracted by pdftotext with ASCII7 encoding....
                #if re.match("[0-9 =%_.,*:#|'\"\^\-º•§]{1,4}$", word.text) or re.search("[0-9]{3,4}", word.text):
                if re.match("[ —=%_.,:#|'\"\^\-º•§]{1,4}$", word.text):
                    # if re.match("[0-9]{1,4}$", word.text):
                    continue
                counted_words += 1
                sum_of_heights = sum_of_heights + (float(word.get('ymax')) -
                                                   float(word.get('ymin')))
                line_y = float(word.get('ymin'))
                # We know we're on a new line of text when the ymin increases more than 7 pixels.
                # 7 pixels was selected empirically based on first several catalogs processed.
                # This may be too large of a number for very small-type catalogs.
                # Changed to 8 because found cases in Milwaukee YMA where 7 was too small.
                # Changed to 12 because of non-straight lines in Charleston Lib Co.
                if (line_y - 8) > old_line_y:
                    old_line_y = line_y
                    if line:
                        line.sort(key=lambda x: float(x.get('xmin')))
                        catalog.append(line)
                    line = [word]
                else:
                    line.append(word)
            #Append the last one on the page
            if len(line) > 0:
                line.sort(key=lambda x: float(x.get('xmin')))
                catalog.append(line)
            #Process the page, putting together split lines into single entries
            previous_line_xmin = None
            for entry in catalog:
                # Is it contents for a multi-volume set? We need to flag these for removal, because the TOCs aren't in the HT metadata
                if re.match(r'\s*Vols?\.\s*', entry[0].text) and re.match(
                        r'\s*\d{1,2}(\.|,)\s*', entry[1].text):
                    entry[0].string = 'DELETE_ME'
                # Is the new line indented further than the old line? If so,
                # it needs some special handling.
                # If it's the first line on the page, the question is moot.
                if previous_line_xmin == None:
                    indent = 0
                else:
                    first_real_word = next(
                        (y for y in entry
                         if y.text.rstrip('.,?!').casefold() != "do"), None)
                    if first_real_word:
                        this_line_xmin = float(first_real_word.get('xmin'))
                        indent = previous_line_xmin - this_line_xmin
                    else:
                        continue
                # Allowing 10 pixels of slop to account for skewed scans etc.
                # Most indents are well over 10 pixels; could increase if needed.
                if (indent + 10) < 0 and (entry[0].string.rstrip('.,?!')
                                          == 'do'):
                    # If this line is indented and not continuing previous line,
                    # we want to append to this line everything from
                    # the previous line with an xmin smaller than the xmin of this word.
                    # Since this will carry down all relevant information from the words on the
                    # previous lines, this *should* work even in catalogs with multiple
                    # levels of indents.
                    # 'vocable' because 'word' was already taken in this script
                    # 10 pixels for slop again
                    for vocable in final_catalog[-1]:
                        if (float(vocable.get('xmin')) + 10) < this_line_xmin:
                            entry.append(vocable)
                    # sort it again because we screwed up the sort appending more words to it
                    entry = sorted(entry, key=lambda x: float(x.get('xmin')))
                elif (indent + 10) < 0 and entry[0].string != "DELETE_ME":
                    # If it's indented but the first word isn't "do", then it's continuing
                    # the previous line.
                    # If it's a TOC, don't append it to the previous line. We want to keep the previous
                    # line if it's the title of the series/set and only remove TOCs
                    # Try to reassemble hyphenated words. OCR process ate
                    # the hyphens at the end of lines, so we have to guess
                    # if two words go together or not. We'll assume that if the
                    # first word on the second line is not capitalized and is not
                    # recognized by the spellchecker then it should be concatenated
                    # with the last word on the previous line.
                    last_word_of_carryover = final_catalog[-1][-1].string
                    first_word_of_line = entry[0].string
                    if first_word_of_line and first_word_of_line != "DELETE_ME" and (
                            re.match("[A-Z]", first_word_of_line)
                            or re.match("[0-9 \-.,]+", first_word_of_line) or
                        (first_word_of_line.rstrip('.,?!').casefold()
                         in SpellChecker()
                         and last_word_of_carryover.rstrip('.,?!').casefold()
                         in SpellChecker())):
                        final_catalog[-1] += entry
                        final_catalog_sorted = sorted(
                            final_catalog[-1],
                            key=lambda x: float(x.get('xmin')))
                        previous_line_xmin = float(
                            final_catalog_sorted[0].get('xmin'))
                        entry = None
                    else:
                        final_catalog[-1][-1].string = final_catalog[-1][
                            -1].text + entry[0].text
                        del entry[0]
                        final_catalog[-1] += entry
                        final_catalog_sorted = sorted(
                            final_catalog[-1],
                            key=lambda x: float(x.get('xmin')))
                        previous_line_xmin = float(
                            final_catalog_sorted[0].get('xmin'))
                        entry = None
                if entry and entry[0] != "DELETE_ME":
                    entry_sorted = sorted(entry,
                                          key=lambda x: float(x.get('xmin')))
                    previous_line_xmin = float(entry_sorted[0].get('xmin'))
                    final_catalog.append(entry)
            catalog = []
    average_line_height = sum_of_heights / counted_words
    #average_line_height = sum_of_heights / len(file.find_all('word'))
    with open(
            os.path.join(home_directory,
                         'replication\\NY_Mercantile_Lib_1825.csv'),
            'wb+') as outfile:
        csvwriter = unicodecsv.writer(outfile, encoding='utf-8')
        for item in final_catalog:
            # Write catalog entries out to a CSV, omitting shelf numbers and sizes
            # Also omit headers, to the extent we can identify them by
            # having a line-height more than 20% larger than average
            # (testing this just on 1st word in line)
            if (float(item[0].get('ymax')) - float(item[0].get('ymin')) <
                    average_line_height * 1.2):
                final_entry = ''
                for vocable in item:
                    # Don't want to include shelf numbers etc. in output.
                    # Do want to include "1st", 12th", etc.
                    # Also want to get rid of "do"s now that we're done w/them
                    # Get rid of punctuation that confuses Solr (including commas and periods, which interact badly with the
                    # fuzzy search ~ when they trail a word). This includes "'s" on the end of words.
                    stripped_vocable = re.sub(
                        r'[\+ \- & \| ! , \. ( ) \{ \} \[ \] \^ " ~ \* \? : \\ #”`]',
                        ' ', vocable.text)
                    if stripped_vocable == '':
                        continue
                    #if we filter out numbers here, then can't use them to identify tables of contents
                    #in the next step
                    #if stripped_vocable[0].isalpha() or stripped_vocable[-1] == 'h' or stripped_vocable[-1] == 'd' or stripped_vocable[-1] == 't':
                    stripped_vocable = stripped_vocable.replace("'s", '')
                    #stripped_vocable = stripped_vocable.replace("'", '')
                    if stripped_vocable.casefold(
                    ) != 'do' and stripped_vocable.casefold() != 'ditto':
                        final_entry += ' ' + stripped_vocable
                # Catching TOCs that slipped through the first filter for whatever reason (often missing punctuation)
                if re.match(r'\s*Vols?\s*\d{1,2}', final_entry):
                    continue
                for vocable in final_entry.split():
                    if vocable[0].isdigit() and vocable[-1] != 'h' and vocable[
                            -1] != 'd' and stripped_vocable[-1] != 't':
                        final_entry = re.sub(vocable, '', final_entry)
                final_entry = re.sub(r'Presented(.*)by(.*)$', '', final_entry)
                final_entry = re.sub(r'Gift(.*)of(.*)$', '', final_entry)
                # This is for NY Society 1813 and NY Mercantile 1825. These translation notes break matching every time.
                # final_entry = final_entry.replace("translated from the Latin","")
                final_entry = final_entry.replace("translated from the French",
                                                  "")
                final_entry = final_entry.replace("translated from the German",
                                                  "")
                if final_entry != '':
                    if final_entry.split() and final_entry.split(
                    )[0] and final_entry.split()[0] != "DELETE_ME":
                        csvwriter.writerow([final_entry])
Ejemplo n.º 23
0
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.print_usage()
        sys.exit(1)

    if options.language not in ['fr', 'en']:
        print('language should be set to fr or en')
        sys.exit(1)

    notebook = None
    input_fname = args[0]
    with open(input_fname) as finput:
        notebook = json.load(finput)

    spell = SpellChecker(options.language)
    tokenizer = RegexpTokenizer(r'\w+')
    if options.language == 'en':
        dict_fname = os.path.expandvars('$HOME/.spellchecker_en.txt')
    else:
        dict_fname = os.path.expandvars('$HOME/.spellchecker_fr.txt')
    with open(dict_fname, 'a'):
        os.utime(dict_fname, None)
    spell.word_frequency.load_text_file(dict_fname)

    learnt_words = []
    ignored_words = []
    notebook_updated = False

    cells = notebook['cells']
    cells_md = [cell for cell in cells if cell['cell_type'] == 'markdown']
Ejemplo n.º 24
0
    def __init__(self, bot):
        self.bot = bot
        self.spell = SpellChecker()
        self.spell.distance = 1

        self.wordfile = pathlib.Path().home() / "wordlist.txt"
        self.spell.word_frequency.load_text_file(str(self.wordfile))

        async def grammar_module(message):
            if message.guild is None or message.guild.name.lower() != "cortex":
                return
            clean_message = message.clean_content.lower()
            # MM: Added so list instead of string
            message_split = clean_message.split(" ")

            # BLACKLIST CHANNELS
            blacklist = [
                "news",
                "rpg",
                "events",
                "recommends",
                "politisophy",
                "eyebleach",
                "weeb-lyfe",
                "out-of-context",
                "jokes",
                "anime-club",
            ]

            message_channel = message.channel.name.lower()

            if (
                # DO NOT RESPOND TO SELF MESSAGES
                (bot.user.id == message.author.id or message.content.startswith("."))
                or (message.channel.name is None)
                or (
                    reduce(
                        lambda acc, n: acc or (n == message_channel), blacklist, False
                    )
                )
                or ("thank" in clean_message)
                or ("http" in clean_message)
            ):
                return

            ctx = await bot.get_context(message)

            new_message = re.sub(
                "n?'[A-Za-z]+|[^A-Za-z ]", "", message.content.lower()
            ).split(" ")

            new_message = [w for w in new_message if w != ""]

            if len(new_message) == 0:
                return

            mispelled = self.spell.unknown(new_message)

            if len(mispelled) == 0:
                return

            message_changed = False
            for word in mispelled:
                correction = self.spell.correction(word)

                if correction != word:
                    new_message = [w if w != word else correction for w in new_message]
                    message_changed = True

            new_message = " ".join(new_message)

            if message_changed:
                await ctx.send('I think you meant to say, "{}"'.format(new_message))

            return

        self.bot.add_listener(grammar_module, "on_message")
Ejemplo n.º 25
0
    def set_language(self, entry='en'):
        """sets language. Options are 'en' 'de' 'fr' es'"""

        if entry in ['en', 'de', 'fr', 'es']:
            self.language = entry
        self.spell = SpellChecker(language=self.language)
Ejemplo n.º 26
0
!pip install pyspellchecker
from spellchecker import SpellChecker

spell = SpellChecker(distance=1)

# find those words that may be misspelled
misspelled = spell.unknown(['tt'])

for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))
def spell_check(df):
  for i in range(df.shape[0]):
    if i%100000==0:
      print("Reached {0}, percent {1}".format(i,float(i/df.shape[0])*100))
    words=df['text'][i].split()
    misspelled = spell.unknown(words)
    l=[]
    for word in words:
      if word in misspelled:
        word=spell.correction(word)
      else:
        word=word
      l.append(word)
    #words=[spell.correction(word) for word in words]
    df['text'][i]=' '.join(word for word in l)
  return df
 def spell_collection(df):
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
from constant import CONTRACTION_MAPPING, PUNCT_MAPPING

# Constant
ENG_STOPWORDS = set(stopwords.words("english"))
WORDNET_MAP = {
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "J": wordnet.ADJ,
    "R": wordnet.ADV
}

# Instance
SpellCheckerInstance = SpellChecker()
LemmatizerInstance = WordNetLemmatizer()


def clean_html_tag(text):
    """HTMLの削除"""
    return BeautifulSoup(text, "lxml").text


def clean_url(text):
    """URLの削除"""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def clean_number(text):
Ejemplo n.º 28
0
def token_spellchecker(tokens):
    spell = SpellChecker()
    correct_spelling = [spell.correction(word) for word in tokens]
    return correct_spelling
from nltk.corpus import words
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import cmudict
nltk.download('words')
nltk.download('cmudict')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
import math
import re
import string
import syllables
d = cmudict.dict()
tool = language_tool_python.LanguageTool('en-US')
from spellchecker import SpellChecker
spell = SpellChecker() 
! pip install langdetect
from langdetect import detect
import requests, time
url = 'https://farasa-api.qcri.org'
import ast

"""# All Functions"""

"""
#my source is: https://readabilityformulas.com/the-LIX-readability-formula.php
def w_g_4(txt):
    count =0
    words=word_tokenize(txt)
    for x in words:
        #print(len(x))
Ejemplo n.º 30
0
 def setUp(self):
     self.spellChecker = SpellChecker()
     self.spellChecker.load_words('spellwords.txt')