def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles):
		#Parameters:
			#	sample_path: A path to a samples source file containing all training data to be fed to the incubator
			#	words_path: A path to all words which the cipher_breaker should consider valid in addition
			#		to those already in pyspellchecker.
			#	elites: How many elites are carried over for each generation
			#	children: How many children are created for each generation
			#	randoms: How many random chromosomes are added each generation
			#	tournament_size: How many chromosomes are considered in a tournament
			#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
			#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
			#	shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value
			#	max_cycles: Cycle # at which the simulation terminates

			#Initializes sample_block_tables
			self.sample_block_table = self.getSampleBlockTable(sample_path)

			#Initializes spellchecker
			self.spellchecker = SpellChecker()
			self.spellchecker.word_frequency.load_text_file((words_path))

			#Checks cross_chance and mutation_chance are valid
			assert (cross_chance + mutation_chance) == 1

			#Loads all incubator paramaters
			self.elites = elites
			self.children = children
			self.randoms = randoms
			self.population = self.elites + self.children + self.randoms

			self.tournament_size = tournament_size

			self.cross_chance = cross_chance
			self.mutation_chance = mutation_chance

			#Handles shock_value
			if shock_value <= 0:
				self.shock_enabled = False
				self.shock_threshold = 0
			else:
				self.shock_enabled = True
				self.shock_threshold = shock_value

			self.max_cycles = max_cycles

			#Prints incubator summary if verbose enables
			if __VERBOSE__:
				print("Incubator Summary:")
				print("sample_path: " + sample_path + "  words_path: " + words_path)
				print("Total population: " + str(self.population))
				print("Elites: " + str(self.elites) + "  Children: " + str(self.children) + "  Randoms: " + str(self.randoms))
				print("Tournament size: " + str(self.tournament_size) + "  Cross chance: " + str(self.cross_chance) + "  Mutation chance: " + str(self.mutation_chance))
				print("Shock enabled: " + str(self.shock_enabled) + "  Shock threshold: " + str(self.shock_threshold))
				print("Max cycles: " + str(self.max_cycles))
				print("\n")
Beispiel #2
0
'''
Created on Feb 8, 2019
Work in Progress
@author: Gordon
'''
from spellchecker import SpellChecker

if __name__ == '__main__':
    pass

spell = SpellChecker(distance=1)


# analyze the cipher text to break the substitution cipher
# return (plain text, cipher text alphabet)
# where cipher text alphabet(CA) is of the form {c1, c2, c3, c4, _, c6, _, ...}
# replace c3 with whatever character in the CA corresponds to c in the PA
# _ is used to denote any letter for which you are uncertain about
def decode(ciphertext):
    '''initializing average frequencies for letters in English alphabet
    based on graph in assignment document, first index left for spaces (most freq. char)'''
    avg_frequencies = [
        ' ', 'e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', 'l', 'c', 'u',
        'm', 'w', 'f', 'g', 'y', 'p', 'b', 'v', 'k', 'j', 'x', 'q', 'z'
    ]
    #print(avg_frequencies)
    '''initializing array to mark frequencies of each letter, then initializing cipher text alphabet to have length of 27'''
    frequency_check = [[0 for x in range(2)] for y in range(27)]
    frequency_check[0][0] = ' '
    for i in range(26):
        frequency_check[i + 1][0] = chr((i) + 97)
Beispiel #3
0
from spellchecker import SpellChecker
import pickle

spell = SpellChecker()
target_words = pickle.load(open('target_words_all.txt', 'rb'))
misspelled = spell.unknown(target_words)
spell.word_frequency.load_words(misspelled)

Beispiel #4
0
from bs4 import BeautifulSoup
from spellchecker import SpellChecker

def suppress_punctuation(text):
    """ Suppress punctuation in a text
    
    :param text str: Text to clean up
    :returns: Text without punctuation
    :rtype: str
    """
    punctuation = "!:;\",?'’."
    for sign in punctuation:
        text = text.replace(sign, " ")
    return text

spell = SpellChecker(language=None, local_dictionary=sys.argv[3], case_sensitive=True)
#With 'case_sensitive=True', we precise that all the words are processed as they are written in the text
#This means that all the uppercase words will be considered wrong but that helps correct them
#To use that technique, we have to call a local dictionnary

for root, dirs, files in os.walk(sys.argv[1]):
    for filename in files:
        dictionary = {}
        with open(sys.argv[1] + filename, 'r') as xml_file:
            print("reading from "+sys.argv[1] + filename)
            soup = BeautifulSoup(xml_file, 'xml')
        for unicode in soup.find_all('Unicode'):
            content = unicode.string
            content = suppress_punctuation(content)
            words = content.split()
            misspelled = spell.unknown(words)
Beispiel #5
0
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        self._parser.suspectedEntityDict = {}

        query_as_list = self._parser.parse_sentence(query)
        # add entities to query - entities doesn't adds to query_as_list in parse_sentence
        # suspectedEntityDict holds only entities from original query
        for entity in self._parser.suspectedEntityDict:
            query_as_list.append(entity)

        # Clear query from Entities parts
        query_as_list = self.clearEntitiesParts(query_as_list)
        ######################################################################################################
        #### INIT SPELL CORRECTION, ADD COVID AND FIND UNKNOWN WORDS IN ORIGINAL QUERY - BEFORE EXPENDING ####
        ######################################################################################################
        # spell checker part
        spellFixer = SpellChecker()
        # add words to known word list
        spellFixer.word_frequency.load_words(['covid'])
        # find unknown words - those words will need spell correction
        missSpell = spellFixer.unknown(query_as_list)

        ####################################
        ######## WordNet expenssion ########
        ####################################
        extendedQ = copy.deepcopy(query_as_list)
        for term in query_as_list:
            synset = wordnet.synsets(term)
            try:
                for i in range(2):
                    Synonym = synset[i].lemmas()[0].name()
                    if term.lower() != Synonym.lower(
                    ) and Synonym + "~" not in extendedQ:
                        Synonym += "~"
                        extendedQ.append(Synonym)
            except:
                continue
        query_as_list = extendedQ

        #####################################
        ######## Spelling correction ########
        #####################################
        # add fixed words
        fixedQuery = copy.deepcopy(query_as_list)
        for word in missSpell:
            candidates = list(spellFixer.candidates(word))
            for i in range(2):
                try:
                    if candidates[i] not in fixedQuery:
                        fixedQuery.append(candidates[i] + '~')
                except:
                    break

        numberOFresults, relevantDocIdList = searcher.search(
            fixedQuery)  # returns tuple (numberOFresults,relevantDocIdList)
        return numberOFresults, relevantDocIdList
Beispiel #6
0
def cleanTweet(text, appostrophes=True, emojis=True, html=True, url=True, misspellings=True, punctuation=True, lemming=True,\
               stop=True):
    """ 
    Function to clean text
    Input text: string of text
    Input appostrophes: defaul True, boolean to clean for appostrophes
    Input emojis: defaul True, boolean to clean for emojis
    Input html: defaul True, boolean to clean for html tags
    Input url: defaul True, boolean to clean for url
    Input misspellings: defaul True, boolean to clean for misspellings
    Input punctuation: defaul True, boolean to clean for punctuation
    Input lemming: defaul True, boolean to clean with lemming technique
    Input stop: defaul True, boolean to clean for stop words
    Return filtered_string: filtered string of input text
    """
    if appostrophes:
        #convert appostrophes
        filtered_string = decontracted(text)
    if emojis:
        #decoding, removing emojis
        filtered_string = filtered_string.encode("utf-8").decode('ascii','ignore')
    if html:
        #cleaning of html tags
        htmltags = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        filtered_string = re.sub(htmltags, '', filtered_string)
    if url:
        #cleaning of url
        url = re.compile(r'https?://\S+|www\.\S+')
        filtered_string = re.sub(url, '', text)
    if misspellings:
        #cleaning of misspellings
        spell = SpellChecker()
        corrected_text = []
        misspelled_words = spell.unknown(filtered_string.split())
        for word in filtered_string.split():
            if word in misspelled_words:
                corrected_text.append(spell.correction(word))
            else:
                corrected_text.append(word)
        filtered_string =  " ".join(corrected_text)
    if punctuation:
        word_tokens = word_tokenize(filtered_string)
        #remove punctuations
        table=str.maketrans('','',string.punctuation)
        filtered_string.translate(table)  
        filtered_string = [word.translate(table) for word in word_tokens]
        filtered_string = " ".join(filtered_string)
    if lemming:
        #lemming of words
        word_tokens = word_tokenize(filtered_string)
        lemmatizer = WordNetLemmatizer() 
        filtered_string = [lemmatizer.lemmatize(word) for word in word_tokens]
    if stop:
        # cleaning from stopwords
        stop_words=set(stopwords.words('english'))
        stop_word_drop = [] 
        for word in filtered_string: 
            if word not in stop_words: 
                stop_word_drop.append(word) 
    filtered_string = " ".join(stop_word_drop)
    
    #toDos
    #cleaning of rare words
    # tokens is a list of all tokens in corpus
    # freq_dist = nltk.FreqDist(token)
    # rarewords = freq_dist.keys()[-50:]
    # after_rare_words = [ word for word in token not in rarewords]
    #cleaning of slang words
    #split attached words, not working and questionable because of all capital words
    # filtered_string =  " ".join(re.findall('[A-Z][^A-Z]*', filtered_string))
    return filtered_string
import pyspelling
from spellchecker import SpellChecker
from nltk.tokenize import TweetTokenizer

token = TweetTokenizer()
s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
token.tokenize(s0)
spell = SpellChecker(language = "en", case_sensitive = False)
misspelled1 = token.tokenize(s0)
for word in misspelled1:
    if( not word.isalpha()):
        print(word)
        misspelled1.remove(word)
print(len(misspelled1))
for word in misspelled1:
    if( not word.isalpha()):
        print(word)
        misspelled1.remove(word)
print(len(misspelled1))
for word in misspelled1:
    if( not word.isalpha()):
        print(word)
        misspelled1.remove(word)
print(len(misspelled1))
for word in misspelled1:
    if( not word.isalpha()):
        print(word)
        misspelled1.remove(word)
print(len(misspelled1))        
# for word in misspelled:
#     # Get the one `most likely` answer
 def __init__(self, dbConnection, redis_client):
     self.db = dbConnection
     self.spell_checker = SpellChecker()
     self.lemmatizer = WordNetLemmatizer()
     self.redis_client = redis_client
Beispiel #9
0
corpus_trainer = ChatterBotCorpusTrainer(hal)
corpus_trainer.train("chatterbot.corpus.english.ai")
corpus_trainer.train("chatterbot.corpus.english.conversations")
corpus_trainer.train("chatterbot.corpus.english.computers")
corpus_trainer.train("chatterbot.corpus.english.emotion")
corpus_trainer.train("chatterbot.corpus.english.greetings")
corpus_trainer.train("chatterbot.corpus.english.movies")

#trains HAL using the training data defined in trainingData.py
conversationTrainer = ListTrainer(hal)
conversationTrainer.train(trainingData.casualConversation)
conversationTrainer.train(trainingData.basicAdvice)
conversationTrainer.train(trainingData.advisor)
conversationTrainer.train(trainingData.gpaToTransfer)

correctTypos = SpellChecker()

tag_list = [
    'cs 149', 'ise 164', 'cs 146', 'cmpe 131', 'cmpe 120', 'cmpe 102',
    'cmpe 133', 'cmpe 148', 'cmpe 165', 'cmpe 172', 'cmpe 187', 'cmpe 195a',
    'cmpe 195b', 'engr 195a', 'engr 195b', 'engr 195', 'cmpe 195', 'cmpe195',
    'engr195', 'cs 151', 'cs 157a', 'cs 166', 'cs149', 'ise164', 'cs146',
    'cmpe131', 'cmpe120', 'cmpe102', 'cmpe133', 'cmpe148', 'cmpe165',
    'cmpe172', 'cmpe187', 'cmpe195a', 'cmpe195b', 'engr195a', 'engr195b',
    'engr195', 'cmpe195', 'cs151', 'cs157a', 'cs166',
    'how many units should i take', 'cmpe 137', 'cmpe137', 'cmpe 139',
    'cmpe139', 'cmpe 152', 'cmpe152', 'cmpe 185', 'cmpe185', 'cmpe 181',
    'cmpe181', 'cmpe 182', 'cmpe182', 'cmpe 183', 'cmpe183', 'cmpe 185',
    'cmpe185', 'cmpe 188', 'cmpe188', 'cmpe 189', 'cmpe189', 'cs 116a',
    'cs116a', 'cs 134', 'cs134', 'cs 152', 'cs152'
]
Beispiel #10
0
import pandas as pd
import unidecode

from deep_translator import GoogleTranslator
from spellchecker import SpellChecker

from bamboo_lib.logger import logger
from bamboo_lib.connectors.models import Connector
from bamboo_lib.steps import WildcardDownloadStep, LoadStep
from bamboo_lib.models import EasyPipeline, PipelineStep
from bamboo_lib.helpers import grab_connector

from util import *

speller = SpellChecker(language='es')


class ReadStep(PipelineStep):
    def run_step(self, prev_result, params):
        logger.info('Running read step...')
        files_list = prev_result
        df = pd.DataFrame()

        for file_ in files_list:
            temp_df = pd.read_excel(file_[0])
            temp_df.columns = temp_df.columns.str.lower()
            if file_[1]['filename'] in [
                    'pef_ac01_avance_2t_2020', 'pef_ac01_avance_2t_2021',
                    'pef_ac01_avance_3t_2020', 'pef_ac01_avance_1t_2021',
                    'pef_ac01_avance_1t_2020'
            ]:
from spellchecker import SpellChecker

spell = SpellChecker()

# find those words that may be misspelled
misspelled = spell.unknown(['something', 'beana', 'hapenning', 'here'])

for word in misspelled:
    print(word)
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))
Beispiel #12
0
 def __init__(self):
     self.spell = SpellChecker()
Beispiel #13
0
 def Tokenize_word(self,text):
     
     ######## Thai word segment ######## ver1
     '''sent = text[0].replace("'","")
     word = word_tokenize(sent, engine='deepcut') # use this method
     wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word]
     words =[]
     for w in wword:
         if w not in common.thai_stopwords():
             words = [str for str in words if str]
             words.append(w)
     return words'''
 
     ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai)
     sent = text[0].replace("'","")    
     word = word_tokenize(sent, engine='attacut') # use this method
     #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word]
     th_no_stopwords =[]
     all_no_stopwords =[]
     th_correct_words =[]
     eng_correct_words =[]
     mix_correct_words =[]
     mix1_correct_words =[]
     all_correct_words =[]
     all_correct_words_final =[]
     check_thai_list = []
     #for tw in wword:
     for tw in word:
         if tw not in common.thai_stopwords():
             th_no_stopwords = [str for str in th_no_stopwords if str]
             th_no_stopwords.append(tw)
     #print("th_no_stopwords = ", th_no_stopwords)
     for ew in th_no_stopwords:
         if ew not in stopwords.words('english'):
             all_no_stopwords = [str for str in all_no_stopwords if str]        
             all_no_stopwords.append(ew)
     #print("all_no_stopwords = ", all_no_stopwords)
     for c in all_no_stopwords:
         thai = isthai(c)
         number = c.isnumeric()
         if not thai:
             no_num = c.isalpha()
             match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string
             if no_num:
                 spell = SpellChecker()
                 eng_correct = spell.correction(c) #pn
                 eng_correct_words.append(eng_correct)
                 #print("eng = ", eng_correct)
             elif match1:
                 mix = c
                 mix_correct_words.append(mix)
                 #print("mix = ", mix)
             else:
                 num = c #No return
                 #print("num = ", num)
         elif thai:
             checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn
             th_correct = checker.correct(c)
             th_correct_words.append(th_correct)
             #print("thai = ", th_correct)
           
     all_correct_words = th_correct_words + eng_correct_words + mix_correct_words
     all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words]
     all_correct_words_final = list(filter(None, all_correct_words))
     #print("words = ", all_correct_words_final)  
     return all_correct_words_final
 
     
     ######## Eng word segment ########
     '''word = text[0]
 def test_word_contains(self):
     ''' test the contains functionality '''
     spell = SpellChecker()
     self.assertEqual(spell['the'], 6187925)
 def test_word_in(self):
     ''' test the use of the `in` operator '''
     spell = SpellChecker()
     self.assertTrue('key' in spell)
     self.assertFalse('rando' in spell)
Beispiel #16
0
import pycountry
import os
import os.path
import wikipedia
from weather import Weather
import re
import pyjokes
import requests
import wolframalpha
import webbrowser
import glob
from PyDictionary import PyDictionary
import tmdbsimple as tmdb
import nltk
from nltk.corpus import wordnet
spell = SpellChecker()
st1=[]
hi = []
thanks=[]
yes=[]
for syn in wordnet.synsets("hi"):
    for l in syn.lemmas():
        hi.append(l.name())      
app_id='XPAQWX-W5LAG5ELYL'
client=wolframalpha.Client(app_id)
tmdb.API_KEY = '60222ace6396c345f94cc42eaac5dae5'
doss = os.getcwd()
i=0
n=0
flag=0
dictionary=PyDictionary()
Beispiel #17
0
import csv
import string

import pandas as pd
import numpy as np

from spellnCorrection import *
from preprocessing import *

from spellchecker import SpellChecker
spell = SpellChecker(language='fr')
""" Step 1: Extract data from file """

dataFile = './collaborativeActs.csv'
df = pd.read_csv(dataFile,
                 delimiter="\t",
                 header=None,
                 error_bad_lines=False,
                 encoding="utf8")

X = np.array(df)
""" Step 2: Spell and Correction """

sentence = []
WRONG = []
for i in range(X.shape[0]):
    if i != 0:
        utterance = X[i][7]
        tokens = normalization(utterance)
        tokens = tokenization(tokens)
        tokens = [token.text for token in tokens]
Beispiel #18
0
 def __init__(self):
     self.tokenizer = RegexpTokenizer(r'\w+')
     self.spell = SpellChecker()
Beispiel #19
0
def correct_spelling(text):
    spell_checker = SpellChecker()
    return " ".join([spell_checker.correction(w) for w in text.split(" ")])
Beispiel #20
0
from spellchecker import SpellChecker
from pynput.keyboard import Key, Listener
import logging
from pyautogui import typewrite, hotkey

spell = SpellChecker('en')

Elissa_dict = './dictionary.txt'
spell.word_frequency.load_text_file(Elissa_dict, 'utf-8')

dirty = open('./bad_words.txt', 'r', encoding='utf-8-sig')
junk = dirty.read().splitlines()
dirty.close()
spell.word_frequency.remove_words(junk)


def autocorrect():
    current_string = open('./keyLog.txt', 'r+', encoding='utf-8-sig')
    string = current_string.read()
    current_string.truncate(0)
    correct = spell.correction(string)
    if len(string) != 0 and string != correct:
        return correct


log_dir = "C:/Users/Cameron/Desktop/1P03/LawtoCorrect/Backend"
logging.basicConfig(filename=(log_dir + "keyLog.txt"),
                    level=logging.DEBUG,
                    format='%(message)s')

keys = []
Beispiel #21
0
 def __init__(self, data: pd.DataFrame, column_name: str, language: str, local_dictionary=None, distance=2):
     self.__strings = data[column_name]
     self.__spell_checker = SpellChecker(language=language, local_dictionary=local_dictionary, distance=distance)
class StoresManagerInterface:
    def __init__(self, users_manager):
        self.stores_manager = StoresManager()
        self.spell_checker = SpellChecker()
        self.users_manager = users_manager

    def search_product(self, search_term: str = "", categories: [str] = [], key_words: [str] = []) \
            -> {int: [Product]}:
        """

        Args:
            search_term:
            categories:
            key_words:

        Returns:

        """
        logger.log(
            "called search with search term:%s, categories:%s, key words:%s",
            search_term, categories, key_words)
        return self.stores_manager.search(
            self.spell_checker.correction(search_term),
            [self.spell_checker.correction(word) for word in categories],
            [self.spell_checker.correction(word) for word in key_words])

    def add_purchase_to_store(self, store_id: int, purchase: Purchase):

        return self.stores_manager.add_purchase_to_store(store_id, purchase)

    # def search(self, search_term: str = "", categories: [str] = None, key_words: [str] = None) -> {Store: [Product]}:
    #     return self.stores_manager.search(search_term, categories, key_words)

    def add_product_to_store(self, store_id: int, user_name: str,
                             product_name: str, product_price: int,
                             product_categories: [str], key_words: [str],
                             amount) -> bool:
        logger.log(
            "user %s called add product to store no.%d. product name:%s"
            " product price:%d product categories:%s,key words:%s, amount:%d",
            user_name, store_id, product_name, product_price,
            product_categories, key_words, amount)

        managed_stores = jsonpickle.decode(
            self.users_manager.get_managed_stores(user_name))

        if store_id in managed_stores:
            return self.stores_manager.add_product_to_store(
                store_id, user_name, product_name, product_price,
                product_categories, key_words, amount)
        return False

    def appoint_manager_to_store(self, store_id, owner, to_appoint):
        logger.log("user %s call appoint manager %s to store no.%d", owner,
                   to_appoint, store_id)
        if store_id in self.users_manager.get_managed_stores(
                owner) and self.users_manager.check_if_registered(to_appoint):
            if self.stores_manager.appoint_manager_to_store(
                    store_id, owner, to_appoint):
                self.users_manager.add_managed_store(to_appoint, store_id)
                return True

        return False

    def appoint_owner_to_store(self, store_id, owner, to_appoint):
        logger.log("user %s call appoint owner %s to store no.%d", owner,
                   to_appoint, store_id)
        if str(store_id) in self.users_manager.get_managed_stores(
                owner) and self.users_manager.check_if_registered(to_appoint):
            if self.stores_manager.appoint_owner_to_store(
                    store_id, owner, to_appoint):
                self.users_manager.add_managed_store(to_appoint, store_id)
                return True

        return False

    def add_permission_to_manager_in_store(self, store_id, owner, manager,
                                           permission: str):
        logger.log("user %s add %s permission to %s in store no.%d", owner,
                   permission, manager, store_id)
        if str(store_id
               ) in self.users_manager.get_managed_stores(owner) and str(
                   store_id) in self.users_manager.get_managed_stores(manager):
            return self.stores_manager.add_permission_to_manager_in_store(
                store_id, owner, manager, permission)
        return False

    def remove_permission_from_manager_in_store(self, store_id, owner, manager,
                                                permission: str):
        logger.log("user %s remove %s permission to %s in store no.%d", owner,
                   permission, manager, store_id)
        if store_id in self.users_manager.get_managed_stores(
                owner) and store_id in self.users_manager.get_managed_stores(
                    manager):
            self.stores_manager.remove_permission_from_manager_in_store(
                store_id, owner, manager, permission)

    def open_store(self, owner: str, store_name):
        logger.log("user %s open %s store", owner, store_name)
        if self.users_manager.check_if_registered(owner):
            store_id = self.stores_manager.open_store(owner, store_name)
            self.users_manager.add_managed_store(owner, store_id)
            return store_id
        return -1

    def buy(self, cart):
        self.stores_manager.buy(cart)

    def get_sales_history(self, store_id, user) -> [Purchase]:
        logger.log("user %s get sales history of store no.%d", user, store_id)
        if self.users_manager.check_if_registered(user) and (
                str(store_id) in self.users_manager.get_managed_stores(user)
                or self.users_manager.is_admin(user)):
            return self.stores_manager.get_sales_history(
                store_id, user, self.users_manager.is_admin(user))

    def remove_product(self, store_id, product_name, username):
        return self.stores_manager.remove_produce_from_store(
            store_id, product_name, username)

    def add_discount_to_product(self, store_id, product_name, username,
                                start_date, end_date, percent):
        return self.stores_manager.add_visible_discount_to_product(
            store_id, product_name, username, start_date, end_date, percent)

    def update_product(self, store_id, username, product_name, attribute,
                       updated):
        return self.stores_manager.update_product(store_id, username,
                                                  product_name, attribute,
                                                  updated)

    def remove_manager(self, store_id, owner, to_remove):
        if self.stores_manager.remove_manager(store_id, owner, to_remove):
            self.users_manager.remove_managed_store(to_remove, store_id)
            return True
        return False

    def remove_owner(self, store_id, owner, to_remove):
        return self.stores_manager.remove_owner(store_id, owner, to_remove)
Beispiel #23
0
import json
from spellchecker import SpellChecker

sp = SpellChecker()

data = json.load(open("data.json"))
word = input("enter the word:")

print(sp)


def defi(w):
    w = w.lower()
    if w in data:
        print(data[w])

    else:
        msp = sp.correction(w)
        if (msp != w):
            print("did you mean the word: {}".format(msp))
        else:
            print("the data doesn't exit")


defi(word)
 def __init__(self, users_manager):
     self.stores_manager = StoresManager()
     self.spell_checker = SpellChecker()
     self.users_manager = users_manager
Beispiel #25
0
from sklearn.metrics.pairwise import linear_kernel
from sklearn.externals import joblib
from sklearn.metrics.pairwise import linear_kernel
import re
import os
from ChatbotBackend.settings import BASE_DIR#, BASE_DIR
import spacy
import sqlite3
from spellchecker import SpellChecker
from pattern.de import parse


nlp = spacy.load("de_core_news_sm")
DATABASE_PATH = os.path.join( BASE_DIR, 'requirements/openthe.db' )
stmt = "SELECT term.word FROM term, synset, term term2 WHERE synset.id = term.synset_id AND term2.synset_id = synset.id AND term2.word = ?"
spell = SpellChecker(language='de')

def process_text( text ):
    annotated = []
    parsed_text = parse(text, lemmata=True)
    doc = parsed_text.split(" ")
    for token in doc:
        pos_tag = token.split("/")[1]
        lemma = token.split("/")[4]
        if pos_tag == ".":
            continue
        current_token = token.split("/")[0]

        if current_token not in spell:
            current_token = spell.correction(current_token)
Beispiel #26
0
class StyloDocument(object):

    DEFAULT_AUTHOR = "Unknown"

    def __init__(self, file_content, author=DEFAULT_AUTHOR):
        self.author = author.strip()
        self.raw_content = file_content
        self.file_content = file_content.lower()
        self.tokens = PortugueseTextualProcessing.tokenize(self.file_content)
        self.text = Text(self.tokens)
        self.fdist = FreqDist(self.text)
        self.sentences = sent_tokenize(self.file_content, language='portuguese')
        self.sentence_chars = [len(sent) for sent in self.sentences]
        self.sentence_word_length = [len(sent.split()) for sent in self.sentences]
        self.paragraphs = [p for p in self.file_content.split("\n\n") if len(p) > 0 and not p.isspace()]
        self.paragraph_word_length = [len(p.split()) for p in self.paragraphs]
        self.punctuation = [".", ",", ";", "-", ":"]
        self.ner_entities = ['ABSTRACCAO', 'ACONTECIMENTO', 'COISA', 'LOCAL',
                             'ORGANIZACAO', 'OBRA', 'OUTRO', 'PESSOA', 'TEMPO', 'VALOR']
        self.white_spaces = len(self.file_content.split(' '))

        self.rich_tags = RichTags(PortugueseTextualProcessing.get_rich_tags(self.file_content), len(self.text))
        self.tagged_sentences = PortugueseTextualProcessing.postag(self.tokens)
        self.tagfdist = FreqDist([b for [(a, b)] in self.tagged_sentences])
        self.ner_tags = PortugueseTextualProcessing.ner_chunks(self.tokens)
        self.ner_ftags = FreqDist(self.ner_tags)
        self.spell = SpellChecker(language='pt')
        self.ROUNDING_FACTOR = 4
        self.LINE_BREAKS = ['\n', '\t', '\r']

    def get_tag_count_by_start(self, tag_start):
        count = 0
        for tag in self.tagfdist.keys():
            if tag.startswith(tag_start):
                count += self.tagfdist[tag]
        return count

    def get_class_frequency_by_start(self, tag_start):
        return self.get_tag_count_by_start(tag_start)/self.tagfdist.N()

    def get_total_not_found(self):
        """"The wn is not being reliable so far"""
        nf_tokens = self.get_tokens_by_tag('notfound')
        return len([i for i in nf_tokens if len(wn.synsets(i, lang='por')) == 0])

    def tag_frequency(self, tag):
        return self.tagfdist.freq(tag)

    def entity_frequency(self, tag):
        return self.ner_ftags.freq(tag)

    def get_tokens_by_tag(self, tag):
        return [i[0][0] for i in self.tagged_sentences if i[0][1] == tag]

    def get_long_sentence_freq(self):
        return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE]))/len(self.sentences)

    def get_short_sentence_freq(self):
        return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE]))/len(self.sentences)

    def get_long_short_sentence_ratio(self):
        """"RF FOR PAN 15"""
        return len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE])/(len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE]))

    def get_sentence_starting_tags_ratio(self, tag):
        count = [i[0][1] for i in self.tagged_sentences].count(tag)
        return count/len(self.sentences)

    def term_per_hundred(self, term):
        """
        term       X
        -----  = ------
          N       100
        """
        return (self.fdist[term] * 100) / self.fdist.N()

    def mean_sentence_len(self):
        return np.mean(self.sentence_word_length)

    def std_sentence_len(self):
        return np.std(self.sentence_word_length)

    def mean_paragraph_len(self):
        return np.mean(self.paragraph_word_length)

    def std_paragraph_len(self):
        return np.std(self.paragraph_word_length)

    def flesh_index(self):
        idx, value = PortugueseTextualProcessing().get_ptBR_flesch_index(self.tokens, self.get_phrases())
        return idx

    def vocabulary(self):
        return [v for v in sorted(set(self.sentences)) if v not in self.punctuation]

    def mean_word_len(self):
        words = set(word_tokenize(self.file_content, language='portuguese'))
        word_chars = [len(word) for word in words]
        return sum(word_chars) / float(len(word_chars))

    def max_word_len(self):
        words = set(word_tokenize(self.file_content, language='portuguese'))
        return max([len(word) for word in words])

    def type_token_ratio(self):
        return (len(set(self.text)) / len(self.text)) * 100

    def unique_words_per_hundred(self):
        return self.type_token_ratio() / 100.0 * 100.0 / len(self.text)

    def document_len(self):
        return sum(self.sentence_chars)

    def get_phrases(self):
        return [i for i in self.file_content.split('.') if i != '']

    def mean_syllables_per_word(self):
        _, syllable_count = PortugueseTextualProcessing().get_syllable_counts(self.tokens)
        return syllable_count/len(self.tokens)

    def characters_frequency(self, character_list):
        return self.frequency([word for word in self.file_content if word in character_list])

    def digits_frequency(self):
        return self.frequency([word for word in self.file_content if word.isdigit()])

    def line_breaks_frequency(self):
        return self.frequency([word for word in self.file_content if word in self.LINE_BREAKS])

    def count_consonant_frequency(self):
        character_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w',
                          'y', 'x', 'z']
        return self.frequency([word for word in self.file_content if word in character_list])

    def camel_case_frequency(self):
        return self.frequency([word for word in self.raw_content.split(' ') if word and word[0].isupper() and (len(word) == 1 or word[1].islower())])

    def local_hapax_legommena_frequency(self):
        return (len(self.fdist.hapaxes()))/len(self.text.tokens)

    def collocations_frequency(self, size):
        """words that often appear consecutively in the window_size"""
        return (len(self.text.collocation_list(window_size=size)))/len(self.text.tokens)

    def most_frequent_word_size(self):
        return FreqDist(len(w) for w in self.text).max()

    def mean_frequent_word_size(self):
        return FreqDist(len(w) for w in self.text).most_common(3)[1][0]

    def guiraud_R_measure(self):
        return (len(set(self.text)))/math.sqrt(len(self.text))

    def herdan_C_measure(self):
        # log V(N)/log N
        return (math.log2(len(set(self.text))))/math.log2(len(self.text))

    def herdan_V_measure(self):
        # N ^ C
        return math.pow(len(self.text), self.herdan_C_measure())

    def K_measure(self):
        # log V(N)/log(log(N))
        return (math.log2(len(set(self.text)))) / math.log2(math.log2(len(self.text)))

    def dugast_U_measure(self):
        # log ^ 2 N/log(N) - log V(N)
        return (math.pow(math.log2(len(self.text)), 2)) / (math.log2(len(self.text)) - math.log2(len(set(self.text))))

    def maas_A_measure(self):
        #a ^ 2 = logN - logV(N)/log ^ 2 N
        return math.sqrt((math.log2(len(self.text)) - math.log2(len(set(self.text))))
                          / math.pow(math.log2(len(self.text)), 2))

    def LN_measure(self):
        # 1 - V(N) ^ 2/ V(N) ^ 2 log N
        return (1 - math.pow(len(set(self.text)),2)) / (math.pow(len(set(self.text)), 2) * math.log2(len(self.text)))

    def honores_H_measure(self):
        return (len(self.fdist.hapaxes()))/len(set(self.text))

    def spell_miss_check_frequency(self):
        return self.frequency(self.spell.unknown(self.text))

    def noun_phrases(self):
        return PortugueseTextualProcessing().get_number_of_noun_phrases(self.tokens) / len(self.text)

    def verb_phrases(self):
        return self.frequency(PortugueseTextualProcessing().get_number_of_verb_phrases(self.file_content))

    def monosyllables(self):
        return PortugueseTextualProcessing().get_monosyllable_counts(self.tokens) / len(self.text)

    def repeated_words_frequency(self):
        repeated_words = list(filter(lambda x: x[1] >= 2, FreqDist(PortugueseTextualProcessing().remove_stopwords(self.tokens)).items()))
        return self.frequency(repeated_words)

    def stop_word_freq(self):
        clean_words = PortugueseTextualProcessing().remove_stopwords(self.tokens)
        return (len(self.tokens) - len(clean_words)) / len(self.text)

    def get_logical_operator_frequency(self):
        return self.frequency([token for token in self.tokens if token in PortugueseTextualProcessing.LOGICAL_OPERATORS])

    def get_tags_freq(self, tags):
        count = 0
        for tag in tags:
            count += self.get_tag_count_by_start(tag)
        return count/len(self.tokens)

    def find_quotes(self):
        """Improve this method to retrieve quotes based on Patterns and special words
        egs: p.43;  segundo (autor, ano)
        """
        return self.characters_frequency(['“', '”'])

    def frequency(self, input_values):
        return len(input_values) / len(self.text)

    @classmethod
    def csv_header(cls):
        return (
            ['DiversidadeLexica', 'TamanhoMedioDasPalavras', 'TamanhoMedioSentencas', 'StdevSentencas', 'TamanhoMedioParagrafos',
             'StdevTamParagrafos', 'FrequenciaDeParagrafos','FrequenciaPalavrasDuplicadas', 'MediaSilabasPorPalavra',

             'Monossilabas',

             'Ponto','Virgulas', 'Exclamacoes', 'DoisPontos', 'Citacoes', 'QuebrasDeLinha', 'Digitos',

             'Adjetivos', 'Adverbios','Artigos', 'Substantivos', 'Preposicoes', 'Verbos','VerbosPtcp', 'Conjuncoes',
             'Pronomes', 'PronomesPorPreposicao','TermosNaoTageados', 'PalavrasDeConteudo', 'PalavrasFuncionais',
             'FrasesNominais', 'FrasesVerbais', 'GenMasc', 'GenFem', 'SemGenero', 'Singular', 'Plural',

             'PrimeiraPessoa', 'TerceiraPessoa','Passado','Presente','Futuro',

             'TotalEntidadesNomeadas', 'EntAbstracao', 'EntAcontecimento', 'EntCoisa', 'EntLocal', 'EntOrganizacao',
             'EntObra', 'EntOutro', 'EntPessoa', 'EntTempo', 'EntValor',

             'GuiraudR', 'HerdanC', 'HerdanV', 'MedidaK', 'DugastU', 'MaasA', 'HonoresH',

             'PalavrasErroOrtografico', 'HapaxLegomenaLocal', 'PalavrasComunsTam2', 'PalavrasComunsTam3', 'PalavrasComunsTam4',
             'StopWords', 'BRFleshIndex', 'OperadoresLogicos', 'PalavrasCapitalizadas',

             'Author']
        )

    def csv_output(self):
        # TODO: Separate features into syntactical, lexical and so on..
        # 69 features + 1 class
        return "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \
               "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \
               "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \
               "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},'{}'".format(

            # Text style features - 10
            round(self.type_token_ratio(), self.ROUNDING_FACTOR),
            round(self.mean_word_len(), self.ROUNDING_FACTOR),
            round(self.mean_sentence_len(), self.ROUNDING_FACTOR),
            round(self.std_sentence_len(), self.ROUNDING_FACTOR),
            round(self.mean_paragraph_len(), self.ROUNDING_FACTOR),
            round(self.std_paragraph_len(), self.ROUNDING_FACTOR),
            len(self.paragraphs) / len(self.text),
            round(self.repeated_words_frequency(), self.ROUNDING_FACTOR),
            self.mean_syllables_per_word(),
            self.monosyllables(),

            # Term count features - 7
            self.term_per_hundred('.'),
            self.term_per_hundred(','),
            self.term_per_hundred('!'),
            self.term_per_hundred(':'),
            self.find_quotes(),
            self.line_breaks_frequency(),
            self.digits_frequency(),

            #POSTAG Features - 24
            self.tag_frequency('ADJ'),
            self.tag_frequency('ADV'),
            self.tag_frequency('ART'),
            self.tag_frequency('N'),
            self.tag_frequency('PREP'),
            self.tag_frequency('PCP'),  # verbo no participio
            self.get_class_frequency_by_start('V'),
            self.get_class_frequency_by_start('K'), #conjunções
            self.get_class_frequency_by_start('PRO'),
            self.get_class_frequency_by_start('PRO')/self.tag_frequency('PREP'), #used in french texts
            self.tag_frequency('notfound'),
            self.get_tags_freq(PortugueseTextualProcessing.CONTENT_TAGS),
            self.get_tags_freq(PortugueseTextualProcessing.FUNCTIONAL_TAGS),
            round(self.noun_phrases(), self.ROUNDING_FACTOR),
            round(self.verb_phrases(), self.ROUNDING_FACTOR),
            self.rich_tags.get_male(),
            self.rich_tags.get_female(),
            self.rich_tags.get_unspecified_gender(),
            self.rich_tags.get_singular(),
            self.rich_tags.get_plural(),
            self.rich_tags.get_first_person(),
            self.rich_tags.get_third_person(),
            self.rich_tags.get_past_tense(),
            self.rich_tags.get_present_tense(),
            self.rich_tags.get_future_tense(),


            #NER Features - 11
            round(len(self.ner_tags) / len(self.tokens), self.ROUNDING_FACTOR),
            self.entity_frequency('ABSTRACCAO'),
            self.entity_frequency('ACONTECIMENTO'),
            self.entity_frequency('COISA'),
            self.entity_frequency('LOCAL'),
            self.entity_frequency('ORGANIZACAO'),
            self.entity_frequency('OBRA'),
            self.entity_frequency('OUTRO'),
            self.entity_frequency('PESSOA'),
            self.entity_frequency('TEMPO'),
            self.entity_frequency('VALOR'),

            # Vocabulary diversity features - 7
            round(self.guiraud_R_measure(), self.ROUNDING_FACTOR),
            round(self.herdan_C_measure(), self.ROUNDING_FACTOR),
            round(self.herdan_V_measure(), self.ROUNDING_FACTOR),
            round(self.K_measure(), self.ROUNDING_FACTOR),
            round(self.dugast_U_measure(), self.ROUNDING_FACTOR),
            round(self.maas_A_measure(), self.ROUNDING_FACTOR),
            round(self.honores_H_measure(), self.ROUNDING_FACTOR),

            # Misc Features - 9
            self.spell_miss_check_frequency(),
            round(self.local_hapax_legommena_frequency(), self.ROUNDING_FACTOR),
            self.collocations_frequency(2),
            self.collocations_frequency(3),
            self.collocations_frequency(4),
            round(self.stop_word_freq(), self.ROUNDING_FACTOR),
            self.flesh_index(),
            self.get_logical_operator_frequency(),
            self.camel_case_frequency(),

            self.author,
        )

    def legacy_features(self):
        """Remove features that are here for future reference"""
        # self.count_characters_frequency(['a']),
        # self.count_characters_frequency(['e']),
        # self.count_characters_frequency(['i']),
        # self.count_characters_frequency(['o']),
        # self.count_characters_frequency(['u']),
        # self.count_consonant_frequency(),
        # self.mean_frequent_word_size(),
        # self.max_word_len(),
        # self.document_len(),
        # round(self.LN_measure(), 8)
        pass
Beispiel #27
0
import tensorflow as tf
import tensorflowjs as tfjs
import numpy as np
from spellchecker import SpellChecker

spell = SpellChecker()

skills = ['Research', 'Engineer', 'Developer', 'Python', 'Programming', 'PhD', 'Publications']

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

text = open('../../data/sentences.csv', 'rb').read().decode(encoding='utf-8').replace(',',' ')
text = text.replace('\n','. ')
text = text.replace('\r','')
text = text.replace('asa','as a')

vocab = sorted(set(text))

# Map from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

forwards_dir = '../training/training_checkpoints3'
Beispiel #28
0
def main():

    print 'Loading models ...'

    inputFolder1 = './input/'

    spellchecker = SpellChecker()
    embeddings = WordSimilarity(spellchecker)

    resultFile1 = open("result/result_top1.txt", "w")
    resultFile2 = open("result/result_top5.txt", "w")
    resultFile3 = open("result/result_visual.txt", "w")
    resultFile4 = open("result/result_text.txt", "w")
    start_time = time.time()
    for classDir in os.listdir(inputFolder1):
        predicted1 = []
        predicted5 = []
        vis_predict = []
        text_predict = []
        inputFolder = inputFolder1 + classDir + '/'
        for filename in os.listdir(inputFolder):
            if filename.endswith(".jpg") or filename.endswith(".jpeg"):
                try:
                    sceneRecognizer = SceneRecognizer()
                    txtDetector = SceneTextDetector()
                    txtRecognizer = SceneTextRecognizer()
                    print(classDir + '->' + filename)
                    print('\n')

                    imgPath = rescaleImage(inputFolder, filename)

                    inds, output_prob, labels, sub_labels = sceneRecognizer.recognize(
                        imgPath)
                    subClass = []
                    for sbl in sub_labels:
                        subClass.append(str(sbl.split(' ')[0]))

                    print "Getting visual features..."
                    visulaScores = {}
                    totalVisualScore = 0
                    for iterating_var in inds:
                        className = labels[iterating_var].split(' ')[0]
                        if className in subClass:
                            score = float(output_prob[iterating_var])
                            visulaScores[className] = score
                            totalVisualScore = score + totalVisualScore

                    for tmp in subClass:
                        tempScore = float(
                            visulaScores[tmp]) / float(totalVisualScore)
                        visulaScores[tmp] = tempScore

                    print "Getting textual features..."
                    # scene text recognition phase
                    outputName = txtDetector.detect(imgPath)

                    # scene text recognition phase
                    words = txtRecognizer.recognize(outputName)

                    textualScores = embeddings.checkSemanticSimilarity(
                        subClass, words)

                    print "fusing scores..."
                    finalScore = LBF(subClass, visulaScores, textualScores,
                                     0.4, 0.6)
                    finalScore = sorted(finalScore.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)
                    finalScore = finalScore[0:5]

                    actual = subClass.index(classDir)
                    for item in finalScore:
                        index = subClass.index(item[0])
                        if finalScore.index(item) == 0:
                            value = str(index) + '|' + str(item[1])
                            predicted1.append(value)
                        if index == actual:
                            predicted5.append(actual)
                            break
                        elif finalScore.index(item) == 4:
                            predicted5.append(subClass.index(finalScore[0][0]))

                    visulaScores = sorted(visulaScores.items(),
                                          key=operator.itemgetter(1),
                                          reverse=True)
                    textualScores = sorted(textualScores.items(),
                                           key=operator.itemgetter(1),
                                           reverse=True)
                    visulaScores = visulaScores[0:5]
                    textualScores = textualScores[0:5]

                    for item in visulaScores:
                        index = subClass.index(item[0])
                        if visulaScores.index(item) == 0:
                            value = str(index) + '|' + str(item[1])
                            vis_predict.append(value)
                            break

                    for item in textualScores:
                        index = subClass.index(item[0])
                        if textualScores.index(item) == 0:
                            value = str(index) + '|' + str(item[1])
                            text_predict.append(value)
                            break

                except Exception:
                    print(
                        colored(
                            '############ Classifying ' + str(filename) +
                            ' has thrown error due to' +
                            str(Exception.message), 'green'))
                    print('\n')

        resultFile1.write(
            str(subClass.index(classDir)) + ':' + toString(predicted1) + '\n')
        resultFile2.write(
            str(subClass.index(classDir)) + ':' + toString(predicted5) + '\n')
        resultFile3.write(
            str(subClass.index(classDir)) + ':' + toString(vis_predict) + '\n')
        resultFile4.write(
            str(subClass.index(classDir)) + ':' + toString(text_predict) +
            '\n')

    resultFile1.close()
    resultFile2.close()
    resultFile3.close()
    resultFile4.close()

    print(
        colored(
            '############ Testing in % seconds ################' %
            (time.time() - start_time), 'green'))
Beispiel #29
0
from psycopg2 import sql
import sys
sys.path.append('.')
from rule_processing import postgresql

def queryTable(conn, table):
    cmd = """
    SELECT * FROM {}
    """
    with conn.cursor() as cur: 
        cur.execute(sql.SQL(cmd).format(sql.Identifier(table)))
        return cur.fetchall()

compr = boto3.client(service_name='comprehend')
compr_m = boto3.client(service_name='comprehendmedical')
spell = SpellChecker() 
conn = postgresql.connect()
spelling_list = [x[0] for x in queryTable(conn, 'spellchecker')]
conn.close()
# Add words to spell list 
spell.word_frequency.load_words(spelling_list)

def findId(val):
    if val == '-1': 
        return str(uuid.uuid4())
    return val

def findUnidentified(val):
    if val.lower() == 'unidentified':
        return 'U/I'
    return val 
Beispiel #30
0
from spellchecker import SpellChecker

spell = SpellChecker()
# find those words that may be misspelled
misspelled = spell.unknown(['statment', 'is', 'wrong'])
for word in misspelled:
    print(spell.correction(word))
 def get_spelling_errors(words):
     checker = SpellChecker()
     return checker.unknown(words)
Beispiel #32
0
def process_txt(path_to_txt, pre_text_path, empty_output):
    # check if file exists here and exist if not
    try:
        f = open(path_to_txt)
        f.close()
    except FileNotFoundError:
        logging.critical('Given text file does not exist')
        sys.exit(0)

    logging.info(f"Processing {path_to_txt}")

    # nested withs https://stackoverflow.com/a/9283052/754432
    with open(pre_text_path, 'w') as outfile, open(path_to_txt,
                                                   'r',
                                                   encoding='utf-8') as infile:
        infile = infile.read().lower()

        infile = re.sub(r'-\n', "", infile)  # conjoin hypehnated newline words
        # infile = re.sub(r'\n\r-,\.', "", infile)
        infile = re.sub(r'\n', " ", infile)
        infile = re.sub(
            r'[\:\(\)\|\]\[\*©]', "", infile
        )  # discard non-spellable items, don't carry any significance towards misspelling
        infile = re.sub(
            r'(\w)\.(\w)', "i", infile
        )  # period character inside a word is often closer to 'i' than any other alphabet character for purposes of spelling correction

        WORDS = Counter(words(infile))
        WORDS = [word for word in WORDS if len(word) < 20
                 ]  # Fix script killer in 1974/ds-89171-page-8-article-17.txt

        spell = SpellChecker()  # loads default word frequency list

        # find those words that may be misspelled
        misspelled = spell.unknown(WORDS)

        auto_corrects = {}
        for i, word in enumerate(misspelled):
            auto_corrects[word] = spell.correction(word)
            # Get the one `most likely` answer
            # if logging.getLogger().level == logging.DEBUG: print("#{}".format(i), word, spell.correction(word))
            # Get a list of `likely` options
            # if logging.getLogger().level == logging.DEBUG: print(spell.candidates(word))
        if logging.getLogger().level == logging.DEBUG:
            pprint(auto_corrects, indent=2)

        try:
            str_out = re.sub(
                r'\b(%s)\b' % '|'.join(auto_corrects.keys()),
                lambda m: auto_corrects.get(m.group(1), m.group(1)), infile)
        except sre_constants.error as err:
            print("SRE ERROR:", err)
            str_out = infile
        except Exception as err:
            print("GENERAL ERROR:", err, type(err))
            str_out = infile
        finally:
            # SIGH! Just return the lightly cleaned text for this one
            pass

        #  str_out = re.sub(r'\b(\w+)\b', lambda m:auto_corrects.get(m.group(1), m.group(1)), f_read)
        if logging.getLogger().level == logging.DEBUG: print(str_out)
        outfile.write(str_out)
    logging.info(f"Processed {pre_text_path}")
    return True
Beispiel #33
0
#!/usr/bin/env python

'''
pip install pyspellchecker
'''

from spellchecker import SpellChecker
spell = SpellChecker()

# find those words that may be misspelled
misspelled = spell.unknown(['let', 'us', 'wlak','on','the','groun'])

for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))
class Incubator:
	#Class variables:
	#	sample_block_table: A dictionary containing all blocks in sample_path and their frequency
	#	spellchecker: A pyspellchecker instance with all the words in words_path added
	#	population: Total population of the incubator, indicating how many chromosomes exist at one time
	#	elites: How many elites are carried over for each generation
	#	children: How many children are created for each generation
	#	randoms: How many random chromosomes are added each generation
	#	tournament_size: How many chromosomes are considered in a tournament
	#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
	#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
	#	shock_enabled: True if genetic shock enabled, false otherwise
	#	shock_threshold: Number of cycles of fitness stagnation before genetic shock is triggered.
	#	max_cycles: Cycle # at which the simulation terminates
	def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles):
		#Parameters:
			#	sample_path: A path to a samples source file containing all training data to be fed to the incubator
			#	words_path: A path to all words which the cipher_breaker should consider valid in addition
			#		to those already in pyspellchecker.
			#	elites: How many elites are carried over for each generation
			#	children: How many children are created for each generation
			#	randoms: How many random chromosomes are added each generation
			#	tournament_size: How many chromosomes are considered in a tournament
			#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
			#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
			#	shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value
			#	max_cycles: Cycle # at which the simulation terminates

			#Initializes sample_block_tables
			self.sample_block_table = self.getSampleBlockTable(sample_path)

			#Initializes spellchecker
			self.spellchecker = SpellChecker()
			self.spellchecker.word_frequency.load_text_file((words_path))

			#Checks cross_chance and mutation_chance are valid
			assert (cross_chance + mutation_chance) == 1

			#Loads all incubator paramaters
			self.elites = elites
			self.children = children
			self.randoms = randoms
			self.population = self.elites + self.children + self.randoms

			self.tournament_size = tournament_size

			self.cross_chance = cross_chance
			self.mutation_chance = mutation_chance

			#Handles shock_value
			if shock_value <= 0:
				self.shock_enabled = False
				self.shock_threshold = 0
			else:
				self.shock_enabled = True
				self.shock_threshold = shock_value

			self.max_cycles = max_cycles

			#Prints incubator summary if verbose enables
			if __VERBOSE__:
				print("Incubator Summary:")
				print("sample_path: " + sample_path + "  words_path: " + words_path)
				print("Total population: " + str(self.population))
				print("Elites: " + str(self.elites) + "  Children: " + str(self.children) + "  Randoms: " + str(self.randoms))
				print("Tournament size: " + str(self.tournament_size) + "  Cross chance: " + str(self.cross_chance) + "  Mutation chance: " + str(self.mutation_chance))
				print("Shock enabled: " + str(self.shock_enabled) + "  Shock threshold: " + str(self.shock_threshold))
				print("Max cycles: " + str(self.max_cycles))
				print("\n")

	"""TRAINING FUNCTIONS"""
	#Takes ciphertext, returns a chromosome that should decrypt ciphertext
	def train(self, cipher_text):
		#Initializes cycle counter
		cycles = 0

		#Generates pool of chromosomes
		chromosomes = []

		for chromosome_iter in range(self.population):
			chromosomes.append(self.getRandomChromosome())

		#Genetic shock trigger variables. Triggers if fitness is stagnant for shock_threshold cycles
		best_fitness = 0
		shock_ticker = 0

		#Starts timer
		start_time = time.time()

		while True:
			#Increments cycle counter
			cycles += 1

			#Creates list of (chromosome, fitness) tuples in order of increasing fitness
			chromosome_fitness = []

			#Checks all chromosomes to see if the correct one has been found
			for chromosome in chromosomes:
				if len(self.spellchecker.unknown((chromosome.convertText(cipher_text)).split(" "))) == 0:
					if __VERBOSE__:
						print("Found key! " + str(chromosome))
						print("Decrypted text:  " + chromosome.convertText(cipher_text))
						print("")

						return (chromosome, cycles)

			#Gets fitness of each chromosome and sorts them according to fitness
			for chromosome in chromosomes:
				chromosome_fitness.append((chromosome, self.getFitness(chromosome, cipher_text)))

			chromosome_fitness.sort(key=lambda x: x[1])
			chromosome_fitness.reverse()

			#Checks if max_cycles exceeded. If so, returns the fittest chromosome
			if cycles >= self.max_cycles:
				print("Best Key: " + str(chromosome_fitness[0][0]))
				print("Decrypted text:  " + chromosome_fitness[0][0].convertText(cipher_text))
				print("")
				return (chromosome_fitness[0][0], cycles)

			#Checks if fitness is stagnant
			if chromosome_fitness[0][1] <= best_fitness:
				shock_ticker += 1
			else:
				best_fitness = max(chromosome_fitness[0][1], best_fitness)
				shock_ticker = 0

			#If __VERBOSE__, provide report on most fit chromosome
			if __VERBOSE__:
				converted_text = chromosome_fitness[0][0].convertText(cipher_text)
				print("Cycle# " + str(cycles))
				print("Best Chromosome: " + str(chromosome_fitness[0][0]))
				print("Fitness: " + str(chromosome_fitness[0][1]))
				print("Shock Ticker: " + str(shock_ticker))
				print("Cycle Time: " + str(time.time()-start_time))
				print("Attempted Decrypt: " + converted_text)
				print("Known words: " + str(self.spellchecker.known((chromosome_fitness[0][0].convertText(cipher_text).split(" ")))))
				print("Unknown words: " + str(self.spellchecker.unknown((chromosome_fitness[0][0].convertText(cipher_text).split(" ")))))
				print("")

			start_time = time.time()

			#Creates a new chromosomes list
			new_chromosomes = []

			#Copies over elite to new chromosomes
			for chromosome_iter in range(self.elites):
				new_chromosomes.append(chromosome_fitness[chromosome_iter][0].clone())

			#Creates children in new_chromsomes

			#Performs tournament process to select breeding candidates
			tournament_selections = []
			while len(tournament_selections) < (self.children):
				tournament_selections.append(self.tournament(chromosome_fitness))

			#Breeds selected candidates
			while len(tournament_selections)>0:
				chance = random.random()
				if chance < self.cross_chance and len(tournament_selections) > 1:
					chromosome_one = tournament_selections.pop()
					chromosome_two = tournament_selections.pop()

					crossed_chromosomes = self.crossChromosomes(chromosome_one, chromosome_two)

					new_chromosomes.append(crossed_chromosomes[0])
					new_chromosomes.append(crossed_chromosomes[1])
				elif chance < (self.mutation_chance + self.cross_chance):
					new_chromosomes.append(self.mutateChromosome(tournament_selections.pop()))
				else:
					new_chromosomes.append(self.getRandomChromosome())

			#Adds random chromosomes to new_chromosomes
			for random_iter in range(self.randoms):
				new_chromosomes.append(self.getRandomChromosome())

			#Checks if genetic shock should be triggered
			if shock_ticker >= self.shock_threshold and self.shock_enabled:
				if __VERBOSE__:
					print("Triggering genetic shock...\n")

				#Performs genetic shock, culling top 10% of population and mutation all others
				for chromosome_iter in range(len(new_chromosomes)):
					if self.getFitness(new_chromosomes[chromosome_iter], cipher_text) > .9 * best_fitness:
						new_chromosomes[chromosome_iter] = self.getRandomChromosome()
					else:
						new_chromosomes[chromosome_iter] = self.mutateChromosome(new_chromosomes[chromosome_iter])

				#Resets shock tickers and trackers
				shock_ticker = 0
				best_fitness = 0

			#Shifts new_chromosomes into gene pool
			chromosomes = new_chromosomes

	#Returns a mutated chromosome
	def mutateChromosome(self, chromosome):
		new_chromosome = chromosome.clone()

		#Chooses two mappings to swap
		mutation_one_index = random.randint(0,25)
		mutation_two_index = random.randint(0,25)

		while mutation_two_index == mutation_one_index:
			mutation_two_index = random.randint(0,25)

		mutation_one = new_chromosome.mappings[mutation_one_index]
		mutation_two = new_chromosome.mappings[mutation_two_index]

		new_chromosome.removeMapping(mutation_one)
		new_chromosome.removeMapping(mutation_two)

		mapping_one = (mutation_one[0], mutation_two[1])
		mapping_two = (mutation_two[0], mutation_one[1])

		new_chromosome.addMapping(mapping_one)
		new_chromosome.addMapping(mapping_two)

		return new_chromosome

	#Takes two chromosomes and returns two crosses of those chromosomes in the format (new_chromosome_one, new_chromosome_two)
	def crossChromosomes(self, chromosome_one, chromosome_two):
		new_chromosome_one = chromosome_one.clone()
		new_chromosome_two = chromosome_two.clone()

		for chromosome_iter in range(26):
			if(random.random() > .5):
				old_mapping_one = new_chromosome_one.mappings[chromosome_iter]
				old_mapping_two = new_chromosome_two.mappings[chromosome_iter]

				if old_mapping_one != old_mapping_two:
					complement_mapping_one = new_chromosome_one.getMappingTarget(old_mapping_two[1])
					complement_mapping_two = new_chromosome_two.getMappingTarget(old_mapping_one[1])

					old_origin_one = complement_mapping_one[0]
					old_origin_two = complement_mapping_two[0]

					new_chromosome_one.removeMapping(complement_mapping_one)
					new_chromosome_two.removeMapping(complement_mapping_two)

					new_chromosome_one.removeMapping(old_mapping_one)
					new_chromosome_two.removeMapping(old_mapping_two)

					complement_mapping_one = (old_origin_two, complement_mapping_one[1])
					complement_mapping_two = (old_origin_one, complement_mapping_two[1])

					new_chromosome_one.addMapping(old_mapping_two)
					new_chromosome_one.addMapping(complement_mapping_two)
					new_chromosome_two.addMapping(old_mapping_one)
					new_chromosome_two.addMapping(complement_mapping_one)

		return (new_chromosome_one, new_chromosome_two)

	#Returns a new random chromosome
	def getRandomChromosome(self):
		new_chromosome = Chromosome()

		origin = []
		destination = []

		for letterIter in range(26):
			origin.append(chr(letterIter+97))
			destination.append(chr(letterIter+97))

		random.shuffle(destination)

		for mappingIter in range(26):
			new_chromosome.addMapping((origin[mappingIter], destination[mappingIter]))

		return new_chromosome

	#Performs a tournament selection of chromosomes based on tournament_size
	def tournament(self, chromosome_fitness):
		tournament_pool = []

		for tournament_iter in range(self.tournament_size):
			tournament_pool.append(chromosome_fitness[random.randint(0, self.population-1)])

		return (max(tournament_pool, key=lambda x: x[1]))[0].clone()

	#Takes a chromosome and cipher_text and evaluates the chromosomes fitness
	def getFitness(self, chromosome, cipher_text):
		total_fitness = 0
		parsed_block_table = self.getBlockTable(chromosome.convertText(cipher_text))

		for block in parsed_block_table.keys():
			if block in self.sample_block_table.keys():
				total_fitness += math.log(self.sample_block_table[block],2)*(parsed_block_table[block])

		return total_fitness

	"""
	BLOCK FUNCTIONS
	"""
	#Returns the blocks located in the passed samples path.
	def getSampleBlockTable(self, sample_path):
		#Opens input file
		input_file = open(sample_path)
		block_table = {}

		for line in input_file:
			components = line.split(" ")

			components[1] = int(components[1][0:len(components[1])-1])

			block_table[components[0]] = components[1]

		input_file.close()

		return block_table

	#Takes a string and returns a hash table of blocks
	def getBlockTable(self, input_string):
		block_table = {}
		input_words = input_string.split(" ")

		#Hashes blocks in dictionary to count them
		for word in input_words:
			word_blocks = self.getBlocks(word)

			for block in word_blocks:
				if block in block_table:
					block_table[block] += 1
				else:
					block_table[block] = 1

		return block_table

	#Returns all substrings of a passed string
	def getBlocks(self, input_string):
		blocks = []

		for block_len in range(len(input_string)):
			start_point = 0
			end_point = block_len+1

			while end_point <= len(input_string):
				blocks.append(input_string[start_point:end_point])
				end_point+=1
				start_point+=1

		return blocks