def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles): #Parameters: # sample_path: A path to a samples source file containing all training data to be fed to the incubator # words_path: A path to all words which the cipher_breaker should consider valid in addition # to those already in pyspellchecker. # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value # max_cycles: Cycle # at which the simulation terminates #Initializes sample_block_tables self.sample_block_table = self.getSampleBlockTable(sample_path) #Initializes spellchecker self.spellchecker = SpellChecker() self.spellchecker.word_frequency.load_text_file((words_path)) #Checks cross_chance and mutation_chance are valid assert (cross_chance + mutation_chance) == 1 #Loads all incubator paramaters self.elites = elites self.children = children self.randoms = randoms self.population = self.elites + self.children + self.randoms self.tournament_size = tournament_size self.cross_chance = cross_chance self.mutation_chance = mutation_chance #Handles shock_value if shock_value <= 0: self.shock_enabled = False self.shock_threshold = 0 else: self.shock_enabled = True self.shock_threshold = shock_value self.max_cycles = max_cycles #Prints incubator summary if verbose enables if __VERBOSE__: print("Incubator Summary:") print("sample_path: " + sample_path + " words_path: " + words_path) print("Total population: " + str(self.population)) print("Elites: " + str(self.elites) + " Children: " + str(self.children) + " Randoms: " + str(self.randoms)) print("Tournament size: " + str(self.tournament_size) + " Cross chance: " + str(self.cross_chance) + " Mutation chance: " + str(self.mutation_chance)) print("Shock enabled: " + str(self.shock_enabled) + " Shock threshold: " + str(self.shock_threshold)) print("Max cycles: " + str(self.max_cycles)) print("\n")
''' Created on Feb 8, 2019 Work in Progress @author: Gordon ''' from spellchecker import SpellChecker if __name__ == '__main__': pass spell = SpellChecker(distance=1) # analyze the cipher text to break the substitution cipher # return (plain text, cipher text alphabet) # where cipher text alphabet(CA) is of the form {c1, c2, c3, c4, _, c6, _, ...} # replace c3 with whatever character in the CA corresponds to c in the PA # _ is used to denote any letter for which you are uncertain about def decode(ciphertext): '''initializing average frequencies for letters in English alphabet based on graph in assignment document, first index left for spaces (most freq. char)''' avg_frequencies = [ ' ', 'e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', 'l', 'c', 'u', 'm', 'w', 'f', 'g', 'y', 'p', 'b', 'v', 'k', 'j', 'x', 'q', 'z' ] #print(avg_frequencies) '''initializing array to mark frequencies of each letter, then initializing cipher text alphabet to have length of 27''' frequency_check = [[0 for x in range(2)] for y in range(27)] frequency_check[0][0] = ' ' for i in range(26): frequency_check[i + 1][0] = chr((i) + 97)
from spellchecker import SpellChecker import pickle spell = SpellChecker() target_words = pickle.load(open('target_words_all.txt', 'rb')) misspelled = spell.unknown(target_words) spell.word_frequency.load_words(misspelled)
from bs4 import BeautifulSoup from spellchecker import SpellChecker def suppress_punctuation(text): """ Suppress punctuation in a text :param text str: Text to clean up :returns: Text without punctuation :rtype: str """ punctuation = "!:;\",?'’." for sign in punctuation: text = text.replace(sign, " ") return text spell = SpellChecker(language=None, local_dictionary=sys.argv[3], case_sensitive=True) #With 'case_sensitive=True', we precise that all the words are processed as they are written in the text #This means that all the uppercase words will be considered wrong but that helps correct them #To use that technique, we have to call a local dictionnary for root, dirs, files in os.walk(sys.argv[1]): for filename in files: dictionary = {} with open(sys.argv[1] + filename, 'r') as xml_file: print("reading from "+sys.argv[1] + filename) soup = BeautifulSoup(xml_file, 'xml') for unicode in soup.find_all('Unicode'): content = unicode.string content = suppress_punctuation(content) words = content.split() misspelled = spell.unknown(words)
def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) self._parser.suspectedEntityDict = {} query_as_list = self._parser.parse_sentence(query) # add entities to query - entities doesn't adds to query_as_list in parse_sentence # suspectedEntityDict holds only entities from original query for entity in self._parser.suspectedEntityDict: query_as_list.append(entity) # Clear query from Entities parts query_as_list = self.clearEntitiesParts(query_as_list) ###################################################################################################### #### INIT SPELL CORRECTION, ADD COVID AND FIND UNKNOWN WORDS IN ORIGINAL QUERY - BEFORE EXPENDING #### ###################################################################################################### # spell checker part spellFixer = SpellChecker() # add words to known word list spellFixer.word_frequency.load_words(['covid']) # find unknown words - those words will need spell correction missSpell = spellFixer.unknown(query_as_list) #################################### ######## WordNet expenssion ######## #################################### extendedQ = copy.deepcopy(query_as_list) for term in query_as_list: synset = wordnet.synsets(term) try: for i in range(2): Synonym = synset[i].lemmas()[0].name() if term.lower() != Synonym.lower( ) and Synonym + "~" not in extendedQ: Synonym += "~" extendedQ.append(Synonym) except: continue query_as_list = extendedQ ##################################### ######## Spelling correction ######## ##################################### # add fixed words fixedQuery = copy.deepcopy(query_as_list) for word in missSpell: candidates = list(spellFixer.candidates(word)) for i in range(2): try: if candidates[i] not in fixedQuery: fixedQuery.append(candidates[i] + '~') except: break numberOFresults, relevantDocIdList = searcher.search( fixedQuery) # returns tuple (numberOFresults,relevantDocIdList) return numberOFresults, relevantDocIdList
def cleanTweet(text, appostrophes=True, emojis=True, html=True, url=True, misspellings=True, punctuation=True, lemming=True,\ stop=True): """ Function to clean text Input text: string of text Input appostrophes: defaul True, boolean to clean for appostrophes Input emojis: defaul True, boolean to clean for emojis Input html: defaul True, boolean to clean for html tags Input url: defaul True, boolean to clean for url Input misspellings: defaul True, boolean to clean for misspellings Input punctuation: defaul True, boolean to clean for punctuation Input lemming: defaul True, boolean to clean with lemming technique Input stop: defaul True, boolean to clean for stop words Return filtered_string: filtered string of input text """ if appostrophes: #convert appostrophes filtered_string = decontracted(text) if emojis: #decoding, removing emojis filtered_string = filtered_string.encode("utf-8").decode('ascii','ignore') if html: #cleaning of html tags htmltags = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') filtered_string = re.sub(htmltags, '', filtered_string) if url: #cleaning of url url = re.compile(r'https?://\S+|www\.\S+') filtered_string = re.sub(url, '', text) if misspellings: #cleaning of misspellings spell = SpellChecker() corrected_text = [] misspelled_words = spell.unknown(filtered_string.split()) for word in filtered_string.split(): if word in misspelled_words: corrected_text.append(spell.correction(word)) else: corrected_text.append(word) filtered_string = " ".join(corrected_text) if punctuation: word_tokens = word_tokenize(filtered_string) #remove punctuations table=str.maketrans('','',string.punctuation) filtered_string.translate(table) filtered_string = [word.translate(table) for word in word_tokens] filtered_string = " ".join(filtered_string) if lemming: #lemming of words word_tokens = word_tokenize(filtered_string) lemmatizer = WordNetLemmatizer() filtered_string = [lemmatizer.lemmatize(word) for word in word_tokens] if stop: # cleaning from stopwords stop_words=set(stopwords.words('english')) stop_word_drop = [] for word in filtered_string: if word not in stop_words: stop_word_drop.append(word) filtered_string = " ".join(stop_word_drop) #toDos #cleaning of rare words # tokens is a list of all tokens in corpus # freq_dist = nltk.FreqDist(token) # rarewords = freq_dist.keys()[-50:] # after_rare_words = [ word for word in token not in rarewords] #cleaning of slang words #split attached words, not working and questionable because of all capital words # filtered_string = " ".join(re.findall('[A-Z][^A-Z]*', filtered_string)) return filtered_string
import pyspelling from spellchecker import SpellChecker from nltk.tokenize import TweetTokenizer token = TweetTokenizer() s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" token.tokenize(s0) spell = SpellChecker(language = "en", case_sensitive = False) misspelled1 = token.tokenize(s0) for word in misspelled1: if( not word.isalpha()): print(word) misspelled1.remove(word) print(len(misspelled1)) for word in misspelled1: if( not word.isalpha()): print(word) misspelled1.remove(word) print(len(misspelled1)) for word in misspelled1: if( not word.isalpha()): print(word) misspelled1.remove(word) print(len(misspelled1)) for word in misspelled1: if( not word.isalpha()): print(word) misspelled1.remove(word) print(len(misspelled1)) # for word in misspelled: # # Get the one `most likely` answer
def __init__(self, dbConnection, redis_client): self.db = dbConnection self.spell_checker = SpellChecker() self.lemmatizer = WordNetLemmatizer() self.redis_client = redis_client
corpus_trainer = ChatterBotCorpusTrainer(hal) corpus_trainer.train("chatterbot.corpus.english.ai") corpus_trainer.train("chatterbot.corpus.english.conversations") corpus_trainer.train("chatterbot.corpus.english.computers") corpus_trainer.train("chatterbot.corpus.english.emotion") corpus_trainer.train("chatterbot.corpus.english.greetings") corpus_trainer.train("chatterbot.corpus.english.movies") #trains HAL using the training data defined in trainingData.py conversationTrainer = ListTrainer(hal) conversationTrainer.train(trainingData.casualConversation) conversationTrainer.train(trainingData.basicAdvice) conversationTrainer.train(trainingData.advisor) conversationTrainer.train(trainingData.gpaToTransfer) correctTypos = SpellChecker() tag_list = [ 'cs 149', 'ise 164', 'cs 146', 'cmpe 131', 'cmpe 120', 'cmpe 102', 'cmpe 133', 'cmpe 148', 'cmpe 165', 'cmpe 172', 'cmpe 187', 'cmpe 195a', 'cmpe 195b', 'engr 195a', 'engr 195b', 'engr 195', 'cmpe 195', 'cmpe195', 'engr195', 'cs 151', 'cs 157a', 'cs 166', 'cs149', 'ise164', 'cs146', 'cmpe131', 'cmpe120', 'cmpe102', 'cmpe133', 'cmpe148', 'cmpe165', 'cmpe172', 'cmpe187', 'cmpe195a', 'cmpe195b', 'engr195a', 'engr195b', 'engr195', 'cmpe195', 'cs151', 'cs157a', 'cs166', 'how many units should i take', 'cmpe 137', 'cmpe137', 'cmpe 139', 'cmpe139', 'cmpe 152', 'cmpe152', 'cmpe 185', 'cmpe185', 'cmpe 181', 'cmpe181', 'cmpe 182', 'cmpe182', 'cmpe 183', 'cmpe183', 'cmpe 185', 'cmpe185', 'cmpe 188', 'cmpe188', 'cmpe 189', 'cmpe189', 'cs 116a', 'cs116a', 'cs 134', 'cs134', 'cs 152', 'cs152' ]
import pandas as pd import unidecode from deep_translator import GoogleTranslator from spellchecker import SpellChecker from bamboo_lib.logger import logger from bamboo_lib.connectors.models import Connector from bamboo_lib.steps import WildcardDownloadStep, LoadStep from bamboo_lib.models import EasyPipeline, PipelineStep from bamboo_lib.helpers import grab_connector from util import * speller = SpellChecker(language='es') class ReadStep(PipelineStep): def run_step(self, prev_result, params): logger.info('Running read step...') files_list = prev_result df = pd.DataFrame() for file_ in files_list: temp_df = pd.read_excel(file_[0]) temp_df.columns = temp_df.columns.str.lower() if file_[1]['filename'] in [ 'pef_ac01_avance_2t_2020', 'pef_ac01_avance_2t_2021', 'pef_ac01_avance_3t_2020', 'pef_ac01_avance_1t_2021', 'pef_ac01_avance_1t_2020' ]:
from spellchecker import SpellChecker spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(['something', 'beana', 'hapenning', 'here']) for word in misspelled: print(word) # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word))
def __init__(self): self.spell = SpellChecker()
def Tokenize_word(self,text): ######## Thai word segment ######## ver1 '''sent = text[0].replace("'","") word = word_tokenize(sent, engine='deepcut') # use this method wword = [x.replace('.',' ').replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").replace('สำหรับ',' ').replace('%',' ').strip(' ') for x in word] words =[] for w in wword: if w not in common.thai_stopwords(): words = [str for str in words if str] words.append(w) return words''' ######## Thai word segment ######## ver2 -> stopwords, type of words, check spell(Eng & Thai) sent = text[0].replace("'","") word = word_tokenize(sent, engine='attacut') # use this method #wword = [x.replace('=',' ').replace('-',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(' ') for x in word] th_no_stopwords =[] all_no_stopwords =[] th_correct_words =[] eng_correct_words =[] mix_correct_words =[] mix1_correct_words =[] all_correct_words =[] all_correct_words_final =[] check_thai_list = [] #for tw in wword: for tw in word: if tw not in common.thai_stopwords(): th_no_stopwords = [str for str in th_no_stopwords if str] th_no_stopwords.append(tw) #print("th_no_stopwords = ", th_no_stopwords) for ew in th_no_stopwords: if ew not in stopwords.words('english'): all_no_stopwords = [str for str in all_no_stopwords if str] all_no_stopwords.append(ew) #print("all_no_stopwords = ", all_no_stopwords) for c in all_no_stopwords: thai = isthai(c) number = c.isnumeric() if not thai: no_num = c.isalpha() match1 = re.findall('\D', c) #Return ถ้าไม่พบตัวเลข 0-9 ใน string if no_num: spell = SpellChecker() eng_correct = spell.correction(c) #pn eng_correct_words.append(eng_correct) #print("eng = ", eng_correct) elif match1: mix = c mix_correct_words.append(mix) #print("mix = ", mix) else: num = c #No return #print("num = ", num) elif thai: checker = NorvigSpellChecker(custom_dict=tnc.word_freqs()) #pn th_correct = checker.correct(c) th_correct_words.append(th_correct) #print("thai = ", th_correct) all_correct_words = th_correct_words + eng_correct_words + mix_correct_words all_correct_words = [x.replace('น.','').replace(':',' ').replace('=',' ').replace('–',' ').replace("("," ").replace(")"," ").replace("/"," ").strip(" ") for x in all_correct_words] all_correct_words_final = list(filter(None, all_correct_words)) #print("words = ", all_correct_words_final) return all_correct_words_final ######## Eng word segment ######## '''word = text[0]
def test_word_contains(self): ''' test the contains functionality ''' spell = SpellChecker() self.assertEqual(spell['the'], 6187925)
def test_word_in(self): ''' test the use of the `in` operator ''' spell = SpellChecker() self.assertTrue('key' in spell) self.assertFalse('rando' in spell)
import pycountry import os import os.path import wikipedia from weather import Weather import re import pyjokes import requests import wolframalpha import webbrowser import glob from PyDictionary import PyDictionary import tmdbsimple as tmdb import nltk from nltk.corpus import wordnet spell = SpellChecker() st1=[] hi = [] thanks=[] yes=[] for syn in wordnet.synsets("hi"): for l in syn.lemmas(): hi.append(l.name()) app_id='XPAQWX-W5LAG5ELYL' client=wolframalpha.Client(app_id) tmdb.API_KEY = '60222ace6396c345f94cc42eaac5dae5' doss = os.getcwd() i=0 n=0 flag=0 dictionary=PyDictionary()
import csv import string import pandas as pd import numpy as np from spellnCorrection import * from preprocessing import * from spellchecker import SpellChecker spell = SpellChecker(language='fr') """ Step 1: Extract data from file """ dataFile = './collaborativeActs.csv' df = pd.read_csv(dataFile, delimiter="\t", header=None, error_bad_lines=False, encoding="utf8") X = np.array(df) """ Step 2: Spell and Correction """ sentence = [] WRONG = [] for i in range(X.shape[0]): if i != 0: utterance = X[i][7] tokens = normalization(utterance) tokens = tokenization(tokens) tokens = [token.text for token in tokens]
def __init__(self): self.tokenizer = RegexpTokenizer(r'\w+') self.spell = SpellChecker()
def correct_spelling(text): spell_checker = SpellChecker() return " ".join([spell_checker.correction(w) for w in text.split(" ")])
from spellchecker import SpellChecker from pynput.keyboard import Key, Listener import logging from pyautogui import typewrite, hotkey spell = SpellChecker('en') Elissa_dict = './dictionary.txt' spell.word_frequency.load_text_file(Elissa_dict, 'utf-8') dirty = open('./bad_words.txt', 'r', encoding='utf-8-sig') junk = dirty.read().splitlines() dirty.close() spell.word_frequency.remove_words(junk) def autocorrect(): current_string = open('./keyLog.txt', 'r+', encoding='utf-8-sig') string = current_string.read() current_string.truncate(0) correct = spell.correction(string) if len(string) != 0 and string != correct: return correct log_dir = "C:/Users/Cameron/Desktop/1P03/LawtoCorrect/Backend" logging.basicConfig(filename=(log_dir + "keyLog.txt"), level=logging.DEBUG, format='%(message)s') keys = []
def __init__(self, data: pd.DataFrame, column_name: str, language: str, local_dictionary=None, distance=2): self.__strings = data[column_name] self.__spell_checker = SpellChecker(language=language, local_dictionary=local_dictionary, distance=distance)
class StoresManagerInterface: def __init__(self, users_manager): self.stores_manager = StoresManager() self.spell_checker = SpellChecker() self.users_manager = users_manager def search_product(self, search_term: str = "", categories: [str] = [], key_words: [str] = []) \ -> {int: [Product]}: """ Args: search_term: categories: key_words: Returns: """ logger.log( "called search with search term:%s, categories:%s, key words:%s", search_term, categories, key_words) return self.stores_manager.search( self.spell_checker.correction(search_term), [self.spell_checker.correction(word) for word in categories], [self.spell_checker.correction(word) for word in key_words]) def add_purchase_to_store(self, store_id: int, purchase: Purchase): return self.stores_manager.add_purchase_to_store(store_id, purchase) # def search(self, search_term: str = "", categories: [str] = None, key_words: [str] = None) -> {Store: [Product]}: # return self.stores_manager.search(search_term, categories, key_words) def add_product_to_store(self, store_id: int, user_name: str, product_name: str, product_price: int, product_categories: [str], key_words: [str], amount) -> bool: logger.log( "user %s called add product to store no.%d. product name:%s" " product price:%d product categories:%s,key words:%s, amount:%d", user_name, store_id, product_name, product_price, product_categories, key_words, amount) managed_stores = jsonpickle.decode( self.users_manager.get_managed_stores(user_name)) if store_id in managed_stores: return self.stores_manager.add_product_to_store( store_id, user_name, product_name, product_price, product_categories, key_words, amount) return False def appoint_manager_to_store(self, store_id, owner, to_appoint): logger.log("user %s call appoint manager %s to store no.%d", owner, to_appoint, store_id) if store_id in self.users_manager.get_managed_stores( owner) and self.users_manager.check_if_registered(to_appoint): if self.stores_manager.appoint_manager_to_store( store_id, owner, to_appoint): self.users_manager.add_managed_store(to_appoint, store_id) return True return False def appoint_owner_to_store(self, store_id, owner, to_appoint): logger.log("user %s call appoint owner %s to store no.%d", owner, to_appoint, store_id) if str(store_id) in self.users_manager.get_managed_stores( owner) and self.users_manager.check_if_registered(to_appoint): if self.stores_manager.appoint_owner_to_store( store_id, owner, to_appoint): self.users_manager.add_managed_store(to_appoint, store_id) return True return False def add_permission_to_manager_in_store(self, store_id, owner, manager, permission: str): logger.log("user %s add %s permission to %s in store no.%d", owner, permission, manager, store_id) if str(store_id ) in self.users_manager.get_managed_stores(owner) and str( store_id) in self.users_manager.get_managed_stores(manager): return self.stores_manager.add_permission_to_manager_in_store( store_id, owner, manager, permission) return False def remove_permission_from_manager_in_store(self, store_id, owner, manager, permission: str): logger.log("user %s remove %s permission to %s in store no.%d", owner, permission, manager, store_id) if store_id in self.users_manager.get_managed_stores( owner) and store_id in self.users_manager.get_managed_stores( manager): self.stores_manager.remove_permission_from_manager_in_store( store_id, owner, manager, permission) def open_store(self, owner: str, store_name): logger.log("user %s open %s store", owner, store_name) if self.users_manager.check_if_registered(owner): store_id = self.stores_manager.open_store(owner, store_name) self.users_manager.add_managed_store(owner, store_id) return store_id return -1 def buy(self, cart): self.stores_manager.buy(cart) def get_sales_history(self, store_id, user) -> [Purchase]: logger.log("user %s get sales history of store no.%d", user, store_id) if self.users_manager.check_if_registered(user) and ( str(store_id) in self.users_manager.get_managed_stores(user) or self.users_manager.is_admin(user)): return self.stores_manager.get_sales_history( store_id, user, self.users_manager.is_admin(user)) def remove_product(self, store_id, product_name, username): return self.stores_manager.remove_produce_from_store( store_id, product_name, username) def add_discount_to_product(self, store_id, product_name, username, start_date, end_date, percent): return self.stores_manager.add_visible_discount_to_product( store_id, product_name, username, start_date, end_date, percent) def update_product(self, store_id, username, product_name, attribute, updated): return self.stores_manager.update_product(store_id, username, product_name, attribute, updated) def remove_manager(self, store_id, owner, to_remove): if self.stores_manager.remove_manager(store_id, owner, to_remove): self.users_manager.remove_managed_store(to_remove, store_id) return True return False def remove_owner(self, store_id, owner, to_remove): return self.stores_manager.remove_owner(store_id, owner, to_remove)
import json from spellchecker import SpellChecker sp = SpellChecker() data = json.load(open("data.json")) word = input("enter the word:") print(sp) def defi(w): w = w.lower() if w in data: print(data[w]) else: msp = sp.correction(w) if (msp != w): print("did you mean the word: {}".format(msp)) else: print("the data doesn't exit") defi(word)
def __init__(self, users_manager): self.stores_manager = StoresManager() self.spell_checker = SpellChecker() self.users_manager = users_manager
from sklearn.metrics.pairwise import linear_kernel from sklearn.externals import joblib from sklearn.metrics.pairwise import linear_kernel import re import os from ChatbotBackend.settings import BASE_DIR#, BASE_DIR import spacy import sqlite3 from spellchecker import SpellChecker from pattern.de import parse nlp = spacy.load("de_core_news_sm") DATABASE_PATH = os.path.join( BASE_DIR, 'requirements/openthe.db' ) stmt = "SELECT term.word FROM term, synset, term term2 WHERE synset.id = term.synset_id AND term2.synset_id = synset.id AND term2.word = ?" spell = SpellChecker(language='de') def process_text( text ): annotated = [] parsed_text = parse(text, lemmata=True) doc = parsed_text.split(" ") for token in doc: pos_tag = token.split("/")[1] lemma = token.split("/")[4] if pos_tag == ".": continue current_token = token.split("/")[0] if current_token not in spell: current_token = spell.correction(current_token)
class StyloDocument(object): DEFAULT_AUTHOR = "Unknown" def __init__(self, file_content, author=DEFAULT_AUTHOR): self.author = author.strip() self.raw_content = file_content self.file_content = file_content.lower() self.tokens = PortugueseTextualProcessing.tokenize(self.file_content) self.text = Text(self.tokens) self.fdist = FreqDist(self.text) self.sentences = sent_tokenize(self.file_content, language='portuguese') self.sentence_chars = [len(sent) for sent in self.sentences] self.sentence_word_length = [len(sent.split()) for sent in self.sentences] self.paragraphs = [p for p in self.file_content.split("\n\n") if len(p) > 0 and not p.isspace()] self.paragraph_word_length = [len(p.split()) for p in self.paragraphs] self.punctuation = [".", ",", ";", "-", ":"] self.ner_entities = ['ABSTRACCAO', 'ACONTECIMENTO', 'COISA', 'LOCAL', 'ORGANIZACAO', 'OBRA', 'OUTRO', 'PESSOA', 'TEMPO', 'VALOR'] self.white_spaces = len(self.file_content.split(' ')) self.rich_tags = RichTags(PortugueseTextualProcessing.get_rich_tags(self.file_content), len(self.text)) self.tagged_sentences = PortugueseTextualProcessing.postag(self.tokens) self.tagfdist = FreqDist([b for [(a, b)] in self.tagged_sentences]) self.ner_tags = PortugueseTextualProcessing.ner_chunks(self.tokens) self.ner_ftags = FreqDist(self.ner_tags) self.spell = SpellChecker(language='pt') self.ROUNDING_FACTOR = 4 self.LINE_BREAKS = ['\n', '\t', '\r'] def get_tag_count_by_start(self, tag_start): count = 0 for tag in self.tagfdist.keys(): if tag.startswith(tag_start): count += self.tagfdist[tag] return count def get_class_frequency_by_start(self, tag_start): return self.get_tag_count_by_start(tag_start)/self.tagfdist.N() def get_total_not_found(self): """"The wn is not being reliable so far""" nf_tokens = self.get_tokens_by_tag('notfound') return len([i for i in nf_tokens if len(wn.synsets(i, lang='por')) == 0]) def tag_frequency(self, tag): return self.tagfdist.freq(tag) def entity_frequency(self, tag): return self.ner_ftags.freq(tag) def get_tokens_by_tag(self, tag): return [i[0][0] for i in self.tagged_sentences if i[0][1] == tag] def get_long_sentence_freq(self): return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE]))/len(self.sentences) def get_short_sentence_freq(self): return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE]))/len(self.sentences) def get_long_short_sentence_ratio(self): """"RF FOR PAN 15""" return len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE])/(len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE])) def get_sentence_starting_tags_ratio(self, tag): count = [i[0][1] for i in self.tagged_sentences].count(tag) return count/len(self.sentences) def term_per_hundred(self, term): """ term X ----- = ------ N 100 """ return (self.fdist[term] * 100) / self.fdist.N() def mean_sentence_len(self): return np.mean(self.sentence_word_length) def std_sentence_len(self): return np.std(self.sentence_word_length) def mean_paragraph_len(self): return np.mean(self.paragraph_word_length) def std_paragraph_len(self): return np.std(self.paragraph_word_length) def flesh_index(self): idx, value = PortugueseTextualProcessing().get_ptBR_flesch_index(self.tokens, self.get_phrases()) return idx def vocabulary(self): return [v for v in sorted(set(self.sentences)) if v not in self.punctuation] def mean_word_len(self): words = set(word_tokenize(self.file_content, language='portuguese')) word_chars = [len(word) for word in words] return sum(word_chars) / float(len(word_chars)) def max_word_len(self): words = set(word_tokenize(self.file_content, language='portuguese')) return max([len(word) for word in words]) def type_token_ratio(self): return (len(set(self.text)) / len(self.text)) * 100 def unique_words_per_hundred(self): return self.type_token_ratio() / 100.0 * 100.0 / len(self.text) def document_len(self): return sum(self.sentence_chars) def get_phrases(self): return [i for i in self.file_content.split('.') if i != ''] def mean_syllables_per_word(self): _, syllable_count = PortugueseTextualProcessing().get_syllable_counts(self.tokens) return syllable_count/len(self.tokens) def characters_frequency(self, character_list): return self.frequency([word for word in self.file_content if word in character_list]) def digits_frequency(self): return self.frequency([word for word in self.file_content if word.isdigit()]) def line_breaks_frequency(self): return self.frequency([word for word in self.file_content if word in self.LINE_BREAKS]) def count_consonant_frequency(self): character_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'y', 'x', 'z'] return self.frequency([word for word in self.file_content if word in character_list]) def camel_case_frequency(self): return self.frequency([word for word in self.raw_content.split(' ') if word and word[0].isupper() and (len(word) == 1 or word[1].islower())]) def local_hapax_legommena_frequency(self): return (len(self.fdist.hapaxes()))/len(self.text.tokens) def collocations_frequency(self, size): """words that often appear consecutively in the window_size""" return (len(self.text.collocation_list(window_size=size)))/len(self.text.tokens) def most_frequent_word_size(self): return FreqDist(len(w) for w in self.text).max() def mean_frequent_word_size(self): return FreqDist(len(w) for w in self.text).most_common(3)[1][0] def guiraud_R_measure(self): return (len(set(self.text)))/math.sqrt(len(self.text)) def herdan_C_measure(self): # log V(N)/log N return (math.log2(len(set(self.text))))/math.log2(len(self.text)) def herdan_V_measure(self): # N ^ C return math.pow(len(self.text), self.herdan_C_measure()) def K_measure(self): # log V(N)/log(log(N)) return (math.log2(len(set(self.text)))) / math.log2(math.log2(len(self.text))) def dugast_U_measure(self): # log ^ 2 N/log(N) - log V(N) return (math.pow(math.log2(len(self.text)), 2)) / (math.log2(len(self.text)) - math.log2(len(set(self.text)))) def maas_A_measure(self): #a ^ 2 = logN - logV(N)/log ^ 2 N return math.sqrt((math.log2(len(self.text)) - math.log2(len(set(self.text)))) / math.pow(math.log2(len(self.text)), 2)) def LN_measure(self): # 1 - V(N) ^ 2/ V(N) ^ 2 log N return (1 - math.pow(len(set(self.text)),2)) / (math.pow(len(set(self.text)), 2) * math.log2(len(self.text))) def honores_H_measure(self): return (len(self.fdist.hapaxes()))/len(set(self.text)) def spell_miss_check_frequency(self): return self.frequency(self.spell.unknown(self.text)) def noun_phrases(self): return PortugueseTextualProcessing().get_number_of_noun_phrases(self.tokens) / len(self.text) def verb_phrases(self): return self.frequency(PortugueseTextualProcessing().get_number_of_verb_phrases(self.file_content)) def monosyllables(self): return PortugueseTextualProcessing().get_monosyllable_counts(self.tokens) / len(self.text) def repeated_words_frequency(self): repeated_words = list(filter(lambda x: x[1] >= 2, FreqDist(PortugueseTextualProcessing().remove_stopwords(self.tokens)).items())) return self.frequency(repeated_words) def stop_word_freq(self): clean_words = PortugueseTextualProcessing().remove_stopwords(self.tokens) return (len(self.tokens) - len(clean_words)) / len(self.text) def get_logical_operator_frequency(self): return self.frequency([token for token in self.tokens if token in PortugueseTextualProcessing.LOGICAL_OPERATORS]) def get_tags_freq(self, tags): count = 0 for tag in tags: count += self.get_tag_count_by_start(tag) return count/len(self.tokens) def find_quotes(self): """Improve this method to retrieve quotes based on Patterns and special words egs: p.43; segundo (autor, ano) """ return self.characters_frequency(['“', '”']) def frequency(self, input_values): return len(input_values) / len(self.text) @classmethod def csv_header(cls): return ( ['DiversidadeLexica', 'TamanhoMedioDasPalavras', 'TamanhoMedioSentencas', 'StdevSentencas', 'TamanhoMedioParagrafos', 'StdevTamParagrafos', 'FrequenciaDeParagrafos','FrequenciaPalavrasDuplicadas', 'MediaSilabasPorPalavra', 'Monossilabas', 'Ponto','Virgulas', 'Exclamacoes', 'DoisPontos', 'Citacoes', 'QuebrasDeLinha', 'Digitos', 'Adjetivos', 'Adverbios','Artigos', 'Substantivos', 'Preposicoes', 'Verbos','VerbosPtcp', 'Conjuncoes', 'Pronomes', 'PronomesPorPreposicao','TermosNaoTageados', 'PalavrasDeConteudo', 'PalavrasFuncionais', 'FrasesNominais', 'FrasesVerbais', 'GenMasc', 'GenFem', 'SemGenero', 'Singular', 'Plural', 'PrimeiraPessoa', 'TerceiraPessoa','Passado','Presente','Futuro', 'TotalEntidadesNomeadas', 'EntAbstracao', 'EntAcontecimento', 'EntCoisa', 'EntLocal', 'EntOrganizacao', 'EntObra', 'EntOutro', 'EntPessoa', 'EntTempo', 'EntValor', 'GuiraudR', 'HerdanC', 'HerdanV', 'MedidaK', 'DugastU', 'MaasA', 'HonoresH', 'PalavrasErroOrtografico', 'HapaxLegomenaLocal', 'PalavrasComunsTam2', 'PalavrasComunsTam3', 'PalavrasComunsTam4', 'StopWords', 'BRFleshIndex', 'OperadoresLogicos', 'PalavrasCapitalizadas', 'Author'] ) def csv_output(self): # TODO: Separate features into syntactical, lexical and so on.. # 69 features + 1 class return "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \ "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \ "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \ "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},'{}'".format( # Text style features - 10 round(self.type_token_ratio(), self.ROUNDING_FACTOR), round(self.mean_word_len(), self.ROUNDING_FACTOR), round(self.mean_sentence_len(), self.ROUNDING_FACTOR), round(self.std_sentence_len(), self.ROUNDING_FACTOR), round(self.mean_paragraph_len(), self.ROUNDING_FACTOR), round(self.std_paragraph_len(), self.ROUNDING_FACTOR), len(self.paragraphs) / len(self.text), round(self.repeated_words_frequency(), self.ROUNDING_FACTOR), self.mean_syllables_per_word(), self.monosyllables(), # Term count features - 7 self.term_per_hundred('.'), self.term_per_hundred(','), self.term_per_hundred('!'), self.term_per_hundred(':'), self.find_quotes(), self.line_breaks_frequency(), self.digits_frequency(), #POSTAG Features - 24 self.tag_frequency('ADJ'), self.tag_frequency('ADV'), self.tag_frequency('ART'), self.tag_frequency('N'), self.tag_frequency('PREP'), self.tag_frequency('PCP'), # verbo no participio self.get_class_frequency_by_start('V'), self.get_class_frequency_by_start('K'), #conjunções self.get_class_frequency_by_start('PRO'), self.get_class_frequency_by_start('PRO')/self.tag_frequency('PREP'), #used in french texts self.tag_frequency('notfound'), self.get_tags_freq(PortugueseTextualProcessing.CONTENT_TAGS), self.get_tags_freq(PortugueseTextualProcessing.FUNCTIONAL_TAGS), round(self.noun_phrases(), self.ROUNDING_FACTOR), round(self.verb_phrases(), self.ROUNDING_FACTOR), self.rich_tags.get_male(), self.rich_tags.get_female(), self.rich_tags.get_unspecified_gender(), self.rich_tags.get_singular(), self.rich_tags.get_plural(), self.rich_tags.get_first_person(), self.rich_tags.get_third_person(), self.rich_tags.get_past_tense(), self.rich_tags.get_present_tense(), self.rich_tags.get_future_tense(), #NER Features - 11 round(len(self.ner_tags) / len(self.tokens), self.ROUNDING_FACTOR), self.entity_frequency('ABSTRACCAO'), self.entity_frequency('ACONTECIMENTO'), self.entity_frequency('COISA'), self.entity_frequency('LOCAL'), self.entity_frequency('ORGANIZACAO'), self.entity_frequency('OBRA'), self.entity_frequency('OUTRO'), self.entity_frequency('PESSOA'), self.entity_frequency('TEMPO'), self.entity_frequency('VALOR'), # Vocabulary diversity features - 7 round(self.guiraud_R_measure(), self.ROUNDING_FACTOR), round(self.herdan_C_measure(), self.ROUNDING_FACTOR), round(self.herdan_V_measure(), self.ROUNDING_FACTOR), round(self.K_measure(), self.ROUNDING_FACTOR), round(self.dugast_U_measure(), self.ROUNDING_FACTOR), round(self.maas_A_measure(), self.ROUNDING_FACTOR), round(self.honores_H_measure(), self.ROUNDING_FACTOR), # Misc Features - 9 self.spell_miss_check_frequency(), round(self.local_hapax_legommena_frequency(), self.ROUNDING_FACTOR), self.collocations_frequency(2), self.collocations_frequency(3), self.collocations_frequency(4), round(self.stop_word_freq(), self.ROUNDING_FACTOR), self.flesh_index(), self.get_logical_operator_frequency(), self.camel_case_frequency(), self.author, ) def legacy_features(self): """Remove features that are here for future reference""" # self.count_characters_frequency(['a']), # self.count_characters_frequency(['e']), # self.count_characters_frequency(['i']), # self.count_characters_frequency(['o']), # self.count_characters_frequency(['u']), # self.count_consonant_frequency(), # self.mean_frequent_word_size(), # self.max_word_len(), # self.document_len(), # round(self.LN_measure(), 8) pass
import tensorflow as tf import tensorflowjs as tfjs import numpy as np from spellchecker import SpellChecker spell = SpellChecker() skills = ['Research', 'Engineer', 'Developer', 'Python', 'Programming', 'PhD', 'Publications'] def build_model(vocab_size, embedding_dim, rnn_units, batch_size): model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'), tf.keras.layers.Dense(vocab_size) ]) return model text = open('../../data/sentences.csv', 'rb').read().decode(encoding='utf-8').replace(',',' ') text = text.replace('\n','. ') text = text.replace('\r','') text = text.replace('asa','as a') vocab = sorted(set(text)) # Map from unique characters to indices char2idx = {u:i for i, u in enumerate(vocab)} idx2char = np.array(vocab) forwards_dir = '../training/training_checkpoints3'
def main(): print 'Loading models ...' inputFolder1 = './input/' spellchecker = SpellChecker() embeddings = WordSimilarity(spellchecker) resultFile1 = open("result/result_top1.txt", "w") resultFile2 = open("result/result_top5.txt", "w") resultFile3 = open("result/result_visual.txt", "w") resultFile4 = open("result/result_text.txt", "w") start_time = time.time() for classDir in os.listdir(inputFolder1): predicted1 = [] predicted5 = [] vis_predict = [] text_predict = [] inputFolder = inputFolder1 + classDir + '/' for filename in os.listdir(inputFolder): if filename.endswith(".jpg") or filename.endswith(".jpeg"): try: sceneRecognizer = SceneRecognizer() txtDetector = SceneTextDetector() txtRecognizer = SceneTextRecognizer() print(classDir + '->' + filename) print('\n') imgPath = rescaleImage(inputFolder, filename) inds, output_prob, labels, sub_labels = sceneRecognizer.recognize( imgPath) subClass = [] for sbl in sub_labels: subClass.append(str(sbl.split(' ')[0])) print "Getting visual features..." visulaScores = {} totalVisualScore = 0 for iterating_var in inds: className = labels[iterating_var].split(' ')[0] if className in subClass: score = float(output_prob[iterating_var]) visulaScores[className] = score totalVisualScore = score + totalVisualScore for tmp in subClass: tempScore = float( visulaScores[tmp]) / float(totalVisualScore) visulaScores[tmp] = tempScore print "Getting textual features..." # scene text recognition phase outputName = txtDetector.detect(imgPath) # scene text recognition phase words = txtRecognizer.recognize(outputName) textualScores = embeddings.checkSemanticSimilarity( subClass, words) print "fusing scores..." finalScore = LBF(subClass, visulaScores, textualScores, 0.4, 0.6) finalScore = sorted(finalScore.items(), key=operator.itemgetter(1), reverse=True) finalScore = finalScore[0:5] actual = subClass.index(classDir) for item in finalScore: index = subClass.index(item[0]) if finalScore.index(item) == 0: value = str(index) + '|' + str(item[1]) predicted1.append(value) if index == actual: predicted5.append(actual) break elif finalScore.index(item) == 4: predicted5.append(subClass.index(finalScore[0][0])) visulaScores = sorted(visulaScores.items(), key=operator.itemgetter(1), reverse=True) textualScores = sorted(textualScores.items(), key=operator.itemgetter(1), reverse=True) visulaScores = visulaScores[0:5] textualScores = textualScores[0:5] for item in visulaScores: index = subClass.index(item[0]) if visulaScores.index(item) == 0: value = str(index) + '|' + str(item[1]) vis_predict.append(value) break for item in textualScores: index = subClass.index(item[0]) if textualScores.index(item) == 0: value = str(index) + '|' + str(item[1]) text_predict.append(value) break except Exception: print( colored( '############ Classifying ' + str(filename) + ' has thrown error due to' + str(Exception.message), 'green')) print('\n') resultFile1.write( str(subClass.index(classDir)) + ':' + toString(predicted1) + '\n') resultFile2.write( str(subClass.index(classDir)) + ':' + toString(predicted5) + '\n') resultFile3.write( str(subClass.index(classDir)) + ':' + toString(vis_predict) + '\n') resultFile4.write( str(subClass.index(classDir)) + ':' + toString(text_predict) + '\n') resultFile1.close() resultFile2.close() resultFile3.close() resultFile4.close() print( colored( '############ Testing in % seconds ################' % (time.time() - start_time), 'green'))
from psycopg2 import sql import sys sys.path.append('.') from rule_processing import postgresql def queryTable(conn, table): cmd = """ SELECT * FROM {} """ with conn.cursor() as cur: cur.execute(sql.SQL(cmd).format(sql.Identifier(table))) return cur.fetchall() compr = boto3.client(service_name='comprehend') compr_m = boto3.client(service_name='comprehendmedical') spell = SpellChecker() conn = postgresql.connect() spelling_list = [x[0] for x in queryTable(conn, 'spellchecker')] conn.close() # Add words to spell list spell.word_frequency.load_words(spelling_list) def findId(val): if val == '-1': return str(uuid.uuid4()) return val def findUnidentified(val): if val.lower() == 'unidentified': return 'U/I' return val
from spellchecker import SpellChecker spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(['statment', 'is', 'wrong']) for word in misspelled: print(spell.correction(word))
def get_spelling_errors(words): checker = SpellChecker() return checker.unknown(words)
def process_txt(path_to_txt, pre_text_path, empty_output): # check if file exists here and exist if not try: f = open(path_to_txt) f.close() except FileNotFoundError: logging.critical('Given text file does not exist') sys.exit(0) logging.info(f"Processing {path_to_txt}") # nested withs https://stackoverflow.com/a/9283052/754432 with open(pre_text_path, 'w') as outfile, open(path_to_txt, 'r', encoding='utf-8') as infile: infile = infile.read().lower() infile = re.sub(r'-\n', "", infile) # conjoin hypehnated newline words # infile = re.sub(r'\n\r-,\.', "", infile) infile = re.sub(r'\n', " ", infile) infile = re.sub( r'[\:\(\)\|\]\[\*©]', "", infile ) # discard non-spellable items, don't carry any significance towards misspelling infile = re.sub( r'(\w)\.(\w)', "i", infile ) # period character inside a word is often closer to 'i' than any other alphabet character for purposes of spelling correction WORDS = Counter(words(infile)) WORDS = [word for word in WORDS if len(word) < 20 ] # Fix script killer in 1974/ds-89171-page-8-article-17.txt spell = SpellChecker() # loads default word frequency list # find those words that may be misspelled misspelled = spell.unknown(WORDS) auto_corrects = {} for i, word in enumerate(misspelled): auto_corrects[word] = spell.correction(word) # Get the one `most likely` answer # if logging.getLogger().level == logging.DEBUG: print("#{}".format(i), word, spell.correction(word)) # Get a list of `likely` options # if logging.getLogger().level == logging.DEBUG: print(spell.candidates(word)) if logging.getLogger().level == logging.DEBUG: pprint(auto_corrects, indent=2) try: str_out = re.sub( r'\b(%s)\b' % '|'.join(auto_corrects.keys()), lambda m: auto_corrects.get(m.group(1), m.group(1)), infile) except sre_constants.error as err: print("SRE ERROR:", err) str_out = infile except Exception as err: print("GENERAL ERROR:", err, type(err)) str_out = infile finally: # SIGH! Just return the lightly cleaned text for this one pass # str_out = re.sub(r'\b(\w+)\b', lambda m:auto_corrects.get(m.group(1), m.group(1)), f_read) if logging.getLogger().level == logging.DEBUG: print(str_out) outfile.write(str_out) logging.info(f"Processed {pre_text_path}") return True
#!/usr/bin/env python ''' pip install pyspellchecker ''' from spellchecker import SpellChecker spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(['let', 'us', 'wlak','on','the','groun']) for word in misspelled: # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word))
class Incubator: #Class variables: # sample_block_table: A dictionary containing all blocks in sample_path and their frequency # spellchecker: A pyspellchecker instance with all the words in words_path added # population: Total population of the incubator, indicating how many chromosomes exist at one time # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_enabled: True if genetic shock enabled, false otherwise # shock_threshold: Number of cycles of fitness stagnation before genetic shock is triggered. # max_cycles: Cycle # at which the simulation terminates def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles): #Parameters: # sample_path: A path to a samples source file containing all training data to be fed to the incubator # words_path: A path to all words which the cipher_breaker should consider valid in addition # to those already in pyspellchecker. # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value # max_cycles: Cycle # at which the simulation terminates #Initializes sample_block_tables self.sample_block_table = self.getSampleBlockTable(sample_path) #Initializes spellchecker self.spellchecker = SpellChecker() self.spellchecker.word_frequency.load_text_file((words_path)) #Checks cross_chance and mutation_chance are valid assert (cross_chance + mutation_chance) == 1 #Loads all incubator paramaters self.elites = elites self.children = children self.randoms = randoms self.population = self.elites + self.children + self.randoms self.tournament_size = tournament_size self.cross_chance = cross_chance self.mutation_chance = mutation_chance #Handles shock_value if shock_value <= 0: self.shock_enabled = False self.shock_threshold = 0 else: self.shock_enabled = True self.shock_threshold = shock_value self.max_cycles = max_cycles #Prints incubator summary if verbose enables if __VERBOSE__: print("Incubator Summary:") print("sample_path: " + sample_path + " words_path: " + words_path) print("Total population: " + str(self.population)) print("Elites: " + str(self.elites) + " Children: " + str(self.children) + " Randoms: " + str(self.randoms)) print("Tournament size: " + str(self.tournament_size) + " Cross chance: " + str(self.cross_chance) + " Mutation chance: " + str(self.mutation_chance)) print("Shock enabled: " + str(self.shock_enabled) + " Shock threshold: " + str(self.shock_threshold)) print("Max cycles: " + str(self.max_cycles)) print("\n") """TRAINING FUNCTIONS""" #Takes ciphertext, returns a chromosome that should decrypt ciphertext def train(self, cipher_text): #Initializes cycle counter cycles = 0 #Generates pool of chromosomes chromosomes = [] for chromosome_iter in range(self.population): chromosomes.append(self.getRandomChromosome()) #Genetic shock trigger variables. Triggers if fitness is stagnant for shock_threshold cycles best_fitness = 0 shock_ticker = 0 #Starts timer start_time = time.time() while True: #Increments cycle counter cycles += 1 #Creates list of (chromosome, fitness) tuples in order of increasing fitness chromosome_fitness = [] #Checks all chromosomes to see if the correct one has been found for chromosome in chromosomes: if len(self.spellchecker.unknown((chromosome.convertText(cipher_text)).split(" "))) == 0: if __VERBOSE__: print("Found key! " + str(chromosome)) print("Decrypted text: " + chromosome.convertText(cipher_text)) print("") return (chromosome, cycles) #Gets fitness of each chromosome and sorts them according to fitness for chromosome in chromosomes: chromosome_fitness.append((chromosome, self.getFitness(chromosome, cipher_text))) chromosome_fitness.sort(key=lambda x: x[1]) chromosome_fitness.reverse() #Checks if max_cycles exceeded. If so, returns the fittest chromosome if cycles >= self.max_cycles: print("Best Key: " + str(chromosome_fitness[0][0])) print("Decrypted text: " + chromosome_fitness[0][0].convertText(cipher_text)) print("") return (chromosome_fitness[0][0], cycles) #Checks if fitness is stagnant if chromosome_fitness[0][1] <= best_fitness: shock_ticker += 1 else: best_fitness = max(chromosome_fitness[0][1], best_fitness) shock_ticker = 0 #If __VERBOSE__, provide report on most fit chromosome if __VERBOSE__: converted_text = chromosome_fitness[0][0].convertText(cipher_text) print("Cycle# " + str(cycles)) print("Best Chromosome: " + str(chromosome_fitness[0][0])) print("Fitness: " + str(chromosome_fitness[0][1])) print("Shock Ticker: " + str(shock_ticker)) print("Cycle Time: " + str(time.time()-start_time)) print("Attempted Decrypt: " + converted_text) print("Known words: " + str(self.spellchecker.known((chromosome_fitness[0][0].convertText(cipher_text).split(" "))))) print("Unknown words: " + str(self.spellchecker.unknown((chromosome_fitness[0][0].convertText(cipher_text).split(" "))))) print("") start_time = time.time() #Creates a new chromosomes list new_chromosomes = [] #Copies over elite to new chromosomes for chromosome_iter in range(self.elites): new_chromosomes.append(chromosome_fitness[chromosome_iter][0].clone()) #Creates children in new_chromsomes #Performs tournament process to select breeding candidates tournament_selections = [] while len(tournament_selections) < (self.children): tournament_selections.append(self.tournament(chromosome_fitness)) #Breeds selected candidates while len(tournament_selections)>0: chance = random.random() if chance < self.cross_chance and len(tournament_selections) > 1: chromosome_one = tournament_selections.pop() chromosome_two = tournament_selections.pop() crossed_chromosomes = self.crossChromosomes(chromosome_one, chromosome_two) new_chromosomes.append(crossed_chromosomes[0]) new_chromosomes.append(crossed_chromosomes[1]) elif chance < (self.mutation_chance + self.cross_chance): new_chromosomes.append(self.mutateChromosome(tournament_selections.pop())) else: new_chromosomes.append(self.getRandomChromosome()) #Adds random chromosomes to new_chromosomes for random_iter in range(self.randoms): new_chromosomes.append(self.getRandomChromosome()) #Checks if genetic shock should be triggered if shock_ticker >= self.shock_threshold and self.shock_enabled: if __VERBOSE__: print("Triggering genetic shock...\n") #Performs genetic shock, culling top 10% of population and mutation all others for chromosome_iter in range(len(new_chromosomes)): if self.getFitness(new_chromosomes[chromosome_iter], cipher_text) > .9 * best_fitness: new_chromosomes[chromosome_iter] = self.getRandomChromosome() else: new_chromosomes[chromosome_iter] = self.mutateChromosome(new_chromosomes[chromosome_iter]) #Resets shock tickers and trackers shock_ticker = 0 best_fitness = 0 #Shifts new_chromosomes into gene pool chromosomes = new_chromosomes #Returns a mutated chromosome def mutateChromosome(self, chromosome): new_chromosome = chromosome.clone() #Chooses two mappings to swap mutation_one_index = random.randint(0,25) mutation_two_index = random.randint(0,25) while mutation_two_index == mutation_one_index: mutation_two_index = random.randint(0,25) mutation_one = new_chromosome.mappings[mutation_one_index] mutation_two = new_chromosome.mappings[mutation_two_index] new_chromosome.removeMapping(mutation_one) new_chromosome.removeMapping(mutation_two) mapping_one = (mutation_one[0], mutation_two[1]) mapping_two = (mutation_two[0], mutation_one[1]) new_chromosome.addMapping(mapping_one) new_chromosome.addMapping(mapping_two) return new_chromosome #Takes two chromosomes and returns two crosses of those chromosomes in the format (new_chromosome_one, new_chromosome_two) def crossChromosomes(self, chromosome_one, chromosome_two): new_chromosome_one = chromosome_one.clone() new_chromosome_two = chromosome_two.clone() for chromosome_iter in range(26): if(random.random() > .5): old_mapping_one = new_chromosome_one.mappings[chromosome_iter] old_mapping_two = new_chromosome_two.mappings[chromosome_iter] if old_mapping_one != old_mapping_two: complement_mapping_one = new_chromosome_one.getMappingTarget(old_mapping_two[1]) complement_mapping_two = new_chromosome_two.getMappingTarget(old_mapping_one[1]) old_origin_one = complement_mapping_one[0] old_origin_two = complement_mapping_two[0] new_chromosome_one.removeMapping(complement_mapping_one) new_chromosome_two.removeMapping(complement_mapping_two) new_chromosome_one.removeMapping(old_mapping_one) new_chromosome_two.removeMapping(old_mapping_two) complement_mapping_one = (old_origin_two, complement_mapping_one[1]) complement_mapping_two = (old_origin_one, complement_mapping_two[1]) new_chromosome_one.addMapping(old_mapping_two) new_chromosome_one.addMapping(complement_mapping_two) new_chromosome_two.addMapping(old_mapping_one) new_chromosome_two.addMapping(complement_mapping_one) return (new_chromosome_one, new_chromosome_two) #Returns a new random chromosome def getRandomChromosome(self): new_chromosome = Chromosome() origin = [] destination = [] for letterIter in range(26): origin.append(chr(letterIter+97)) destination.append(chr(letterIter+97)) random.shuffle(destination) for mappingIter in range(26): new_chromosome.addMapping((origin[mappingIter], destination[mappingIter])) return new_chromosome #Performs a tournament selection of chromosomes based on tournament_size def tournament(self, chromosome_fitness): tournament_pool = [] for tournament_iter in range(self.tournament_size): tournament_pool.append(chromosome_fitness[random.randint(0, self.population-1)]) return (max(tournament_pool, key=lambda x: x[1]))[0].clone() #Takes a chromosome and cipher_text and evaluates the chromosomes fitness def getFitness(self, chromosome, cipher_text): total_fitness = 0 parsed_block_table = self.getBlockTable(chromosome.convertText(cipher_text)) for block in parsed_block_table.keys(): if block in self.sample_block_table.keys(): total_fitness += math.log(self.sample_block_table[block],2)*(parsed_block_table[block]) return total_fitness """ BLOCK FUNCTIONS """ #Returns the blocks located in the passed samples path. def getSampleBlockTable(self, sample_path): #Opens input file input_file = open(sample_path) block_table = {} for line in input_file: components = line.split(" ") components[1] = int(components[1][0:len(components[1])-1]) block_table[components[0]] = components[1] input_file.close() return block_table #Takes a string and returns a hash table of blocks def getBlockTable(self, input_string): block_table = {} input_words = input_string.split(" ") #Hashes blocks in dictionary to count them for word in input_words: word_blocks = self.getBlocks(word) for block in word_blocks: if block in block_table: block_table[block] += 1 else: block_table[block] = 1 return block_table #Returns all substrings of a passed string def getBlocks(self, input_string): blocks = [] for block_len in range(len(input_string)): start_point = 0 end_point = block_len+1 while end_point <= len(input_string): blocks.append(input_string[start_point:end_point]) end_point+=1 start_point+=1 return blocks