def fitness(self, w): # *log(lexicon[ch])/log(lexicon[mostCommon]) return sum( 0.2 * log(lexicon[ch]) / log(lexicon[mostCommon]) for ch in chngrams(w, 4) if ch in lexicon ) * 8 / len(w) + sum( 0.1 * log(lexicon[ch]) / log(lexicon[mostCommon]) for ch in chngrams(w, 3) if ch in lexicon ) * 8 / len( w )
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:]+"$"] = 1 v[len(name)] = 1 return v
def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:] + "$"] = 1 v[len(name)] = 1 return v
def fitness(self, w): #*log(lexicon[ch])/log(lexicon[mostCommon]) return sum(0.2*log(lexicon[ch])/log(lexicon[mostCommon]) for ch in chngrams(w, 4) if ch in lexicon)*8/len(w) + \ sum(0.1*log(lexicon[ch])/log(lexicon[mostCommon]) for ch in chngrams(w, 3) if ch in lexicon)*8/len(w)
from pattern.vector import GA, chngrams # from pattern.en import lexicon import json from random import choice from random import randint as ri from collections import Counter from math import log with open('words.json', 'r') as infile: words = json.load(infile) allgrams = list() for w in words: char3grams = chngrams(w, 3).items() char4grams = chngrams(w, 4).items() allgrams.extend(char3grams + char4grams) lexicon = Counter() for gram in allgrams: lexicon[gram[0]] += gram[1] # print 'lexicon length:', len(lexicon) mostCommon = max(lexicon.keys(), key=lambda k: lexicon[k]) # print mostCommon, lexicon[mostCommon] def chseq(length=4, chars='abcdefghijklmnopqrstuvwxyz'): # Returns a string of random characters. return ''.join(choice(chars) for i in range(length)) class Jabberwocky(GA):
def fitness(self, w): return sum(0.2 for ch in chngrams(w, 4) if ch in lexicon) + sum( 0.1 for ch in chngrams(w, 3) if ch in lexicon)
exclude=[], # Filter words in the exclude list. stopwords=False, # Include stop words? language='en') # en, es, de, fr, it, nl for k, v in freq_dic.iteritems(): print k, v # stop words and stemming print stem('spies', stemmer=PORTER) print stem('spies', stemmer=LEMMA) s = 'The black cat was spying on the white cat.' print count(words(s), stemmer=PORTER) print count(words(s), stemmer=LEMMA) s = 'The black cat was spying on the white cat.' s = Sentence(parse(s)) print count(s, stemmer=LEMMA) # character n-grams print chngrams('The cat sat on the mat.'.lower(), n=3) # document text = "The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again" \ "Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the shipʼs" \ "external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing" \ "fresh supplies and an intelligent robot for the International Space Station. But complications delayed the" \ "flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready" \ "to launch before its flight window closed this Monday." doc = Document(text, threshold=1) print doc.keywords(top=6) document = Document( text, filter=lambda w: w.lstrip("'").isalnum(), punctuation='.,;:!?()[]{}\'`"@#$*+-|=~_', top=None, # Filter words not in the top most frequent. threshold=0, # Filter words whose count falls below threshold.
def ngram_vector(s, n=3): v = {} v.update(chngrams(s.lower(), n)) return v
from pattern.vector import GA, chngrams # from pattern.en import lexicon import json from random import choice from random import randint as ri from collections import Counter from math import log with open("words.json", "r") as infile: words = json.load(infile) allgrams = list() for w in words: char3grams = chngrams(w, 3).items() char4grams = chngrams(w, 4).items() allgrams.extend(char3grams + char4grams) lexicon = Counter() for gram in allgrams: lexicon[gram[0]] += gram[1] # print 'lexicon length:', len(lexicon) mostCommon = max(lexicon.keys(), key=lambda k: lexicon[k]) # print mostCommon, lexicon[mostCommon] def chseq(length=4, chars="abcdefghijklmnopqrstuvwxyz"): # Returns a string of random characters. return "".join(choice(chars) for i in range(length))