Example #1
0
 def fitness(self, w):
     # *log(lexicon[ch])/log(lexicon[mostCommon])
     return sum(
         0.2 * log(lexicon[ch]) / log(lexicon[mostCommon]) for ch in chngrams(w, 4) if ch in lexicon
     ) * 8 / len(w) + sum(
         0.1 * log(lexicon[ch]) / log(lexicon[mostCommon]) for ch in chngrams(w, 3) if ch in lexicon
     ) * 8 / len(
         w
     )
Example #2
0
 def vector(self, name): 
     """ Returns a dictionary with character bigrams and suffix.
         For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
     """
     v = chngrams(name, n=2)
     v = count(v)
     v[name[-2:]+"$"] = 1
     v[len(name)] = 1
     return v
 def vector(self, name):
     """ Returns a dictionary with character bigrams and suffix.
         For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
     """
     v = chngrams(name, n=2)
     v = count(v)
     v[name[-2:] + "$"] = 1
     v[len(name)] = 1
     return v
Example #4
0
 def fitness(self, w):
     #*log(lexicon[ch])/log(lexicon[mostCommon])
     return sum(0.2*log(lexicon[ch])/log(lexicon[mostCommon]) for ch in chngrams(w, 4) if ch in lexicon)*8/len(w) + \
            sum(0.1*log(lexicon[ch])/log(lexicon[mostCommon]) for ch in chngrams(w, 3) if ch in lexicon)*8/len(w)
Example #5
0
from pattern.vector import GA, chngrams
# from pattern.en import lexicon
import json
from random import choice
from random import randint as ri
from collections import Counter
from math import log

with open('words.json', 'r') as infile:
    words = json.load(infile)

allgrams = list()
for w in words:
    char3grams = chngrams(w, 3).items()
    char4grams = chngrams(w, 4).items()
    allgrams.extend(char3grams + char4grams)

lexicon = Counter()
for gram in allgrams:
    lexicon[gram[0]] += gram[1]

# print 'lexicon length:', len(lexicon)

mostCommon = max(lexicon.keys(), key=lambda k: lexicon[k])
# print mostCommon, lexicon[mostCommon]

def chseq(length=4, chars='abcdefghijklmnopqrstuvwxyz'):
    # Returns a string of random characters. 
    return ''.join(choice(chars) for i in range(length))

class Jabberwocky(GA):
 def fitness(self, w):
     return sum(0.2 for ch in chngrams(w, 4) if ch in lexicon) + sum(
         0.1 for ch in chngrams(w, 3) if ch in lexicon)
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)
print count(words(s), stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
s = Sentence(parse(s))
print count(s, stemmer=LEMMA)
# character n-grams
print chngrams('The cat sat on the mat.'.lower(), n=3)
# document
text = "The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again" \
    "Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the shipʼs" \
    "external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing" \
    "fresh supplies and an intelligent robot for the International Space Station. But complications delayed the" \
    "flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready" \
    "to launch before its flight window closed this Monday."
doc = Document(text, threshold=1)
print doc.keywords(top=6)
document = Document(
    text,
    filter=lambda w: w.lstrip("'").isalnum(),
    punctuation='.,;:!?()[]{}\'`"@#$*+-|=~_',
    top=None,  # Filter words not in the top most frequent.
    threshold=0,  # Filter words whose count falls below threshold.
Example #8
0
def ngram_vector(s, n=3):
    v = {}
    v.update(chngrams(s.lower(), n))
    return v
Example #9
0
def ngram_vector(s, n=3):
    v = {}
    v.update(chngrams(s.lower(), n))
    return v
Example #10
0
from pattern.vector import GA, chngrams

# from pattern.en import lexicon
import json
from random import choice
from random import randint as ri
from collections import Counter
from math import log

with open("words.json", "r") as infile:
    words = json.load(infile)

allgrams = list()
for w in words:
    char3grams = chngrams(w, 3).items()
    char4grams = chngrams(w, 4).items()
    allgrams.extend(char3grams + char4grams)

lexicon = Counter()
for gram in allgrams:
    lexicon[gram[0]] += gram[1]

# print 'lexicon length:', len(lexicon)

mostCommon = max(lexicon.keys(), key=lambda k: lexicon[k])
# print mostCommon, lexicon[mostCommon]


def chseq(length=4, chars="abcdefghijklmnopqrstuvwxyz"):
    # Returns a string of random characters.
    return "".join(choice(chars) for i in range(length))