def __init__(self, paragraphs, normalizer, words=None): if words: tk2p = CFD((normalizer.normalize(token), i) for i, para in enumerate(paragraphs) for token in para if normalizer.ok(token) and normalizer.normalize(token) in words) else: tk2p = CFD((normalizer.normalize(token), i) for i, para in enumerate(paragraphs) for token in para if normalizer.ok(token)) self._tk2p = tk2p
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD((self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens))
import sys from collections import Counter, defaultdict from nltk.probability import ConditionalFreqDist as CFD from nltk.tokenize import TreebankWordTokenizer from nltk.corpus import treebank from pprint import pprint from math import log T=CFD() L=CFD() def extractTransitions(tagged_sents=treebank.tagged_sents(tagset='universal')): for s in tagged_sents: lasttag = 0 for token,tag in s: T[lasttag][tag]+=1 L[tag][token]+=1 lasttag = tag #Transition probability def Pt(lasttag, tag): p = T[lasttag].freq(tag) p = 0.005 if p == 0 else p return log(p) #Likelihood of word belonging to tag def Pl(word, tag): p = L[tag].freq(word) p = 0.000001 if p == 0 else p return log(p)