Exemple #1
0
 def __init__(self, paragraphs, normalizer, words=None):
     if words:
         tk2p = CFD((normalizer.normalize(token), i)
                    for i, para in enumerate(paragraphs) for token in para
                    if normalizer.ok(token)
                    and normalizer.normalize(token) in words)
     else:
         tk2p = CFD((normalizer.normalize(token), i)
                    for i, para in enumerate(paragraphs) for token in para
                    if normalizer.ok(token))
     self._tk2p = tk2p
Exemple #2
0
 def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
     self._key = key
     self._tokens = tokens
     if context_func:
         self._context_func = context_func
     else:
         self._context_func = self._default_context
     if filter:
         tokens = [t for t in tokens if filter(t)]
     self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                  for i, w in enumerate(tokens))
     self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                  for i, w in enumerate(tokens))
import sys
from collections import Counter, defaultdict
from nltk.probability import ConditionalFreqDist as CFD
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import treebank
from pprint import pprint
from math import log


T=CFD()
L=CFD()

def extractTransitions(tagged_sents=treebank.tagged_sents(tagset='universal')):
	for s in tagged_sents:
		lasttag = 0
		for token,tag in s:
			T[lasttag][tag]+=1
			L[tag][token]+=1
			lasttag = tag

#Transition probability
def Pt(lasttag, tag):
	p = T[lasttag].freq(tag)
	p = 0.005 if p == 0 else p
	return log(p)

#Likelihood of word belonging to tag
def Pl(word, tag):
	p = L[tag].freq(word)
	p = 0.000001 if p == 0 else p
	return log(p)