Esempio n. 1
0
 def __init__(self, fd, *args, **kwargs):
     LidstoneProbDist.__init__(self, fd, 0.01, args[-1])
     samples = fd.samples()
     self._probs = dict(zip([0]*len(samples), samples))
     self._logprobs = dict(zip([0]*len(samples), samples))        
     for sample in samples:
         self._logprobs[sample] = LidstoneProbDist.logprob(self, sample)
         self._probs[sample] = LidstoneProbDist.prob(self, sample)
Esempio n. 2
0
def lidstoneProbDist(olddf):
    """
    Use nltk to create probdist
    """
    #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf
    #https://github.com/tuzzeg/detect_insults/blob/master/README.md
    print "Creating LidStone Probdist...",nltk.__version__
    tutto=[]
    
    #olddf = olddf.ix[random.sample(olddf.index, 10)]
    olddf=pd.DataFrame(olddf['body'])
    
    print type(olddf)
    for ind in olddf.index:
	  print ind
	  row=[]
	  row.append(ind)
	  text=olddf.ix[ind,'body']
	  tokens=word_tokenize(text)
	  #print tokens
	  
	  t_fd = FreqDist(tokens)
	  pdist = LidstoneProbDist(t_fd,0.1)
	  print pdist.samples()
	  #for tok in tokens:
	  #    print pdist[3][tok]
	  #t_fd.plot(cumulative=False)
	  raw_input("HITKEY")
	  row=tokens
	  #print tagged
	  #print len(tagged)

	  tutto.append(row)
    newdf=pd.DataFrame(tutto).set_index(0)
    newdf.columns=taglist
    print newdf.head(20)
    print newdf.describe()
    newdf.to_csv("../stumbled_upon/data/lidstone.csv")
Esempio n. 3
0
 def prob(self, sample):
     if sample not in self._probs:
         self._probs[sample] = LidstoneProbDist.prob(self, sample)
     return self._probs.get(sample)
Esempio n. 4
0
 def __init__(self, fd, bins, *factory_args):
     LidstoneProbDist.__init__(self, fd, 0.1, bins)