Esempio n. 1
0
def lidstoneProbDist(olddf):
    """
    Use nltk to create probdist
    """
    #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf
    #https://github.com/tuzzeg/detect_insults/blob/master/README.md
    print "Creating LidStone Probdist...",nltk.__version__
    tutto=[]
    
    #olddf = olddf.ix[random.sample(olddf.index, 10)]
    olddf=pd.DataFrame(olddf['body'])
    
    print type(olddf)
    for ind in olddf.index:
	  print ind
	  row=[]
	  row.append(ind)
	  text=olddf.ix[ind,'body']
	  tokens=word_tokenize(text)
	  #print tokens
	  
	  t_fd = FreqDist(tokens)
	  pdist = LidstoneProbDist(t_fd,0.1)
	  print pdist.samples()
	  #for tok in tokens:
	  #    print pdist[3][tok]
	  #t_fd.plot(cumulative=False)
	  raw_input("HITKEY")
	  row=tokens
	  #print tagged
	  #print len(tagged)

	  tutto.append(row)
    newdf=pd.DataFrame(tutto).set_index(0)
    newdf.columns=taglist
    print newdf.head(20)
    print newdf.describe()
    newdf.to_csv("../stumbled_upon/data/lidstone.csv")
Esempio n. 2
0
def lidstoneProbDist(olddf):
    """
    Use nltk to create probdist
    """
    #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf
    #https://github.com/tuzzeg/detect_insults/blob/master/README.md
    print("Creating LidStone Probdist...", nltk.__version__)
    tutto = []

    #olddf = olddf.ix[random.sample(olddf.index, 10)]
    olddf = pd.DataFrame(olddf['body'])

    print(type(olddf))
    for ind in olddf.index:
        print(ind)
        row = []
        row.append(ind)
        text = olddf.ix[ind, 'body']
        tokens = word_tokenize(text)
        #print tokens

        t_fd = FreqDist(tokens)
        pdist = LidstoneProbDist(t_fd, 0.1)
        print(pdist.samples())
        #for tok in tokens:
        #    print pdist[3][tok]
        #t_fd.plot(cumulative=False)
        input("HITKEY")
        row = tokens
        #print tagged
        #print len(tagged)

        tutto.append(row)
    newdf = pd.DataFrame(tutto).set_index(0)
    newdf.columns = taglist
    print(newdf.head(20))
    print(newdf.describe())
    newdf.to_csv("../stumbled_upon/data/lidstone.csv")