print t, c, mu, sigma, (mu/sigma) train() collocation('trendy', min_count=1) #---------Use NLTK collocations-------------- import nltk import re import HTMLParser import datastores.datastore as d df = d.solr_data_frame('Beauty_Crawl_RSS_Feeds') h = HTMLParser.HTMLParser() for i,d in enumerate(documents): documents[i] = h.unescape(documents[i]).encode('utf-8') def tokenize(documents): for i,doc in enumerate(documents): if i % 100 == 0: print '%d of %d' % (i, len(documents)) for sent in nltk.sent_tokenize(doc.lower()): for word in nltk.word_tokenize(sent): yield word
__author__ = 'sriWork' import datastores.datastore as ds COLLECTION = 'Health_Crawl_RSS_Feeds' #COLLECTION = 'Health_Crawl_RSS_Feeds' FIELDS = [ 'id','title','content,' 'pubDate_dt', 'tags_s', 'lang','author'] QUERY = None CACHE=False ##### Read solr data into 'dataframe' ##### dataframe=[] dataframe = ds.solr_data_frame(COLLECTION, FIELDS, QUERY,CACHE) ##print(dataframe['content'][373]) length_dataframe=len(dataframe) #### Count the number of english and spanish documents and print the other language tags cnt_eng=0 cnt_es=0 for i in range(0,length_dataframe): #content_currentframe=dataframe['content'][i] #print(dataframe['lang'][i]) if dataframe['lang'][i]==[u'en']: cnt_eng=cnt_eng+1 elif dataframe['lang'][i]==[u'es']: cnt_es=cnt_es+1 #else: #print i #print(dataframe['lang'][i]) #print(content_currentframe)