def rifu(): global pre_append global keywords global doc_freq global url print "starting time:", now = datetime.datetime.now() print datetime.time(now.hour, now.minute, now.second) url = request.args.get('url', 0, type=str) #webpage = urllib2.urlopen(url).read() #para= re.compile('<p>(.*)</p>') #collect data in p tags and store in para object # raw = re.findall(para , webpage) # rawstr = ' '.join(raw) # text = tokenize(rawstr) #SCRAPE print "URL:" + url text = scrape.scrapePage(url) print "\nScraping Done" print text #GET KEYWORDS keywords = keywords_file.get_keywords(url, text) print "my keywords:::" print keywords #GET CONTEXT nearbywordsObj = getnearbywords_intokens.getnearbywords() doc_freq = nearbywordsObj.get_words_from_proximity(keywords, text) # import pdb;pdb.set_trace(); print "doc freq::::" + str(doc_freq) #GET SEARCH RESULTS results = "" results = search.search_web(doc_freq, results, 0, 4, url) pre_append = results tab_id = request.args.get('tab_id', 0, type=int) results += '{"tab_id":' + str( tab_id) + '}' #Add dummy result for comma after last result results += "]}" print results return results
def rifu(): global pre_append global keywords global doc_freq global url print "starting time:", now = datetime.datetime.now() print datetime.time(now.hour, now.minute, now.second) url = request.args.get('url', 0, type=str) #webpage = urllib2.urlopen(url).read() #para= re.compile('<p>(.*)</p>') #collect data in p tags and store in para object # raw = re.findall(para , webpage) # rawstr = ' '.join(raw) # text = tokenize(rawstr) #SCRAPE print "URL:"+url text = scrape.scrapePage(url) print "\nScraping Done" print text #GET KEYWORDS keywords = keywords_file.get_keywords(url,text) print "my keywords:::" print keywords #GET CONTEXT nearbywordsObj = getnearbywords_intokens.getnearbywords() doc_freq = nearbywordsObj.get_words_from_proximity(keywords,text) # import pdb;pdb.set_trace(); print "doc freq::::"+ str(doc_freq) #GET SEARCH RESULTS results ="" results = search.search_web(doc_freq,results,0,4,url) pre_append = results tab_id = request.args.get('tab_id',0,type=int) results +='{"tab_id":'+str(tab_id)+'}' #Add dummy result for comma after last result results += "]}" print results return results
def main(): obj = proper_noun() text = obj.scrape(sys.argv[1]) title = alchemyObj.URLGetTitle(sys.argv[1]) soup = BeautifulSoup(title) raw = soup('title') tokens_title_first = [str(title.text) for title in raw] #tokens_title = ['Three', 'Musketeers'] print "title::", print tokens_title_first #text = original ### Take nouns in title tokens_title_first = str(tokens_title_first[0]) print tokens_title_first tokens_title_temp = nltk.word_tokenize(tokens_title_first) tokens_title_pos = nltk.pos_tag(tokens_title_temp) print "tokens_title_temp::", print tokens_title_temp tokens_title = [] ##create duplicate list for t in tokens_title_temp: index = tokens_title_temp.index(t) print "t::" + t print "index::" + str(index) print "tag::" + tokens_title_pos[index][1] print "len" + str(len(t)) if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)): # tokens_title.remove(t) tokens_title.append(t) tokens_title.sort() tokens_title = list(tokens_title for tokens_title,_ in itertools.groupby(tokens_title)) print "title::", print tokens_title list_of_NNPs = obj.get_nnp_ngrams(text,5,0) #list_of_NNPs = [['Three','Musketeers'],['Alexandre', 'Dumas']]#,['Cardinal', 'Richelieu'],['Athos'],['Although'],['Porthos'] ] print "list of NNPs: ", print list_of_NNPs if len(list_of_NNPs)>3: ###### list_of_NNPs = list_of_NNPs[0:3] ######## doc_freq_obj = doc_freq_class.context() print "getting doc freq" max_df = [] for n in list_of_NNPs: print "got n" max_freq = 0 for t in tokens_title: print "got t" df = doc_freq_obj.get_together_DF(n,t) if df > max_freq: max_freq = df print "ngram:", print n print "title word:", print t print "df:", print df max_df.append(max_freq) i = 0 for df in max_df: for i in range(len(max_df)-1): if max_df[i]<max_df[i+1]: t = list_of_NNPs[i] list_of_NNPs[i]=list_of_NNPs[i+1] list_of_NNPs[i+1]= t t1 = max_df[i] max_df[i]=max_df[i+1] max_df[i+1] = t1 #i = 0 for i in range(len(list_of_NNPs)): print "keyword: ", print list_of_NNPs[i] print "df:", print max_df[i] if len(list_of_NNPs)>3: list_of_NNPs = list_of_NNPs[0:3]#********* #list_of_NNPs.sort() #list_of_NNPs_final = list(list_NNPs for list_NNPs,_ in itertools.groupby(list_of_NNPs)) #list_of_NNPs_final.sort() print "\n\nfinal list:", print list_of_NNPs nearbywordsObj = getnearbywords_intokens.getnearbywords() nearbywordsObj.get_words_from_proximity(list_of_NNPs,text)