def __init__(self,word): self.word = word self.freq_word = 0#freq of the words in document self.freq_together = 0# frq of that word occuring with that keyword self.doc_freq_obj = doc_freq_class.context()
def get_words_from_proximity(self,keywords,text): #think of how to get nouns from sentence only... !! #create object of doc_frequency doc_freq_obj = doc_freq_class.context() tokens = nltk.word_tokenize(text) #print "tokens:" #print tokens for i in tokens: if i.isalnum()== False: tokens.remove(i) c = nltk.ConcordanceIndex(tokens, key = lambda s: s.lower()) tokens_pos = nltk.pos_tag(tokens) i = 5 doc_freq = [] df_cnt = 0 print "keywords going to loop", print keywords for kw in keywords: print "keyword::::::::", print kw #split keyword not required as kw is list of strings #k = nltk.word_tokenize(kw) #print k #print "keywords in for ", #print kw first_word = kw[0] #1st word in keyword #print "first word" #print first_word keyword_len = len(kw) #print "LEN="+str(keyword_len) i = 5 nomatch = 0 #print "IN KWD LOOP." print "offset", print c.offsets(first_word) for offset in c.offsets(first_word): print kw j = 1 i = 5 #print "Keyword=", #print kw, #print " OFFSET=" + str(offset) nomatch = 0 while j < keyword_len: #print "in while" #print tokens[offset+j] #print kw[j] if tokens[offset+j].lower() <> kw[j].lower(): #print tokens[offset+j] #print k[j] nomatch = 1 break j = j + 1 if nomatch == 0: doc_freq.append(document_frequency(kw)) #print "matched kwd", #print tokens[offset:offset+j-1] #print tokens[offset-5:offset+5] i = 5 while i > 0 : if (offset-i) < 0: break if (tokens_pos[offset-i][1] in ["NN","NNP"]) and (tokens_pos[offset-i][1].lower() not in nltk.corpus.stopwords.words('english')): #doc_freq_obj.get_together_DF("") #print "dfcnt:" + str(df_cnt) #print "i: " + str(i) doc_freq[df_cnt].addneighbour(tokens_pos[offset-i][0]) print tokens_pos[offset-i][0], #pass i = i - 1 print "\m/ ", print kw, print "\m/ ", i = 1 while i < 5 : if (offset+i+(keyword_len-1)) >= len(tokens): break if (tokens_pos[offset+i+(keyword_len-1)][1] in ["NN","NNP"]) and (tokens_pos[offset+i+(keyword_len-1)][1].lower() not in nltk.corpus.stopwords.words('english')): #pass doc_freq[df_cnt].addneighbour(tokens[offset+i+(keyword_len-1)]) print tokens_pos[offset+i+(keyword_len-1)][0], i = i + 1 k = 0 print "\n\n" while k < doc_freq[df_cnt].cnt: #doc_freq[df_cnt].neighbours[k].freq_word = fd1[context_vectors[CV_cnt].keyword] doc_freq[df_cnt].neighbours[k].find_doc_freq(doc_freq[df_cnt].keyword) k = k + 1 doc_freq[df_cnt].neighbours.sort(key=lambda x: x.freq_together, reverse=True) if doc_freq[df_cnt].cnt > 5: doc_freq[df_cnt].neighbours = doc_freq[df_cnt].neighbours[:5] #take 10 neighbours with highest weight doc_freq[df_cnt].cnt = 5 k = 0 #while k < doc_freq[df_cnt].cnt: print "keyword: ", for l in doc_freq[df_cnt].keyword: print l, print "\n" print "neighbours: ", for m in doc_freq[df_cnt].neighbours: print m.word, print "\n" #k += 1 df_cnt = df_cnt + 1 results = search_web(doc_freq) return results
def get_words_from_proximity( self, keyword_list, text): #think of how to get nouns from sentence only... !! #create object of doc_frequency doc_freq_obj = doc_freq_class.context() tokens = nltk.word_tokenize(text) #print "tokens:" #print tokens for i in tokens: if i.isalnum() == False: tokens.remove(i) c = nltk.ConcordanceIndex(tokens, key=lambda s: s.lower()) tokens_pos = nltk.pos_tag(tokens) i = 5 doc_freq = [] df_cnt = 0 print "keywords going to loop", print keyword_list keywords = [] for k in keyword_list: kw = nltk.word_tokenize(k) keywords.append(kw) print "keywords" print keywords for kw in keywords: print "keyword::::::::", print kw #split keyword not required as kw is list of strings #k = nltk.word_tokenize(kw) #print k #print "keywords in for ", #print kw first_word = kw[0] #1st word in keyword #print "first word" #print first_word keyword_len = len(kw) #print "LEN="+str(keyword_len) i = 5 nomatch = 0 #print "IN KWD LOOP." print "offset", print c.offsets(first_word) doc_freq.append(document_frequency(kw)) no_of_times = 0 for offset in c.offsets(first_word): print kw j = 1 i = 5 #print "Keyword=", #print kw, #print " OFFSET=" + str(offset) nomatch = 0 while j < keyword_len: #print "in while" #print tokens[offset+j] #print kw[j] if tokens[offset + j].lower() <> kw[j].lower(): #print tokens[offset+j] #print k[j] nomatch = 1 break j = j + 1 if nomatch == 0: #print "matched kwd", #print tokens[offset:offset+j-1] #print tokens[offset-5:offset+5] i = 5 while i > 0: if (offset - i) < 0: break if (tokens_pos[offset - i][1] in [ "NN", "NNP" ]) and (tokens_pos[offset - i][1].lower() not in nltk.corpus.stopwords.words('english')): #doc_freq_obj.get_together_DF("") #print "dfcnt:" + str(df_cnt) #print "i: " + str(i) doc_freq[df_cnt].addneighbour(tokens_pos[offset - i][0]) print tokens_pos[offset - i][0], #pass i = i - 1 print "\m/ ", print kw, print "\m/ ", i = 1 while i < 5: if (offset + i + (keyword_len - 1)) >= len(tokens): break if (tokens_pos[offset + i + (keyword_len - 1)][1] in [ "NN", "NNP" ]) and (tokens_pos[offset + i + (keyword_len - 1)][1].lower() not in nltk.corpus.stopwords.words('english')): #pass doc_freq[df_cnt].addneighbour( tokens[offset + i + (keyword_len - 1)]) print tokens_pos[offset + i + (keyword_len - 1)][0], i = i + 1 k = 0 print "\n\n" while k < doc_freq[df_cnt].cnt: #doc_freq[df_cnt].neighbours[k].freq_word = fd1[context_vectors[CV_cnt].keyword] doc_freq[df_cnt].neighbours[k].find_doc_freq( doc_freq[df_cnt].keyword) k = k + 1 doc_freq[df_cnt].neighbours.sort( key=lambda x: x.freq_together, reverse=True) if doc_freq[df_cnt].cnt > 5: doc_freq[df_cnt].neighbours = doc_freq[ df_cnt].neighbours[: 5] #take 10 neighbours with highest weight doc_freq[df_cnt].cnt = 5 k = 0 #while k < doc_freq[df_cnt].cnt: print "keyword: ", for l in doc_freq[df_cnt].keyword: print l, print "\n" print "neighbours: ", for m in doc_freq[df_cnt].neighbours: print m.word, print "\n" #k += 1 no_of_times = no_of_times + 1 if no_of_times >= 2: break #import pdb;pdb.set_trace(); df_cnt = df_cnt + 1 #results = search_web(doc_freq) print doc_freq return doc_freq
def __init__(self, word): self.word = word self.freq_word = 0 #freq of the words in document self.freq_together = 0 # frq of that word occuring with that keyword self.doc_freq_obj = doc_freq_class.context()
def main(): obj = proper_noun() text = obj.scrape(sys.argv[1]) title = alchemyObj.URLGetTitle(sys.argv[1]) soup = BeautifulSoup(title) raw = soup('title') tokens_title_first = [str(title.text) for title in raw] #tokens_title = ['Three', 'Musketeers'] print "title::", print tokens_title_first #text = original ### Take nouns in title tokens_title_first = str(tokens_title_first[0]) print tokens_title_first tokens_title_temp = nltk.word_tokenize(tokens_title_first) tokens_title_pos = nltk.pos_tag(tokens_title_temp) print "tokens_title_temp::", print tokens_title_temp tokens_title = [] ##create duplicate list for t in tokens_title_temp: index = tokens_title_temp.index(t) print "t::" + t print "index::" + str(index) print "tag::" + tokens_title_pos[index][1] print "len" + str(len(t)) if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)): # tokens_title.remove(t) tokens_title.append(t) tokens_title.sort() tokens_title = list(tokens_title for tokens_title,_ in itertools.groupby(tokens_title)) print "title::", print tokens_title list_of_NNPs = obj.get_nnp_ngrams(text,5,0) #list_of_NNPs = [['Three','Musketeers'],['Alexandre', 'Dumas']]#,['Cardinal', 'Richelieu'],['Athos'],['Although'],['Porthos'] ] print "list of NNPs: ", print list_of_NNPs if len(list_of_NNPs)>3: ###### list_of_NNPs = list_of_NNPs[0:3] ######## doc_freq_obj = doc_freq_class.context() print "getting doc freq" max_df = [] for n in list_of_NNPs: print "got n" max_freq = 0 for t in tokens_title: print "got t" df = doc_freq_obj.get_together_DF(n,t) if df > max_freq: max_freq = df print "ngram:", print n print "title word:", print t print "df:", print df max_df.append(max_freq) i = 0 for df in max_df: for i in range(len(max_df)-1): if max_df[i]<max_df[i+1]: t = list_of_NNPs[i] list_of_NNPs[i]=list_of_NNPs[i+1] list_of_NNPs[i+1]= t t1 = max_df[i] max_df[i]=max_df[i+1] max_df[i+1] = t1 #i = 0 for i in range(len(list_of_NNPs)): print "keyword: ", print list_of_NNPs[i] print "df:", print max_df[i] if len(list_of_NNPs)>3: list_of_NNPs = list_of_NNPs[0:3]#********* #list_of_NNPs.sort() #list_of_NNPs_final = list(list_NNPs for list_NNPs,_ in itertools.groupby(list_of_NNPs)) #list_of_NNPs_final.sort() print "\n\nfinal list:", print list_of_NNPs nearbywordsObj = getnearbywords_intokens.getnearbywords() nearbywordsObj.get_words_from_proximity(list_of_NNPs,text)
def keywords(self, url, text): # import pdb;pdb.set_trace(); title = alchemyObj.URLGetTitle(url) soup = BeautifulSoup(title) raw = soup('title') tokens_title_first = [str(title.text) for title in raw] print "title::", print tokens_title_first #text = original ### Take nouns in title tokens_title_first = str(tokens_title_first[0]) print tokens_title_first tokens_title_temp = nltk.word_tokenize(tokens_title_first) tokens_title_pos = nltk.pos_tag(tokens_title_temp) print "tokens_title_temp::", print tokens_title_temp tokens_title = [] ##create duplicate list for t in tokens_title_temp: index = tokens_title_temp.index(t) print "t::" + t print "index::" + str(index) print "tag::" + tokens_title_pos[index][1] print "len" + str(len(t)) if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)): # tokens_title.remove(t) tokens_title.append(t) tokens_title.sort() tokens_title = list( tokens_title for tokens_title, _ in itertools.groupby(tokens_title)) print "title::", print tokens_title list_of_NNPs = self.get_nnp_ngrams(text, 5, 0) print "list of NNPs: ", print list_of_NNPs doc_freq_obj = doc_freq_class.context() print "getting doc freq" max_df = [] # for n in list_of_NNPs: #print "got n" # max_freq = 0 # for t in tokens_title: # print "got t" # df = doc_freq_obj.get_together_DF(n,t) # if df > max_freq: # max_freq = df # print "ngram:", # print n # print "title word:", # print t # print "df:", # print df # max_df.append(max_freq) #i = 0 #for df in max_df: # for i in range(len(max_df)-1): # if max_df[i]<max_df[i+1]: # t = list_of_NNPs[i] # list_of_NNPs[i]=list_of_NNPs[i+1] # list_of_NNPs[i+1]= t # t1 = max_df[i] # max_df[i]=max_df[i+1] # max_df[i+1] = t1 # for i in range(len(list_of_NNPs)): # print "keyword: ", # print list_of_NNPs[i] # print "df:", # print max_df[i] # print "\n\nfinal list:", # print list_of_NNPs return list_of_NNPs
def keywords(self,url,text): # import pdb;pdb.set_trace(); title = alchemyObj.URLGetTitle(url) soup = BeautifulSoup(title) raw = soup('title') tokens_title_first = [str(title.text) for title in raw] print "title::", print tokens_title_first #text = original ### Take nouns in title tokens_title_first = str(tokens_title_first[0]) print tokens_title_first tokens_title_temp = nltk.word_tokenize(tokens_title_first) tokens_title_pos = nltk.pos_tag(tokens_title_temp) print "tokens_title_temp::", print tokens_title_temp tokens_title = [] ##create duplicate list for t in tokens_title_temp: index = tokens_title_temp.index(t) print "t::" + t print "index::" + str(index) print "tag::" + tokens_title_pos[index][1] print "len" + str(len(t)) if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)): # tokens_title.remove(t) tokens_title.append(t) tokens_title.sort() tokens_title = list(tokens_title for tokens_title,_ in itertools.groupby(tokens_title)) print "title::", print tokens_title list_of_NNPs = self.get_nnp_ngrams(text,5,0) print "list of NNPs: ", print list_of_NNPs doc_freq_obj = doc_freq_class.context() print "getting doc freq" max_df = [] # for n in list_of_NNPs: #print "got n" # max_freq = 0 # for t in tokens_title: # print "got t" # df = doc_freq_obj.get_together_DF(n,t) # if df > max_freq: # max_freq = df # print "ngram:", # print n # print "title word:", # print t # print "df:", # print df # max_df.append(max_freq) #i = 0 #for df in max_df: # for i in range(len(max_df)-1): # if max_df[i]<max_df[i+1]: # t = list_of_NNPs[i] # list_of_NNPs[i]=list_of_NNPs[i+1] # list_of_NNPs[i+1]= t # t1 = max_df[i] # max_df[i]=max_df[i+1] # max_df[i+1] = t1 # for i in range(len(list_of_NNPs)): # print "keyword: ", # print list_of_NNPs[i] # print "df:", # print max_df[i] # print "\n\nfinal list:", # print list_of_NNPs return list_of_NNPs