def hello(self, sentence): ''' tokenized_sentence = word_tokenize(sentence) punctuation = re.compile(r'[-.?!,":;()|0-9]') tokenized_sentence = list(filter(None,tokenized_sentence)) tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence] extracted = [] for w in tokenized_sentence: if (w.lower() not in stopwords.words('english') and w != ""): extracted.append(w) tagged_sent = pos_tag(extracted) interest_types = ["NN","NNP","NNS","VBG","VB"] for tagged in tagged_sent: word_type = tagged[1] if word_type in interest_types: if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) ''' extracted = [] rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) return json.dumps( [dict(name=keyword[0], weight=keyword[1]) for keyword in keywords])
def get_phrases(sents,search_text,res_ind): ''' :param sents: list of sentences for search :param search_text: search text :res_ind: indices of best matching sents :return: phrases from query and top results ''' full_text=' . '.join([sents[i] for i in res_ind]) full_text = full_text +' . '+search_text rake = Rake() keys = rake.run(full_text) print keys query_phrases=[] query_words=word_tokenize(search_text) for phr,score in keys: words=word_tokenize(phr) flag_present=1 for word in words: if word not in query_words: flag_present=0 if flag_present==1: query_phrases.append((phr,score)) print query_phrases ###change the phrase to all possible synonyms, find the phrase with maximum match ###look for the nearest answer type to that phrase return keys
def hello(self, sentence): ''' tokenized_sentence = word_tokenize(sentence) punctuation = re.compile(r'[-.?!,":;()|0-9]') tokenized_sentence = list(filter(None,tokenized_sentence)) tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence] extracted = [] for w in tokenized_sentence: if (w.lower() not in stopwords.words('english') and w != ""): extracted.append(w) tagged_sent = pos_tag(extracted) interest_types = ["NN","NNP","NNS","VBG","VB"] for tagged in tagged_sent: word_type = tagged[1] if word_type in interest_types: if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) ''' extracted = [] rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) return json.dumps([dict(name=keyword[0],weight=keyword[1]) for keyword in keywords])
def get_keywords(text): """ Gets main keywords using RAKE Algorithm """ rake = Rake("SmartStoplist.txt") keywords = rake.run(text) return [k[0] for k in keywords if len(k[0].split(" ")) <= 2 and k[1] > 1]
def get_keywords(text, stopwords="SmartStoplist.txt"): #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated #https://github.com/zelandiya/RAKE-tutorial #phrase_max_words = 3 #min_word_chars = 5 #min_kw_repeat_rate = 4 #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate) rake = Rake(stopwords) keywords = rake.run(text) return keywords
def get_rake_kp(file_name, topk): json_file = open(file_name, 'r', encoding='utf-8') rake_kp = [] for line in json_file.readlines(): json_data = json.loads(line) cur_content = json_data['title'].strip().lower( ) + ' ' + json_data['abstract'].strip().lower() content_list = nltk.word_tokenize(cur_content) rake = Rake() keywords_dict = rake.run(cur_content) keywords_list = list(keywords_dict.keys())[:topk] kp_list = get_kp(content_list, keywords_list) rake_kp.append(kp_list) json_file.close() return rake_kp
def abstract_analyze(pdf, abstract): match_word_file = "Matchlist.txt" match = load_match_words(match_word_file) stop_words_path = "SmartStoplist.txt" r = Rake(stop_words_path) temp = r.run(abstract) matched = [] for item in temp: if (item[1] >= 3): #以分数3的界限分隔 matched.append(item) matched = temp flag = False for item in matched: if (item[0] in match): list3.append(pdf) flag = True break if (flag == False): list4.append(pdf)
def index(): if request.method == "POST": job_description = request.form["description"] job_title = request.form["title"] rake = Rake("all_stop_words.txt") keyword_tuples = rake.run(job_description) keyword_dict = turn_tuple_to_dict(keyword_tuples) important_sentences = summarize(job_title, job_description) common_words = get_common_words(keyword_dict, important_sentences) return render_template("results.html", keywords=keyword_dict, summaries=important_sentences, common_words = common_words) return render_template('index.html')
def __getMainWords__(self, userInput): rake = Rake("SmartStoplist.txt") keywords = rake.run(userInput) return keywords
re.sub('\n[\s]*', '\n', section_text.lower(), flags=re.M | re.DOTALL), flags=re.M | re.DOTALL), flags=re.M | re.DOTALL), flags=re.M | re.DOTALL) with open(str(cur_topic['pno']) + ' ' + cur_topic['title'] + '.txt', 'w') as f: f.write(section_text) preprocessed_sections.append(section_text) keywords = keywords.union( map(lambda x: x[0], unigram_rake.run(section_text))).union( map(lambda x: x[0], bigram_rake.run(section_text))).union( map(lambda x: x[0], trigram_rake.run(section_text))) print >> open('__Keywords.txt', 'w'), '\n'.join(list(keywords)) with open('__Sections.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Id', 'Section No.', 'Level', 'Section', 'Page No.']) writer.writerows([[ i, chapter_tree[i]['sno'], chapter_tree[i]['level'], chapter_tree[i]['title'], chapter_tree[i]['pno'] ] for i in range(len(chapter_tree[:-1])) if i not in skip]) with open('__OS-train.txt', 'w') as f:
def get_keyword(text): rake = Rake("SmartStoplist.txt") if text == "": return "" keywords = rake.run(text) return keywords[0][0]
def extract_product(html_content, url): #String Buffer string_buffer = "" errs = list() #Read page and read to extract product infomation parser = BeautifulSoup(html_content, "html.parser") #Check if the page is a product, if not skip page. truth, asin = check_page(parser) if not truth: errs.append("Not product") return (False, errs) #New Product as a object product = Product() #New Keyword rank keyword = Rake(SmartStopList.words()) #Find URL product.SetUrl(url) #Find Brand: Note: Some products have an image for the brand truth, string_buffer = search_table( parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name") if truth: product.SetBrand(string_buffer) else: string_buffer = parser.find("a", attrs={"id": "brand"}) if string_buffer != None: product.SetBrand(string_buffer.get_text().strip()) else: errs.append("Could not find Brand") #Find Title string_buffer = parser.find("span", attrs={"id": "productTitle"}) if string_buffer != None: product.SetTitle(string_buffer.get_text().strip()) else: errs.append("Could not find Title") return (False, errs) #Find Image string_buffer = parser.find("img", attrs={"id": "landingImage"}) if string_buffer != None: string_buffer = string_buffer.get("data-old-hires") if len(string_buffer) < 2: string_buffer = parser.find("img", attrs={ "id": "landingImage" }).get("data-a-dynamic-image") m = re.search('https://(.+?).jpg', string_buffer) if m: string_buffer = m.group(1) string_buffer = "https://{}.jpg".format(string_buffer) #print ("Img Url: "+string_buffer) product.SetImage(string_buffer) else: errs.append("Could not find Image") #Find Small Blob #TODO: Need to perform keyword analysis string_buffer = parser.find("div", attrs={"id": "feature-bullets"}) if string_buffer != None: string_buffer = string_buffer.find("ul") try: string_buffer = string_buffer.find_all("li") if string_buffer != None: string_buffer_2 = "" for span in string_buffer: string_buffer_3 = span.find("span") if string_buffer_3 != None: string_buffer_3 = string_buffer_3.get_text() try: string_buffer_2 = "{} {}".format( string_buffer_2, string_buffer_3.strip()) except: pass saved_buffer = string_buffer_2.strip() #Calculating Key Words keywords_1 = keyword.run(saved_buffer) product.SetSmallBlog(keywords_1) except: errs.append("Error finding li") else: errs.append("Could not find small section keywords") #Find Large Blob #TODO: Need to perform keyword analysis string_buffer = parser.find("div", attrs={"id": "productDescription"}) if string_buffer != None: string_buffer = string_buffer.find("p") if string_buffer != None: string_buffer = string_buffer.get_text() saved_buffer = string_buffer.strip() #Calculating Key Words keywords_2 = keyword.run(saved_buffer) product.SetLargeBlob(keywords_2) else: errs.append("Could not find large section keywords") #Find ASIN product.SetSourceID(asin) #TODO: Perform price save! #Append the product to large list of products if product.FormCompleted(): return (product, errs) else: return (False, errs)
def getRakeKeywords(doc): r = Rake(path.join('', cur_dir+'/SmartStoplist.txt')) candidates = r.run(open(doc).read().replace('\n',' ')) return candidates[:300]
def get_key_phrases(document, stop_list): r = Rake(stop_list) keywords = r.run(document.lower()) phrase_list = [word[0] for word in keywords if len(word[0].split(" ")) < 4] return phrase_list
if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) # print (importantwords) fdist = FreqDist(extracted) # print (fdist) # print (fdist.most_common(50)) rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) # print (keywords) for keyword in keywords: word = keyword[0] # print (word) response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin') soup = BeautifulSoup(response.text) content = soup.find(id='mw-content-text') sentence = ' '.join(content.text.split())
users = parser.getUsers() with open("data/edinburgh_restaurant_reviews.json") as f: data = json.loads(f.readline()) pos_polarity = 0 neg_polarity = 0 for business_id in data: results[business_id] = {} for review in data[business_id]: b = TextBlob(review["text"]) if b.sentiment.polarity >= 0: pos_polarity += b.sentiment.polarity else: neg_polarity += b.sentiment.polarity keywords = rake.run(b) for topic in keywords: if topic[0] not in results[business_id]: results[business_id][topic[0]] = {'count': 0} results[business_id][topic[0]]['count'] += 1 avg_pos_polarity = float(pos_polarity) / len(data.keys()) avg_neg_polarity = float(neg_polarity) / len(data.keys()) effective_keywords = [] for business_id in results: if len(results[business_id].keys()) != 0: count = 0 for key, value in results[business_id].iteritems(): count += int(value['count']) avg_count = count / len(results[business_id].keys())
if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) # print (importantwords) fdist = FreqDist(extracted) # print (fdist) # print (fdist.most_common(50)) rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) # print (keywords) for keyword in keywords: word = keyword[0] # print (word) response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin'); soup = BeautifulSoup(response.text) content = soup.find(id='mw-content-text') sentence = ' '.join(content.text.split())