Example #1
0
    def hello(self, sentence):
        '''
	tokenized_sentence = word_tokenize(sentence)
	punctuation = re.compile(r'[-.?!,":;()|0-9]')
	tokenized_sentence = list(filter(None,tokenized_sentence))
	tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence]
	extracted = []
	for w in tokenized_sentence:
		if (w.lower() not in stopwords.words('english') and w != ""):
			extracted.append(w)
	tagged_sent = pos_tag(extracted)
	interest_types = ["NN","NNP","NNS","VBG","VB"]
	for tagged in tagged_sent:
	    word_type = tagged[1]
	    if word_type in interest_types:
	        if (tagged[0] not in extracted and tagged[0] != ""):
	            extracted.append(tagged[0])
	importantwords = ', '.join(extracted)
	'''
        extracted = []

        rake = Rake("SmartStoplist.txt")

        keywords = rake.run(sentence)

        return json.dumps(
            [dict(name=keyword[0], weight=keyword[1]) for keyword in keywords])
 def get_phrases(sents,search_text,res_ind):
     '''
     :param sents: list of sentences for search
     :param search_text: search text
     :res_ind: indices of best matching sents
     :return: phrases from query and top results
     '''
     full_text=' . '.join([sents[i] for i in res_ind])
     full_text = full_text +' . '+search_text
     rake = Rake()
     keys = rake.run(full_text)
     print keys
     query_phrases=[]
     query_words=word_tokenize(search_text)
     for phr,score in keys:
         words=word_tokenize(phr)
         flag_present=1
         for word in words:
             if word not in query_words:
                 flag_present=0
         if flag_present==1:
             query_phrases.append((phr,score))
     print query_phrases
     ###change the phrase to all possible synonyms, find the phrase with maximum match
     ###look for the nearest answer type to that phrase
     return keys
Example #3
0
    def hello(self, sentence):    	
    	'''
	tokenized_sentence = word_tokenize(sentence)
	punctuation = re.compile(r'[-.?!,":;()|0-9]')
	tokenized_sentence = list(filter(None,tokenized_sentence))
	tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence]
	extracted = []
	for w in tokenized_sentence:
		if (w.lower() not in stopwords.words('english') and w != ""):
			extracted.append(w)
	tagged_sent = pos_tag(extracted)
	interest_types = ["NN","NNP","NNS","VBG","VB"]
	for tagged in tagged_sent:
	    word_type = tagged[1]
	    if word_type in interest_types:
	        if (tagged[0] not in extracted and tagged[0] != ""):
	            extracted.append(tagged[0])
	importantwords = ', '.join(extracted)
	'''
	extracted = []
	
	rake = Rake("SmartStoplist.txt")

	keywords = rake.run(sentence)

        return json.dumps([dict(name=keyword[0],weight=keyword[1]) for keyword in keywords])
def get_keywords(text):
    """
    Gets main keywords using RAKE Algorithm

    """
    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    return [k[0] for k in keywords if len(k[0].split(" ")) <= 2 and k[1] > 1]
def get_keywords(text, stopwords="SmartStoplist.txt"):
    #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated
    #https://github.com/zelandiya/RAKE-tutorial
    #phrase_max_words = 3
    #min_word_chars = 5
    #min_kw_repeat_rate = 4
    #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate)
    rake = Rake(stopwords)
    keywords = rake.run(text)
    return keywords
def get_keywords(text, stopwords="SmartStoplist.txt"):
    #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated
    #https://github.com/zelandiya/RAKE-tutorial
    #phrase_max_words = 3
    #min_word_chars = 5
    #min_kw_repeat_rate = 4
    #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate)
    rake = Rake(stopwords)
    keywords = rake.run(text)
    return keywords
Example #7
0
def get_rake_kp(file_name, topk):
    json_file = open(file_name, 'r', encoding='utf-8')
    rake_kp = []
    for line in json_file.readlines():
        json_data = json.loads(line)
        cur_content = json_data['title'].strip().lower(
        ) + ' ' + json_data['abstract'].strip().lower()
        content_list = nltk.word_tokenize(cur_content)
        rake = Rake()
        keywords_dict = rake.run(cur_content)
        keywords_list = list(keywords_dict.keys())[:topk]
        kp_list = get_kp(content_list, keywords_list)
        rake_kp.append(kp_list)
    json_file.close()
    return rake_kp
Example #8
0
def abstract_analyze(pdf, abstract):
    match_word_file = "Matchlist.txt"
    match = load_match_words(match_word_file)
    stop_words_path = "SmartStoplist.txt"
    r = Rake(stop_words_path)
    temp = r.run(abstract)
    matched = []
    for item in temp:
        if (item[1] >= 3):  #以分数3的界限分隔
            matched.append(item)
    matched = temp
    flag = False
    for item in matched:
        if (item[0] in match):
            list3.append(pdf)
            flag = True
            break
    if (flag == False):
        list4.append(pdf)
Example #9
0
def index():
        if request.method == "POST":

                job_description = request.form["description"]
                job_title = request.form["title"]
                
                rake = Rake("all_stop_words.txt")
                keyword_tuples = rake.run(job_description)
                keyword_dict = turn_tuple_to_dict(keyword_tuples)
                
                important_sentences = summarize(job_title, job_description) 
                
                common_words = get_common_words(keyword_dict, important_sentences)

               
                return render_template("results.html", 
                                    keywords=keyword_dict, 
                                    summaries=important_sentences,
                                    common_words = common_words)

        return render_template('index.html')
Example #10
0
 def __getMainWords__(self, userInput):
     rake = Rake("SmartStoplist.txt")
     keywords = rake.run(userInput)
     return keywords
Example #11
0
                                        re.sub('\n[\s]*',
                                               '\n',
                                               section_text.lower(),
                                               flags=re.M | re.DOTALL),
                                        flags=re.M | re.DOTALL),
                                 flags=re.M | re.DOTALL),
                          flags=re.M | re.DOTALL)

    with open(str(cur_topic['pno']) + ' ' + cur_topic['title'] + '.txt',
              'w') as f:
        f.write(section_text)

    preprocessed_sections.append(section_text)

    keywords = keywords.union(
        map(lambda x: x[0], unigram_rake.run(section_text))).union(
            map(lambda x: x[0], bigram_rake.run(section_text))).union(
                map(lambda x: x[0], trigram_rake.run(section_text)))

print >> open('__Keywords.txt', 'w'), '\n'.join(list(keywords))

with open('__Sections.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Id', 'Section No.', 'Level', 'Section', 'Page No.'])

    writer.writerows([[
        i, chapter_tree[i]['sno'], chapter_tree[i]['level'],
        chapter_tree[i]['title'], chapter_tree[i]['pno']
    ] for i in range(len(chapter_tree[:-1])) if i not in skip])

with open('__OS-train.txt', 'w') as f:
Example #12
0
def get_keyword(text):
    rake = Rake("SmartStoplist.txt")
    if text == "":
        return ""
    keywords = rake.run(text)
    return keywords[0][0]
def extract_product(html_content, url):
    #String Buffer
    string_buffer = ""
    errs = list()

    #Read page and read to extract product infomation
    parser = BeautifulSoup(html_content, "html.parser")

    #Check if the page is a product, if not skip page.
    truth, asin = check_page(parser)
    if not truth:
        errs.append("Not product")
        return (False, errs)

    #New Product as a object
    product = Product()
    #New Keyword rank
    keyword = Rake(SmartStopList.words())

    #Find URL
    product.SetUrl(url)

    #Find Brand: Note: Some products have an image for the brand
    truth, string_buffer = search_table(
        parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name")
    if truth:
        product.SetBrand(string_buffer)
    else:
        string_buffer = parser.find("a", attrs={"id": "brand"})
        if string_buffer != None:
            product.SetBrand(string_buffer.get_text().strip())
        else:
            errs.append("Could not find Brand")

    #Find Title
    string_buffer = parser.find("span", attrs={"id": "productTitle"})
    if string_buffer != None:
        product.SetTitle(string_buffer.get_text().strip())
    else:
        errs.append("Could not find Title")
        return (False, errs)

    #Find Image
    string_buffer = parser.find("img", attrs={"id": "landingImage"})
    if string_buffer != None:
        string_buffer = string_buffer.get("data-old-hires")
        if len(string_buffer) < 2:
            string_buffer = parser.find("img", attrs={
                "id": "landingImage"
            }).get("data-a-dynamic-image")
            m = re.search('https://(.+?).jpg', string_buffer)
            if m:
                string_buffer = m.group(1)
                string_buffer = "https://{}.jpg".format(string_buffer)
        #print ("Img Url: "+string_buffer)
        product.SetImage(string_buffer)
    else:
        errs.append("Could not find Image")

    #Find Small Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "feature-bullets"})
    if string_buffer != None:
        string_buffer = string_buffer.find("ul")
    try:
        string_buffer = string_buffer.find_all("li")
        if string_buffer != None:
            string_buffer_2 = ""
            for span in string_buffer:
                string_buffer_3 = span.find("span")
                if string_buffer_3 != None:
                    string_buffer_3 = string_buffer_3.get_text()
                    try:
                        string_buffer_2 = "{} {}".format(
                            string_buffer_2, string_buffer_3.strip())
                    except:
                        pass
            saved_buffer = string_buffer_2.strip()
            #Calculating Key Words
            keywords_1 = keyword.run(saved_buffer)
            product.SetSmallBlog(keywords_1)
    except:
        errs.append("Error finding li")

    else:
        errs.append("Could not find small section keywords")

    #Find Large Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "productDescription"})
    if string_buffer != None:
        string_buffer = string_buffer.find("p")
    if string_buffer != None:
        string_buffer = string_buffer.get_text()
        saved_buffer = string_buffer.strip()
        #Calculating Key Words
        keywords_2 = keyword.run(saved_buffer)
        product.SetLargeBlob(keywords_2)
    else:
        errs.append("Could not find large section keywords")

    #Find ASIN
    product.SetSourceID(asin)

    #TODO: Perform price save!

    #Append the product to large list of products
    if product.FormCompleted():
        return (product, errs)
    else:
        return (False, errs)
Example #14
0
def getRakeKeywords(doc):
    r = Rake(path.join('', cur_dir+'/SmartStoplist.txt'))
    candidates = r.run(open(doc).read().replace('\n',' '))
    return candidates[:300]
Example #15
0
def get_key_phrases(document, stop_list):
    r = Rake(stop_list)
    keywords = r.run(document.lower())

    phrase_list = [word[0] for word in keywords if len(word[0].split(" ")) < 4]
    return phrase_list
Example #16
0
        if (tagged[0] not in extracted and tagged[0] != ""):
            extracted.append(tagged[0])

importantwords = ', '.join(extracted)

# print (importantwords)

fdist = FreqDist(extracted)

# print (fdist)

# print (fdist.most_common(50))

rake = Rake("SmartStoplist.txt")

keywords = rake.run(sentence)

# print (keywords)

for keyword in keywords:
    word = keyword[0]
    # print (word)

response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin')

soup = BeautifulSoup(response.text)

content = soup.find(id='mw-content-text')

sentence = ' '.join(content.text.split())
users = parser.getUsers()

with open("data/edinburgh_restaurant_reviews.json") as f:
    data = json.loads(f.readline())

pos_polarity = 0
neg_polarity = 0
for business_id in data:
    results[business_id] = {}
    for review in data[business_id]:
        b = TextBlob(review["text"])
        if b.sentiment.polarity >= 0:
            pos_polarity += b.sentiment.polarity
        else:
            neg_polarity += b.sentiment.polarity
        keywords = rake.run(b)
        for topic in keywords:
            if topic[0] not in results[business_id]:
                results[business_id][topic[0]] = {'count': 0}
            results[business_id][topic[0]]['count'] += 1

avg_pos_polarity = float(pos_polarity) / len(data.keys())
avg_neg_polarity = float(neg_polarity) / len(data.keys())

effective_keywords = []
for business_id in results:
    if len(results[business_id].keys()) != 0:
        count = 0
        for key, value in results[business_id].iteritems():
            count += int(value['count'])
        avg_count = count / len(results[business_id].keys())
Example #18
0
		if (tagged[0] not in extracted and tagged[0] != ""):
			extracted.append(tagged[0])

importantwords = ', '.join(extracted)
	
# print (importantwords)

fdist = FreqDist(extracted)

# print (fdist)

# print (fdist.most_common(50))

rake = Rake("SmartStoplist.txt")

keywords = rake.run(sentence)

# print (keywords)

for keyword in keywords:
	word = keyword[0]
	# print (word)

response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin');

soup = BeautifulSoup(response.text)

content = soup.find(id='mw-content-text')

sentence = ' '.join(content.text.split())