Ejemplos de Rake en Python, ejemplos de rake.Rake en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: hellorpc.py Proyecto: sunnysinghgithub/myapp

    def hello(self, sentence):    	
    	'''
	tokenized_sentence = word_tokenize(sentence)
	punctuation = re.compile(r'[-.?!,":;()|0-9]')
	tokenized_sentence = list(filter(None,tokenized_sentence))
	tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence]
	extracted = []
	for w in tokenized_sentence:
		if (w.lower() not in stopwords.words('english') and w != ""):
			extracted.append(w)
	tagged_sent = pos_tag(extracted)
	interest_types = ["NN","NNP","NNS","VBG","VB"]
	for tagged in tagged_sent:
	    word_type = tagged[1]
	    if word_type in interest_types:
	        if (tagged[0] not in extracted and tagged[0] != ""):
	            extracted.append(tagged[0])
	importantwords = ', '.join(extracted)
	'''
	extracted = []
	
	rake = Rake("SmartStoplist.txt")

	keywords = rake.run(sentence)

        return json.dumps([dict(name=keyword[0],weight=keyword[1]) for keyword in keywords])

Ejemplo n.º 2

0

Mostrar archivo

    def hello(self, sentence):
        '''
	tokenized_sentence = word_tokenize(sentence)
	punctuation = re.compile(r'[-.?!,":;()|0-9]')
	tokenized_sentence = list(filter(None,tokenized_sentence))
	tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence]
	extracted = []
	for w in tokenized_sentence:
		if (w.lower() not in stopwords.words('english') and w != ""):
			extracted.append(w)
	tagged_sent = pos_tag(extracted)
	interest_types = ["NN","NNP","NNS","VBG","VB"]
	for tagged in tagged_sent:
	    word_type = tagged[1]
	    if word_type in interest_types:
	        if (tagged[0] not in extracted and tagged[0] != ""):
	            extracted.append(tagged[0])
	importantwords = ', '.join(extracted)
	'''
        extracted = []

        rake = Rake("SmartStoplist.txt")

        keywords = rake.run(sentence)

        return json.dumps(
            [dict(name=keyword[0], weight=keyword[1]) for keyword in keywords])

Ejemplo n.º 3

0

Mostrar archivo

 def generate_rake_keywords(self):
     if os.path.exists(self.keyword_file) and os.path.exists(
             self.rake_score_file):
         print(
             'Already have the files [' + self.keyword_file + ', ' +
             self.rake_score_file + ']', ', directly load them.')
         self.load()
         return
     self.dataset.load()
     if self.path is None:
         self.path = self.dataset.default_path
     ger_stop_words = stopwords.words('german')
     stop_words = stopwords.words('english')
     stop_words.extend(ger_stop_words)
     stop_words.extend(['via', 'using', 'fr'])
     r = Rake(stop_words)
     r.extract_keywords_from_sentences(self.dataset.titles)
     path = os.path.join(self.path, 'keyword.dat')
     print('generate keywords', end='', flush=True)
     with open(path, 'wb') as f:
         i = 0
         for title in self.dataset.titles:
             i += 1
             if i % 100000 == 0:
                 print('.', end='', flush=True)
             phrases = r.generate_phrases(title)
             phrases = [' '.join(phrase) for phrase in phrases]
             self.keywords.append(phrases)
         pickle.dump(self.keywords, f)
     self.rake_scores = r.phrase_score
     path = os.path.join(self.path, 'rake_score.dat')
     with open(path, 'wb') as f:
         pickle.dump(self.rake_scores, f)
     print('done')

Ejemplo n.º 4

0

Mostrar archivo

Archivo: evaluatingResults.py Proyecto: joswinkj/question_answering

 def get_phrases(sents,search_text,res_ind):
     '''
     :param sents: list of sentences for search
     :param search_text: search text
     :res_ind: indices of best matching sents
     :return: phrases from query and top results
     '''
     full_text=' . '.join([sents[i] for i in res_ind])
     full_text = full_text +' . '+search_text
     rake = Rake()
     keys = rake.run(full_text)
     print keys
     query_phrases=[]
     query_words=word_tokenize(search_text)
     for phr,score in keys:
         words=word_tokenize(phr)
         flag_present=1
         for word in words:
             if word not in query_words:
                 flag_present=0
         if flag_present==1:
             query_phrases.append((phr,score))
     print query_phrases
     ###change the phrase to all possible synonyms, find the phrase with maximum match
     ###look for the nearest answer type to that phrase
     return keys

Ejemplo n.º 5

0

Mostrar archivo

Archivo: utils.py Proyecto: aprojectthatwasonceknownasbidsx/piazza-analyzer

def get_keywords(text):
    """
    Gets main keywords using RAKE Algorithm

    """
    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    return [k[0] for k in keywords if len(k[0].split(" ")) <= 2 and k[1] > 1]

Ejemplo n.º 6

0

Mostrar archivo

Archivo: get_keywords.py Proyecto: seamustuohy/overview_archive

def get_keywords(text, stopwords="SmartStoplist.txt"):
    #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated
    #https://github.com/zelandiya/RAKE-tutorial
    #phrase_max_words = 3
    #min_word_chars = 5
    #min_kw_repeat_rate = 4
    #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate)
    rake = Rake(stopwords)
    keywords = rake.run(text)
    return keywords

Ejemplo n.º 7

0

Mostrar archivo

Archivo: get_keywords.py Proyecto: seamustuohy/overview_archive

def get_keywords(text, stopwords="SmartStoplist.txt"):
    #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated
    #https://github.com/zelandiya/RAKE-tutorial
    #phrase_max_words = 3
    #min_word_chars = 5
    #min_kw_repeat_rate = 4
    #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate)
    rake = Rake(stopwords)
    keywords = rake.run(text)
    return keywords

Ejemplo n.º 8

0

Mostrar archivo

Archivo: trad_test.py Proyecto: joyce99/XieYuxin

def get_rake_kp(file_name, topk):
    json_file = open(file_name, 'r', encoding='utf-8')
    rake_kp = []
    for line in json_file.readlines():
        json_data = json.loads(line)
        cur_content = json_data['title'].strip().lower(
        ) + ' ' + json_data['abstract'].strip().lower()
        content_list = nltk.word_tokenize(cur_content)
        rake = Rake()
        keywords_dict = rake.run(cur_content)
        keywords_list = list(keywords_dict.keys())[:topk]
        kp_list = get_kp(content_list, keywords_list)
        rake_kp.append(kp_list)
    json_file.close()
    return rake_kp

Ejemplo n.º 9

0

Mostrar archivo

Archivo: main.py Proyecto: jeffreyfei/kontex

def get_sentence_keyword_score(document, num_sentences):
    rake = Rake()
    keywords = rake.get_keywords(document)
    ranked_keywords = rake.generate_keyword_rank(keywords)
    sufficient_keywords_length = int(math.ceil(len(ranked_keywords) / 4.0))
    sufficient_keywords = ranked_keywords[:sufficient_keywords_length]
    total_keyword_score = 0.0
    # value of a keyword is its relative score value divided by the score of all keywords
    sentence_keyword_score = [0.0] * num_sentences
    for keyword in sufficient_keywords:
        total_keyword_score += keyword['score']
    for keyword in sufficient_keywords:
        sentence_keyword_score[
            keyword['sentence_num']] += keyword['score'] / total_keyword_score
    return sentence_keyword_score

Ejemplo n.º 10

0

Mostrar archivo

Archivo: extract_title.py Proyecto: ppujari/Hackathon2016

def extract_sentiment_nltk(reviews):
    rake = Rake("SmartStoplist.txt")
    for item_id, review in reviews:
        pos_keyword_list, neg_keyword_list = extract_keywords_with_sentiment(review, rake)
        print "pos: {}".format(pos_keyword_list)
        print "neg: {}".format(neg_keyword_list)
        print "\n"

Ejemplo n.º 11

0

Mostrar archivo

Archivo: extract_title.py Proyecto: ppujari/Hackathon2016

def extract_titles_rake(reviews):

    rake = Rake("SmartStoplist.txt")
    for item_id, review in reviews:
        print "Review: {}".format(review)
        title = extract_title_rake(review, rake)
        print title
        print "\n"

Ejemplo n.º 12

0

Mostrar archivo

Archivo: Pub.py Proyecto: blankTt/Browser

def abstract_analyze(pdf, abstract):
    match_word_file = "Matchlist.txt"
    match = load_match_words(match_word_file)
    stop_words_path = "SmartStoplist.txt"
    r = Rake(stop_words_path)
    temp = r.run(abstract)
    matched = []
    for item in temp:
        if (item[1] >= 3):  #以分数3的界限分隔
            matched.append(item)
    matched = temp
    flag = False
    for item in matched:
        if (item[0] in match):
            list3.append(pdf)
            flag = True
            break
    if (flag == False):
        list4.append(pdf)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: my_arxiv.py Proyecto: lingkangjie/ProjectManagement

def keyword_title(title_corpus):

    ## here we need NLTK stopwords and punkt, will storaged in /usr/share/nltk_data
    # uncomment to download

    #nltk.download('stopwords')
    nltk.download('punkt')

    title_dict = {}
    for t in title_corpus:
        key = (t[3], t[4])
        if key in title_dict:
            title_dict[key].append(t[1])
        else:
            title_dict[key] = []
            title_dict[key].append(t[1])

    # extract keywords with year span
    title_years = {}
    for k, v in title_dict.items():
        key = (k[0], )  # year index
        if key in title_years.keys():
            title_years[key].append(v)
        else:
            title_years[key] = []
            title_years[key].append(v)

    for k, v in title_years.items():
        r = Rake()
        vs = [item.rstrip('\n') for sublist in v for item in sublist]
        # a list of strings where each string is a sentence
        #r.extract_keywords_from_sentences(vs)
        #print('The keywords for year:{}'.format(str(k[0])))
        #print(r.get_ranked_phrases_with_scores()[0:10])

        title_txt = '''.'''.join(vs)
        title_txt.strip('\n')
        r.extract_keywords_from_text(title_txt)
        print('The keywords for year:{}'.format(str(k[0])))
        # to get keyword phrases ranked from hightest to lowest with scores
        print(r.get_ranked_phrases_with_scores()[0:10])

Ejemplo n.º 14

0

Mostrar archivo

Archivo: app.py Proyecto: brdhunga/rake-resume

def index():
        if request.method == "POST":

                job_description = request.form["description"]
                job_title = request.form["title"]
                
                rake = Rake("all_stop_words.txt")
                keyword_tuples = rake.run(job_description)
                keyword_dict = turn_tuple_to_dict(keyword_tuples)
                
                important_sentences = summarize(job_title, job_description) 
                
                common_words = get_common_words(keyword_dict, important_sentences)

               
                return render_template("results.html", 
                                    keywords=keyword_dict, 
                                    summaries=important_sentences,
                                    common_words = common_words)

        return render_template('index.html')

Ejemplo n.º 15

0

Mostrar archivo

def get_keyword(text):
    rake = Rake("SmartStoplist.txt")
    if text == "":
        return ""
    keywords = rake.run(text)
    return keywords[0][0]

Ejemplo n.º 16

0

Mostrar archivo

from pandas import DataFrame

#given the list of stopwords from nltk.corpus.stopwords.words and also added some more based on the taken text
stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'across',
 'needs','called','together','creates','tells','yet','1996','shows','following','discussed']

#given the list of punctuations based on string.punctuations and also added some more based on the text
punctuations=['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>','."',';}', '(...);',
 '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '<>', '[]', '()', '/*', '*/', '("', '")', ');', '//', '...','!"','•',"''",'",','""','','[])','".',
 '<<','>>','<<<','!)','(/*','*)','().','();','==']


file = open("jbn.txt","r") #text taken as data
text = file.read()
r = Rake(punctuations=punctuations,stopwords=stopwords,ranking_metric=2)
r.extract_keywords_from_text(text)

#if words are to be stored in a text file
# file1 = open("keywords.txt","w")
# for i in range(len(r.rank_list)//2): #half of the list of phrases found are considered to be as keywords 
# 	file1.write(str(r.rank_list[i][0])+"  "+r.rank_list[i][1])
# 	file1.write("\n")
# file1.close()

#if words are to be stored in an excel sheet

keywords = []
weights = []
for i in range(len(r.rank_list)//2): #half of the list of phrases found are considered to be as keywords 
	keywords.append(r.rank_list[i][1])

Ejemplo n.º 17

0

Mostrar archivo

def get_key_phrases(document, stop_list):
    r = Rake(stop_list)
    keywords = r.run(document.lower())

    phrase_list = [word[0] for word in keywords if len(word[0].split(" ")) < 4]
    return phrase_list

Ejemplo n.º 18

0

Mostrar archivo

Archivo: feed.py Proyecto: AdamUpcomer/rss-feed

    'lol': ['league of legends', 'lol', 'riot'],
    'dota2': [
        'dota', 'dota2', 'defense of the ancients',
        'defense of the ancients 2', 'the international', 'ti7'
    ],
    'csgo': [
        'csgo', 'counter-strike', 'counter strike', 'cs-go',
        'counter-strike:global offensive'
    ],
    'overwatch': ['overwatch'],
    'wow': ['wow', 'world of warcraft'],
    'hots': ['hots', 'heroes of the storm'],
    'sc': ['starcraft 2', 'starcraft', 'sc2']
}

rake = Rake("SmartStoplist.txt", max_words_length=MAX_WORD_LENGTH)


def similar(a, b):
    if type(a) is list:
        similar_keywords = 0
        for a_val in a:
            for b_val in b:
                if Levenshtein.ratio(a_val, b_val) > KEYWORD_SIMILARITY_RATIO:
                    similar_keywords += 1
        return similar_keywords >= SIMILAR_KEYWORD_REQ
    return Levenshtein.ratio(a, b) > KEYWORD_SIMILARITY_RATIO


def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')

Ejemplo n.º 19

0

Mostrar archivo

Archivo: productfinder_helper.py Proyecto: henryvps/python-common-crawl-amazon-example

def extract_product(html_content, url):
    #String Buffer
    string_buffer = ""
    errs = list()

    #Read page and read to extract product infomation
    parser = BeautifulSoup(html_content, "html.parser")

    #Check if the page is a product, if not skip page.
    truth, asin = check_page(parser)
    if not truth:
        errs.append("Not product")
        return (False, errs)

    #New Product as a object
    product = Product()
    #New Keyword rank
    keyword = Rake(SmartStopList.words())

    #Find URL
    product.SetUrl(url)

    #Find Brand: Note: Some products have an image for the brand
    truth, string_buffer = search_table(
        parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name")
    if truth:
        product.SetBrand(string_buffer)
    else:
        string_buffer = parser.find("a", attrs={"id": "brand"})
        if string_buffer != None:
            product.SetBrand(string_buffer.get_text().strip())
        else:
            errs.append("Could not find Brand")

    #Find Title
    string_buffer = parser.find("span", attrs={"id": "productTitle"})
    if string_buffer != None:
        product.SetTitle(string_buffer.get_text().strip())
    else:
        errs.append("Could not find Title")
        return (False, errs)

    #Find Image
    string_buffer = parser.find("img", attrs={"id": "landingImage"})
    if string_buffer != None:
        string_buffer = string_buffer.get("data-old-hires")
        if len(string_buffer) < 2:
            string_buffer = parser.find("img", attrs={
                "id": "landingImage"
            }).get("data-a-dynamic-image")
            m = re.search('https://(.+?).jpg', string_buffer)
            if m:
                string_buffer = m.group(1)
                string_buffer = "https://{}.jpg".format(string_buffer)
        #print ("Img Url: "+string_buffer)
        product.SetImage(string_buffer)
    else:
        errs.append("Could not find Image")

    #Find Small Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "feature-bullets"})
    if string_buffer != None:
        string_buffer = string_buffer.find("ul")
    try:
        string_buffer = string_buffer.find_all("li")
        if string_buffer != None:
            string_buffer_2 = ""
            for span in string_buffer:
                string_buffer_3 = span.find("span")
                if string_buffer_3 != None:
                    string_buffer_3 = string_buffer_3.get_text()
                    try:
                        string_buffer_2 = "{} {}".format(
                            string_buffer_2, string_buffer_3.strip())
                    except:
                        pass
            saved_buffer = string_buffer_2.strip()
            #Calculating Key Words
            keywords_1 = keyword.run(saved_buffer)
            product.SetSmallBlog(keywords_1)
    except:
        errs.append("Error finding li")

    else:
        errs.append("Could not find small section keywords")

    #Find Large Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "productDescription"})
    if string_buffer != None:
        string_buffer = string_buffer.find("p")
    if string_buffer != None:
        string_buffer = string_buffer.get_text()
        saved_buffer = string_buffer.strip()
        #Calculating Key Words
        keywords_2 = keyword.run(saved_buffer)
        product.SetLargeBlob(keywords_2)
    else:
        errs.append("Could not find large section keywords")

    #Find ASIN
    product.SetSourceID(asin)

    #TODO: Perform price save!

    #Append the product to large list of products
    if product.FormCompleted():
        return (product, errs)
    else:
        return (False, errs)

Ejemplo n.º 20

0

Mostrar archivo

from rake import Rake
from nltk.corpus import stopwords

starttime = datetime.datetime.now()
ger_stop_words = stopwords.words('german')
stop_words = stopwords.words('english')
stop_words.extend(ger_stop_words)
stop_words.extend(['via', 'using', 'fr'])
r = Rake(stop_words)
with open(r'dblp_index/title.dat', encoding='utf-8') as f_title:
    titles = []
    for line in f_title:
        titles.append(line)
r.extract_keywords_from_sentences(titles)
print('generate keywords', end='', flush=True)
with open(r'dblp_index/keywords.dat', 'a', encoding='utf-8') as keywords:
    with open(r'dblp_index/title.dat', encoding='utf-8') as titles:
        i = 0
        for line in titles:
            i += 1
            if i % 10000 == 0:
                print('.', end='', flush=True)
            phrases = r._generate_phrases(line)
            phrases_scores = []
            for phrase in phrases:
                true_phrase = '_'.join(phrase)
                score = r.phrase_score[true_phrase]
                phrases_scores.append(true_phrase + ":" + str(score))
            keywords.write(','.join(phrases_scores) + '\n')
print('done')

Ejemplo n.º 21

0

Mostrar archivo

Archivo: example.py Proyecto: carol975/RAKE_JPN

from rake import Rake

rake = Rake()
text = "杉山古墳（すぎやまこふん）は、奈良県奈良市大安寺にある古墳。形状は前方後円墳。大安寺古墳群を構成する古墳の1つ。国の史跡に指定されている（史跡「大安寺旧境内 附 石橋瓦窯跡」のうち）。"
print(rake.get_keywords(text, 3))
"""
Output Keyword List

['ぎやまこふん', '前方後円墳', '大安寺']
"""

Ejemplo n.º 22

0

Mostrar archivo

 def __getMainWords__(self, userInput):
     rake = Rake("SmartStoplist.txt")
     keywords = rake.run(userInput)
     return keywords

Ejemplo n.º 23

0

Mostrar archivo

#coding: utf-8

#util
from read_conf import config
import csv
from optparse import OptionParser
import cPickle as pickle
import os
import re
from itertools import combinations

#rake
from rake import Rake
rake = Rake()

#nltk
import nltk
from nltk.util import clean_html
from nltk.util import clean_url

#nlp
from nlp import nlp
mnlp = nlp()

tag_re = re.compile(r"<p>(.+?)</p>", re.DOTALL)

dp = config("../conf/dp.conf")


#这个函数的作用是去重
#先读取title，然后和test的title相对比，看看有没有重的

Ejemplo n.º 24

0

Mostrar archivo

Archivo: extract_restaurant_reviews.py Proyecto: vvasavada/INFR11124

import sys
import csv_io
import sets

import parser

reload(sys)
sys.setdefaultencoding("utf-8")

from textblob import TextBlob
from collections import Counter
from rake import Rake

results = {}

rake = Rake("SmartStoplist.txt")

users = parser.getUsers()

with open("data/edinburgh_restaurant_reviews.json") as f:
    data = json.loads(f.readline())

pos_polarity = 0
neg_polarity = 0
for business_id in data:
    results[business_id] = {}
    for review in data[business_id]:
        b = TextBlob(review["text"])
        if b.sentiment.polarity >= 0:
            pos_polarity += b.sentiment.polarity
        else:

Ejemplo n.º 25

0

Mostrar archivo

Archivo: hello.py Proyecto: sunnysinghgithub/myapp

	word_type = tagged[1]
	if word_type in interest_types:
		if (tagged[0] not in extracted and tagged[0] != ""):
			extracted.append(tagged[0])

importantwords = ', '.join(extracted)
	
# print (importantwords)

fdist = FreqDist(extracted)

# print (fdist)

# print (fdist.most_common(50))

rake = Rake("SmartStoplist.txt")

keywords = rake.run(sentence)

# print (keywords)

for keyword in keywords:
	word = keyword[0]
	# print (word)

response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin');

soup = BeautifulSoup(response.text)

content = soup.find(id='mw-content-text')

Ejemplo n.º 26

0

Mostrar archivo

chapter_tree += [{
    'sno': '[\d]*',
    'level': -1,
    'title': 'Exercises',
    'pno': chapter_tree[-1]['pno'] + 1
}]  # to ensure the last section is processed as per my current logic

try:
    os.stat('resources/' + pdfname)
except:
    os.mkdir('resources/' + pdfname)
finally:
    os.chdir('resources/' + pdfname)

unigram_rake = Rake('../stopwords.txt', 3, 1, 3)
bigram_rake = Rake('../stopwords.txt', 3, 2, 3)
trigram_rake = Rake('../stopwords.txt', 3, 3, 2)

keywords = set()
if split_mode == CHAPTER_MODE:
    chapter_tree = filter(lambda x: x['level'] == 1, chapter_tree)

pages = pages[40:]

skip = []

preprocessed_sections = []
for i, (cur_topic,
        next_topic) in enumerate(zip(chapter_tree[:-1], chapter_tree[1:])):
    if next_topic['level'] != -1:

Ejemplo n.º 27

0

Mostrar archivo

    dirname = pdfname

try:
    os.stat('../resources/' + dirname)
except:
    os.mkdir('../resources/' + dirname)
finally:
    os.chdir('../resources/' + dirname)

with open('__Sections.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Section No.', 'Level', 'Section', 'Page No.'])
    writer.writerows([[c['sno'], c['level'], c['title'], c['pno']]
                      for c in chapter_tree])

unigram_rake = Rake('../stopwords.txt', 3, 1, 3)
bigram_rake = Rake('../stopwords.txt', 3, 2, 3)
trigram_rake = Rake('../stopwords.txt', 3, 3, 2)

keywords = set()
if split_mode == CHAPTER_MODE:
    chapter_tree = list(
        filter(lambda x: int(x['level']) in [1, -1], chapter_tree))
else:
    chapter_tree = list(
        filter(lambda x: int(x['level']) in [1, 2, -1], chapter_tree))

preprocessed_sections = []
for i, (cur_topic,
        next_topic) in enumerate(zip(chapter_tree[:-1], chapter_tree[1:])):
    if next_topic['level'] != -1:

Ejemplo n.º 28

0

Mostrar archivo

Archivo: keyExtract.py Proyecto: mguarro/Virgil

def getRakeKeywords(doc):
    r = Rake(path.join('', cur_dir+'/SmartStoplist.txt'))
    candidates = r.run(open(doc).read().replace('\n',' '))
    return candidates[:300]

Ejemplo n.º 29

0

Mostrar archivo

Archivo: hello.py Proyecto: sunnysinghgithub/myapp

    word_type = tagged[1]
    if word_type in interest_types:
        if (tagged[0] not in extracted and tagged[0] != ""):
            extracted.append(tagged[0])

importantwords = ', '.join(extracted)

# print (importantwords)

fdist = FreqDist(extracted)

# print (fdist)

# print (fdist.most_common(50))

rake = Rake("SmartStoplist.txt")

keywords = rake.run(sentence)

# print (keywords)

for keyword in keywords:
    word = keyword[0]
    # print (word)

response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin')

soup = BeautifulSoup(response.text)

content = soup.find(id='mw-content-text')

Ejemplo n.º 30

0

Mostrar archivo

Archivo: pke_exp.py Proyecto: akjindal53244/keyphrase_extraction

from ensemble import apply_filters

# from ensemble import ensemble_predictions
num_cores = multiprocessing.cpu_count()
chunk_size = 11000
core_id = 4
offset = 5230

preprocessed_lines = open("resources/data/social_text_tokenized_pos",
                          "r").readlines()
rake_data_lines = open("resources/data/social_text_tokenized", "r") \
    .readlines()
lemma_lines = open("resources/data/social_text_tokenized_lemma",
                   "r").readlines()

rake = Rake("resources/CombinedStopList")
#rake_out_f = open("resources/data/social_text_tokenized_rake_filtered_" + str(core_id + 1), "a")
topic_rank_out_f = open(
    "resources/data/social_text_tokenized_pos_topicrank_filtered_" +
    str(core_id + 1), "a")
kpminer_out_f = open(
    "resources/data/social_text_tokenized_pos_kpminer_filtered_" +
    str(core_id + 1), "a")

input_file_path = "resources/sample_jd_preprocessed"
meta_file_path = "resources/sample_jd_meta_data"

freq_unigrams = get_freq_words()

new_preprocessed_lines = preprocessed_lines[
    chunk_size * core_id +

Ejemplo n.º 31

0

Mostrar archivo

 def __init__():
     self.model = Rake()

Ejemplo n.º 32

0

Mostrar archivo

Archivo: AnswerProcessing.py Proyecto: joswinkj/question_answering

import pandas as pd,numpy as np
import pdb
from nltk import RegexpTokenizer
import re
from Tokenizers import SynonymTokenizer
from Tokenizers import SynonymStemTokenizer
from Tokenizers import StemTokenizer
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from rake import Rake
from nltk.stem import WordNetLemmatizer
from nltk.tag.stanford import NERTagger

rr = Rake()
class AnswerProcessor(object):
    def __init__(self,query,answers,score=None):
        '''answers are a list of strings, query is a string,score is a list of scores for each answer '''
        self.query=query
        self.answers=answers
        self.score = score
        #self.question_types = {'who':'PERSON','whom':'PERSON','whose':'PERSON','where':'LOCATION'\
        #	,'when':('DURATION','DATE'),'how+adj/adv':'NUMBER','how long':'DURATION','how many':'NUMBER','how much':'NUMBER'}
        self.question_types = {'who':'PERSON','whom':'PERSON','whose':'PERSON','where':'LOCATION'\
        	,'when':'CD','how+adj/adv':'CD','how long':'CD','how many':'CD','how much':'CD'}
                ###for what, next noun will be the thing we are searching for
        self.question_type=None
        self.query_tag=None
        self.answers_tag=None
    def stringProcessing(self,only_query=1):
        ''' query is a string, answers is a list of strings. returns tuples with tags, with a list covering '''