def process_text(self): # Remove new lines and turn to lower case text = re.sub('\n', ' ', self.text).lower() # Extract keyphrases using Rake # TODO also possible to extract keywords from sentence rake = Rake() if self.text_type == 'article': rake.extract_keywords_from_text(text) elif self.text_type == 'social': rake.extract_keywords_from_sentences(text) self.all_phrases = rake.get_ranked_phrases_with_scores() # word_freq_dist = rake.get_word_frequency_distribution() # Tokenize text self.article_text_tokenized = word_tokenize(text) # Tokenize phrases self.all_phrases_tokenized = self.tokenize_phrases() # Tag all phrases and remove all but noun words self.all_phrases_tagged = pos_tag_phrase_pairs(self.all_phrases) self.all_phrases_tagged_nouns = filter_pos(self.all_phrases_tagged, "nouns") # Convert list of tagged nouns back to a string phrase self.string_phrases_nouns = self.tuple_list_to_string_list()
def keywords_by_rake( texts_by_chapters: Sequence[Tuple[str, str]], word_count: int, filter_words: Optional[Sequence[str]] = None) -> Sequence[str]: """Extract keywords from the raw complete text (by appending the chapter-divided text blocks into a complete text block) using RAKE. The RAKE-ranked keywords shall be preprocessed and duplicates shall be removed. If filter_words is provided, those words will be filtered out from the list of keywords. Only the word_count most highly ranked keywords shall be returned.""" complete_text_by_chapters = [] for header, text_block in texts_by_chapters: complete_text_by_chapters.append(text_block) r = Rake(stopwords=mod_config.STOP_WORDS, punctuations=mod_config.PUNCTUATION, max_length=1, min_length=1) # Extract keywords from the text_block r.extract_keywords_from_sentences(complete_text_by_chapters) # Get list of ranked keywords (highest-lowest) keywords = r.ranked_phrases _logger.debug("Raw RAKE keywords: {}...".format(keywords[:20])) # Preprocess the keywords keywords = _preprocess_words(keywords) keywords = _remove_duplicates(keywords) if filter_words is not None: # Filter is specified. Filter out the specified words from the keywords. keywords = list(filter(lambda x: x not in filter_words, keywords)) # Return (possibly filtered) list of preprocessed keywords in ranked order return keywords[:word_count]
def rake(self): r_1 = Rake(ranking_metric=Metric.WORD_DEGREE) r_2 = Rake(ranking_metric=Metric.WORD_FREQUENCY) # Extraction given the text. r_1.extract_keywords_from_text(self.article.text) r_2.extract_keywords_from_text(self.article.text) # To get keyword phrases ranked highest to lowest. r_1.get_ranked_phrases() r_2.get_ranked_phrases() # To get keyword phrases ranked highest to lowest with scores. list_1 = r_1.get_ranked_phrases()[:10] list_2 = r_2.get_ranked_phrases()[:10] # make a list of duplicates dups = set(list_1) & set(list_2) r_3 = Rake(ranking_metric=Metric.WORD_DEGREE) r_4 = Rake(ranking_metric=Metric.WORD_FREQUENCY) r_3.extract_keywords_from_sentences(dups) r_4.extract_keywords_from_sentences(dups) list_3 = r_3.get_ranked_phrases()[:10] list_4 = r_4.get_ranked_phrases()[:10] rake_keywords = [] tmp_keywords = list(set(list_3) & set(list_4)) for i in range(len(tmp_keywords)): tmp = tmp_keywords[i].split() for word in tmp: if "-" not in word: rake_keywords.append(word.lower()) return rake_keywords
def get_keywords(tweets): rake = Rake() rake.extract_keywords_from_sentences(tweets) rake_return = [] for phrase in rake.get_ranked_phrases(): if (len(phrase.split()) < 4 and len(phrase.split()) > 1) and (phrase[:2].lower() != 'rt' and 'http' not in phrase and phrase.replace(" ", "").isalpha()): rake_return.append(phrase.strip(string.punctuation)) return rake_return
def run(self, text, val): """ TODO Improvements: 1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage """ # Remove new lines and turn to lower case # TODO what if only wanting to read first x lines, but that should only be for purposes of ML self.val = val text = re.sub('\n', ' ', text).lower() # Extract keyphrases using Rake # TODO also possible to extract keywords from sentence rake = Rake() if val == 'article': rake.extract_keywords_from_text(text) elif val == 'social': rake.extract_keywords_from_sentences(text) all_phrases = rake.get_ranked_phrases_with_scores() word_freq_dist = rake.get_word_frequency_distribution() # Tokenize text article_text_tokenized = casual_tokenize(text) # Tokenize phrases all_phrases_tokenized = self.tokenize_phrases(all_phrases) # Tag all phrases and remove all but noun words all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases) all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged) # Convert list of tagged nouns back to a string phrase string_phrases_nouns = self.tuple_list_to_string_list( all_phrases_tagged_nouns) # Get the indexes from the non-filtered suggested phrases in the original text all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens( all_phrases_tokenized, article_text_tokenized) # Get wikipedia urls for top 5 phrases mapping_list = self.get_wiki_urls_top_n_phrases( string_phrases_nouns, all_surrounding_tokens, 10) # Return mapping to console wiki_mapping = self.write_suggestions_to_json(mapping_list) # print(json.dumps(wiki_mapping)) # Get page links on medium by phrase medium_mapping = self.get_n_listed_medium_posts( string_phrases_nouns, 2) # print(json.dumps(medium_mapping)) # Combine jsons mapping = self.combine_mappings(wiki_mapping, medium_mapping) print(json.dumps(mapping))
def rake_keyword_extractor_raw_text(filename): rake = Rake() word_array = list() with open(filename, "r", encoding="utf-8") as file: try: extractor_data = file.readlines() rake.extract_keywords_from_sentences(extractor_data) word_array = filtering_condition_for_words( rake.get_ranked_phrases()) except UnicodeDecodeError: print("Cant extract data from file: " + filename) return word_array
def extract_terms(self, DataFrame, min_len=2, max_len=4): """ This method uses the RAKE Algorithm to extract keywords from the text column of the DataFrame of naive search results. :param DataFrame: :param min_len: minimum keyword length :param max_len: maximum keyword length :return: a list consisting of a combination of extracted keywords and author keyword """ r = Rake(language='english', punctuations='!"#$%&\'()*+,-),./“:;≥≤<=|‘>©?@[\\]^_`{|}~', ranking_metric=Metric.WORD_DEGREE) # Extraction using the text column the text. texts = list(DataFrame['text']) r.extract_keywords_from_sentences(texts) raked_keywords = r.get_ranked_phrases() # raked keywords # Extract author keywords from naive search results and remove blank values author_keywords = list(DataFrame['keywords']) real_keywords = [ x.lower() for x in author_keywords if str(x) != 'nan' ] # removing nan values from list of author keywords # merge all keywords and split into list real_keywords = "".join(real_keywords) real_keywords = real_keywords.split(";") # merge raked keywords with author keywords keywords = raked_keywords + real_keywords # loop through all keywords, remove every keyword with a digit in it and create new cleaned list digits_cleaned_all_keywords = [ x for x in keywords if (any(char.isdigit() for char in x) == False) ] regex = re.compile('[@_!#$%^&""*..,≈·ακ⩽(∼苔草沼泽的no排放量天)<>?•η°/|}{~:]') # loop through all keywords, remove every keyword with a symbol in it using regex and create new cleaned list all_keywords = [ x.strip() for x in digits_cleaned_all_keywords if (regex.search(x) is None) ] # Convert keyword list to set and then back to list to deduplicate keyword list all_keywords = list(set(all_keywords)) all_keywords.sort(reverse=False) return all_keywords
def keywords_rake_nltk(self, texts=None, words=10, **kwargs): """ extract keywords using rake_nltk """ r = Rake() if texts is None: texts = self.contents(**kwargs) if isinstance(texts, list): r.extract_keywords_from_sentences(texts) else: r.extract_keywords_from_text(texts) res = r.get_ranked_phrases() return res[:words]
def detect_and_translate(text): translator = google_translator() r = Rake() original_lang = translator.detect(text) print("\nSource Language was : ", original_lang[1]) if (original_lang[0] != 'en'): print(text) translate_text = translator.translate(text, lang_tgt='en') print("\nEnglish Translation \n") r.extract_keywords_from_sentences(translate_text.split('\n')) print(translate_text) ranked = r.get_ranked_phrases_with_scores() print("\nPhrases with Scores") print(ranked)
def rake_keyword_extractor(filename): rake = Rake() extractor_data = load_as_json(filename) categories = dict() result_dict = dict() for content in extractor_data: if content['category'] not in categories: categories[content['category']] = [] result_dict[content['category']] = [] categories[content['category']].append(content['text']) for category, categoryArray in categories.items(): print(category) rake.extract_keywords_from_sentences(categoryArray) word_array = filtering_condition_for_words(rake.get_ranked_phrases()) print(len(word_array)) result_dict[category] = word_array return result_dict
def extract_keywords_from_doc(doc, phrases=True, return_scores=False): if phrases: r = Rake() if isinstance(doc, (list, tuple)): r.extract_keywords_from_sentences(doc) else: r.extract_keywords_from_text(doc) if return_scores: return [(b, a) for a, b in r.get_ranked_phrases_with_scores()] else: return r.get_ranked_phrases() else: if not isinstance(doc, (list, tuple)): doc = [doc] ret = [] for x in doc: for t in nltk.word_tokenize(x): if t.lower() not in stop_words: ret.append(t) return ret
def parse_keywords(self): r = Rake() if self.keyword_limit == 0: sentence = self.sentence r.extract_keywords_from_text(sentence) score_words = r.get_ranked_phrases_with_scores() for keyword in score_words: if keyword[0] > 1: self.keywords.append(keyword[1]) return self.keywords else: sentences = [self.sentence] r.extract_keywords_from_sentences(sentences) keywords = r.ranked_phrases return keywords[0:self.keyword_limit]
def do_keyword_extraction(words): if debug: print("---\n", words) rake_all = Rake() rake_all.extract_keywords_from_sentences(_t["context"].value_counts().index.values) word_degrees = dict(rake_all.get_word_degrees()) r = Rake() r.extract_keywords_from_text(words) keywords = dict(r.get_word_degrees()) if debug: print(keywords) for k, v in keywords.items(): keywords[k] = word_degrees[k] if debug: print(keywords) return Counter(keywords).most_common(1)[0]
def lyrics_preprocessing(folder_path, tags_csv_path, output_csv_path): collection = list() # genre tags_table = pd.read_csv(tags_csv_path, sep='\t', index_col='id') # lyrics for i, file_name in enumerate(os.listdir(folder_path)): file_path = os.path.join(folder_path, file_name) file_id = file_name.split('.')[0] with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() lines = [i.strip() for i in lines] lyrics = ';'.join(lines) # keywords r = Rake() r.extract_keywords_from_sentences(lines) keywords = r.get_ranked_phrases() keywords_str = ','.join(keywords[:3]) tags = tags_table.loc[file_id, 'tags'] # clean data if len(lyrics) < 50: continue if detect(lyrics[:100]) != 'en': continue # add line collection.append([tags, keywords_str, lyrics]) if i % 100 == 0: print(i) with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f, delimiter='\t') # writer.writerow(['tags','keywords','lyrics']) writer.writerows(collection)
def get_keywords(messages, topics): rake = Rake() rake.extract_keywords_from_sentences( [message['text'] for message in messages]) # Only bi-grams filtered = [ item for item in rake.get_ranked_phrases_with_scores() if len(item[1].split()) == 2 ] # Filter only nouns bi-grams keywords_with_score = [] for item in filtered: score = item[0] keyword = item[1] words = keyword.split() should_include = True tags = pos_tag(words) should_include = 'NN' in tags[0][1] and 'NN' == tags[1][1] for word in words: synset = wn.synsets(word) if not synset: should_include = False break if synset[0].pos() != 'n': should_include = False if should_include: keywords_with_score.append(item) extracted_keywords = [ item[1] for item in keywords_with_score[:EXTRACT_KEYWORDS_COUNT] ] extracted_keywords.extend([topic['text'].lower() for topic in topics]) return list(set(extracted_keywords))
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ import gensim import pdfminer3 txt = "C:/RandD/Ex1/test-EVD-SEARO.txt" openfile = open(txt, "r") lines = openfile.readlines() import rake_nltk from rake_nltk import Rake r = Rake( ) # Uses stopwords for english from NLTK, and all puntuation characters. r.extract_keywords_from_sentences(lines) phraselist = r.get_ranked_phrases_with_scores( ) # To get keyword phrases ranked highest to lowest. for i in phraselist[:5]: print("Line: ", i[1], " score: ", i[0])
def getSentenceFeature(tokens, wordVectors, sentence, keyword = 'off', postag = 'off'): """ Obtain the sentence feature for sentiment analysis by averaging its word vectors """ # Implement computation for the sentence features given a sentence. # Inputs: # - tokens: a dictionary that maps words to their indices in # the word vector list # - wordVectors: word vectors (each row) for all tokens # - sentence: a list of words in the sentence of interest # Output: # - sentVector: feature vector for the sentence if keyword == 'on': r = Rake() r.extract_keywords_from_sentences(sentence) n = len(sentence) m = int(n/2) sentence_new = r.get_ranked_phrases() sentVector = np.zeros((wordVectors.shape[1],)) n = len(sentence_new) ### YOUR CODE HERE if n == 0: for word in sentence: token = tokens.get(word, 19536) wordVector = wordVectors[token] sentVector += wordVector n = len(sentence) else: for word in sentence_new: token = tokens.get(word, 19536) wordVector = wordVectors[token] sentVector += wordVector n = len(sentence_new) sentVector /= n elif postag == 'on': sentVector = np.zeros((wordVectors.shape[1],)) tags = nltk.pos_tag(sentence) sentence_new = [] tag_list = ['RB','RBR','RBS','UH','VB','VBD','VBG','VBN','VBP','VBZ','WRB','JJ','JJR','JJS','NN'] for tag in tags: if tag[1] in tag_list: sentence_new.append(tag[0]) n = len(sentence_new) if n == 0: for word in sentence: token = tokens.get(word, 19536) wordVector = wordVectors[token] sentVector += wordVector n = len(sentence) else: for word in sentence_new: token = tokens.get(word, 19536) wordVector = wordVectors[token] sentVector += wordVector n = len(sentence_new) sentVector /= n else: sentVector = np.zeros((wordVectors.shape[1],)) ### YOUR CODE HERE for word in sentence: token = tokens.get(word, 19536) wordVector = wordVectors[token] sentVector += wordVector n = len(sentence) sentVector /= n #raise NotImplementedError ### END YOUR CODE return sentVector
#!/usr/bin/python3 # coding: utf-8 # pip install rake-nltk from rake_nltk import Rake from nltk import tokenize r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters by default ################################################################## ## Extraction given the text. mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.''' r.extract_keywords_from_text(mytext) print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest. # ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility'] print(r.get_ranked_phrases_with_scores()) # To get keyword phrases ranked highest to lowest with scores. # [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')] ################################################################## ## Extraction given the list of strings where each string is a sentence. r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext)) print(r.get_ranked_phrases()) # ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility'] print(r.get_ranked_phrases_with_scores()) # [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
from data_helper import get_reviews_from_database from pre_processing import pre_process, get_stop_words sentences = get_reviews_from_database(4536405) formated_sentences = [] for sentence in sentences: formated_sentences.append(pre_process(sentence['content'])) from rake_nltk import Rake # Uses stopwords for english from NLTK, and all puntuation characters by # default stop_words = get_stop_words("data/stopwords.txt") r = Rake(stopwords=stop_words, language="vietnamese") # Extraction given the text. r.extract_keywords_from_sentences(formated_sentences) phrases = r.get_ranked_phrases() for phrase in phrases: print(phrase)
def extractKeywords(document): r = Rake() keys = r.extract_keywords_from_sentences(document) return keys
) title_id = 0 for l in range(len(lines)): if lines[l].strip().startswith("<EOS>"): continue title = lines_title[title_id].strip() title_id += 1 document = lines[l].replace('t outline . <s>', '').replace( ' <p> ', ' ').replace(' ', ' ').strip().replace(' <s> ', '\n').split('\n') body = lines[l].replace('t outline . <s>', '').strip() try: r = Rake() r.extract_keywords_from_sentences(document) top_features = r.get_ranked_phrases() top_features = clean_top_features(top_features, topK) except Exception: print(document) continue keywordsSTR = convert_keys_to_str(top_features) if len(title) > 2: title = title.lower().replace("paid notice :", "").replace("paid notice:", "").replace("journal;", "").strip() keywordsSTR = title + '[SEP]' + keywordsSTR if len(keywordsSTR.split(' ')) > 100:
def generate_questions(self, data): text = data['text'] title = self.nlp(data['title']) subject = self.nlp(data['subject']) doc = self.nlp(text) generated = [] ranking_metrics = [Metric.WORD_DEGREE] sentences = [s.text for s in doc.sents] phrases_swisscom = self.generate_keywords(text) phrases = [] # EG: [['egyptian president gamal abdel nasser', 'suez canal', 'israeli war', 'arab world', 'egypt', 'suez crisis', 'soviet union', 'nasser', 'tripartite aggression', 'israel'], [1.0, 0.8614248633384705, 0.8030354976654053, 0.7896698713302612, 0.811191737651825, 0.8514521718025208, 0.6438262462615967, 0.8813737034797668, 0.5584405660629272, 0.7795075178146362], [['nasser'], ['canal'], [], [], [], [], [], ['egyptian president gamal abdel nasser'], [], []]] if phrases_swisscom == None: # fallback option just in case swisscom isn't working for metric in ranking_metrics: r = Rake(ranking_metric=metric, min_length=1, max_length=5) # Extraction given the sentences as a list of strings. r.extract_keywords_from_sentences(sentences) # To get keyword phrases ranked highest to lowest and strip out the last half. keywords = r.get_ranked_phrases() keywords = keywords[0:round(len(keywords) * 0.5)] phrases.extend(keywords) else: print(phrases_swisscom) phrases = sorted( [(p, phrases_swisscom[1][i]) for i, p in enumerate(phrases_swisscom[0]) if title.similarity(self.nlp(p)) < self.title_similarity and subject.similarity(self.nlp(p)) < self.title_similarity], key=lambda x: x[1], reverse=True) phrases = [p for p, s in phrases if s > 0.5] generated = [] sentences_used = {s: 0 for s in doc.sents} phrases_used = [] for tok in doc: for phrase in phrases: tok_sent_i = tok.i - tok.sent.start tok_sent_end = tok_sent_i + len(phrase.split()) same_len = tok.sent[tok_sent_i:tok_sent_end] if [t.lower_ for t in same_len] == phrase.lower().split(): similarity = [ self.nlp(phrase).similarity(p) for p in phrases_used ] # figure out empty vector if sentences_used[tok.sent] < 3 and max( similarity, default=0.0) < self.title_similarity: toks_with_ws = [ token.text_with_ws for token in tok.sent ] long_gap_toks = deepcopy(toks_with_ws) for i in range(tok_sent_i, tok_sent_end): long_gap_toks[i] = '_' * len(tok.sent[i]) if i == tok_sent_i: toks_with_ws[i] = '_____' else: toks_with_ws[i] = '' long_gap_toks[tok_sent_end - 1] = long_gap_toks[ tok_sent_end - 1] + tok.sent[tok_sent_end - 1].whitespace_ toks_with_ws[tok_sent_end - 1] = toks_with_ws[ tok_sent_end - 1] + tok.sent[tok_sent_end - 1].whitespace_ pair = { "question": "".join(long_gap_toks), "answer": "".join([t.text_with_ws for t in same_len]), "sentence": tok.sent.text, "short_gap": "".join(toks_with_ws) } sentences_used[tok.sent] += 1 phrases_used.append(self.nlp(phrase)) generated.append(pair) return generated
from rake_nltk import Rake # Uses stopwords for english from NLTK, and all puntuation characters by # default r = Rake() # Extraction given the text. #r.extract_keywords_from_text("keyword_tests.txt") x = [] with open("question_keyword_tests.txt") as f: for line in f: x.append (line) r.extract_keywords_from_text(line) #print(r.get_ranked_phrases_with_scores()) print(r.get_ranked_phrases()) # Extraction given the list of strings where each string is a sentence. r.extract_keywords_from_sentences(x) # To get keyword phrases ranked highest to lowest. phrases = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest with scores. phrases_with_text = r.get_ranked_phrases_with_scores() # print(phrases) # print(phrases_with_text)
for entry in patterns: print('{}: {}'.format(entry[0], entry[1])) ####################### Question 2.1 ############################## # Get keywords using RAKE titles = get_values(data, 'title') titles = sorted(titles) stopwords = nltk.corpus.stopwords.words('english') extra_stopwords = ['using', 'via', 'without', 'towards', 'toward', 'based'] stopwords += extra_stopwords extractor = Rake(stopwords=stopwords) extractor.extract_keywords_from_sentences(titles) keywords = extractor.get_ranked_phrases() # Only using keywords with less than 5 words keywords = [k for k in keywords if len(k.split()) <= 5] # Get 20000 keywords keywords = keywords[:20000] # Get keyword of each team's research print_divider('=', 'Question 2.1: Team Insterests') for team in teams: team_data = get_team_data(data, team) team_titles = get_values(team_data, 'title') team_titles = sorted(team_titles) team_freq = get_ranked_keyword_frequency(team_titles, keywords)
#Project 3.1 - Extract Keywords Article #rake-nltk short for Rapid Automatic Keyword Extraction algorithm from rake_nltk import Rake r = Rake() text = " The Platform of the Future? \ \ The survival of any organization depends on its ability to outperform competitors and marketplaces in attracting and rewarding talent, ideas and capital. As communication and transaction costs have drastically declined because of the internet, new platforms have emerged, delivering goods and services at a speed and efficiency previously unimaginable. These new digital players took advantage of the changes in the underlying technology to challenge established business models and rethink pre-existing value chains. The ones that succeeded did so because they achieved a level of efficiency that their brick and mortar counterparts had trouble replicating. Through online reputation and feedback systems, digital players were able to create global marketplaces where individuals, products and services could be matched more effectively than ever before. By providing curation and ensuring the safety of transactions, these new types of intermediaries were able to reap the returns of this first wave of digitization. \ \ A similar transformation is about to happen as blockchain technology and cryptocurrencies mature and mainstream applications emerge. Under this new wave of technological change, intermediaries will still be able to add value to transactions, but the nature of intermediation will fundamentally change. Whereas some established players will be able to use this opportunity to further scale their operations, others will be challenged by new entrants proposing entirely new approaches to value creation and value capture.\ \ Complementing Artificial Intelligence with Human Intelligence " #Extraction given a text r.extract_keywords_from_text(text) #Extraction given the list of strings where is a sentence myList = ["ability to outperform", "blockchain", "suvival organization", "cryptocurrencies emerge", "artifical intelligence blockchain"] r.extract_keywords_from_sentences(myList) #To get Keyword phrases ranked highest to lowest r.get_ranked_phrases() #To get Keyword phrases ranked highest to lowest with scores print(r.get_ranked_phrases_with_scores())
def menu_response(reviews): if not reviews: return [], 400 reviews = filter(len, [ filter(len, map(utils.simple_preprocess, i.split('.'))) for i in reviews ]) r = Rake() key_phrases = [] for review in reviews: r.extract_keywords_from_sentences(sentences(review)) for i in sorted(r.get_ranked_phrases(), key=lambda s: similarity_to_food(ml.sentence_model, s), reverse=True)[:5]: key_phrases.append(i) similar_phrases = [] for p1, p2 in map(lambda t: (t[0].split(), t[1].split()), itertools.combinations(key_phrases, 2)): try: similar_phrases.append( (p1, p2, sentence_model.wv.n_similarity(p1, p2))) except KeyError: pass similar_phrases = sorted(similar_phrases, key=lambda t: t[2], reverse=True) '''food_phrases = set() while similar_phrases and similar_phrases[0][2] > 0.9: shrt, lng = sorted([' '.join(similar_phrases[0][0]), ' '.join(similar_phrases[0][1])], key=len) if shrt not in food_phrases and lng not in food_phrases: food_phrases.add(shrt) try: key_phrases.remove(shrt) except KeyError, e: try: key_phrases.remove(lng) except KeyError: pass similar_phrases.pop(0)''' food_phrases, rejected, queue = set(), set(), set() for p1, p2, similarity in similar_phrases: if similarity < 0.75: break p1 = ' '.join(p1) p2 = ' '.join(p2) if p1 in food_phrases | rejected or p2 in food_phrases | rejected: continue if p1 in queue or p2 in queue: if len(p1) < len(p2): food_phrases.add(p1) rejected.add(p2) else: food_phrases.add(p2) rejected.add(p1) else: queue.add(p1) queue.add(p2) return sorted(score_phrases(food_phrases, reviews), key=lambda t: t[1], reverse=True), 200
def get_key_words(comments): r = Rake() r.extract_keywords_from_sentences(comments) return r.get_ranked_phrases_with_scores()