def GetRelevanteKeyWords(text, quantity, language): r = Rake(language=language) r.extract_keywords_from_text(text) r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest. r.get_word_frequency_distribution() teste = r.get_word_degrees() keyWordsSorted = sorted(teste.items(), key=lambda x: x[1], reverse=True) keyWordsRelevante = [x[0] for x in keyWordsSorted[0:quantity]] return {'keywords': keyWordsRelevante, 'result': keyWordsSorted}
def extract_and_update_video_keywords(video_id, video_content): n = 5 rake = Rake(stopwords=stop_words) rake.extract_keywords_from_text(video_content) top_n_keywords = rake.get_word_frequency_distribution().most_common( n) # list of tuples (word, count) ordered by 'count' desc top_n_keywords_str = ", ".join( [kw_tuple[0] for kw_tuple in top_n_keywords]) update_videos_meta_data(video_id, "Keywords", top_n_keywords_str)
def run(self, text, val): """ TODO Improvements: 1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage """ # Remove new lines and turn to lower case # TODO what if only wanting to read first x lines, but that should only be for purposes of ML self.val = val text = re.sub('\n', ' ', text).lower() # Extract keyphrases using Rake # TODO also possible to extract keywords from sentence rake = Rake() if val == 'article': rake.extract_keywords_from_text(text) elif val == 'social': rake.extract_keywords_from_sentences(text) all_phrases = rake.get_ranked_phrases_with_scores() word_freq_dist = rake.get_word_frequency_distribution() # Tokenize text article_text_tokenized = casual_tokenize(text) # Tokenize phrases all_phrases_tokenized = self.tokenize_phrases(all_phrases) # Tag all phrases and remove all but noun words all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases) all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged) # Convert list of tagged nouns back to a string phrase string_phrases_nouns = self.tuple_list_to_string_list( all_phrases_tagged_nouns) # Get the indexes from the non-filtered suggested phrases in the original text all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens( all_phrases_tokenized, article_text_tokenized) # Get wikipedia urls for top 5 phrases mapping_list = self.get_wiki_urls_top_n_phrases( string_phrases_nouns, all_surrounding_tokens, 10) # Return mapping to console wiki_mapping = self.write_suggestions_to_json(mapping_list) # print(json.dumps(wiki_mapping)) # Get page links on medium by phrase medium_mapping = self.get_n_listed_medium_posts( string_phrases_nouns, 2) # print(json.dumps(medium_mapping)) # Combine jsons mapping = self.combine_mappings(wiki_mapping, medium_mapping) print(json.dumps(mapping))
def rakeResult(self): text_doc = self.getDoc() r = Rake() r.extract_keywords_from_text(text_doc) ranked_w_score = r.get_ranked_phrases_with_scores() ranked = r.get_ranked_phrases() word_degree = r.get_word_degrees() word_freq_dist = r.get_word_frequency_distribution() return ranked_w_score, ranked, word_degree, word_freq_dist
def calculate_rake_ranking(just_words): # Initializes the Rake object r = Rake() # Meant to contain each word in a string words_string = '' # Extracts only the word itself as a string for word_array in just_words: words_string += word_array[0] + " " # The Rake object ranks all the words in the string r.extract_keywords_from_text(words_string) # The return type of both functions called below is Dictionary (key -> value) frequency_distribution = r.get_word_frequency_distribution( ) # word -> frequency (number of times it occurs) word_degrees = r.get_word_degrees( ) # word -> degree (linguistic co-occurrence) # Meant to contain RAKE ranking which aren't scaled yet rake_not_scaled = [] # Appends the ranking to each word's array for word_array in just_words: word_frequency = 1 word_degree = 1 # Linear search to match a word to its frequency for word, value in frequency_distribution.items(): if word_array[0] == word: word_frequency = value # Linear search to match a word to its degree for word, value in word_degrees.items(): if word_array[0] == word: word_degree = value # Formula in accordance with the chosen metric ranking = word_degree / word_frequency rake_not_scaled.append(ranking) # Scales the values of the RAKE rankings to [0, 2] scaler = MinMaxScaler(feature_range=(0, 2)) rake_scaled = scaler.fit_transform( np.asarray(rake_not_scaled).reshape(-1, 1)) rake_scaled = [float(ranking) for ranking in rake_scaled] return rake_scaled
def test_build_frequency_dist(self): r = Rake() phrase_list = [['red', 'apples'], ['good'], ['red'], ['flavour']] freq = defaultdict(lambda: 0) freq['apples'] = 1 freq['good'] = 1 freq['flavour'] = 1 freq['red'] = 2 r._build_frequency_dist(phrase_list) self.assertEqual(r.get_word_frequency_distribution(), freq)
def test_build_frequency_dist(self): r = Rake() phrase_list = [["red", "apples"], ["good"], ["red"], ["flavour"]] freq = defaultdict(lambda: 0) freq["apples"] = 1 freq["good"] = 1 freq["flavour"] = 1 freq["red"] = 2 r._build_frequency_dist(phrase_list) self.assertEqual(r.get_word_frequency_distribution(), freq)
def extract_lemmatize_dictionary(file_list): with open(stopwords_file_path + 'stopwords.json') as fp: stopwords = json.load(fp) r = Rake(stopwords=stopwords) file_to_words = {} lemmatizer = WordNetLemmatizer() file_list = sorted(file_list) i = 0 for f in file_list: counter = collections.Counter() i = i + 1 print "Processed ", (i * 100.0) / len(file_list), "%" lines = [line.rstrip('\n') for line in open(file_path + f)] for line in lines[6:-2]: r.extract_keywords_from_text(line) d = dict(r.get_word_frequency_distribution()) lemmatized = {} for k, v in d.items(): if not is_alpha(k) or len(k) < 3: d.pop(k) for k, v in d.items(): w = [] w.append(k) tag = get_wordnet_pos(nltk.pos_tag(w)[0][1]) if tag != '': lem = lemmatizer.lemmatize(k, tag) if len(lem) < 3: continue if lem in lemmatized: lemmatized[lem] = lemmatized[lem] + v else: lemmatized[lem] = v counter.update(lemmatized) file_to_words[f] = dict(counter) with open('file_to_words.json', 'w') as fp: json.dump(file_to_words, fp, sort_keys=True, indent=3)
def tagging(filename): df = pd.read_excel(filename) cols = df.columns.values r = Rake() df2 = pd.DataFrame() for i in cols: # print(i) sent = [str(j) for j in df[i].values if j != 0] # print(sent) r.extract_keywords_from_text(" ".join(sent)) # print(r.get_word_frequency_distribution()) # print(r.get_word_degrees()) fdis = r.get_word_frequency_distribution() wdig = r.get_word_degrees() fdis_ls = [] wdig_ls = [] wdig = { a: b for a, b in sorted( wdig.items(), key=lambda item: item[1], reverse=True) } # print(wdig) for j in fdis.most_common(): # print(j[0]) if len(j[0]) > 3: fdis_ls.append(j[0]) for j in wdig.keys(): if len(j) > 3: wdig_ls.append(j) print(fdis_ls[:5]) print(wdig_ls[:5]) res = [fdis_ls[:5], wdig_ls[:5]] df2[i] = res # break df2.to_excel("datafile/tagged.xlsx")
#formatted_string = re.sub(r' [0-9]{1,3} ', ' ', formatted_string) formatted_string = re.sub(r' [a-z]{1,2} ', ' ', formatted_string) print("FINAL", formatted_string) return formatted_string def get_email(html_string): email_words = whitespace_wt.tokenize(html_string.lower()) for element in email_words: #element = "*****@*****.**" if re.match(r".*@.*\.(de|com|net)", element) is not None: print("Email found:" + element) browser.get( "https://www.schwaebische.de/landkreis/bodenseekreis/tettnang_artikel,-junge-union-will-partty-bus-verwirklichen-_arid,10701303.html" ) html_of_search = browser.page_source html = BeautifulSoup(html_of_search, "html.parser") html_string = str(html.body.text) html_string = formate_input_text(html_string) #html_string = test(html_string) r.extract_keywords_from_text(html_string) print(r.get_ranked_phrases()) print(r.get_ranked_phrases_with_scores()) print(r.get_word_degrees()) print(r.get_word_frequency_distribution())
import json cons = [] pros = [] conString = '' proString = '' # with open('../data/cons.csv') as csvFile: # pros = list(filter(lambda row: len(row) > 0, csv.reader(csvFile, delimiter='\n'))) # pros = list(map(lambda row: row[0], pros)) # proString = ''.join(pros) # r = Rake(min_length=3, max_length=4) # r.extract_keywords_from_text(proString) # conFrequency = r.get_word_frequency_distribution() # with open('../data/confrequency.json', 'w') as jsonfile: # json.dump(conFrequency, jsonfile) with open('../data/pros.csv') as csvFile: pros = list(filter(lambda row: len(row) > 0, csv.reader(csvFile, delimiter='\n'))) pros = list(map(lambda row: row[0], pros)) proString = ''.join(pros) r = Rake(min_length=3, max_length=4) r.extract_keywords_from_text(proString) proFrequency = r.get_word_frequency_distribution() with open('../data/profrequency.json', 'w') as jsonfile: json.dump(proFrequency, jsonfile)