def generate_recommendations(data, title): df = data df = df[['Name', 'Cause', 'City', 'Details']] df.fillna("null", inplace=True) count = CountVectorizer() df['Keywords'] = "" df['bag_of_words'] = "" for index, row in df.iterrows(): details = row['Details'] causes = row['Cause'] r = Rake() r.extract_keywords_from_text(details) keywords_dict_scores = r.get_word_degrees() keywords = list(keywords_dict_scores.keys()) keywordString = "" for keyword in keywords: keywordString = keywordString + " " + keyword x = Rake() x.extract_keywords_from_text(causes) keywords_cause = x.get_word_degrees() keys = list(keywords_cause.keys()) causeString = "" for cause in keys: causeString = causeString + " " + cause keywordString += " " keywordString += causeString row['Keywords'] = keywordString cityString = "" cities = row['City'] if cities is not "null": cityString = cities.lower() row['bag_of_words'] = cityString + keywordString df.drop(columns=['Cause', 'City', 'Details', 'Keywords'], inplace=True) df.set_index('Name', inplace=True) count_matrix = count.fit_transform(df['bag_of_words']) cosine_sim = cosine_similarity(count_matrix, count_matrix) indices = pd.Series(df.index) # def recommendations(title, cosine_sim = cosine_sim): recommended_ngos = [] idx = indices[indices == title].index[0] score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False) top_10_indexes = list(score_series.iloc[1:4].index) for i in top_10_indexes: recommended_ngos.append(list(df.index)[i]) return recommended_ngos
def rec_events(data, title): df = data df = df[['Name', 'Purpose']] count = CountVectorizer() df['bag_of_words'] = "" for index, row in df.iterrows(): purpose = row["Purpose"] r = Rake() r.extract_keywords_from_text(purpose) keywords_dict_scores = r.get_word_degrees() keywords = list(keywords_dict_scores.keys()) keywordString = "" for keyword in keywords: keywordString = keywordString + " " + keyword row['bag_of_words'] = keywordString count_matrix = count.fit_transform(df['bag_of_words']) cosine_sim = cosine_similarity(count_matrix, count_matrix) df.drop(columns=["Purpose"], inplace=True) df.set_index("Name", inplace=True) indices = pd.Series(df.index) recommended_events = [] idx = indices[indices == title].index[0] score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False) top_3_indexes = list(score_series.iloc[1:4].index) for i in top_3_indexes: recommended_events.append(list(df.index)[i]) return recommended_events
def clean(data): for index, row in data.iterrows(): plot = row['Plot'] actors = row['Actors'] director = row['Director'] genre = row['Genre'] # instantiating Rake, by default it uses english stopwords from NLTK # and discards all puntuation characters as well r = Rake() for item in [plot,actors,director,genre]: # extracting the words by passing the text r.extract_keywords_from_text(item) # getting the dictionary whith key words as keys and their scores as values key_words_dict_scores = r.get_word_degrees() # assigning the key words to the new column for the corresponding movie row['Key_words'] += ' '.join(list(key_words_dict_scores.keys())) # dropping the Plot column data.drop(columns = ['Plot'], inplace = True) data.drop(columns = ['Actors'], inplace = True) data.drop(columns = ['Director'], inplace = True) data.drop(columns = ['Genre'], inplace = True) data = data.set_index('Title')
def preprocessing_overview(data): plot = data rake = Rake() rake.extract_keywords_from_text(plot) scores = rake.get_word_degrees() return (list(scores.keys()))
def keyword(str): # Remove stop words, etc. r = Rake() r.extract_keywords_from_text(str) rake_list = list(r.get_word_degrees().keys()) # Special case for topic: remove "xd". for i in range(len(rake_list)): if re.search(r'^xd\d', rake_list[i]): rake_list[i] = rake_list[i][3:] if len(rake_list[i]) <= 1: rake_list[i] = "" for word in rake_list: if re.search(r'^xd\d', word): word = word[3:] if len(word) <= 1: word = "" # Stem the list. ps = PorterStemmer() stemmed_list = [] for word in rake_list: stemmed_list.append(ps.stem(word)) stemmed_list = list(filter(None, stemmed_list)) return list(set(stemmed_list))
def recommend(item_id, num,df_1): item_id = str(item_id) tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 4), min_df=0, stop_words='english') df_1['key_words'] = '' r = Rake() for index, row in df_1.iterrows(): r.extract_keywords_from_text(row['description']) key_words_dict_scores = r.get_word_degrees() row['key_words'] = list(key_words_dict_scores.keys()) columns = ['key_words'] for index, row in df_1.iterrows(): words = '' for col in columns: words += ' '.join(row[col]) + ' ' row['description'] = words tfidf_matrix = tf.fit_transform(df_1['description']) cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix) results = {} for index, row in df_1.iterrows(): similar_index = cosine_similarities[index].argsort() similar_items = [(cosine_similarities[index][i], df_1['item_id'][i]) for i in similar_index] results[row['item_id']] = similar_items[1:] # print(item_id) recs = results[item_id][:num] rec_list = [] for rec in recs: rec_list.append(rec[1]) return rec_list
def extract_keywords(text): stop_words_file = open(STOP_WORDS, 'r') contents = list(stop_words_file.read().split('\n')) rake_object = Rake(stopwords=contents) rake_object.extract_keywords_from_text(text) key_words_scores = rake_object.get_word_degrees() return key_words_scores
def token_proc_thread(self): tokens = [] # print("YO") tokens.extend([ x for x in nltk.word_tokenize(self.docs.lower()) if re.match("[a-zA-Z]{2,}", x) ]) for word in tokens[:]: try: if not (word.lower() in self.dict_of_words.keys()): tokens.remove(word) except: pass tokens = ' '.join(map(str, tokens)) r = Rake(stopwords=self.stop_words) r.extract_keywords_from_text(tokens) # word_degrees = [] word_degrees = sorted(r.get_word_degrees().items(), key=lambda item: item[1], reverse=True) print(word_degrees) for word_tuple in word_degrees[:]: if len(word_tuple[0]) <= 4: word_degrees.remove(word_tuple) return word_degrees
def main(self, df): self.df = df for index, row in df.iterrows(): row['Director'] = ''.join(row['Director']).lower() row['Cast'] = ''.join(row['Cast']).lower() row['Writers'] = ''.join(row['Writers']).lower() df['Key_words'] = "" for index, row in df.iterrows(): summary = row['Short Summary'] r = Rake() r.extract_keywords_from_text(summary) key_words_dict_scores = r.get_word_degrees() row['Key_words'] = list(key_words_dict_scores.keys()) df.drop(columns=['Short Summary'], inplace=True) df.set_index('Title', inplace=True) df['bag_of_words'] = df[[ 'Genres', 'Director', 'Writers', 'Cast', 'Key_words' ]].agg(' '.join, axis=1) df.drop(columns=[col for col in df.columns if col != 'bag_of_words'], inplace=True) count = CountVectorizer() count_matrix = count.fit_transform(df['bag_of_words']) return pd.Series(df.index), cosine_similarity(count_matrix, count_matrix)
def extract_keyword(text): sentences = splitSentence(text) result = [] for sentence in sentences: r = Rake() r.extract_keywords_from_text(sentence) result += list(r.get_word_degrees().keys()) return result
def get_RAKE_keywords(sample, stoplist, T_RATIO=0.33): rake = Rake(stopwords=stoplist) rake.extract_keywords_from_text(sample.abstract) t = int(len(rake.get_word_degrees()) * T_RATIO) return rake.get_ranked_phrases()[:t]
def recomendacionChampion(request): formulario = ChampionBusquedaForm() campeones = None dat = [] champDat = [] datos = {} if request.method == 'POST': formulario = ChampionBusquedaForm(request.POST) if formulario.is_valid(): campeones = Champion.objects.all() for champ in campeones: tiers = Tier.objects.filter(idChampion=champ.idChampion) for tie in tiers: position = Position.objects.get(name=tie.idPosition) actual = 'l'+str(tie.level) + ' ' + \ position.name + ' ' + 'w'+str(tie.winrate) name = champ.name if name in champDat: anterior = datos.get(name) upgrade = actual + ' ' + anterior datos.update({name: upgrade}) dat.append(actual) else: datos.update({name: actual}) champDat.append(name) values = datos.values() values = list(values) d = {'Nombre': champDat, 'Valores': values} df = pd.DataFrame(data=d, index=champDat) df = df[['Nombre', 'Valores']] df.head() df['Key_words'] = '' for index, row in df.iterrows(): valor = row['Valores'] r = Rake() r.extract_keywords_from_text(valor) key_words_dict_scores = r.get_word_degrees() row['Key_words'] = str(list(key_words_dict_scores.keys())) df.drop(columns=['Valores'], inplace=True) count = CountVectorizer() count_matrix = count.fit_transform(df['Key_words']) cosine_sim = cosine_similarity(count_matrix, count_matrix) indices = pd.Series(df.index) recommended_champs = [] champion_name = formulario.cleaned_data['champion_name'] idx = indices[indices == champion_name].index[0] score_series = pd.Series( cosine_sim[idx]).sort_values(ascending=False) top_10_indexes = list(score_series.iloc[1:11].index) for i in top_10_indexes: recommended_champs.append(list(df.index)[i]) campeones = [] for name_c in recommended_champs: campeones.append(Champion.objects.get(name=name_c)) return render(request, 'campeones_recomendados.html', { 'campeones': campeones, 'STATIC_URL': settings.STATIC_URL })
def extract_keywords(df_work): # Using RAKE(nlp lib) to extracct keywords from description and put them in new column df_work['Key_words'] = "" for index, row in df_work.iterrows(): descrip = row['description'] r = Rake() r.extract_keywords_from_text(descrip) key_words_dict_scores = r.get_word_degrees() row['Key_words'] = list(key_words_dict_scores.keys())
def recomendarPeliculas(id_pelicula): pelicula = get_object_or_404(Pelicula, pk=id_pelicula) peliculas = Pelicula.objects.all() #Se crean los valores de la columna de titulos movies_titles = [] for movie in peliculas: if str(movie.titulo) != str(pelicula.titulo): movies_titles.append(movie.titulo) movies_titles.append(pelicula.titulo) #Se crean los valores de la columna de valores values = [] for movie in peliculas: movie_values = [] generos = '' if str(movie.titulo) != str(pelicula.titulo): for genero in movie.generos.all(): generos += ', ' + genero.nombre movie_values.append(generos[2:]) values.append(movie_values) movie_values = [] generos = '' for genero in pelicula.generos.all(): generos += ', ' + genero.nombre movie_values.append(generos[2:]) values.append(movie_values) #Se crea la tabla d = {'titulo': movies_titles, 'valores': values} df = pd.DataFrame(data=d, index=movies_titles) df = df[['titulo', 'valores']] df.head() df['key_words'] = '' for index, row in df.iterrows(): valor = row['valores'] r = Rake() r.extract_keywords_from_text(valor[0]) key_words_dict_scores = r.get_word_degrees() row['key_words'] = str(list(key_words_dict_scores.keys())) df.drop(columns=['valores'], inplace=True) count = CountVectorizer() count_matrix = count.fit_transform(df['key_words']) cosine_sim = cosine_similarity(count_matrix, count_matrix) recommended_movies = [] indices = pd.Series(df.index) idx = indices[indices == pelicula.titulo].index[0] score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False) top_3_indexes = list(score_series.iloc[1:4].index) for i in top_3_indexes: recommended_movies.append(list(df.index)[i]) if str(pelicula.titulo) in recommended_movies: recommended_movies.remove(str(pelicula.titulo)) recommended_movies.append( list(df.index)[score_series.iloc[4:5].index[0]]) return recommended_movies
def shorten_title(title, max_title_len, alpha_only=True): """ Shortens a title using important phrases and keywords in the title. Args: title (str): Title to shorten. max_title_len (int): Maximum length of the final title. alpha_only (bool): Whether to only use alphabetic characters. Returns: str: Shortened, all lower-case title with a length less than `max_title_len`. """ title = title.lower() if len(title) <= max_title_len: # Title is already short enough. return title blob = TextBlob(title) if len(blob.sentences) == 1: first_blob = blob else: first_blob = blob.sentences[0] tags_kept = [] for word, tag in first_blob.tags: if tag != "DT": tags_kept.append((word, tag)) new_title = _join_tags(tags_kept) if len(new_title) <= max_title_len: return new_title if alpha_only: filter = re.compile("[^a-z ]") title = filter.sub("", title) # Try using the highest ranked phrase from the title. r = Rake() r.extract_keywords_from_text(title) new_title = r.get_ranked_phrases()[0] if len(new_title) <= max_title_len: return new_title # Title is still too long. Use as many of the important words as will fit within the max # title length. words = sorted(r.get_word_degrees()) new_title = words[0] if len(new_title) > max_title_len: # Cut the single-word title short. return new_title[:max_title_len] for w in words[1:]: append_title = "{} {}".format(new_title, w) if len(append_title) > max_title_len: break new_title = append_title return new_title
def extract_keywords(text): r = Rake() r.extract_keywords_from_text(text) keywords_dict_scores = r.get_word_degrees() keywords = list(keywords_dict_scores.keys()) keywordString = "" for keyword in keywords: keywordString = keywordString + " " + keyword keywordString = keywordString.lstrip() return keywordString
def rakeResult(self): text_doc = self.getDoc() r = Rake() r.extract_keywords_from_text(text_doc) ranked_w_score = r.get_ranked_phrases_with_scores() ranked = r.get_ranked_phrases() word_degree = r.get_word_degrees() word_freq_dist = r.get_word_frequency_distribution() return ranked_w_score, ranked, word_degree, word_freq_dist
def clean_Data(self): for i in range (len(self.plot)): r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters. r.extract_keywords_from_text(self.plot[i]) # getting the dictionary whith key words as keys and their scores as values key_words_dict = r.get_word_degrees() # assigning the key words to the new column for the corresponding movie self.plot[i]=" ".join(list(key_words_dict.keys()))
def main(query=None, serving=True): df_main = pd.read_csv('./Program list.csv') df = df_main.copy() # initializing the new column df['Key_words'] = "" for index, row in df.iterrows(): plot = str(row['Program Info']) # instantiating Rake, by default it uses english stopwords from NLTK # and discards all puntuation characters as well r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(plot) # getting the dictionary whith key words as keys and their scores as values key_words_dict_scores = r.get_word_degrees() # assigning the key words to the new column for the corresponding movie row['Key_words'] = concatenate_list(list(key_words_dict_scores.keys())) # dropping the Plot column df.drop(columns=['Program Info'], inplace=True) df.drop(columns=['Timing'], inplace=True) df['desc'] = df[['Dept/Domain', 'Key_words']].apply(lambda x: ''.join(x.map(str)).lower(), axis=1) df['Program Name'] = df['Program Name'].apply(lambda x: str(x).lower()) df3 = df.copy().drop(['Dept/Domain', 'Key_words'], axis=1) df3.set_index(['Program Name'], inplace=True) # instantiating and generating the count matrix count = CountVectorizer() count_matrix = count.fit_transform(df3['desc']) # generating the cosine similarity matrix cosine_sim = cosine_similarity(count_matrix, count_matrix) # creating a Series for the movie titles so they are associated to an ordered numerical # list I will use in the function to match the indexes indices = pd.Series(df3.index) # print(indices) descs = pd.Series(df3['desc']) if serving: pass else: query = input("Enter search keyword: \n") print(get_recommendation(query, indices, cosine_sim))
def similar_items(item_id): movie = pd.read_csv(movies_utils.data_path) # tvshow = pd.read_csv(shows_utils.data_path) movie['item_data'] = movie['movie_title'] + ' ' + movie['movie_plot'] movie['item_id'] = movie['movie_id'] movie = movie.drop([ 'movie_id', 'movie_title', 'movie_genre', 'actors', 'movie_plot', 'imdb_rating', 'movie_link', 'director' ], axis=1) # tvshow['item_data'] = tvshow['show_name'] + ' ' + tvshow['show_plot'] # tvshow['item_id'] = tvshow['show_id'] # tvshow = tvshow.drop(['show_id', 'show_name', 'show_genre', 'show_plot', 'show_rating', 'show_link'], axis=1) data = movie data['key_words'] = "" for index, row in data.iterrows(): item_data = row['item_data'] r = Rake() r.extract_keywords_from_text(item_data) key_words_dict_scores = r.get_word_degrees() row['key_words'] = list(key_words_dict_scores.keys()) data.drop(columns=['item_data'], inplace=True) data.set_index('item_id', inplace=True) data['bag_of_words'] = '' columns = data.columns for index, row in data.iterrows(): words = '' for col in columns: words = words + ' '.join(row[col]) + ' ' data.at[index, 'bag_of_words'] = words data.drop(columns=[col for col in data.columns if col != 'bag_of_words'], inplace=True) count = TfidfVectorizer() count_matrix = count.fit_transform(data['bag_of_words']) indices = pd.Series(data.index) cosine_sim = cosine_similarity(count_matrix, count_matrix) idx = indices[indices == item_id].index[0] score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False) top_30_indexes = list(score_series.iloc[1:51].index) ans = [] for i in top_30_indexes: ans.append(data.iloc[i].name) return ans
def calculate_rake_ranking(just_words): # Initializes the Rake object r = Rake() # Meant to contain each word in a string words_string = '' # Extracts only the word itself as a string for word_array in just_words: words_string += word_array[0] + " " # The Rake object ranks all the words in the string r.extract_keywords_from_text(words_string) # The return type of both functions called below is Dictionary (key -> value) frequency_distribution = r.get_word_frequency_distribution( ) # word -> frequency (number of times it occurs) word_degrees = r.get_word_degrees( ) # word -> degree (linguistic co-occurrence) # Meant to contain RAKE ranking which aren't scaled yet rake_not_scaled = [] # Appends the ranking to each word's array for word_array in just_words: word_frequency = 1 word_degree = 1 # Linear search to match a word to its frequency for word, value in frequency_distribution.items(): if word_array[0] == word: word_frequency = value # Linear search to match a word to its degree for word, value in word_degrees.items(): if word_array[0] == word: word_degree = value # Formula in accordance with the chosen metric ranking = word_degree / word_frequency rake_not_scaled.append(ranking) # Scales the values of the RAKE rankings to [0, 2] scaler = MinMaxScaler(feature_range=(0, 2)) rake_scaled = scaler.fit_transform( np.asarray(rake_not_scaled).reshape(-1, 1)) rake_scaled = [float(ranking) for ranking in rake_scaled] return rake_scaled
def test_build_word_co_occurance_graph(self): r = Rake() phrase_list = [['red', 'apples'], ['good'], ['red'], ['flavour']] degree = defaultdict(lambda: 0) degree['apples'] = 2 degree['good'] = 1 degree['flavour'] = 1 degree['red'] = 3 r._build_word_co_occurance_graph(phrase_list) self.assertEqual(r.get_word_degrees(), degree)
def extract_keywords(df, feature): r = Rake() keyword_lists = [] for i in range(1000): descr = df[feature][i] r.extract_keywords_from_text(descr) key_words_dict_scores = r.get_word_degrees() keywords_string = " ".join(list(key_words_dict_scores.keys())) keyword_lists.append(keywords_string) return keyword_lists
def test_build_word_co_occurance_graph(self): r = Rake() phrase_list = [["red", "apples"], ["good"], ["red"], ["flavour"]] degree = defaultdict(lambda: 0) degree["apples"] = 2 degree["good"] = 1 degree["flavour"] = 1 degree["red"] = 3 r._build_word_co_occurance_graph(phrase_list) self.assertEqual(r.get_word_degrees(), degree)
def do_keyword_extraction(words): if debug: print("---\n", words) rake_all = Rake() rake_all.extract_keywords_from_sentences(_t["context"].value_counts().index.values) word_degrees = dict(rake_all.get_word_degrees()) r = Rake() r.extract_keywords_from_text(words) keywords = dict(r.get_word_degrees()) if debug: print(keywords) for k, v in keywords.items(): keywords[k] = word_degrees[k] if debug: print(keywords) return Counter(keywords).most_common(1)[0]
def recomendacionPlayer(request): formulario = PlayerBusquedaForm() jugadores = None dat = [] playerDat = [] datos = {} if request.method == 'POST': formulario = PlayerBusquedaForm(request.POST) if formulario.is_valid(): jugadores = Player.objects.all() for player in jugadores: idsChampions = player.idsChampion.all() name_p = player.name for champ in idsChampions: name_player = player.name actual = [] actual.append(champ.name) datos.update( {name_player: (str(actual) + ' w' + str(player.winrate))}) playerDat.append(name_p) values = datos.values() values = list(values) d = {'Nombre': playerDat, 'Valores': values} df = pd.DataFrame(data=d, index=playerDat) df = df[['Nombre', 'Valores']] df.head() df['Key_words'] = '' for index, row in df.iterrows(): valor = row['Valores'] r = Rake() r.extract_keywords_from_text(valor) key_words_dict_scores = r.get_word_degrees() row['Key_words'] = str(list(key_words_dict_scores.keys())) df.drop(columns=['Valores'], inplace=True) count = CountVectorizer() count_matrix = count.fit_transform(df['Key_words']) cosine_sim = cosine_similarity(count_matrix, count_matrix) indices = pd.Series(df.index) recommended_player = [] player_name = formulario.cleaned_data['player_name'] idx = indices[indices == player_name].index[0] score_series = pd.Series( cosine_sim[idx]).sort_values(ascending=False) top_10_indexes = list(score_series.iloc[1:11].index) for i in top_10_indexes: recommended_player.append(list(df.index)[i]) jugadores = [] for name_c in recommended_player: jugadores.append(Player.objects.get(name=name_c)) return render(request, 'jugadores_recomendados.html', { 'jugadores': jugadores, 'STATIC_URL': settings.STATIC_URL })
def key_words(content_id): content = imdb.get_movie(content_id) plot = content['plot'][0] punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' for i in plot: if i in punctuations: plot = plot.replace(i, "") r = Rake() r.extract_keywords_from_text(plot) key_words_dict_scores = r.get_word_degrees() keyword = list(key_words_dict_scores.keys()) keyword = " ".join(str(i) for i in keyword) return keyword
def getKeywords(df, column_name, new_column_name): # add a new empty column to df df[new_column_name] = "" for index, row in df.iterrows(): column_data = row[column_name] #word rake, automatically uses stop words and gets rid of punctuation!!! :D r = Rake() # extract key words r.extract_keywords_from_text(column_data) # get the dictionary with key words as keys and their scores as values (maybe use later? idk) keywords_scores = r.get_word_degrees() # assigning the key words to the new column row[new_column_name] = list(keywords_scores.keys())
def extract_bag_of_words(row): bag_of_words = row[1].lower() + ' ' + row[2].lower() + ' ' + row[3].lower() #keyword extraction from overview using Rake r = Rake() r.extract_keywords_from_text(row[4].lower()) key_words_dict_scores = r.get_word_degrees() overview_keywords = ' '.join(list(key_words_dict_scores.keys())) bag_of_words = bag_of_words + ' ' + overview_keywords return bag_of_words
def make_keywords(description): '''Makes keywords of description using "Rake" ''' # instantiating Rake, by default is uses english stopwords from NLTK # and discard all puntuation characters r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(description) # getting the dictionary with key words and their scores key_words_dict_scores = r.get_word_degrees() # return the key words return list(key_words_dict_scores.keys())