def supreme(s): consumer_key = '8960pswi0ALmad8bD27Bofh22' consumer_secret = 'hSFcDZUsfwSbn3eutUirambdqLK1dwMyZkL40BAuoYY4mcbLbE' access_token = '934833577803616257-mVf5WjNVNfT2eWmQ4T46N2T2BDFZ1tV' access_token_secret = '5xQVESFc6kGaQSbtdhvew1WPi73Yne1a9lTi62oPrkKba' try: auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) except: print("Error: Authentication Failed") tweets = get_tweets(query=s, count=200) ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive'] ppos = 100 * len(ptweets) / len(tweets) # print("Positive tweets percentage: {} %".format(ppos)) ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative'] pneg = 100 * len(ntweets) / len(tweets) # print("Negative tweets percentage: {} %".format(pneg)) pneu = (len(tweets) - len(ntweets) - len(ptweets)) / len(tweets) neutweets = [tweet for tweet in tweets if tweet['sentiment'] == 'neutral'] # print("Neutral tweets percentage: {} % \ ".format(100 * (pneu))) """ print("\n\nPositive tweets:") for tweet in ptweets[:10]: print(tweet['text']) print("\n\nNegative tweets:") for tweet in ntweets[:10]: print(tweet['text']) print("\n\nNeutral tweets:") for tweet in neutweets[:10]: print(tweet['text']) """ newsapi = NewsApiClient(api_key='bb0f664df41346a38b42d10e3682c915') all_news = newsapi.get_everything(q=s) l1 = all_news.get('articles') newsl = [] titles=[] for i in l1: if i.get('content'): newsl.append(i.get('content')) titles.append(i.get('title').lower()) for i in l1: if i.get('content'): newsl.append(i.get('content')) r = Rake() l1 = [] for i in newsl: r.extract_keywords_from_text(i) for j in r.get_ranked_phrases(): l1.append(j) l = [] if pneg + 0.5 * pneu > 50: tweets1 = ntweets[:] else: tweets1 = tweets[:] for i in tweets1: l.append(i.get('text')) l2 = [] for i in l: r.extract_keywords_from_text(i) for j in r.get_ranked_phrases(): l2.append(j) intersection = list(set([value for value in l1 if value in l2 and len(value) > 2])) titleRank = [] for i in titles: titleRank.append(len(set(i.split()) & set(intersection))) truthfulness = True if len(intersection) > 2 else False return ptweets, ppos, ntweets, pneg, neutweets, pneu, intersection, truthfulness,titles[titleRank.index(max(titleRank))]
from rake_nltk import Rake rake = Rake() text = """İnsanın konuşacak kadar zekaya, ya da susacak kadar akla sahip olmaması büyük bir talihsizliktir. - Stefan Zweig""" rake.extract_keywords_from_text(text) keyword = rake.get_ranked_phrases() print(keyword)
from rake_nltk import Rake from pyexcel_xls import get_data from pyexcel_xls import save_data from textblob import TextBlob r = Rake() path = r"E:\Users\lockon\Desktop\\" inputFileName = r"test.xlsx" ouputFileName = r"result2.xls" def getExcelData(): xls_data = get_data(path + inputFileName) return xls_data def saveExcelData(sheet1): xls_data.update({u"Sheet1": sheet1}) save_data(path + ouputFileName, xls_data) xls_data = getExcelData() #遍历excel里面的表单,因为只有第一个有内容,所以循环一次就中止 for sheet_n in xls_data.keys(): break sheet1 = xls_data[sheet_n] #遍历sheet1里面的每一行 for rowData in sheet1:
def recommendations(title, val): df = pd.read_csv('bcd.csv') df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']] if val == '1': flag = 0 for i in range(len(list(df['Title']))): if title == str(df['Title'][i]): flag = 1 if flag == 0: return -1 for i in range(len(df.index)): gen = df['Genre'][i].lower().split(", ") df.at[i, 'Genre'] = gen dct = df['Director'][i].lower().split(" ") st = "" for j in range(len(dct)): st = st + str(dct[j]) df.at[i, 'Director'] = st act = df['Actors'][i].lower().split(", ")[:3] for j in range(len(act)): act[j] = act[j].replace(' ', '') df.at[i, 'Actors'] = act df['Key_words'] = "" for index, row in df.iterrows(): plot = row['Plot'] """creating object of Rake class to extract keywords from plot""" r = Rake() """ extract_keywords_from_plot function finds out the keywords from the passed string by removing common stop words like a,the,an,from,it..etc """ r.extract_keywords_from_text(plot) """ getting the dictionary of the extracted words where the words act as the keys and they have a numeric value assigned to them """ key_words_dict_scores = r.get_word_degrees() """ now we are assigning the list of the keywords from the dictionary to the newly created column called 'Key_words' """ row['Key_words'] = list(key_words_dict_scores.keys()) df['BOW'] = "" df.drop(columns=['Plot'], inplace=True) for i in range(len(df.index)): s = "" lst = df['Genre'][i] for j in range(len(lst)): s = s + str(lst[j]) + ' ' s = s + str(df['Director'][0]) lst = df['Actors'][i] for j in range(len(lst)): s = s + str(lst[j]) + ' ' df.at[i, 'BOW'] = s lst = df['Key_words'][i] for j in range(len(lst)): s = s + str(lst[j]) + ' ' df.at[i, 'BOW'] = s '''dropping every column except for bag_of_words as they are not needed anymore''' df.drop(columns=[col for col in df.columns if col != 'BOW'], inplace=True) count = CountVectorizer() count_matrix = count.fit_transform(df['BOW']) #indices = pd.Series(df.index) cosine_sim = cosine_similarity(count_matrix, count_matrix) h = count_matrix.toarray() h = count_matrix.toarray() #Outermatrix=[] #print(h) for i in range(len(df.index)): var = h[0] j = 0 Innermatrix = [] for j in range(len(df.index)): var2 = h[j] numerator = 0 denom1 = 0 denom2 = 0 for o in range(len(h[0])): numerator += (var[o] * var2[o]) denom1 += (var[o]**2) denom2 += (var2[o]**2) denom1 = denom1**0.5 denom2 = denom2**0.5 numerator = numerator / (denom1 * denom2) Innermatrix.append(numerator) print("COSINEFUNCTION\t \tMY LOOP\n") for k in range(0, 5): print(cosine_sim[0][k], "\t\t\t\t\t", Innermatrix[k]) break
def jsonToTxt(jsonString): substringListOne=jsonString.split('"text": ') substringListTwo=[] stronk="" for sub in substringListOne: sub=sub[1:] i=0 for s in sub: if (s=='\"'): break i+=1 sub=sub[:i] if len(sub)==0: stronk+=sub else: stronk+=sub+" " return stronk stank = jsonToTxt(jsonInput) ############ from rake_nltk import Rake r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters. # If you want to provide your own set of stop words and punctuations to # r = Rake(<list of stopwords>, <string of puntuations to ignore>) r.extract_keywords_from_text(stank) r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
def get_top_k_docs(self, query, k=100): """ Args: query: string k: int (default: 1) Returns: top_k_docs: dictionary keys: titles, abstracts, ids. Each element in dict[key] is a list of k elements in descending order of relevance """ query_words = preprocess_query(query) if len(query_words) > 10: # long query search r = Rake(min_length=1, max_length=4) r.extract_keywords_from_text(query) phrases = list(set(' '.join(r.get_ranked_phrases()).split())) query_words = preprocess_query(' '.join(phrases)) top_k_docs = self.model.get_top_n(query_words, self.corpus, n=k) insensitive_comparers = {} for qw in query_words: insensitive_comparers[qw] = re.compile(re.escape(qw), re.IGNORECASE) results = {'titles': [], 'abstracts': [], 'ids': [], 'links': []} relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []} not_relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []} for i in top_k_docs: abstract = i['abstract'].replace('\n', '') if abstract == '': abstract = i['introduction'].replace('\n', '') if abstract == '': continue abstract = remove_punctuations(abstract) title = i['title'].replace('\n', '') if title == '': continue doc_text = title.lower() + ' ' + abstract.lower( ) + ' ' + i['introduction'].replace('\n', '').lower() query_words_found = False for qw in query_words: if qw in doc_text: query_words_found = True break if not query_words_found: continue # Bold mark query words in abstract for qw in query_words: abstract = insensitive_comparers[qw].sub( '<b>' + qw + '</b>', abstract) rel_score = self.relevance_scores[(tuple(query_words), i['id'])] if rel_score > 0: relevant['titles'].append(title.title()) relevant['abstracts'].append(abstract) relevant['ids'].append(i['id']) relevant['links'].append(i['link']) elif rel_score < 0: not_relevant['titles'].append(title.title()) not_relevant['abstracts'].append(abstract) not_relevant['ids'].append(i['id']) not_relevant['links'].append(i['link']) else: results['titles'].append(title.title()) results['abstracts'].append(abstract) results['ids'].append(i['id']) results['links'].append(i['link']) for key in ['abstracts', 'ids', 'titles', 'links']: results[key] = relevant[key] + results[key] + not_relevant[key] return results
# Input : wikiplots file ( plots, titles) ## Sample downloaded from https://github.com/markriedl/WikiPlots (there is a plots.zip folder ) ## Sample input: ## plots : Old Major, the old boar on the Manor Farm, summons the animals on the farm together for a meeting, during which he refers to humans as "enemies" and teaches the animals a revolutionary song called "Beasts of England". ## titles: Animal Farm # Output: wikiplot.kwRAKE.csv # Sample output: ## plot-1_0 K animal farm[SEP]happiest animals live simple lives .'[SEP]several men attack animal farm .'[SEP]napoleon educates young puppies[SEP]boxer continues working harder[SEP]irresponsible farmer mr jones[SEP]set aside special food items[SEP]frequently smears snowball[SEP]anthem glorifying napoleon[SEP]revolutionary song called[SEP]similar animal revolts .' I 4 Old Major, the old boar on the Manor Farm, summons the animals on the farm together for a meeting, during which he refers to humans as "enemies" and teaches the animals a revolutionary song called "Beasts of England". When Major dies, two young pigs, Snowball and Napoleon, assume command and consider it a duty to prepare for the Rebellion. The animals revolt and drive the drunken and irresponsible farmer mr Jones from the farm, renaming it "Animal Farm". They adopt the Seven Commandments of Animalism, the most important of which is, "All animals are equal". Snowball teaches the animals to read and write, while Napoleon educates young puppies on the principles of Animalism. Food is plentiful, and the farm runs smoothly. NA infile = 'data/download/plots' infile_title = 'data/download/titles' outfile = 'data/generated/wikiplot.kwRAKE.csv' # 2.2 - Execute r = Rake() vectorizer = TfidfVectorizer(ngram_range=(1, 3)) topK = 10 f = open(infile, 'r', encoding='"ISO-8859-1"') f_title = open(infile_title, 'r', encoding='"ISO-8859-1"') fout = open(outfile, 'a', encoding='"ISO-8859-1"') lines = f.readlines() lines_title = f_title.readlines() abstract_lens = {} print("Starting Pre-processing") sentences_to_write = [] w = 0
def similar_items(item_id): book = pd.read_csv(books_utils.data_path) movie = pd.read_csv(movies_utils.data_path) tvshow = pd.read_csv(shows_utils.data_path) # data cleaning book['item_data'] = book['book_title'] + ' ' + book[ 'book_author'] + ' ' + book['book_plot'] book['item_id'] = book['book_id'] book = book.drop([ 'book_id', 'book_title', 'book_genre', 'book_author', 'book_plot', 'book_rating', 'book_link' ], axis=1) movie['item_data'] = movie['movie_title'] + ' ' + movie['movie_plot'] movie['item_id'] = movie['movie_id'] movie = movie.drop([ 'movie_id', 'movie_title', 'movie_genre', 'actors', 'movie_plot', 'imdb_rating', 'movie_link', 'director' ], axis=1) tvshow['item_data'] = tvshow['show_name'] + ' ' + tvshow['show_plot'] tvshow['item_id'] = tvshow['show_id'] tvshow = tvshow.drop([ 'show_id', 'show_name', 'show_genre', 'show_plot', 'show_rating', 'show_link' ], axis=1) data = (book.append(movie)).append(tvshow) data['key_words'] = "" for index, row in data.iterrows(): item_data = row['item_data'] r = Rake() r.extract_keywords_from_text(item_data) key_words_dict_scores = r.get_word_degrees() row['key_words'] = list(key_words_dict_scores.keys()) data.drop(columns=['item_data'], inplace=True) data.set_index('item_id', inplace=True) data['bag_of_words'] = '' columns = data.columns for index, row in data.iterrows(): words = '' for col in columns: words = words + ' '.join(row[col]) + ' ' data.at[index, 'bag_of_words'] = words data.drop(columns=[col for col in data.columns if col != 'bag_of_words'], inplace=True) count = TfidfVectorizer() count_matrix = count.fit_transform(data['bag_of_words']) indices = pd.Series(data.index) cosine_sim = cosine_similarity(count_matrix, count_matrix) idx = indices[indices == item_id].index[0] score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False) top_30_indexes = list(score_series.iloc[1:51].index) ans = [] for i in top_30_indexes: ans.append(data.iloc[i].name) return ans
def rec(title, val): df = pd.read_csv('bcd.csv') df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']] #df.sort_values('Title',inplace=True) #df.drop_duplicates(subset='Title',keep='first',inplace=False) '''in case the movie is not found then -1 is returned to the function''' if val == '1': flag = 0 for i in range(len(list(df['Title']))): if title == str(df['Title'][i]): flag = 1 if flag == 0: return -1 '''cleaning the dataset''' for i in range(len(df.index)): gen = df['Genre'][i].lower().split(", ") df.at[i, 'Genre'] = gen dct = df['Director'][i].lower().split(" ") st = "" for j in range(len(dct)): st = st + str(dct[j]) df.at[i, 'Director'] = st act = df['Actors'][i].lower().split(", ")[:3] for j in range(len(act)): act[j] = act[j].replace(' ', '') df.at[i, 'Actors'] = act df['Key_words'] = "" '''following loop stores the extracted keywords from the 'Plot' column in a new column Key_words''' for i in range(len(df.index)): tempstr = str(df['Plot'][i]) """creating object of Rake class to extract keywords from plot""" r = Rake() """ extract_keywords_from_plot function finds out the keywords from the passed string by removing common stop words like a,the,an,from,it..etc """ r.extract_keywords_from_text(tempstr) """ getting the dictionary of the extracted words where the words act as the keys and they have a numeric value assigned to them """ key_words = r.get_word_degrees() """ now we are assigning the list of the keywords from the dictionary to the newly created column called 'Key_words' """ df.at[i, 'Key_words'] = list(key_words.keys()) """Creating a new column for the concatenated strings to be stored as bag of words""" df['BOW'] = "" df.drop(columns=['Plot'], inplace=True) """The following loop creates the bag of strings for every row""" for i in range(len(df.index)): s = "" lst = df['Genre'][i] for j in range(len(lst)): s = s + str(lst[j]) + ' ' s = s + str(df['Director'][0]) lst = df['Actors'][i] for j in range(len(lst)): s = s + str(lst[j]) + ' ' df.at[i, 'BOW'] = s lst = df['Key_words'][i] for j in range(len(lst)): s = s + str(lst[j]) + ' ' df.at[i, 'BOW'] = s """Replacing the default indexing by the title of the movies""" df.set_index('Title', inplace=True) """Dropping every column except for bag of words because they aren't needed beyond here""" df.drop(columns=[col for col in df.columns if col != 'BOW'], inplace=True) """initializing an instance of CountVectorizer to create sparse matrix vectors""" count = CountVectorizer() count_matrix = count.fit_transform(df['BOW']) """calculating the similarity scores of the sparse matrix vectors and storing them in a 2d matrix""" cosine_sim = cosine_similarity(count_matrix, count_matrix) '''empty list to store the recommended movies''' rec_mov = [] indexes = [] indices = pd.Series(df.index) '''idx stores the index of the movie input by the user''' idx = indices[indices == title].index[0] '''creating a series of scores of the movies corresponding to the input''' score = pd.Series(cosine_sim[idx]).sort_values(ascending=False) '''getting the indices of the top 10 closest related movies''' for i in range(1, 11): indexes.append(score.index[i]) '''Adding the recommended movies to the list ''' for i in indexes: rec_mov.append(list(df.index)[i]) i = 0 '''lurl is a list to store the url's of the posters of the movie''' lurl = list() '''year is a list to store the year of the release of the movie''' year = list() '''the following loop scrapes values of url and year from the open source movie database''' while i < 10: movie = str(rec_mov[i]) url = "http://www.omdbapi.com/?i=tt3896198&apikey=7eaeeaff&t=" + movie http = urllib3.PoolManager() response = http.request('GET', url) data = response.data values = json.loads(data) lurl.append(values['Poster']) year.append(values['Year']) i = i + 1 '''css file for the output file''' css = """table{ border-collapse: collapse; border:3px solid red;} th{ border:3px solid red; text-align:center; font-weight:bold; font-family:'Times New Roman', Times, serif; background-color:white; opacity:0.9;} td{ border:3px solid red; text-align:center; font-weight:bold; font-size:20px; font-family:'Franklin Gothic Medium', 'Arial Narrow', Arial, sans-serif; background-color:white; opacity:0.9;}""" message = """<html> <head> <title>Recommended Movies</title> <style> """ + css + """ </style> </head> <body background="https://i.insider.com/5f578371e6ff30001d4e76be?width=1136&format=jpeg"> <table border='1'> <tr><th>Title</th><th>Poster</th></tr> <tr><td>""" + str(rec_mov[0]) + '<br>' + str( year[0]) + """</td><td><img src=""" + str(lurl[0]) + """></td></tr> <tr><td>""" + str(rec_mov[1]) + '<br>' + str( year[1]) + """</td><td><img src=""" + str(lurl[1]) + """></td></tr> <tr><td>""" + str(rec_mov[2]) + '<br>' + str( year[2] ) + """</td><td><img src=""" + str(lurl[2]) + """></td></tr> <tr><td>""" + str(rec_mov[3]) + '<br>' + str( year[3]) + """</td><td><img src=""" + str( lurl[3]) + """></td></tr> <tr><td>""" + str(rec_mov[4]) + '<br>' + str( year[4]) + """</td><td><img src=""" + str( lurl[4]) + """></td></tr> <tr><td>""" + str(rec_mov[5]) + '<br>' + str( year[5]) + """</td><td><img src=""" + str( lurl[5]) + """></td></tr> <tr><td>""" + str(rec_mov[6] ) + '<br>' + str( year[6]) + """</td><td><img src=""" + str( lurl[6]) + """></td></tr> <tr><td>""" + str(rec_mov[7]) + '<br>' + str( year[7]) + """</td><td><img src=""" + str( lurl[7]) + """></td></tr> <tr><td>""" + str(rec_mov[8] ) + '<br>' + str( year[8]) + """</td><td><img src=""" + str( lurl[8]) + """></td></tr> <tr><td>""" + str(rec_mov[9]) + '<br>' + str( year[9]) + """</td><td><img src=""" + str( lurl[9]) + """></td></tr> </table> </body> </html>> """ return message
from rake_nltk import Rake rake_nltk_var = Rake() text = """Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.""" rake_nltk_var.extract_keywords_from_text(text) keyword_extracted = rake_nltk_var.get_ranked_phrases() keyword_extracted_with_scores = rake_nltk_var.get_ranked_phrases_with_scores() print("\n*******List of Keyword Extracted*******") print(*keyword_extracted, sep = "\n") print("\n*******List of Keyword Extracted with Scores*******") print(*keyword_extracted_with_scores, sep = "\n")
def calculate_keywords(text): r = Rake(stopwords=[ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ]) # Uses stopwords for english from NLTK, and all puntuation characters. #titles and abstracts parsed and saved in json file need to be joined here below #my_text=re.sub(r'\s\s+',' ',text.lower()) r.extract_keywords_from_text(text) kw_rake = r.get_ranked_phrases_with_scores() kw_rake = [[x[1], x[0]] for x in kw_rake if len(x[1]) > 3] kw_rake = [x for x in kw_rake if len(x[0].split()) < 3] kw_rake = [x for x in kw_rake if min([len(i) for i in x[0].split()]) > 3] kw_rake = [x for x in kw_rake if not re.search(r'\d', x[0])] kw_rake = [x for x in kw_rake if check_pos(x[0]) is True] kw_rake_scores = [x[1] for x in kw_rake] my_min = min(kw_rake_scores) my_max = max(kw_rake_scores) kw_rake = [[x[0], normalize(x[1], my_min, my_max)] for x in kw_rake] kw_rake = [x for x in kw_rake if x[1] > 0.1] #text="History and evolution of the arctic flora: in the footsteps of Eric. A major contribution to our initial understanding of the origin, history and biogeography of the present-day arctic flora was made by Eric Hulten in his landmark book Outline of the History of Arctic and Boreal Biota during the Quarternary Period, published in 1937. Here we review recent molecular and fossil evidence that has tested some of Hulten's proposals. There is now excellent fossil, molecular and phytogeographical evidence to support Hulten's proposal that Beringia was a major northern refugium for arctic plants throughout the Quaternary. In contrast, most molecular evidence fails to support his proposal that contemporary east and west Atlantic populations of circumarctic and amphi-Atlantic species have been separated throughout the Quaternary. In fact, populations of these species from opposite sides of the Atlantic are normally genetically very similar, thus the North Atlantic does not appear to have been a strong barrier to their dispersal during the Quaternary. Hulten made no detailed proposals on mechanisms of speciation in the Arctic; however, molecular studies have confirmed that many arctic plants are allopolyploid, and some of them most probably originated during the Holocene. Recurrent formation of polyploids from differentiated diploid or more low-ploid populations provides one explanation for the intriguing taxonomic complexity of the arctic flora, also noted by Hulten. In addition, population fragmentation during glacial periods may have lead to the formation of new sibling species at the diploid level. Despite the progress made since Hulten wrote his book, there remain large gaps in our knowledge of the history of the arctic flora, especially about the origins of the founding stocks of this flora which first appeared in the Arctic at the end of the Pliocene (approximately 3 Ma). Comprehensive analyses of the molecular phylogeography of arctic taxa and their relatives together with detailed fossil studies are required to fill these gaps. Quantification of population sizes of large herbivores and their long-term functional role in ecosystems using dung fungal spores. The relationship between large herbivore numbers and landscape cover over time is poorly understood. There are two schools of thought: one views large herbivores as relatively passive elements upon the landscape and the other as ecosystem engineers driving vegetation succession. The latter relationship has been used as an argument to support reintroductions of large herbivores onto many landscapes in order to increase vegetation heterogeneity and biodiversity through local-scale disturbance regimes. Most of the research examining the relationship between large herbivores and their impact on landscapes has used extant studies. An alternative approach is to estimate the impact of variations in herbivore populations through time using fossil dung fungal spores and pollen in sedimentary sequences. However, to date, there has been little quantification of fossil dung fungal spore records and their relationship to herbivore numbers, leaving this method open to varied interpretations. In this study, we developed further the dung fungal spore method and determined the relationship between spore abundance in sediments (number cm(-2)year(-1)) and herbivore biomass densities (kgha(-1)). To establish this relationship, we used the following: (i) the abundance of Sporormiella spp., Sordaria spp. and Podospora spp. spores in modern sediments from ponds and (ii) weekly counts of contemporary wildlife over a period of 5years from the rewilded site, Oostvaardersplassen, in the Netherlands. Results from this study demonstrate that there is a highly significant relationship between spore abundance and local biomass densities of herbivores that can be used in the calibration of fossil records. Mammal biomass density (comprising Konik horses, Heck cattle and red deer) predicts in a highly significant way the abundance of all dung fungal spores amalgamated together. This relationship is apparent at a very local scale (<10m), when the characteristics of the sampled ponds are taken into account (surface area of pond, length of shoreline). In addition, we identify that dung fungal spores are principally transported into ponds by surface run-off from the shores. These results indicate that this method provides a robust quantitative measure of herbivore population size over time. Herbivory Network: An international, collaborative effort to study herbivory in Arctic and alpine ecosystems. Plant-herbivore interactions are central to the functioning of tundra ecosystems, but their outcomes vary over space and time. Accurate forecasting of ecosystem responses to ongoing environmental changes requires a better understanding of the processes responsible for this heterogeneity. To effectively address this complexity at a global scale, coordinated research efforts, including multi-site comparisons within and across disciplines, are needed. The Herbivory Network was established as a forum for researchers from Arctic and alpine regions to collaboratively investigate the multifunctional role of herbivores in these changing ecosystems. One of the priorities is to integrate sites, methodologies, and metrics used in previous work, to develop a set of common protocols and design long-term geographically-balanced, coordinated experiments. The implementation of these collaborative research efforts will also improve our understanding of traditional human-managed systems that encompass significant portions of the sub-Arctic and alpine areas worldwide. A deeper understanding of the role of herbivory in these systems under ongoing environmental changes will guide appropriate adaptive strategies to preserve their natural values and related ecosystem services. (C) 2016 Elsevier B.V. and NIPR. All rights reserved. Biomass allometry for alder, dwarf birch, and willow in boreal forest and tundra ecosystems of far northeastern Siberia and north-central Alaska. Shrubs play an important ecological role in the Arctic system, and there is evidence from many Arctic regions of deciduous shrubs increasing in size and expanding into previously forb or graminoid-dominated ecosystems. There is thus a pressing need to accurately quantify regional and temporal variation in shrub biomass in Arctic regions, yet allometric equations needed for deriving biomass estimates from field surveys are rare. We developed 66 allometric equations relating basal diameter (BD) to various aboveground plant characteristics for three tall, deciduous shrub genera growing in boreal and tundra ecoregions in far northeastern Siberia (Yakutia) and north-central Alaska. We related BD to plant height and stem, branch, new growth (leaves + new twigs), and total aboveground biomass for alder (Alms viridis subsp. crispa and Alms fruticosa), dwarf birch (Betula nana subsp. exilis and divaricata), and willow (Salix spp.). The equations were based on measurements of 358 shrubs harvested at 33 sites. Plant height (r(2) = 0.48-0.95), total aboveground biomass (r(2) = 0.46-0.99), and component biomass (r(2) = 0.13-0.99) were significantly (P < 0.01) related to shrub BD. Alder and willow populations exhibited differences in allometric relationships across ecoregions, but this was not the case for dwarf birch. The allometric relationships we developed provide a tool for researchers and land managers seeking to better quantify and monitor the form and function of shrubs across the Arctic landscape. (C) 2014 Elsevier B.V. All rights reserved. Shrub expansion may reduce summer permafrost thaw in Siberian tundra. Climate change is expected to cause extensive vegetation changes in the Arctic: deciduous shrubs are already expanding, in response to climate warming. The results from transect studies suggest that increasing shrub cover will impact significantly on the surface energy balance. However, little is known about the direct effects of shrub cover on permafrost thaw during summer. We experimentally quantified the influence of Betula nana cover on permafrost thaw in a moist tundra site in northeast Siberia with continuous permafrost. We measured the thaw depth of the soil, also called the active layer thickness (ALT), ground heat flux and net radiation in 10 m diameter plots with natural B. nana cover (control plots) and in plots in which B. nana was removed (removal plots). Removal of B. nana increased ALT by 9% on average late in the growing season, compared with control plots. Differences in ALT correlated well with differences in ground heat flux between the control plots and B. nana removal plots. In the undisturbed control plots, we found an inverse correlation between B. nana cover and late growing season ALT. These results suggest that the expected expansion of deciduous shrubs in the Arctic region, triggered by climate warming, may reduce summer permafrost thaw. Increased shrub growth may thus partially offset further permafrost degradation by future temperature increases. Permafrost models need to include a dynamic vegetation component to accurately predict future permafrost thaw. Global assessment of nitrogen deposition effects on terrestrial plant diversity: a synthesis. Atmospheric nitrogen (N) deposition is it recognized threat to plant diversity ill temperate and northern parts of Europe and North America. This paper assesses evidence from field experiments for N deposition effects and thresholds for terrestrial plant diversity protection across a latitudinal range of main categories of ecosystems. from arctic and boreal systems to tropical forests. Current thinking on the mechanisms of N deposition effects on plant diversity, the global distribution of G200 ecoregions, and current and future (2030) estimates of atmospheric N-deposition rates are then used to identify the risks to plant diversity in all major ecosystem types now and in the future. This synthesis paper clearly shows that N accumulation is the main driver of changes to species composition across the whole range of different ecosystem types by driving the competitive interactions that lead to composition change and/or making conditions unfavorable for some species. Other effects such its direct toxicity of nitrogen gases and aerosols long-term negative effects of increased ammonium and ammonia availability, soil-mediated effects of acidification, and secondary stress and disturbance are more ecosystem, and site-specific and often play a supporting role. N deposition effects in mediterranean ecosystems have now been identified, leading to a first estimate of an effect threshold. Importantly, ecosystems thought of as not N limited, such as tropical and subtropical systems, may be more vulnerable in the regeneration phase. in situations where heterogeneity in N availability is reduced by atmospheric N deposition, on sandy soils, or in montane areas. Critical loads are effect thresholds for N deposition. and the critical load concept has helped European governments make progress toward reducing N loads on sensitive ecosystems. More needs to be done in Europe and North America. especially for the more sensitive ecosystem types. including several ecosystems of high conservation importance. The results of this assessment Show that the Vulnerable regions outside Europe and North America which have not received enough attention are ecoregions in eastern and Southern Asia (China, India), an important part of the mediterranean ecoregion (California, southern Europe). and in the coming decades several subtropical and tropical parts of Latin America and Africa. Reductions in plant diversity by increased atmospheric N deposition may be more widespread than first thought, and more targeted Studies are required in low background areas, especially in the G200 ecoregions. Meta-analysis of high-latitude nitrogen-addition and warming studies implies ecological mechanisms overlooked by land models. Accurate representation of ecosystem processes in land models is crucial for reducing predictive uncertainty in energy and greenhouse gas feedbacks with the climate. Here we describe an observational and modeling meta-analysis approach to benchmark land models, and apply the method to the land model CLM4.5 with two versions of belowground biogeochemistry. We focused our analysis on the aboveground and belowground responses to warming and nitrogen addition in high-latitude ecosystems, and identified absent or poorly parameterized mechanisms in CLM4.5. While the two model versions predicted similar soil carbon stock trajectories following both warming and nitrogen addition, other predicted variables (e.g., belowground respiration) differed from observations in both magnitude and direction, indicating that CLM4.5 has inadequate underlying mechanisms for representing high-latitude ecosystems. On the basis of observational synthesis, we attribute the model-observation differences to missing representations of microbial dynamics, aboveground and belowground coupling, and nutrient cycling, and we use the observational meta-analysis to discuss potential approaches to improving the current models. However, we also urge caution concerning the selection of data sets and experiments for meta-analysis. For example, the concentrations of nitrogen applied in the synthesized field experiments (average = 72 kg ha(-1) yr(-1)) are many times higher than projected soil nitrogen concentrations (from nitrogen deposition and release during mineralization), which precludes a rigorous evaluation of the model responses to likely nitrogen perturbations. Overall, we demonstrate that elucidating ecological mechanisms via meta-analysis can identify deficiencies in ecosystem models and empirical experiments." kw_text_rank = [ list(x) for x in (score_keyphrases_by_text_rank(text, n_keywords=0.05)) ] kw_text_rank = [ x for x in kw_text_rank if not re.search(r'(study|studi|effect|relation)', x[0]) ] kw_text_rank = [ x for x in kw_text_rank if min([len(i) for i in x[0].split()]) > 3 ] kw_text_rank_scores = [x[1] for x in kw_text_rank] my_min = min(kw_text_rank_scores) my_max = max(kw_text_rank_scores) kw_text_rank = [[x[0], normalize(x[1], my_min, my_max)] for x in kw_text_rank] kw_text_rank = [x for x in kw_text_rank if x[1] > 0.01] keywords = [] keywords.extend(kw_rake) keywords.extend(kw_text_rank) keywords = sorted(keywords, key=lambda x: x[1], reverse=True) final_keyword_list = [] for kw in keywords: if kw[0] not in [x[0] for x in final_keyword_list]: final_keyword_list.append(kw) #final ranked keyword list need to be saved in a json file as well return final_keyword_list
def getkeys(text): r = Rake() r.extract_keywords_from_text(text) return r.get_ranked_phrases_with_scores()
def makeDescription(url): lemmatizer = WordNetLemmatizer() r = Rake() joblinkTarget = BeautifulSoup(urllib.urlopen(url), "html.parser") summaryElement = joblinkTarget.find('div', attrs={'id': 'jobdescSec'}) text = summaryElement.get_text() topicFromHTML = joblinkTarget.find('h1', attrs={'class': 'jobTitle'}).text topicRake = Rake() topicRake.extract_keywords_from_text(topicFromHTML) topicExtractor = topicRake.get_ranked_phrases() topic = topicExtractor[0] if topic.endswith("ineer"): topic += "ing" elif topic.endswith("oper"): topic = topic[:-2] + "ment" elif topic.endswith("yst"): topic = topic[:-1] + "is" listedTech = "" r.extract_keywords_from_text(text) rankedPhrases = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest. for eachPhrase in rankedPhrases: toChange = eachPhrase eachPhrase = str(eachPhrase.encode('ascii', 'ignore')) rankedPhrases[rankedPhrases.index(toChange)] = eachPhrase actionList = "" for sent in rankedPhrases: content = ne_chunk(pos_tag(word_tokenize(sent))) if len(content) > 1 and content[0][1][0] == 'V': sentList = sent.split() sentList[0] = lemmatizer.lemmatize(content[0][0], 'v') rankedPhrases.remove(sent) sent = "" for i in sentList: sent = sent + " " + i actionList = actionList + sent + "\n" if len(rankedPhrases) > 6: for each in range(0, 4): listedTech = listedTech + rankedPhrases[each] + ", " listedTech += rankedPhrases[4] print ("\nTopic: -----------------------------------------------------------------------------------") print ("Topics in " + topic + "\n") print ("Course Description: ----------------------------------------------------------------------") print ("Introduction to topics in " + topic + " such as " + listedTech) print ("\nCourse Learning Outcomes: -----------------------------------------------------------------") print actionList keywordsText = [] print ('Summary:') from nltk.tokenize import sent_tokenize sentences = sent_tokenize(text) text = "" for each in sentences: text = text + " " + str(each.encode('ascii','ignore')) print (summarize(text)) print ("\n") '''
def key_extract(mytext): r = Rake() # mytext = '''Electronic commerce, commonly written as e-commerce, is the trading in products or services using computer networks, such as the Internet. Electronic commerce draws on technologies such as mobile commerce, electronic funds transfer, supply chain management, Internet marketing, onlinetransactionprocessing, electronicdatainterchange (EDI), inv entory management systems, and automated data collection systems. Modern electronic commerce typically uses the World Wide Web for at least one part of the transaction's life cycle, although it may also use other technologies such as e-mail''' r.extract_keywords_from_text(mytext) return r.get_ranked_phrases()
def handle_article_rake_nltk(text, nb_to_display): r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO) r.extract_keywords_from_text(text) ranked_words = r.get_ranked_phrases_with_scores() pp.pprint(ranked_words[:nb_to_display]) # To get keyword phrases ranked highest to lowest.
from rake_nltk import Rake import json import spacy from scipy import stats r = Rake() # Used to extract keywords nlp = spacy.load("en_core_web_lg") # Word embedding dictionary # Given a sentence, extracts keywords using rake def get_keywords(sentence): r.extract_keywords_from_text(sentence) keywords = r.get_ranked_phrases() return keywords # Compares question to dictionary (previously read with read_book) def compute_scores_from_index(question, data): keywords = get_keywords(question) analyzed = analyze(question) for d in analyzed: keywords.extend(d['Subjects']) keywords.extend(d['Verbs']) keywords.extend(d['Complements']) keywords = list(set(keywords)) #keywords = [nlp(k) for k in keywords] scores = [] for index in data: score = get_score(keywords, (index['title'])) scores.append(score)
def categorize(tweet): r = Rake(max_length=MAX_PHRASE_LEN) r.extract_keywords_from_text(tweet) return r.get_ranked_phrases()
def recommend_config(request): search_query = request.data.get('query', None) if not search_query: return Response("Invalid search query", status=400) corpus = pd.read_excel('data/master_configs.xlsx') print('reached...') # initializing the new column corpus['Key_words'] = "" for index, row in corpus.iterrows(): info = row['Info'] # instantiating Rake, by default it uses english stopwords from NLTK # and discards all puntuation characters as well r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(info) # getting the dictionary whith key words as keys and their scores as values key_words_dict_scores = r.get_word_degrees() # assigning the key words to the new column for the corresponding movie row['Key_words'] = list(key_words_dict_scores.keys()) # dropping the Plot column corpus.drop(columns=['Info'], inplace=True) corpus.set_index('Name', inplace=True) corpus['bag_of_words'] = '' columns = corpus.columns for index, row in corpus.iterrows(): words = '' for col in columns: if col != 'Key_words': words = words + row[col] + ' ' else: words = words + ' '.join(row[col]) + ' ' row['bag_of_words'] = words corpus['config_id'] = '' columns = corpus.columns for index, row in corpus.iterrows(): words = '' for col in columns: if col == 'OS' or col == 'Server' or col == 'Controller': words = words + row[col] + '|' row['config_id'] = words search_query = _remove_noise(search_query) vector1 = text_to_vector(search_query.lower()) result = {'Rank': [], 'Heading': [], 'Config': [], 'Cosine': []} for i in corpus.index: config = corpus['bag_of_words'][i] config = _remove_noise(config) vector2 = text_to_vector(config.lower()) cosine = get_cosine(vector1, vector2) if cosine > 0: result['Rank'].append(0) result['Heading'].append(i) result['Config'].append(corpus['config_id'][i]) result['Cosine'].append(cosine) result_df = pd.DataFrame(data=result) result_df = result_df.sort_values('Cosine', ascending=False) result_df.to_csv('data/config_op.csv') #print(result_df) print('Output saved!') #convert csv to json csvfilename = 'data/config_op.csv' jsonfilename = csvfilename.split('.')[0] + '.json' csvfile = open(csvfilename, 'r') jsonfile = open(jsonfilename, 'w') reader = csv.DictReader(csvfile) fieldnames = ('Rank', 'Heading', 'Config', 'Cosine') output = [] for each in reader: row = {} for field in fieldnames: row[field] = each[field] output.append(row) json.dump(output, jsonfile, indent=2, sort_keys=True) #print(result_df) print('Output saved!') return Response(output)
from flask import Flask, render_template, request from rake_nltk import Rake, Metric import requests import numpy as np import ast, os, nltk, re, db nltk.download('stopwords') nltk.download('punkt') # stop words: set of words to be excluded from consideration while generating keywords stopwords = nltk.corpus.stopwords.words('english') newStopWords = ['http', 'https', '://', '```', '~~~', '///'] stopwords.extend(newStopWords) rake = Rake(min_length=1, max_length=4, ranking_metric=Metric.WORD_DEGREE, stopwords=stopwords) cursor = db.connect() app = Flask(__name__, static_folder="../web/dist/static", template_folder="../web/dist/") search_dir = os.path.join(app.root_path, '../../_categories/') files = os.listdir(search_dir) files = [os.path.join(search_dir, f) for f in files] # add path to each file category_list = [] for file in files: slash_pos = file.rfind('/') category_string = file[slash_pos + 1:-3] if category_string != 'all_links':
import json from rake_nltk import Rake from nltk.corpus import stopwords rake = Rake(min_length=2, max_length=4, stopwords=stopwords.words('english').extend(['book', 'review'])) def find_missing_isbn(s): print(s) rev_s = reversed(s) digits = (int(c) for c in rev_s) digits = (d * i for i, d in enumerate(digits, start=2)) return 11 - sum(digits) % 11 def find_full_isbn(s): crop_s = s[3:-1] return crop_s + str(find_missing_isbn(crop_s)) def serialize_meta(meta): d = {'title': meta['title'], 'description': meta['description']} return d def serialize_reviews(reviews): clean_reviews = [] for r in reviews: print(r)
def calc(self,mname): pd.set_option('display.max_columns', 100) df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7') df.head() df.shape df = df[['Title','Genre','Director','Actors','Plot']] df.head() df.shape # discarding the commas between the actors' full names and getting only the first three names df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3]) # putting the genres in a list of words df['Genre'] = df['Genre'].map(lambda x: x.lower().split(',')) df['Director'] = df['Director'].map(lambda x: x.split(' ')) # merging together first and last name for each actor and director, so it's considered as one word # and there is no mix up between people sharing a first name for index, row in df.iterrows(): row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']] row['Director'] = ''.join(row['Director']).lower() # initializing the new column df['Key_words'] = "" for index, row in df.iterrows(): plot = row['Plot'] # instantiating Rake, by default is uses english stopwords from NLTK # and discard all puntuation characters r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(plot) # getting the dictionary whith key words and their scores key_words_dict_scores = r.get_word_degrees() # assigning the key words to the new column row['Key_words'] = list(key_words_dict_scores.keys()) # dropping the Plot column df.drop(columns = ['Plot'], inplace = True) df.set_index('Title', inplace = True) df.head() df['bag_of_words'] = '' columns = df.columns for index, row in df.iterrows(): words = '' for col in columns: if col != 'Director': words = words + ' '.join(row[col])+ ' ' else: words = words + row[col]+ ' ' row['bag_of_words'] = words df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True) df.head() # instantiating and generating the count matrix count = CountVectorizer() count_matrix = count.fit_transform(df['bag_of_words']) # creating a Series for the movie titles so they are associated to an ordered numerical # list I will use later to match the indexes indices = pd.Series(df.index) indices[:5] # generating the cosine similarity matrix cosine_sim = cosine_similarity(count_matrix, count_matrix) cosine_sim # function that takes in movie title as input and returns the top 10 recommended movies def recommendations(title, cosine_sim = cosine_sim): rm="" try: recommended_movies = [] # gettin the index of the movie that matches the title idx = indices[indices == title].index[0] # creating a Series with the similarity scores in descending order score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False) # getting the indexes of the 10 most similar movies top_10_indexes = list(score_series.iloc[1:11].index) # populating the list with the titles of the best 10 matching movies for i in top_10_indexes: recommended_movies.append(list(df.index)[i]) for i in range(len(recommended_movies)-1): rm=rm+"\n"+recommended_movies[i] except IndexError: rm="Movie not in dataset" finally: with open("rname.txt","w+") as file: file.seek(0) file.write(rm) recommendations(mname)
def dollarSA(): r = Rake() # Opens file and reads in training data # NB classifier trains using the read in data with open("../datasets/trainingData.csv", 'r') as trainingdata: classifier = NaiveBayesClassifier(trainingdata, format="csv") print("Training Data") classifier.show_informative_features(5) # Opens file and reads in testing data # Prints testing data accuracy # Not needed for final product with open("../datasets/testingData.csv", 'r') as testingdata: print("Testing data accuracy", classifier.accuracy(testingdata)) with open("dollar.txt", 'r', encoding='utf-8') as a_file: for line in a_file: userInput = line.strip() regex = re.compile('[^a-zA-Z ]') punctuationRemoved = regex.sub('', userInput) # Defines stopwords stop_words = set(stopwords.words('english')) # Takes user input, removes stopwords word_tokens = word_tokenize(punctuationRemoved) # Creates list size based on number of words left after stop words are removed filtered_sentence = [w for w in word_tokens if not w in stop_words] # Initialize empty list filtered_sentence = [] # Appends each word to end of list # Runs for as many words are stored in word_tokens for w in word_tokens: # If word is not in stop_words, append to end of list if w not in stop_words: filtered_sentence.append(w) # Prints list to see new sentence with stopwords removed # Converts the filtered stop word sentence to string stringWithoutStopwords = ' '.join( [str(elem) for elem in filtered_sentence]) # Extracts keywords from the filtered sentence r.extract_keywords_from_text(stringWithoutStopwords) # Ranks the keywords that have been extracted ranked_phrases = r.get_ranked_phrases() # Converts extracted keywords list to string listToStr = ' '.join([str(elem) for elem in ranked_phrases]) # Runs string through trained NB classifier finalString = TextBlob(listToStr, classifier=classifier) # Print string followed by classification print(finalString + "," + finalString.classify())
dict['long_description'].append(fields[3]) dict['id'].append(regex.sub(" ", fields[4])) df1 = pd.DataFrame.from_dict(dict) ############################################################################################################################################### """ TRANSLATING THE TEXTS INTO ENGLISH LANGUAGE. TAKING ONLY TITLE AND DESCRIPTION COLUMNS FOR EASYNESS. FOR TRANSLATION USING 'GOOGLETRANS' AND 'TEXTBLOB' PACKAGES FOR BETTER PERFORMANCE. THEN EXTRACTING KEY PHRASES FROM THEM USING 'RAKE'(NLTK package) AND STORED IN A NEW DATAFRAME COLUMN 'keyword_set'. """ columns = ['title', 'description'] temp = [] translator = Translator() r = Rake(stopwords=stop_words, ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, min_length=2, max_length=3) n = 0 t = 500 dest_t = len(df1) while n < dest_t: for i in range(n, t): for j in columns: print("df1[j][i] = ", df1[j][i], "i==", i) if (len(df1[j][i]) >= 3): blob = TextBlob(str(df1[j][i]))
def key_word_analysis(text): r = Rake(min_length=2, max_length=4) r.extract_keywords_from_text(text) keywords = r.get_ranked_phrases() return keywords
def recommend(): ingredient_list = "" if request.method == 'POST': brand = request.form['brand'] print(brand) category = request.form['category'] manufacturer = request.form['manufacturer'] message = request.form['message'] #Appending new data ingredient_list += brand + " " + category + " " + manufacturer + " " + message print(ingredient_list) pd.set_option('display.max_columns', 10) df = pd.read_csv('Food_Ingredients.csv', encoding='latin-1') print('=' * 90) #Choosing the Vector features to base my recommendations on print( '---------------------------FEATURE VECTOR TABLE---------------------------' ) df = df[[ 'brand', 'categories', 'features.value', 'manufacturer', 'name' ]] df = pd.DataFrame(np.array([['new', ingredient_list]]), columns=['name', 'features.value']).append(df, ignore_index=True, sort=True) # df.loc[0].replace(np.nan, "a") df.fillna("") print(df.head()) print(df.shape) print( '--------------------Feature Vector Information--------------------' ) print(df.info()) print() print('=' * 90) print( "--------------------LIST OF UNIQUE VALUES IN THE FEATURE VECTORS--------------------" ) unique_values_brand = df['brand'].unique().tolist() unique_values_categories = df['categories'].unique().tolist() unique_values_ingredients = df['features.value'].unique().tolist() unique_values_manufacturer = df['manufacturer'].unique().tolist() print('=' * 90) print() print( "--------------------NUMBER OF UNIQUE VALUES FOR THE FEATURE VECTORS--------------------" ) print('Brands: ', unique_values_brand) print('Categories: ', unique_values_categories) print('Ingredients: ', unique_values_ingredients) print('Manufacturers: ', unique_values_manufacturer) print('=' * 90) print() print( '--------------------FEATURE VECTORS UNIQUE VALUES(NO)--------------------' ) print("Brand: ", len(unique_values_brand)) print('Categories: ', len(unique_values_categories)) print('Ingredients: ', len(unique_values_ingredients)) print('manufacturers: ', len(unique_values_manufacturer)) print('=' * 90) print() print( '--------------------FEATURE VECTORS UNIQUE VALUES COUNT(TOP 20)\n--------------------' ) print("BRANDS: \n", df['brand'].value_counts().head(20)) print() print('CATEGORIES: \n', df['categories'].value_counts().head(20)) print() # print('INGREDIENTS: \n', df['features.value'].value_counts()) print() print('MANUFACTURERS: \n', df['manufacturer'].value_counts().head(20)) print() # print('NAMES: \n', df['name'].encode("utf-8").value_counts().head(20 )) print() print('=' * 90) #Transforming the values of the FV individual columns into single words so they are considered as unique values. # discarding the commas between the actors' full names and getting only the first three names df['categories'] = df['categories'].astype('str') df['categories'] = df['categories'].map(lambda x: x.split(',')[:3]) df['manufacturer'] = df['manufacturer'].astype('str') df['manufacturer'] = df['manufacturer'].map(lambda x: x.split(',')[:3]) df['brand'] = df['brand'].astype('str') df['brand'] = df['brand'].map(lambda x: x.split(' ')) # print('huh??') # print(df['categories'].head()) print('=' * 90) print() # merging together first and last name for each categories and manufacturers, so it's considered as one word # and there is no mix up between people sharing a first name for index, row in df.iterrows(): row['categories'] = [ x.lower().replace(' ', '') for x in row['categories'] ] row['manufacturer'] = [ x.lower().replace(' ', '') for x in row['manufacturer'] ] row['brand'] = ''.join(row['brand']).lower() print(df['categories'].head()) print(df['brand'].head()) print(df['manufacturer'].head()) print('=' * 90) # initializing the new column df['Key_words'] = "" df['features.value'] = df['features.value'].astype('str') for index, row in df.iterrows(): ingredients = row['features.value'] # instantiating Rake, by default is uses english stopwords from NLTK # and discard all puntuation characters r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(ingredients) # getting the dictionary whith key words and their scores key_words_dict_scores = r.get_word_degrees() # assigning the key words to the new column row['Key_words'] = list(key_words_dict_scores.keys()) # dropping the Features.value column df.drop(columns=['features.value'], inplace=True) df.set_index('name', inplace=True) print(df.head()) print('=' * 90) df['bag_of_words'] = '' columns = df.columns for index, row in df.iterrows(): words = '' for col in columns: if col != 'brand': words = words + ' '.join(row[col]) + ' ' else: words = words + row[col] + ' ' row['bag_of_words'] = words df.drop(columns=[col for col in df.columns if col != 'bag_of_words'], inplace=True) print('bag of word right?') # df = pd.DataFrame(np.array([['new','just a joke']]), columns=['name','bag_of_words']).append(df, ignore_index=True) # print('MY OWN RATINGS') # f = df.loc[0].dropna() # print(f.head()) print(df.shape) print(df.head()) # instantiating and generating the count matrix count = CountVectorizer() count_matrix = count.fit_transform(df['bag_of_words']) # creating a Series for the Food names so they are associated to an ordered numerical # list I will use later to match the indexes indices = pd.Series(df.index) print('is thiss??') print(indices[:5]) # generating the cosine similarity matrix cosine_sim = cosine_similarity(count_matrix, count_matrix) print("cosine simm") print(cosine_sim) # function that takes in food as input and returns the top 10 recommended foods def recommendations(title, cosine_sim=cosine_sim): recommended_food = [] # gettin the index of the food that matches the name idx = indices[indices == title].index[0] # creating a Series with the similarity scores in descending order score_series = pd.Series( cosine_sim[idx]).sort_values(ascending=False) # getting the indexes of the 10 most similar foods top_10_indexes = list(score_series.iloc[1:11].index) # populating the list with the names of the best 10 matching foods for i in top_10_indexes: recommended_food.append(list(df.index)[i]) print("wait") for r in range(len(recommended_food)): print(recommended_food[r]) return recommended_food foods = recommendations('new') comma_separated = ','.join(foods) return redirect(url_for('display_recommendations', food=comma_separated))
def readfile(self, path): with open(path) as f: n = self.start rake = Rake() while True: line = self.readblock(f) #check new record starts if line.startswith('%d. ' % n): data = {} data['seqid'] = n #get time data['time'] = line.split('.')[2].split(';')[0].strip() #get title data['title'] = self.readblock(f).replace('\n', ' ').strip() #get author data['author'] = self.readblock(f).replace('\n', ' ').strip() #detect affiliation line = self.readblock(f) if line.startswith('Author information:'): data['affiliation'] = line.replace('\n', ' ').strip() countries = self.findcountry(data['affiliation']) if countries: data['country'] = countries else: #could not find country info, skip it LOGGER.info( 'could not find country info: of paper id: [%d], [%s]' % (int(n), data['affiliation'])) n += 1 continue line = self.readblock(f) #abstract while line.startswith('Comment'): line = self.readblock(f) #no abstract if line.find('DOI:') >= 0 or line.find( 'PMID:') >= 0 or line.find('PMCID:') >= 0: data['DOI'] = line.replace( '\n', ' ').rstrip('[Indexed for MEDLINE]').strip() #skip this record as it does not have abstract n += 1 continue else: data['abstract'] = line.replace('\n', ' ').strip() #abstract length is too short if len(data['abstract']) < self.minimum_length: n += 1 continue line = self.readblock(f) #skip all other elements until encountering DOI while line.find('DOI:') == -1 and line.find( 'PMID:') == -1 and line.find('PMCID:') == -1: line = self.readblock(f) data['DOI'] = line.replace( '\n', ' ').rstrip('[Indexed for MEDLINE]').strip() n += 1 #here data is ready for processing rake.extract_keywords_from_text(data['abstract']) words = rake.get_ranked_phrases() data['keywords'] = words[:2] LOGGER.debug(data) LOGGER.info('starts saving id: %d data into mongodb' % int(n - 1)) self.mongo.insert([data], self.collection) #record ends if n > self.max_items: break
class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' args = parser.parse_args() rake = Rake(min_length=args.min_keyw_length, max_length=args.max_keyw_length, ranking_metric=Metric.WORD_DEGREE) for page in range(int(args.num_pages)): if not args.default_sort and not args.answers_sort: req = requests.get( 'http://answers.gazebosim.org/questions/scope:all/sort:votes-desc/page:' + str(page + 1) + '/') print("Entries sorted based on votes: ") elif args.answers_sort: print("Entries sorted based on most-answered: ") req = requests.get( 'http://answers.gazebosim.org/questions/scope:all/sort:answers-desc/page:' + str(page + 1) + '/') elif args.default_sort: print("Entries sorted based on activity: ")
for row in content: if row[18] == '' or len(row[18]) != 2: continue elif row[18] not in reviews.keys(): reviews[row[18]] = '' cnt[row[18]] = 0 if cnt[row[18]] >= 3000: pass cnt[row[18]] += 1 reviews[row[18]] = reviews[row[18]] + row[14] print(reviews.keys()) for j in reviews.keys(): r = Rake( ranking_metric=Metric.WORD_DEGREE, max_length=4 ) # Uses stopwords for english from NLTK, and all puntuation characters. keywords = r.extract_keywords_from_text(reviews[j]) phrases = r.get_ranked_phrases( ) # To get keyword phrases ranked highest to lowest. scores = r.get_ranked_phrases_with_scores( ) # To get keyword phrases with scores # print(len(scores),len(phrases)) i = 0 for score in scores: if score[0] < 20: del phrases[i:-1] break i += 1 # print(phrases)
def InitNLPRake(): nlp = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, max_length=6) #nlp=Rake(StopWords); return nlp
def find_keyword(terms): r = Rake(min_length=1, max_length=2) r.extract_keywords_from_text(terms) result = r.get_ranked_phrases() return result