Beispiel #1
0
def supreme(s):
    consumer_key = '8960pswi0ALmad8bD27Bofh22'
    consumer_secret = 'hSFcDZUsfwSbn3eutUirambdqLK1dwMyZkL40BAuoYY4mcbLbE'
    access_token = '934833577803616257-mVf5WjNVNfT2eWmQ4T46N2T2BDFZ1tV'
    access_token_secret = '5xQVESFc6kGaQSbtdhvew1WPi73Yne1a9lTi62oPrkKba'

    try:
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
    except:
        print("Error: Authentication Failed")

    tweets = get_tweets(query=s, count=200)

    ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive']
    ppos = 100 * len(ptweets) / len(tweets)
    # print("Positive tweets percentage: {} %".format(ppos))

    ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative']
    pneg = 100 * len(ntweets) / len(tweets)
    # print("Negative tweets percentage: {} %".format(pneg))

    pneu = (len(tweets) - len(ntweets) - len(ptweets)) / len(tweets)
    neutweets = [tweet for tweet in tweets if tweet['sentiment'] == 'neutral']
    # print("Neutral tweets percentage: {} % \ ".format(100 * (pneu)))

    """
    print("\n\nPositive tweets:")
    for tweet in ptweets[:10]:
        print(tweet['text'])

    print("\n\nNegative tweets:")
    for tweet in ntweets[:10]:
        print(tweet['text'])
    print("\n\nNeutral tweets:")
    for tweet in neutweets[:10]:
        print(tweet['text'])
    """

    
    newsapi = NewsApiClient(api_key='bb0f664df41346a38b42d10e3682c915')

    all_news = newsapi.get_everything(q=s)
    l1 = all_news.get('articles')
    newsl = []

    titles=[]
    for i in l1:
        if i.get('content'):
            newsl.append(i.get('content'))
            titles.append(i.get('title').lower())

    for i in l1:
        if i.get('content'):
            newsl.append(i.get('content'))

    r = Rake()

    l1 = []
    for i in newsl:
        r.extract_keywords_from_text(i)
        for j in r.get_ranked_phrases():
            l1.append(j)

    

    l = []

    if pneg + 0.5 * pneu > 50:
        tweets1 = ntweets[:]
    else:
        tweets1 = tweets[:]
    for i in tweets1:
        l.append(i.get('text'))
    l2 = []
    for i in l:
        r.extract_keywords_from_text(i)
        for j in r.get_ranked_phrases():
            l2.append(j)

    

    intersection = list(set([value for value in l1 if value in l2 and len(value) > 2]))
    titleRank = []
    for i in titles:
        titleRank.append(len(set(i.split()) & set(intersection)))
    truthfulness = True if len(intersection) > 2 else False

    return ptweets, ppos, ntweets, pneg, neutweets, pneu, intersection, truthfulness,titles[titleRank.index(max(titleRank))]
from rake_nltk import Rake
rake = Rake()

text = """İnsanın konuşacak kadar zekaya, ya da susacak kadar
akla sahip olmaması büyük bir talihsizliktir. - Stefan Zweig"""

rake.extract_keywords_from_text(text)
keyword = rake.get_ranked_phrases()
print(keyword)
from rake_nltk import Rake
from pyexcel_xls import get_data
from pyexcel_xls import save_data
from textblob import TextBlob

r = Rake()
path = r"E:\Users\lockon\Desktop\\"
inputFileName = r"test.xlsx"
ouputFileName = r"result2.xls"


def getExcelData():
    xls_data = get_data(path + inputFileName)
    return xls_data


def saveExcelData(sheet1):
    xls_data.update({u"Sheet1": sheet1})
    save_data(path + ouputFileName, xls_data)


xls_data = getExcelData()

#遍历excel里面的表单,因为只有第一个有内容,所以循环一次就中止
for sheet_n in xls_data.keys():
    break
sheet1 = xls_data[sheet_n]

#遍历sheet1里面的每一行
for rowData in sheet1:
Beispiel #4
0
def recommendations(title, val):

    df = pd.read_csv('bcd.csv')
    df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]

    if val == '1':
        flag = 0
        for i in range(len(list(df['Title']))):
            if title == str(df['Title'][i]):
                flag = 1
        if flag == 0:
            return -1

    for i in range(len(df.index)):
        gen = df['Genre'][i].lower().split(", ")
        df.at[i, 'Genre'] = gen
        dct = df['Director'][i].lower().split(" ")
        st = ""
        for j in range(len(dct)):
            st = st + str(dct[j])
        df.at[i, 'Director'] = st
        act = df['Actors'][i].lower().split(", ")[:3]
        for j in range(len(act)):
            act[j] = act[j].replace(' ', '')
        df.at[i, 'Actors'] = act

    df['Key_words'] = ""

    for index, row in df.iterrows():
        plot = row['Plot']
        """creating object of Rake class to extract keywords from plot"""
        r = Rake()
        """
            extract_keywords_from_plot function finds out the keywords from the passed
            string by removing common stop words like a,the,an,from,it..etc
            """
        r.extract_keywords_from_text(plot)
        """
            getting the dictionary of the extracted words where the words act as the
            keys and they have a numeric value assigned to them
            """
        key_words_dict_scores = r.get_word_degrees()
        """
            now we are assigning the list of the keywords from the dictionary to the
            newly created column called 'Key_words'
            """
        row['Key_words'] = list(key_words_dict_scores.keys())

    df['BOW'] = ""
    df.drop(columns=['Plot'], inplace=True)

    for i in range(len(df.index)):
        s = ""
        lst = df['Genre'][i]
        for j in range(len(lst)):
            s = s + str(lst[j]) + ' '
        s = s + str(df['Director'][0])
        lst = df['Actors'][i]
        for j in range(len(lst)):
            s = s + str(lst[j]) + ' '
        df.at[i, 'BOW'] = s
        lst = df['Key_words'][i]
        for j in range(len(lst)):
            s = s + str(lst[j]) + ' '
        df.at[i, 'BOW'] = s
    '''dropping every column except for bag_of_words as they are not needed anymore'''
    df.drop(columns=[col for col in df.columns if col != 'BOW'], inplace=True)
    count = CountVectorizer()
    count_matrix = count.fit_transform(df['BOW'])
    #indices = pd.Series(df.index)
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    h = count_matrix.toarray()

    h = count_matrix.toarray()
    #Outermatrix=[]
    #print(h)
    for i in range(len(df.index)):
        var = h[0]
        j = 0
        Innermatrix = []
        for j in range(len(df.index)):
            var2 = h[j]

            numerator = 0
            denom1 = 0
            denom2 = 0

            for o in range(len(h[0])):
                numerator += (var[o] * var2[o])
                denom1 += (var[o]**2)
                denom2 += (var2[o]**2)

            denom1 = denom1**0.5
            denom2 = denom2**0.5
            numerator = numerator / (denom1 * denom2)

            Innermatrix.append(numerator)

        print("COSINEFUNCTION\t \tMY LOOP\n")

        for k in range(0, 5):
            print(cosine_sim[0][k], "\t\t\t\t\t", Innermatrix[k])
        break
Beispiel #5
0
def jsonToTxt(jsonString):
    substringListOne=jsonString.split('"text": ')
    substringListTwo=[]
    stronk=""
    for sub in substringListOne:
        sub=sub[1:]
        i=0
        for s in sub:
            if (s=='\"'):
                break
            i+=1
        sub=sub[:i]
        if len(sub)==0:
            stronk+=sub
        else:
            stronk+=sub+" "
    return stronk

stank = jsonToTxt(jsonInput)
############

from rake_nltk import Rake

r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

# If you want to provide your own set of stop words and punctuations to
# r = Rake(<list of stopwords>, <string of puntuations to ignore>)
r.extract_keywords_from_text(stank)

r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
Beispiel #6
0
    def get_top_k_docs(self, query, k=100):
        """
        Args:
            query: string
            k: int (default: 1)
        
        Returns:
            top_k_docs: dictionary keys: titles, abstracts, ids. Each element in dict[key] is a list of k elements in descending order of relevance
        """
        query_words = preprocess_query(query)
        if len(query_words) > 10:  # long query search
            r = Rake(min_length=1, max_length=4)
            r.extract_keywords_from_text(query)
            phrases = list(set(' '.join(r.get_ranked_phrases()).split()))
            query_words = preprocess_query(' '.join(phrases))

        top_k_docs = self.model.get_top_n(query_words, self.corpus, n=k)

        insensitive_comparers = {}
        for qw in query_words:
            insensitive_comparers[qw] = re.compile(re.escape(qw),
                                                   re.IGNORECASE)

        results = {'titles': [], 'abstracts': [], 'ids': [], 'links': []}
        relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []}
        not_relevant = {'titles': [], 'abstracts': [], 'ids': [], 'links': []}
        for i in top_k_docs:
            abstract = i['abstract'].replace('\n', '')
            if abstract == '':
                abstract = i['introduction'].replace('\n', '')
            if abstract == '':
                continue

            abstract = remove_punctuations(abstract)

            title = i['title'].replace('\n', '')
            if title == '':
                continue

            doc_text = title.lower() + ' ' + abstract.lower(
            ) + ' ' + i['introduction'].replace('\n', '').lower()
            query_words_found = False
            for qw in query_words:
                if qw in doc_text:
                    query_words_found = True
                    break
            if not query_words_found:
                continue

            # Bold mark query words in abstract
            for qw in query_words:
                abstract = insensitive_comparers[qw].sub(
                    '<b>' + qw + '</b>', abstract)

            rel_score = self.relevance_scores[(tuple(query_words), i['id'])]
            if rel_score > 0:
                relevant['titles'].append(title.title())
                relevant['abstracts'].append(abstract)
                relevant['ids'].append(i['id'])
                relevant['links'].append(i['link'])
            elif rel_score < 0:
                not_relevant['titles'].append(title.title())
                not_relevant['abstracts'].append(abstract)
                not_relevant['ids'].append(i['id'])
                not_relevant['links'].append(i['link'])
            else:
                results['titles'].append(title.title())
                results['abstracts'].append(abstract)
                results['ids'].append(i['id'])
                results['links'].append(i['link'])

        for key in ['abstracts', 'ids', 'titles', 'links']:
            results[key] = relevant[key] + results[key] + not_relevant[key]

        return results
# Input : wikiplots file ( plots, titles)
## Sample downloaded from https://github.com/markriedl/WikiPlots (there is a plots.zip folder )
## Sample input:
## plots : Old Major, the old boar on the Manor Farm, summons the animals on the farm together for a meeting, during which he refers to humans as "enemies" and teaches the animals a revolutionary song called "Beasts of England".
## titles: Animal Farm
# Output: wikiplot.kwRAKE.csv
# Sample output:
## plot-1_0	K	animal farm[SEP]happiest animals live simple lives .'[SEP]several men attack animal farm .'[SEP]napoleon educates young puppies[SEP]boxer continues working harder[SEP]irresponsible farmer mr jones[SEP]set aside special food items[SEP]frequently smears snowball[SEP]anthem glorifying napoleon[SEP]revolutionary song called[SEP]similar animal revolts .'	I	4	Old Major, the old boar on the Manor Farm, summons the animals on the farm together for a meeting, during which he refers to humans as "enemies" and teaches the animals a revolutionary song called "Beasts of England". When Major dies, two young pigs, Snowball and Napoleon, assume command and consider it a duty to prepare for the Rebellion. The animals revolt and drive the drunken and irresponsible farmer mr Jones from the farm, renaming it "Animal Farm". They adopt the Seven Commandments of Animalism, the most important of which is, "All animals are equal". Snowball teaches the animals to read and write, while Napoleon educates young puppies on the principles of Animalism. Food is plentiful, and the farm runs smoothly.	NA

infile = 'data/download/plots'
infile_title = 'data/download/titles'
outfile = 'data/generated/wikiplot.kwRAKE.csv'

# 2.2 - Execute
r = Rake()
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
topK = 10

f = open(infile, 'r', encoding='"ISO-8859-1"')
f_title = open(infile_title, 'r', encoding='"ISO-8859-1"')
fout = open(outfile, 'a', encoding='"ISO-8859-1"')

lines = f.readlines()
lines_title = f_title.readlines()

abstract_lens = {}

print("Starting Pre-processing")
sentences_to_write = []
w = 0
Beispiel #8
0
def similar_items(item_id):
    book = pd.read_csv(books_utils.data_path)
    movie = pd.read_csv(movies_utils.data_path)
    tvshow = pd.read_csv(shows_utils.data_path)

    # data cleaning
    book['item_data'] = book['book_title'] + ' ' + book[
        'book_author'] + ' ' + book['book_plot']
    book['item_id'] = book['book_id']
    book = book.drop([
        'book_id', 'book_title', 'book_genre', 'book_author', 'book_plot',
        'book_rating', 'book_link'
    ],
                     axis=1)
    movie['item_data'] = movie['movie_title'] + ' ' + movie['movie_plot']
    movie['item_id'] = movie['movie_id']
    movie = movie.drop([
        'movie_id', 'movie_title', 'movie_genre', 'actors', 'movie_plot',
        'imdb_rating', 'movie_link', 'director'
    ],
                       axis=1)
    tvshow['item_data'] = tvshow['show_name'] + ' ' + tvshow['show_plot']
    tvshow['item_id'] = tvshow['show_id']
    tvshow = tvshow.drop([
        'show_id', 'show_name', 'show_genre', 'show_plot', 'show_rating',
        'show_link'
    ],
                         axis=1)

    data = (book.append(movie)).append(tvshow)

    data['key_words'] = ""

    for index, row in data.iterrows():
        item_data = row['item_data']
        r = Rake()
        r.extract_keywords_from_text(item_data)
        key_words_dict_scores = r.get_word_degrees()
        row['key_words'] = list(key_words_dict_scores.keys())
    data.drop(columns=['item_data'], inplace=True)

    data.set_index('item_id', inplace=True)

    data['bag_of_words'] = ''
    columns = data.columns
    for index, row in data.iterrows():
        words = ''
        for col in columns:
            words = words + ' '.join(row[col]) + ' '
        data.at[index, 'bag_of_words'] = words

    data.drop(columns=[col for col in data.columns if col != 'bag_of_words'],
              inplace=True)

    count = TfidfVectorizer()
    count_matrix = count.fit_transform(data['bag_of_words'])
    indices = pd.Series(data.index)
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    idx = indices[indices == item_id].index[0]

    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)

    top_30_indexes = list(score_series.iloc[1:51].index)

    ans = []
    for i in top_30_indexes:
        ans.append(data.iloc[i].name)
    return ans
Beispiel #9
0
def rec(title, val):
    df = pd.read_csv('bcd.csv')
    df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
    #df.sort_values('Title',inplace=True)
    #df.drop_duplicates(subset='Title',keep='first',inplace=False)
    '''in case the movie is not found then -1 is returned to the function'''
    if val == '1':
        flag = 0
        for i in range(len(list(df['Title']))):
            if title == str(df['Title'][i]):
                flag = 1
        if flag == 0:
            return -1
    '''cleaning the dataset'''
    for i in range(len(df.index)):
        gen = df['Genre'][i].lower().split(", ")
        df.at[i, 'Genre'] = gen
        dct = df['Director'][i].lower().split(" ")
        st = ""
        for j in range(len(dct)):
            st = st + str(dct[j])
        df.at[i, 'Director'] = st
        act = df['Actors'][i].lower().split(", ")[:3]
        for j in range(len(act)):
            act[j] = act[j].replace(' ', '')
        df.at[i, 'Actors'] = act

    df['Key_words'] = ""
    '''following loop stores the extracted keywords from the 'Plot' column in a new column Key_words'''
    for i in range(len(df.index)):
        tempstr = str(df['Plot'][i])
        """creating object of Rake class to extract keywords from plot"""
        r = Rake()
        """
            extract_keywords_from_plot function finds out the keywords from the passed
            string by removing common stop words like a,the,an,from,it..etc
            """
        r.extract_keywords_from_text(tempstr)
        """
            getting the dictionary of the extracted words where the words act as the
            keys and they have a numeric value assigned to them
            """
        key_words = r.get_word_degrees()
        """
            now we are assigning the list of the keywords from the dictionary to the
            newly created column called 'Key_words'
            """
        df.at[i, 'Key_words'] = list(key_words.keys())
    """Creating a new column for the concatenated strings to be stored as bag of words"""
    df['BOW'] = ""
    df.drop(columns=['Plot'], inplace=True)
    """The following loop creates the bag of strings for every row"""
    for i in range(len(df.index)):
        s = ""
        lst = df['Genre'][i]
        for j in range(len(lst)):
            s = s + str(lst[j]) + ' '
        s = s + str(df['Director'][0])
        lst = df['Actors'][i]
        for j in range(len(lst)):
            s = s + str(lst[j]) + ' '
        df.at[i, 'BOW'] = s
        lst = df['Key_words'][i]
        for j in range(len(lst)):
            s = s + str(lst[j]) + ' '
        df.at[i, 'BOW'] = s
    """Replacing the default indexing by the title of the movies"""
    df.set_index('Title', inplace=True)
    """Dropping every column except for bag of words because they aren't needed beyond here"""
    df.drop(columns=[col for col in df.columns if col != 'BOW'], inplace=True)
    """initializing an instance of CountVectorizer to create sparse matrix vectors"""
    count = CountVectorizer()
    count_matrix = count.fit_transform(df['BOW'])
    """calculating the similarity scores of the sparse matrix vectors and storing them in a 2d matrix"""
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    '''empty list to store the recommended movies'''
    rec_mov = []
    indexes = []
    indices = pd.Series(df.index)
    '''idx stores the index of the movie input by the user'''
    idx = indices[indices == title].index[0]
    '''creating a series of scores of the movies corresponding to the input'''
    score = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    '''getting the indices of the top 10 closest related movies'''
    for i in range(1, 11):
        indexes.append(score.index[i])
    '''Adding the recommended movies to the list '''
    for i in indexes:
        rec_mov.append(list(df.index)[i])

    i = 0
    '''lurl is a list to store the url's of the posters of the movie'''
    lurl = list()
    '''year is a list to store the year of the release of the movie'''
    year = list()
    '''the following loop scrapes values of url and year from the open source movie database'''
    while i < 10:
        movie = str(rec_mov[i])
        url = "http://www.omdbapi.com/?i=tt3896198&apikey=7eaeeaff&t=" + movie
        http = urllib3.PoolManager()
        response = http.request('GET', url)
        data = response.data
        values = json.loads(data)
        lurl.append(values['Poster'])
        year.append(values['Year'])
        i = i + 1
    '''css file for the output file'''
    css = """table{
    border-collapse: collapse;
    border:3px solid red;}
th{
    border:3px solid red;
    text-align:center;
    font-weight:bold;
    font-family:'Times New Roman', Times, serif;
    background-color:white;
    opacity:0.9;}
td{
    border:3px solid red;
    text-align:center;
    font-weight:bold;
    font-size:20px;
    font-family:'Franklin Gothic Medium', 'Arial Narrow', Arial, sans-serif;
    background-color:white;
    opacity:0.9;}"""

    message = """<html>
        <head>
    <title>Recommended Movies</title>
    <style>
     """ + css + """
    </style>
    </head>
    <body background="https://i.insider.com/5f578371e6ff30001d4e76be?width=1136&format=jpeg">
    <table border='1'>
    <tr><th>Title</th><th>Poster</th></tr>
    <tr><td>""" + str(rec_mov[0]) + '<br>' + str(
        year[0]) + """</td><td><img src=""" + str(lurl[0]) + """></td></tr>
    <tr><td>""" + str(rec_mov[1]) + '<br>' + str(
            year[1]) + """</td><td><img src=""" + str(lurl[1]) + """></td></tr>
    <tr><td>""" + str(rec_mov[2]) + '<br>' + str(
                year[2]
            ) + """</td><td><img src=""" + str(lurl[2]) + """></td></tr>
    <tr><td>""" + str(rec_mov[3]) + '<br>' + str(
                year[3]) + """</td><td><img src=""" + str(
                    lurl[3]) + """></td></tr>
    <tr><td>""" + str(rec_mov[4]) + '<br>' + str(
                        year[4]) + """</td><td><img src=""" + str(
                            lurl[4]) + """></td></tr>
    <tr><td>""" + str(rec_mov[5]) + '<br>' + str(
                                year[5]) + """</td><td><img src=""" + str(
                                    lurl[5]) + """></td></tr>
    <tr><td>""" + str(rec_mov[6]
                      ) + '<br>' + str(
                          year[6]) + """</td><td><img src=""" + str(
                              lurl[6]) + """></td></tr>
    <tr><td>""" + str(rec_mov[7]) + '<br>' + str(
                                  year[7]) + """</td><td><img src=""" + str(
                                      lurl[7]) + """></td></tr>
    <tr><td>""" + str(rec_mov[8]
                      ) + '<br>' + str(
                          year[8]) + """</td><td><img src=""" + str(
                              lurl[8]) + """></td></tr>
    <tr><td>""" + str(rec_mov[9]) + '<br>' + str(
                                  year[9]) + """</td><td><img src=""" + str(
                                      lurl[9]) + """></td></tr>
    </table>
    </body>
    </html>>
   """
    return message
from rake_nltk import Rake
rake_nltk_var = Rake()
 
text = """Compatibility of systems of linear constraints
        over the set of natural numbers. Criteria of compatibility of a system
        of linear Diophantine equations, strict inequations, and nonstrict
        inequations are considered. Upper bounds for components of a minimal
        set of solutions and algorithms of construction of minimal generating
        sets of solutions for all types of systems are given. These criteria
        and the corresponding algorithms for constructing a minimal supporting
        set of solutions can be used in solving all the considered types of
        systems and systems of mixed types."""
    
rake_nltk_var.extract_keywords_from_text(text)
keyword_extracted = rake_nltk_var.get_ranked_phrases()
keyword_extracted_with_scores = rake_nltk_var.get_ranked_phrases_with_scores()

print("\n*******List of Keyword Extracted*******")
print(*keyword_extracted, sep = "\n") 
print("\n*******List of Keyword Extracted with Scores*******")
print(*keyword_extracted_with_scores, sep = "\n") 
Beispiel #11
0
def calculate_keywords(text):
    r = Rake(stopwords=[
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
        "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having",
        "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
        "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during",
        "before", "after", "above", "below", "to", "from", "up", "down", "in",
        "out", "on", "off", "over", "under", "again", "further", "then",
        "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
        "t", "can", "will", "just", "don", "should", "now"
    ])  # Uses stopwords for english from NLTK, and all puntuation characters.

    #titles and abstracts parsed and saved in json file need to be joined here below
    #my_text=re.sub(r'\s\s+',' ',text.lower())

    r.extract_keywords_from_text(text)

    kw_rake = r.get_ranked_phrases_with_scores()
    kw_rake = [[x[1], x[0]] for x in kw_rake if len(x[1]) > 3]
    kw_rake = [x for x in kw_rake if len(x[0].split()) < 3]
    kw_rake = [x for x in kw_rake if min([len(i) for i in x[0].split()]) > 3]
    kw_rake = [x for x in kw_rake if not re.search(r'\d', x[0])]
    kw_rake = [x for x in kw_rake if check_pos(x[0]) is True]

    kw_rake_scores = [x[1] for x in kw_rake]
    my_min = min(kw_rake_scores)
    my_max = max(kw_rake_scores)

    kw_rake = [[x[0], normalize(x[1], my_min, my_max)] for x in kw_rake]
    kw_rake = [x for x in kw_rake if x[1] > 0.1]

    #text="History and evolution of the arctic flora: in the footsteps of Eric. A major contribution to our initial understanding of the origin, history and biogeography of the present-day arctic flora was made by Eric Hulten in his landmark book Outline of the History of Arctic and Boreal Biota during the Quarternary Period, published in 1937. Here we review recent molecular and fossil evidence that has tested some of Hulten's proposals. There is now excellent fossil, molecular and phytogeographical evidence to support Hulten's proposal that Beringia was a major northern refugium for arctic plants throughout the Quaternary. In contrast, most molecular evidence fails to support his proposal that contemporary east and west Atlantic populations of circumarctic and amphi-Atlantic species have been separated throughout the Quaternary. In fact, populations of these species from opposite sides of the Atlantic are normally genetically very similar, thus the North Atlantic does not appear to have been a strong barrier to their dispersal during the Quaternary. Hulten made no detailed proposals on mechanisms of speciation in the Arctic; however, molecular studies have confirmed that many arctic plants are allopolyploid, and some of them most probably originated during the Holocene. Recurrent formation of polyploids from differentiated diploid or more low-ploid populations provides one explanation for the intriguing taxonomic complexity of the arctic flora, also noted by Hulten. In addition, population fragmentation during glacial periods may have lead to the formation of new sibling species at the diploid level. Despite the progress made since Hulten wrote his book, there remain large gaps in our knowledge of the history of the arctic flora, especially about the origins of the founding stocks of this flora which first appeared in the Arctic at the end of the Pliocene (approximately 3 Ma). Comprehensive analyses of the molecular phylogeography of arctic taxa and their relatives together with detailed fossil studies are required to fill these gaps. Quantification of population sizes of large herbivores and their long-term functional role in ecosystems using dung fungal spores. The relationship between large herbivore numbers and landscape cover over time is poorly understood. There are two schools of thought: one views large herbivores as relatively passive elements upon the landscape and the other as ecosystem engineers driving vegetation succession. The latter relationship has been used as an argument to support reintroductions of large herbivores onto many landscapes in order to increase vegetation heterogeneity and biodiversity through local-scale disturbance regimes. Most of the research examining the relationship between large herbivores and their impact on landscapes has used extant studies. An alternative approach is to estimate the impact of variations in herbivore populations through time using fossil dung fungal spores and pollen in sedimentary sequences. However, to date, there has been little quantification of fossil dung fungal spore records and their relationship to herbivore numbers, leaving this method open to varied interpretations. In this study, we developed further the dung fungal spore method and determined the relationship between spore abundance in sediments (number cm(-2)year(-1)) and herbivore biomass densities (kgha(-1)). To establish this relationship, we used the following: (i) the abundance of Sporormiella spp., Sordaria spp. and Podospora spp. spores in modern sediments from ponds and (ii) weekly counts of contemporary wildlife over a period of 5years from the rewilded site, Oostvaardersplassen, in the Netherlands. Results from this study demonstrate that there is a highly significant relationship between spore abundance and local biomass densities of herbivores that can be used in the calibration of fossil records. Mammal biomass density (comprising Konik horses, Heck cattle and red deer) predicts in a highly significant way the abundance of all dung fungal spores amalgamated together. This relationship is apparent at a very local scale (<10m), when the characteristics of the sampled ponds are taken into account (surface area of pond, length of shoreline). In addition, we identify that dung fungal spores are principally transported into ponds by surface run-off from the shores. These results indicate that this method provides a robust quantitative measure of herbivore population size over time. Herbivory Network: An international, collaborative effort to study herbivory in Arctic and alpine ecosystems. Plant-herbivore interactions are central to the functioning of tundra ecosystems, but their outcomes vary over space and time. Accurate forecasting of ecosystem responses to ongoing environmental changes requires a better understanding of the processes responsible for this heterogeneity. To effectively address this complexity at a global scale, coordinated research efforts, including multi-site comparisons within and across disciplines, are needed. The Herbivory Network was established as a forum for researchers from Arctic and alpine regions to collaboratively investigate the multifunctional role of herbivores in these changing ecosystems. One of the priorities is to integrate sites, methodologies, and metrics used in previous work, to develop a set of common protocols and design long-term geographically-balanced, coordinated experiments. The implementation of these collaborative research efforts will also improve our understanding of traditional human-managed systems that encompass significant portions of the sub-Arctic and alpine areas worldwide. A deeper understanding of the role of herbivory in these systems under ongoing environmental changes will guide appropriate adaptive strategies to preserve their natural values and related ecosystem services. (C) 2016 Elsevier B.V. and NIPR. All rights reserved. Biomass allometry for alder, dwarf birch, and willow in boreal forest and tundra ecosystems of far northeastern Siberia and north-central Alaska. Shrubs play an important ecological role in the Arctic system, and there is evidence from many Arctic regions of deciduous shrubs increasing in size and expanding into previously forb or graminoid-dominated ecosystems. There is thus a pressing need to accurately quantify regional and temporal variation in shrub biomass in Arctic regions, yet allometric equations needed for deriving biomass estimates from field surveys are rare. We developed 66 allometric equations relating basal diameter (BD) to various aboveground plant characteristics for three tall, deciduous shrub genera growing in boreal and tundra ecoregions in far northeastern Siberia (Yakutia) and north-central Alaska. We related BD to plant height and stem, branch, new growth (leaves + new twigs), and total aboveground biomass for alder (Alms viridis subsp. crispa and Alms fruticosa), dwarf birch (Betula nana subsp. exilis and divaricata), and willow (Salix spp.). The equations were based on measurements of 358 shrubs harvested at 33 sites. Plant height (r(2) = 0.48-0.95), total aboveground biomass (r(2) = 0.46-0.99), and component biomass (r(2) = 0.13-0.99) were significantly (P < 0.01) related to shrub BD. Alder and willow populations exhibited differences in allometric relationships across ecoregions, but this was not the case for dwarf birch. The allometric relationships we developed provide a tool for researchers and land managers seeking to better quantify and monitor the form and function of shrubs across the Arctic landscape. (C) 2014 Elsevier B.V. All rights reserved. Shrub expansion may reduce summer permafrost thaw in Siberian tundra. Climate change is expected to cause extensive vegetation changes in the Arctic: deciduous shrubs are already expanding, in response to climate warming. The results from transect studies suggest that increasing shrub cover will impact significantly on the surface energy balance. However, little is known about the direct effects of shrub cover on permafrost thaw during summer. We experimentally quantified the influence of Betula nana cover on permafrost thaw in a moist tundra site in northeast Siberia with continuous permafrost. We measured the thaw depth of the soil, also called the active layer thickness (ALT), ground heat flux and net radiation in 10 m diameter plots with natural B. nana cover (control plots) and in plots in which B. nana was removed (removal plots). Removal of B. nana increased ALT by 9% on average late in the growing season, compared with control plots. Differences in ALT correlated well with differences in ground heat flux between the control plots and B. nana removal plots. In the undisturbed control plots, we found an inverse correlation between B. nana cover and late growing season ALT. These results suggest that the expected expansion of deciduous shrubs in the Arctic region, triggered by climate warming, may reduce summer permafrost thaw. Increased shrub growth may thus partially offset further permafrost degradation by future temperature increases. Permafrost models need to include a dynamic vegetation component to accurately predict future permafrost thaw. Global assessment of nitrogen deposition effects on terrestrial plant diversity: a synthesis. Atmospheric nitrogen (N) deposition is it recognized threat to plant diversity ill temperate and northern parts of Europe and North America. This paper assesses evidence from field experiments for N deposition effects and thresholds for terrestrial plant diversity protection across a latitudinal range of main categories of ecosystems. from arctic and boreal systems to tropical forests. Current thinking on the mechanisms of N deposition effects on plant diversity, the global distribution of G200 ecoregions, and current and future (2030) estimates of atmospheric N-deposition rates are then used to identify the risks to plant diversity in all major ecosystem types now and in the future. This synthesis paper clearly shows that N accumulation is the main driver of changes to species composition across the whole range of different ecosystem types by driving the competitive interactions that lead to composition change and/or making conditions unfavorable for some species. Other effects such its direct toxicity of nitrogen gases and aerosols long-term negative effects of increased ammonium and ammonia availability, soil-mediated effects of acidification, and secondary stress and disturbance are more ecosystem, and site-specific and often play a supporting role. N deposition effects in mediterranean ecosystems have now been identified, leading to a first estimate of an effect threshold. Importantly, ecosystems thought of as not N limited, such as tropical and subtropical systems, may be more vulnerable in the regeneration phase. in situations where heterogeneity in N availability is reduced by atmospheric N deposition, on sandy soils, or in montane areas. Critical loads are effect thresholds for N deposition. and the critical load concept has helped European governments make progress toward reducing N loads on sensitive ecosystems. More needs to be done in Europe and North America. especially for the more sensitive ecosystem types. including several ecosystems of high conservation importance. The results of this assessment Show that the Vulnerable regions outside Europe and North America which have not received enough attention are ecoregions in eastern and Southern Asia (China, India), an important part of the mediterranean ecoregion (California, southern Europe). and in the coming decades several subtropical and tropical parts of Latin America and Africa. Reductions in plant diversity by increased atmospheric N deposition may be more widespread than first thought, and more targeted Studies are required in low background areas, especially in the G200 ecoregions. Meta-analysis of high-latitude nitrogen-addition and warming studies implies ecological mechanisms overlooked by land models. Accurate representation of ecosystem processes in land models is crucial for reducing predictive uncertainty in energy and greenhouse gas feedbacks with the climate. Here we describe an observational and modeling meta-analysis approach to benchmark land models, and apply the method to the land model CLM4.5 with two versions of belowground biogeochemistry. We focused our analysis on the aboveground and belowground responses to warming and nitrogen addition in high-latitude ecosystems, and identified absent or poorly parameterized mechanisms in CLM4.5. While the two model versions predicted similar soil carbon stock trajectories following both warming and nitrogen addition, other predicted variables (e.g., belowground respiration) differed from observations in both magnitude and direction, indicating that CLM4.5 has inadequate underlying mechanisms for representing high-latitude ecosystems. On the basis of observational synthesis, we attribute the model-observation differences to missing representations of microbial dynamics, aboveground and belowground coupling, and nutrient cycling, and we use the observational meta-analysis to discuss potential approaches to improving the current models. However, we also urge caution concerning the selection of data sets and experiments for meta-analysis. For example, the concentrations of nitrogen applied in the synthesized field experiments (average = 72 kg ha(-1) yr(-1)) are many times higher than projected soil nitrogen concentrations (from nitrogen deposition and release during mineralization), which precludes a rigorous evaluation of the model responses to likely nitrogen perturbations. Overall, we demonstrate that elucidating ecological mechanisms via meta-analysis can identify deficiencies in ecosystem models and empirical experiments."

    kw_text_rank = [
        list(x) for x in (score_keyphrases_by_text_rank(text, n_keywords=0.05))
    ]
    kw_text_rank = [
        x for x in kw_text_rank
        if not re.search(r'(study|studi|effect|relation)', x[0])
    ]
    kw_text_rank = [
        x for x in kw_text_rank if min([len(i) for i in x[0].split()]) > 3
    ]
    kw_text_rank_scores = [x[1] for x in kw_text_rank]
    my_min = min(kw_text_rank_scores)
    my_max = max(kw_text_rank_scores)
    kw_text_rank = [[x[0], normalize(x[1], my_min, my_max)]
                    for x in kw_text_rank]

    kw_text_rank = [x for x in kw_text_rank if x[1] > 0.01]

    keywords = []
    keywords.extend(kw_rake)
    keywords.extend(kw_text_rank)
    keywords = sorted(keywords, key=lambda x: x[1], reverse=True)

    final_keyword_list = []
    for kw in keywords:

        if kw[0] not in [x[0] for x in final_keyword_list]:
            final_keyword_list.append(kw)

    #final ranked keyword list need to be saved in a json file as well

    return final_keyword_list
Beispiel #12
0
def getkeys(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases_with_scores()
def makeDescription(url):
    lemmatizer = WordNetLemmatizer()
    r = Rake()
    joblinkTarget = BeautifulSoup(urllib.urlopen(url), "html.parser")
    summaryElement = joblinkTarget.find('div', attrs={'id': 'jobdescSec'})
    text = summaryElement.get_text()

    topicFromHTML = joblinkTarget.find('h1', attrs={'class': 'jobTitle'}).text

    topicRake = Rake()
    topicRake.extract_keywords_from_text(topicFromHTML)
    topicExtractor = topicRake.get_ranked_phrases()
    topic = topicExtractor[0]
    if topic.endswith("ineer"):
        topic += "ing"
    elif topic.endswith("oper"):
        topic = topic[:-2] + "ment"
    elif topic.endswith("yst"):
        topic = topic[:-1] + "is"


    listedTech = ""
    r.extract_keywords_from_text(text)

    rankedPhrases = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
    for eachPhrase in rankedPhrases:
        toChange = eachPhrase
        eachPhrase = str(eachPhrase.encode('ascii', 'ignore'))
        rankedPhrases[rankedPhrases.index(toChange)] = eachPhrase

    actionList = ""
    for sent in rankedPhrases:
        content = ne_chunk(pos_tag(word_tokenize(sent)))
        if len(content) > 1 and content[0][1][0] == 'V':
            sentList = sent.split()
            sentList[0] = lemmatizer.lemmatize(content[0][0], 'v')
            rankedPhrases.remove(sent)
            sent = ""
            for i in sentList:
                sent = sent + " " + i
            actionList = actionList + sent + "\n"

    if len(rankedPhrases) > 6:
        for each in range(0, 4):
            listedTech = listedTech + rankedPhrases[each] + ", "
        listedTech += rankedPhrases[4]

    print ("\nTopic: -----------------------------------------------------------------------------------")
    print ("Topics in " + topic + "\n")
    print ("Course Description: ----------------------------------------------------------------------")
    print ("Introduction to topics in " + topic + " such as " + listedTech)

    print ("\nCourse Learning Outcomes: -----------------------------------------------------------------")
    print actionList
    keywordsText = []
    print ('Summary:')

    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(text)

    text = ""

    for each in sentences:
        text = text + " " + str(each.encode('ascii','ignore'))

    print (summarize(text))
    print ("\n")

    '''
Beispiel #14
0
def key_extract(mytext):
    r = Rake()
    # mytext = '''Electronic commerce, commonly written as e-commerce, is the trading in products or services using computer networks, such as the Internet. Electronic commerce draws on technologies such as mobile commerce, electronic funds transfer, supply chain management, Internet marketing, onlinetransactionprocessing, electronicdatainterchange (EDI), inv entory management systems, and automated data collection systems. Modern electronic commerce typically uses the World Wide Web for at least one part of the transaction's life cycle, although it may also use other technologies such as e-mail'''
    r.extract_keywords_from_text(mytext)
    return r.get_ranked_phrases()
def handle_article_rake_nltk(text, nb_to_display):
    r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
    r.extract_keywords_from_text(text)
    ranked_words = r.get_ranked_phrases_with_scores()
    pp.pprint(ranked_words[:nb_to_display])  # To get keyword phrases ranked highest to lowest.
Beispiel #16
0
from rake_nltk import Rake
import json
import spacy
from scipy import stats

r = Rake()  # Used to extract keywords
nlp = spacy.load("en_core_web_lg")  # Word embedding dictionary


# Given a sentence, extracts keywords using rake
def get_keywords(sentence):
    r.extract_keywords_from_text(sentence)
    keywords = r.get_ranked_phrases()
    return keywords


# Compares question to dictionary (previously read with read_book)
def compute_scores_from_index(question, data):
    keywords = get_keywords(question)

    analyzed = analyze(question)
    for d in analyzed:
        keywords.extend(d['Subjects'])
        keywords.extend(d['Verbs'])
        keywords.extend(d['Complements'])
    keywords = list(set(keywords))
    #keywords = [nlp(k) for k in keywords]
    scores = []
    for index in data:
        score = get_score(keywords, (index['title']))
        scores.append(score)
Beispiel #17
0
def categorize(tweet):
    r = Rake(max_length=MAX_PHRASE_LEN)
    r.extract_keywords_from_text(tweet)
    return r.get_ranked_phrases()
Beispiel #18
0
def recommend_config(request):
    search_query = request.data.get('query', None)
    if not search_query:
        return Response("Invalid search query", status=400)

    corpus = pd.read_excel('data/master_configs.xlsx')
    print('reached...')
    # initializing the new column
    corpus['Key_words'] = ""

    for index, row in corpus.iterrows():
        info = row['Info']

        # instantiating Rake, by default it uses english stopwords from NLTK
        # and discards all puntuation characters as well
        r = Rake()

        # extracting the words by passing the text
        r.extract_keywords_from_text(info)

        # getting the dictionary whith key words as keys and their scores as values
        key_words_dict_scores = r.get_word_degrees()

        # assigning the key words to the new column for the corresponding movie
        row['Key_words'] = list(key_words_dict_scores.keys())

    # dropping the Plot column
    corpus.drop(columns=['Info'], inplace=True)

    corpus.set_index('Name', inplace=True)

    corpus['bag_of_words'] = ''
    columns = corpus.columns
    for index, row in corpus.iterrows():
        words = ''
        for col in columns:
            if col != 'Key_words':
                words = words + row[col] + ' '
            else:
                words = words + ' '.join(row[col]) + ' '
        row['bag_of_words'] = words

    corpus['config_id'] = ''
    columns = corpus.columns
    for index, row in corpus.iterrows():
        words = ''
        for col in columns:
            if col == 'OS' or col == 'Server' or col == 'Controller':
                words = words + row[col] + '|'
        row['config_id'] = words

    search_query = _remove_noise(search_query)
    vector1 = text_to_vector(search_query.lower())

    result = {'Rank': [], 'Heading': [], 'Config': [], 'Cosine': []}

    for i in corpus.index:
        config = corpus['bag_of_words'][i]
        config = _remove_noise(config)

        vector2 = text_to_vector(config.lower())

        cosine = get_cosine(vector1, vector2)
        if cosine > 0:
            result['Rank'].append(0)
            result['Heading'].append(i)
            result['Config'].append(corpus['config_id'][i])
            result['Cosine'].append(cosine)

    result_df = pd.DataFrame(data=result)

    result_df = result_df.sort_values('Cosine', ascending=False)

    result_df.to_csv('data/config_op.csv')
    #print(result_df)
    print('Output saved!')

    #convert csv to json
    csvfilename = 'data/config_op.csv'
    jsonfilename = csvfilename.split('.')[0] + '.json'
    csvfile = open(csvfilename, 'r')
    jsonfile = open(jsonfilename, 'w')
    reader = csv.DictReader(csvfile)

    fieldnames = ('Rank', 'Heading', 'Config', 'Cosine')

    output = []

    for each in reader:
        row = {}
        for field in fieldnames:
            row[field] = each[field]
        output.append(row)

    json.dump(output, jsonfile, indent=2, sort_keys=True)

    #print(result_df)
    print('Output saved!')
    return Response(output)
Beispiel #19
0
from flask import Flask, render_template, request
from rake_nltk import Rake, Metric
import requests
import numpy as np
import ast, os, nltk, re, db

nltk.download('stopwords')
nltk.download('punkt')
# stop words: set of words to be excluded from consideration while generating keywords
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['http', 'https', '://', '```', '~~~', '///']
stopwords.extend(newStopWords)

rake = Rake(min_length=1,
            max_length=4,
            ranking_metric=Metric.WORD_DEGREE,
            stopwords=stopwords)
cursor = db.connect()
app = Flask(__name__,
            static_folder="../web/dist/static",
            template_folder="../web/dist/")

search_dir = os.path.join(app.root_path, '../../_categories/')
files = os.listdir(search_dir)
files = [os.path.join(search_dir, f) for f in files]  # add path to each file

category_list = []
for file in files:
    slash_pos = file.rfind('/')
    category_string = file[slash_pos + 1:-3]
    if category_string != 'all_links':
Beispiel #20
0
import json
from rake_nltk import Rake
from nltk.corpus import stopwords

rake = Rake(min_length=2,
            max_length=4,
            stopwords=stopwords.words('english').extend(['book', 'review']))


def find_missing_isbn(s):
    print(s)
    rev_s = reversed(s)
    digits = (int(c) for c in rev_s)
    digits = (d * i for i, d in enumerate(digits, start=2))
    return 11 - sum(digits) % 11


def find_full_isbn(s):
    crop_s = s[3:-1]
    return crop_s + str(find_missing_isbn(crop_s))


def serialize_meta(meta):
    d = {'title': meta['title'], 'description': meta['description']}
    return d


def serialize_reviews(reviews):
    clean_reviews = []
    for r in reviews:
        print(r)
    def calc(self,mname):
        pd.set_option('display.max_columns', 100)
        df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
        df.head()
        df.shape
        df = df[['Title','Genre','Director','Actors','Plot']]
        df.head()
        df.shape
        # discarding the commas between the actors' full names and getting only the first three names
        df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

        # putting the genres in a list of words
        df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

        df['Director'] = df['Director'].map(lambda x: x.split(' '))

        # merging together first and last name for each actor and director, so it's considered as one word 
        # and there is no mix up between people sharing a first name
        for index, row in df.iterrows():
            row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
            row['Director'] = ''.join(row['Director']).lower()
        # initializing the new column
        df['Key_words'] = ""

        for index, row in df.iterrows():
            plot = row['Plot']

            # instantiating Rake, by default is uses english stopwords from NLTK
            # and discard all puntuation characters
            r = Rake()

            # extracting the words by passing the text
            r.extract_keywords_from_text(plot)

            # getting the dictionary whith key words and their scores
            key_words_dict_scores = r.get_word_degrees()

            # assigning the key words to the new column
            row['Key_words'] = list(key_words_dict_scores.keys())

        # dropping the Plot column
        df.drop(columns = ['Plot'], inplace = True)
        df.set_index('Title', inplace = True)
        df.head()
        df['bag_of_words'] = ''
        columns = df.columns
        for index, row in df.iterrows():
            words = ''
            for col in columns:
                if col != 'Director':
                    words = words + ' '.join(row[col])+ ' '
                else:
                    words = words + row[col]+ ' '
            row['bag_of_words'] = words

        df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)
        df.head()
        # instantiating and generating the count matrix
        count = CountVectorizer()
        count_matrix = count.fit_transform(df['bag_of_words'])

        # creating a Series for the movie titles so they are associated to an ordered numerical
        # list I will use later to match the indexes
        indices = pd.Series(df.index)
        indices[:5]
        # generating the cosine similarity matrix
        cosine_sim = cosine_similarity(count_matrix, count_matrix)
        cosine_sim
        # function that takes in movie title as input and returns the top 10 recommended movies
        def recommendations(title, cosine_sim = cosine_sim):
            rm=""
            try:   
                recommended_movies = []

                # gettin the index of the movie that matches the title
                idx = indices[indices == title].index[0]

                # creating a Series with the similarity scores in descending order
                score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

                # getting the indexes of the 10 most similar movies
                top_10_indexes = list(score_series.iloc[1:11].index)

                # populating the list with the titles of the best 10 matching movies
                for i in top_10_indexes:
                    recommended_movies.append(list(df.index)[i])
                
                for i in range(len(recommended_movies)-1):
                    rm=rm+"\n"+recommended_movies[i]
            except IndexError:
                rm="Movie not in dataset"
            finally:
                with open("rname.txt","w+") as file:
                    file.seek(0)
                    file.write(rm)
        recommendations(mname)
Beispiel #22
0
def dollarSA():
    r = Rake()
    # Opens file and reads in training data
    # NB classifier trains using the read in data
    with open("../datasets/trainingData.csv", 'r') as trainingdata:
        classifier = NaiveBayesClassifier(trainingdata, format="csv")
        print("Training Data")
        classifier.show_informative_features(5)

    # Opens file and reads in testing data
    # Prints testing data accuracy
    # Not needed for final product

    with open("../datasets/testingData.csv", 'r') as testingdata:
        print("Testing data accuracy", classifier.accuracy(testingdata))

    with open("dollar.txt", 'r', encoding='utf-8') as a_file:
        for line in a_file:
            userInput = line.strip()

            regex = re.compile('[^a-zA-Z ]')
            punctuationRemoved = regex.sub('', userInput)

            # Defines stopwords
            stop_words = set(stopwords.words('english'))

            # Takes user input, removes stopwords
            word_tokens = word_tokenize(punctuationRemoved)

            # Creates list size based on number of words left after stop words are removed
            filtered_sentence = [w for w in word_tokens if not w in stop_words]

            # Initialize empty list
            filtered_sentence = []

            # Appends each word to end of list
            # Runs for as many words are stored in word_tokens
            for w in word_tokens:
                # If word is not in stop_words, append to end of list
                if w not in stop_words:
                    filtered_sentence.append(w)

            # Prints list to see new sentence with stopwords removed

            # Converts the filtered stop word sentence to string
            stringWithoutStopwords = ' '.join(
                [str(elem) for elem in filtered_sentence])

            # Extracts keywords from the filtered sentence
            r.extract_keywords_from_text(stringWithoutStopwords)

            # Ranks the keywords that have been extracted
            ranked_phrases = r.get_ranked_phrases()

            # Converts extracted keywords list to string
            listToStr = ' '.join([str(elem) for elem in ranked_phrases])

            # Runs string through trained NB classifier
            finalString = TextBlob(listToStr, classifier=classifier)

            # Print string followed by classification
            print(finalString + "," + finalString.classify())
        dict['long_description'].append(fields[3])
        dict['id'].append(regex.sub(" ", fields[4]))

df1 = pd.DataFrame.from_dict(dict)
###############################################################################################################################################
"""
	TRANSLATING THE TEXTS INTO ENGLISH LANGUAGE. TAKING ONLY TITLE AND DESCRIPTION COLUMNS FOR
	EASYNESS. FOR TRANSLATION USING 'GOOGLETRANS' AND 'TEXTBLOB' PACKAGES FOR BETTER PERFORMANCE. THEN EXTRACTING
	KEY PHRASES FROM THEM USING 'RAKE'(NLTK package) AND STORED IN A NEW DATAFRAME COLUMN 'keyword_set'.
"""
columns = ['title', 'description']
temp = []

translator = Translator()
r = Rake(stopwords=stop_words,
         ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
         min_length=2,
         max_length=3)

n = 0
t = 500
dest_t = len(df1)

while n < dest_t:

    for i in range(n, t):
        for j in columns:

            print("df1[j][i] = ", df1[j][i], "i==", i)

            if (len(df1[j][i]) >= 3):
                blob = TextBlob(str(df1[j][i]))
Beispiel #24
0
def key_word_analysis(text):
    r = Rake(min_length=2, max_length=4)
    r.extract_keywords_from_text(text)
    keywords = r.get_ranked_phrases()
    return keywords 
def recommend():
    ingredient_list = ""
    if request.method == 'POST':
        brand = request.form['brand']
        print(brand)
        category = request.form['category']
        manufacturer = request.form['manufacturer']
        message = request.form['message']

        #Appending new data
        ingredient_list += brand + " " + category + " " + manufacturer + " " + message
        print(ingredient_list)

        pd.set_option('display.max_columns', 10)
        df = pd.read_csv('Food_Ingredients.csv', encoding='latin-1')

        print('=' * 90)

        #Choosing the Vector features to base my recommendations on
        print(
            '---------------------------FEATURE VECTOR TABLE---------------------------'
        )
        df = df[[
            'brand', 'categories', 'features.value', 'manufacturer', 'name'
        ]]

        df = pd.DataFrame(np.array([['new', ingredient_list]]),
                          columns=['name',
                                   'features.value']).append(df,
                                                             ignore_index=True,
                                                             sort=True)
        # df.loc[0].replace(np.nan, "a")
        df.fillna("")
        print(df.head())
        print(df.shape)

        print(
            '--------------------Feature Vector Information--------------------'
        )
        print(df.info())
        print()

        print('=' * 90)

        print(
            "--------------------LIST OF UNIQUE VALUES IN THE FEATURE VECTORS--------------------"
        )
        unique_values_brand = df['brand'].unique().tolist()
        unique_values_categories = df['categories'].unique().tolist()
        unique_values_ingredients = df['features.value'].unique().tolist()
        unique_values_manufacturer = df['manufacturer'].unique().tolist()

        print('=' * 90)

        print()
        print(
            "--------------------NUMBER OF UNIQUE VALUES FOR THE FEATURE VECTORS--------------------"
        )
        print('Brands: ', unique_values_brand)
        print('Categories: ', unique_values_categories)
        print('Ingredients: ', unique_values_ingredients)
        print('Manufacturers: ', unique_values_manufacturer)

        print('=' * 90)

        print()
        print(
            '--------------------FEATURE VECTORS UNIQUE VALUES(NO)--------------------'
        )
        print("Brand: ", len(unique_values_brand))
        print('Categories: ', len(unique_values_categories))
        print('Ingredients: ', len(unique_values_ingredients))
        print('manufacturers: ', len(unique_values_manufacturer))

        print('=' * 90)

        print()
        print(
            '--------------------FEATURE VECTORS UNIQUE VALUES COUNT(TOP 20)\n--------------------'
        )
        print("BRANDS: \n", df['brand'].value_counts().head(20))
        print()
        print('CATEGORIES: \n', df['categories'].value_counts().head(20))
        print()
        # print('INGREDIENTS: \n', df['features.value'].value_counts())
        print()
        print('MANUFACTURERS: \n', df['manufacturer'].value_counts().head(20))
        print()
        # print('NAMES: \n', df['name'].encode("utf-8").value_counts().head(20 ))
        print()

        print('=' * 90)

        #Transforming the values of the FV individual columns into single words so they are considered as unique values.

        # discarding the commas between the actors' full names and getting only the first three names
        df['categories'] = df['categories'].astype('str')
        df['categories'] = df['categories'].map(lambda x: x.split(',')[:3])
        df['manufacturer'] = df['manufacturer'].astype('str')
        df['manufacturer'] = df['manufacturer'].map(lambda x: x.split(',')[:3])
        df['brand'] = df['brand'].astype('str')
        df['brand'] = df['brand'].map(lambda x: x.split(' '))

        # print('huh??')
        # print(df['categories'].head())

        print('=' * 90)
        print()

        # merging together first and last name for each categories and manufacturers, so it's considered as one word
        # and there is no mix up between people sharing a first name

        for index, row in df.iterrows():
            row['categories'] = [
                x.lower().replace(' ', '') for x in row['categories']
            ]
            row['manufacturer'] = [
                x.lower().replace(' ', '') for x in row['manufacturer']
            ]
            row['brand'] = ''.join(row['brand']).lower()

        print(df['categories'].head())
        print(df['brand'].head())
        print(df['manufacturer'].head())

        print('=' * 90)

        # initializing the new column
        df['Key_words'] = ""
        df['features.value'] = df['features.value'].astype('str')

        for index, row in df.iterrows():
            ingredients = row['features.value']

            # instantiating Rake, by default is uses english stopwords from NLTK
            # and discard all puntuation characters
            r = Rake()

            # extracting the words by passing the text
            r.extract_keywords_from_text(ingredients)

            # getting the dictionary whith key words and their scores
            key_words_dict_scores = r.get_word_degrees()

            # assigning the key words to the new column
            row['Key_words'] = list(key_words_dict_scores.keys())

        # dropping the Features.value column
        df.drop(columns=['features.value'], inplace=True)

        df.set_index('name', inplace=True)
        print(df.head())

        print('=' * 90)

        df['bag_of_words'] = ''
        columns = df.columns
        for index, row in df.iterrows():
            words = ''
            for col in columns:
                if col != 'brand':
                    words = words + ' '.join(row[col]) + ' '
                else:
                    words = words + row[col] + ' '
            row['bag_of_words'] = words

        df.drop(columns=[col for col in df.columns if col != 'bag_of_words'],
                inplace=True)
        print('bag of word right?')
        # df = pd.DataFrame(np.array([['new','just a joke']]), columns=['name','bag_of_words']).append(df, ignore_index=True)
        # print('MY OWN RATINGS')
        # f = df.loc[0].dropna()
        # print(f.head())

        print(df.shape)
        print(df.head())

        # instantiating and generating the count matrix
        count = CountVectorizer()
        count_matrix = count.fit_transform(df['bag_of_words'])

        # creating a Series for the Food names so they are associated to an ordered numerical
        # list I will use later to match the indexes
        indices = pd.Series(df.index)
        print('is thiss??')
        print(indices[:5])

        # generating the cosine similarity matrix
        cosine_sim = cosine_similarity(count_matrix, count_matrix)
        print("cosine simm")
        print(cosine_sim)

        # function that takes in food as input and returns the top 10 recommended foods
        def recommendations(title, cosine_sim=cosine_sim):

            recommended_food = []

            # gettin the index of the food that matches the name
            idx = indices[indices == title].index[0]

            # creating a Series with the similarity scores in descending order
            score_series = pd.Series(
                cosine_sim[idx]).sort_values(ascending=False)

            # getting the indexes of the 10 most similar foods
            top_10_indexes = list(score_series.iloc[1:11].index)

            # populating the list with the names of the best 10 matching foods
            for i in top_10_indexes:
                recommended_food.append(list(df.index)[i])
            print("wait")
            for r in range(len(recommended_food)):
                print(recommended_food[r])
            return recommended_food

    foods = recommendations('new')
    comma_separated = ','.join(foods)
    return redirect(url_for('display_recommendations', food=comma_separated))
Beispiel #26
0
    def readfile(self, path):
        with open(path) as f:
            n = self.start
            rake = Rake()
            while True:
                line = self.readblock(f)
                #check new record starts
                if line.startswith('%d. ' % n):
                    data = {}
                    data['seqid'] = n
                    #get time
                    data['time'] = line.split('.')[2].split(';')[0].strip()
                    #get title
                    data['title'] = self.readblock(f).replace('\n',
                                                              ' ').strip()
                    #get author
                    data['author'] = self.readblock(f).replace('\n',
                                                               ' ').strip()
                    #detect affiliation
                    line = self.readblock(f)
                    if line.startswith('Author information:'):
                        data['affiliation'] = line.replace('\n', ' ').strip()
                        countries = self.findcountry(data['affiliation'])
                        if countries:
                            data['country'] = countries
                        else:
                            #could not find country info, skip it
                            LOGGER.info(
                                'could not find country info: of paper id: [%d], [%s]'
                                % (int(n), data['affiliation']))
                            n += 1
                            continue
                        line = self.readblock(f)
                    #abstract
                    while line.startswith('Comment'):
                        line = self.readblock(f)
                    #no abstract
                    if line.find('DOI:') >= 0 or line.find(
                            'PMID:') >= 0 or line.find('PMCID:') >= 0:
                        data['DOI'] = line.replace(
                            '\n', ' ').rstrip('[Indexed for MEDLINE]').strip()
                        #skip this record as it does not have abstract
                        n += 1
                        continue
                    else:
                        data['abstract'] = line.replace('\n', ' ').strip()
                        #abstract length is too short
                        if len(data['abstract']) < self.minimum_length:
                            n += 1
                            continue
                        line = self.readblock(f)
                        #skip all other elements until encountering DOI
                        while line.find('DOI:') == -1 and line.find(
                                'PMID:') == -1 and line.find('PMCID:') == -1:
                            line = self.readblock(f)
                        data['DOI'] = line.replace(
                            '\n', ' ').rstrip('[Indexed for MEDLINE]').strip()
                        n += 1

                        #here data is ready for processing
                        rake.extract_keywords_from_text(data['abstract'])
                        words = rake.get_ranked_phrases()
                        data['keywords'] = words[:2]

                        LOGGER.debug(data)
                        LOGGER.info('starts saving id: %d data into mongodb' %
                                    int(n - 1))
                        self.mongo.insert([data], self.collection)
                #record ends
                if n > self.max_items:
                    break
Beispiel #27
0

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


args = parser.parse_args()
rake = Rake(min_length=args.min_keyw_length,
            max_length=args.max_keyw_length,
            ranking_metric=Metric.WORD_DEGREE)

for page in range(int(args.num_pages)):
    if not args.default_sort and not args.answers_sort:
        req = requests.get(
            'http://answers.gazebosim.org/questions/scope:all/sort:votes-desc/page:'
            + str(page + 1) + '/')
        print("Entries sorted based on votes: ")
    elif args.answers_sort:
        print("Entries sorted based on most-answered: ")
        req = requests.get(
            'http://answers.gazebosim.org/questions/scope:all/sort:answers-desc/page:'
            + str(page + 1) + '/')
    elif args.default_sort:
        print("Entries sorted based on activity: ")
        for row in content:
            if row[18] == '' or len(row[18]) != 2:
                continue
            elif row[18] not in reviews.keys():
                reviews[row[18]] = ''
                cnt[row[18]] = 0
            if cnt[row[18]] >= 3000:
                pass
            cnt[row[18]] += 1
            reviews[row[18]] = reviews[row[18]] + row[14]

print(reviews.keys())

for j in reviews.keys():
    r = Rake(
        ranking_metric=Metric.WORD_DEGREE, max_length=4
    )  # Uses stopwords for english from NLTK, and all puntuation characters.
    keywords = r.extract_keywords_from_text(reviews[j])
    phrases = r.get_ranked_phrases(
    )  # To get keyword phrases ranked highest to lowest.
    scores = r.get_ranked_phrases_with_scores(
    )  # To get keyword phrases with scores

    # print(len(scores),len(phrases))
    i = 0
    for score in scores:
        if score[0] < 20:
            del phrases[i:-1]
            break
        i += 1
    # print(phrases)
Beispiel #29
0
def InitNLPRake():
    nlp = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, max_length=6)
    #nlp=Rake(StopWords);
    return nlp
Beispiel #30
0
def find_keyword(terms):
    r = Rake(min_length=1, max_length=2)
    r.extract_keywords_from_text(terms)
    result = r.get_ranked_phrases()
    return result