def initialize_knowledgebase(): # entities is a dict global entities, all_mentions, most_freq entities = util.get_entities() print('Entities Loading!') entities = util.split_dataframe(entities, col='article') print('Entities Dictionary Built!') all_mentions = util.get_mentions() print('Mentions Dictionary Loaded!') most_freq = util.get_most_freq_entities() #most_freq.set_index('mention', inplace=True) print('Most Frequent Entities Dictionary Loaded!')
def get_recommendation(): try: api = dbp['api'] except Exception as e: dbp.clear() gen_user = db.news.find().sort("publishedAt", -1).limit( 180) #non twitter user recoomend the top stories in each category return dumps(gen_user) recom_scoredList = [] try: friend_list = save_friendList(api, POST_USERNAME) #get the user friend list tweets = get_tweets(api, POST_USERNAME) #get user tweets liked = get_likes(api, POST_USERNAME) #get user likes tweets.extend( liked) #make single collection for likes, tweet and retweet document = " ".join(tweets) #convert list to string by space seprateor document = nlp( document ) #Pass the string to spacy nlp function for pre processing most_common = get_mostCommon(document) #get most common keywords spacy entities = get_entities(document) #get user entities most_common.extend(entities) #add entities to most common processedWords = list( set(most_common)) #make single unique list of all keywords save_uniqueWords(processedWords, POST_USERNAME) #save words in database tweet_intrest = get_tweetIntrest( tweets) #calculate user intrest from tweets final_intrest_category, normCounts = get_normIntrest( tweet_intrest) #counts dictionary of tweets final_intrest_category = list( set(final_intrest_category )) #make unique list of calculated category intrest save_tweetIntrest(final_intrest_category, POST_USERNAME) #user intrest is saved to database profile_list_combined = final_intrest_category + processedWords #combined list of catgory and processed words save_profile(profile_list_combined, POST_USERNAME) #user profile is saved result_set = get_collKeywords( topN, POST_USERNAME) #Similar user profile keywords is extracted app_likes = get_appLikes( POST_USERNAME) #user liked article keywords in prime application app_saved = get_appsaved( POST_USERNAME ) #user saved article keywords list in prime application final_search_list = list( result_set ) + processedWords + app_likes + app_saved #final list to search in database #cold start user if dbp['acc_location'] is not None and dbp[ 'acc_location'] != "": #update the search list, it helps in case of cold start user final_search_list.extend(dbp['acc_location'].split(",")) if dbp['time_zone'] is not None and dbp[ 'time_zone'] != "": #cold start user time zone final_search_list.extend(dbp['time_zone'].split(",")) search_kwlst = set([ i.lower() for i in final_search_list ]) #final keyword list which is searched in database recom_list = db.news.find({"keywords": { "$in": list(search_kwlst) }}) #database search query recom_scoredList = assign_score(recom_list, normCounts) if not recom_scoredList: #If cold start problem is not solved with location and timezone field give top stories recom_scoredList = db.news.find().sort("publishedAt", -1).limit(180) #top stories except Exception as e: print(e) return "Sorry We are not able to process your request this Time" #in case of exception return dumps(recom_hybridarticles(recom_scoredList, POST_USERNAME))
import re import util import settings import pandas as pd import nlp_util as nlp import read_data as data # get all linked entities of Wikipedia entities = util.get_entities(chunks=True) # DataFrames to store stats Graph = pd.DataFrame(columns=['article', 'entity', 'mention'], dtype='unicode') i = 0 # Iterate over Wikipedia articles # and find mentions & entities # and store stats with open(settings.PATH_ARTICLES, 'rb') as a: for article in data.iter_annotations(a): # get the linked entities within the article article_entities = entities[entities.article == article.page_id] # get the article content article_body = article.to_string() # print article name #print(article.page_name, ' ', i, '\n') # save the article
# -*- coding: utf-8 -*- """ @author: Abdulrahman Bres """ import re import util import pickle import settings import nlp_util as nlp import pandas as pd # load articles & entities entities = util.get_entities() print('Entities Loaded!') articles = pickle.load(open(settings.PATH_KB + 'gs_articles.pickle', 'rb')) gold_standard = pd.DataFrame(columns=[ 'article', 'mention', 'used_entity', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) def annotate(article): annotations = pd.DataFrame(columns=[ 'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence'
def initialize_entities(): global entities entities = util.get_entities() entities = util.split_dataframe(entities) print('Entities loaded!')
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import read_data as data import settings import util # counters articles_count = 0 paragraphs_count = 0 # get all entities from Wikipedia into a DataFrame entities = util.get_entities(True) # get all paragraphs of Wikipedia into a DataFrame paragraphs = util.get_paragraphs() # data structures for output Articles_Paragraphs_Entities = pd.DataFrame(columns=[ 'Article', 'Paragraph', 'Paragraph Linked Entities', 'Found Entities' ], index=None) Articles_Recall_Values = pd.DataFrame( columns=['Article', 'Macro Recall', 'Micro Recall'], index=None) Macro_Recall_st_errors = [] Micro_Recall_st_errors = [] s = 0 # cutoffs for calculating the standard error values
def data_initialize(): global entities entities = util.get_entities(chunks=True) print('Entities loaded!')
def get_annotations(gold_standard): entities = util.get_entities() all_mentions = util.get_mentions() global most_freq most_freq = util.get_most_freq_entities() annotations = pd.DataFrame(columns=[ 'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) for article in gold_standard.articles: print(article.title) anno = pd.DataFrame(columns=[ 'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) # search for mentions of the article entities article_entities = entities.loc[ entities.article == article.title.replace(' ', '%20'), 'entity'] mentions = [] try: mentions.extend(map(all_mentions.get, article_entities)) mentions = filter(None, mentions) mentions = reduce(lambda x, y: x + y, mentions) except: pass mentions = sorted(mentions, key=len)[::-1] for mention in mentions: for match in re.finditer(r'\b{}\b'.format(re.escape(mention)), article.text): entity = disambiguate(None, match.group()) anno.loc[len(anno.index)] = [ article.title, match.group(), entity, nlp.get_entity_id(entity), match.start(), -1, None ] # map offsets to sentences sentences_spans = [] tokenized_sents = nlp.get_sentences(article.text) for sentence in nlp.get_sentences_spans(article.text, tokenized_sents): sentences_spans.append(sentence) anno = util.sorted_dataframe(anno, anno.offset, True) anno[['sentence', 'the_sentence']] = pd.DataFrame( list(anno['offset'].map( lambda x: nlp.get_sentence_number(sentences_spans, x)))) annotations = annotations.append(anno) return annotations