コード例 #1
0
def initialize_knowledgebase():
    # entities is a dict
    global entities, all_mentions, most_freq
    entities = util.get_entities()
    print('Entities Loading!')
    entities = util.split_dataframe(entities, col='article')
    print('Entities Dictionary Built!')
    all_mentions = util.get_mentions()
    print('Mentions Dictionary Loaded!')
    most_freq = util.get_most_freq_entities()
    #most_freq.set_index('mention', inplace=True)
    print('Most Frequent Entities Dictionary Loaded!')
コード例 #2
0
def get_recommendation():
    try:
        api = dbp['api']
    except Exception as e:
        dbp.clear()
        gen_user = db.news.find().sort("publishedAt", -1).limit(
            180)  #non twitter user recoomend the top stories in each category
        return dumps(gen_user)
    recom_scoredList = []
    try:
        friend_list = save_friendList(api,
                                      POST_USERNAME)  #get the user friend list
        tweets = get_tweets(api, POST_USERNAME)  #get user tweets
        liked = get_likes(api, POST_USERNAME)  #get user likes
        tweets.extend(
            liked)  #make single collection for likes, tweet and retweet
        document = " ".join(tweets)  #convert list to string by space seprateor
        document = nlp(
            document
        )  #Pass the string to spacy nlp function for pre processing
        most_common = get_mostCommon(document)  #get most common keywords spacy
        entities = get_entities(document)  #get user entities
        most_common.extend(entities)  #add entities to most common
        processedWords = list(
            set(most_common))  #make single unique list of all keywords
        save_uniqueWords(processedWords,
                         POST_USERNAME)  #save words in database
        tweet_intrest = get_tweetIntrest(
            tweets)  #calculate user intrest from tweets
        final_intrest_category, normCounts = get_normIntrest(
            tweet_intrest)  #counts dictionary of tweets
        final_intrest_category = list(
            set(final_intrest_category
                ))  #make unique list of calculated category intrest
        save_tweetIntrest(final_intrest_category,
                          POST_USERNAME)  #user intrest is saved to database
        profile_list_combined = final_intrest_category + processedWords  #combined list of catgory and processed words
        save_profile(profile_list_combined,
                     POST_USERNAME)  #user profile is saved
        result_set = get_collKeywords(
            topN, POST_USERNAME)  #Similar user profile keywords is extracted
        app_likes = get_appLikes(
            POST_USERNAME)  #user liked article keywords in prime application
        app_saved = get_appsaved(
            POST_USERNAME
        )  #user saved article keywords list in prime application
        final_search_list = list(
            result_set
        ) + processedWords + app_likes + app_saved  #final list to search in database
        #cold start user
        if dbp['acc_location'] is not None and dbp[
                'acc_location'] != "":  #update the search list, it helps in case of cold start user
            final_search_list.extend(dbp['acc_location'].split(","))
        if dbp['time_zone'] is not None and dbp[
                'time_zone'] != "":  #cold start user time zone
            final_search_list.extend(dbp['time_zone'].split(","))
        search_kwlst = set([
            i.lower() for i in final_search_list
        ])  #final keyword list which is searched in database
        recom_list = db.news.find({"keywords": {
            "$in": list(search_kwlst)
        }})  #database search query
        recom_scoredList = assign_score(recom_list, normCounts)
        if not recom_scoredList:  #If cold start problem is not solved with location and timezone field give top stories
            recom_scoredList = db.news.find().sort("publishedAt",
                                                   -1).limit(180)  #top stories
    except Exception as e:
        print(e)
        return "Sorry We are not able to process your request this Time"  #in case of exception
    return dumps(recom_hybridarticles(recom_scoredList, POST_USERNAME))
import re
import util
import settings
import pandas as pd
import nlp_util as nlp
import read_data as data

# get all linked entities of Wikipedia
entities = util.get_entities(chunks=True)

# DataFrames to store stats
Graph = pd.DataFrame(columns=['article', 'entity', 'mention'], dtype='unicode')

i = 0
# Iterate over Wikipedia articles
# and find mentions & entities
# and store stats
with open(settings.PATH_ARTICLES, 'rb') as a:

    for article in data.iter_annotations(a):

        # get the linked entities within the article
        article_entities = entities[entities.article == article.page_id]

        # get the article content
        article_body = article.to_string()

        # print article name
        #print(article.page_name, ' ', i, '\n')

        # save the article
# -*- coding: utf-8 -*-
"""
@author: Abdulrahman Bres
"""

import re
import util
import pickle
import settings
import nlp_util as nlp
import pandas as pd

# load articles & entities
entities = util.get_entities()
print('Entities Loaded!')

articles = pickle.load(open(settings.PATH_KB + 'gs_articles.pickle', 'rb'))

gold_standard = pd.DataFrame(columns=[
    'article', 'mention', 'used_entity', 'entity', 'entity_id', 'offset',
    'sentence', 'the_sentence'
],
                             dtype='unicode',
                             index=None)


def annotate(article):

    annotations = pd.DataFrame(columns=[
        'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id',
        'offset', 'sentence', 'the_sentence'
コード例 #5
0
def initialize_entities():

    global entities
    entities = util.get_entities()
    entities = util.split_dataframe(entities)
    print('Entities loaded!')
コード例 #6
0
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import read_data as data
import settings
import util

# counters
articles_count = 0
paragraphs_count = 0

# get all entities from Wikipedia into a DataFrame
entities = util.get_entities(True)

# get all paragraphs of Wikipedia into a DataFrame
paragraphs = util.get_paragraphs()

# data structures for output
Articles_Paragraphs_Entities = pd.DataFrame(columns=[
    'Article', 'Paragraph', 'Paragraph Linked Entities', 'Found Entities'
],
                                            index=None)
Articles_Recall_Values = pd.DataFrame(
    columns=['Article', 'Macro Recall', 'Micro Recall'], index=None)

Macro_Recall_st_errors = []
Micro_Recall_st_errors = []

s = 0
# cutoffs for calculating the standard error values
def data_initialize():
    global entities
    entities = util.get_entities(chunks=True)
    print('Entities loaded!')
コード例 #8
0
def get_annotations(gold_standard):

    entities = util.get_entities()
    all_mentions = util.get_mentions()
    global most_freq
    most_freq = util.get_most_freq_entities()

    annotations = pd.DataFrame(columns=[
        'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence',
        'the_sentence'
    ],
                               dtype='unicode',
                               index=None)

    for article in gold_standard.articles:

        print(article.title)

        anno = pd.DataFrame(columns=[
            'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence',
            'the_sentence'
        ],
                            dtype='unicode',
                            index=None)

        # search for mentions of the article entities

        article_entities = entities.loc[
            entities.article == article.title.replace(' ', '%20'), 'entity']

        mentions = []

        try:
            mentions.extend(map(all_mentions.get, article_entities))
            mentions = filter(None, mentions)
            mentions = reduce(lambda x, y: x + y, mentions)
        except:
            pass

        mentions = sorted(mentions, key=len)[::-1]

        for mention in mentions:
            for match in re.finditer(r'\b{}\b'.format(re.escape(mention)),
                                     article.text):
                entity = disambiguate(None, match.group())
                anno.loc[len(anno.index)] = [
                    article.title,
                    match.group(), entity,
                    nlp.get_entity_id(entity),
                    match.start(), -1, None
                ]

        # map offsets to sentences
        sentences_spans = []
        tokenized_sents = nlp.get_sentences(article.text)
        for sentence in nlp.get_sentences_spans(article.text, tokenized_sents):
            sentences_spans.append(sentence)

        anno = util.sorted_dataframe(anno, anno.offset, True)
        anno[['sentence', 'the_sentence']] = pd.DataFrame(
            list(anno['offset'].map(
                lambda x: nlp.get_sentence_number(sentences_spans, x))))

        annotations = annotations.append(anno)

    return annotations