Ejemplo n.º 1
0
def rank_wiki_entities(wikipage):

    print wikipage

    # gets the current wiki innerlinks
    _wikipage_entities = wikipage_entities.get_current_wiki_links(wikipage)

    entities_rank = defaultdict(float)

    # 1- Rank Entities based on backlins
    # 2- Rank based on outt link overlap
    # 3- Mentions in the page - TF

    for entity in _wikipage_entities:
        # Edge based------------------------------------
        innerLink_outLinks = wikipage_entities.get_current_wiki_links(entity)

        if innerLink_outLinks is not None:

            if wikipage in innerLink_outLinks:
                entities_rank[entity] = entities_rank[entity] + 2.0

            else:
                entities_rank[entity] = entities_rank[entity] + 1.0


            # Concept overlap - using jaccard
            entities_rank[entity] = entities_rank[entity] + (
                similaritites.jaccard_similarity(
                    Counter(_wikipage_entities),
                    Counter(innerLink_outLinks)))


    # 3- TF +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    entity_mentions = defaultdict(int)


    frequencies = tf.get_frequencies_wikipage(wikipage)


    for x in frequencies.keys():
        for entity in _wikipage_entities:
            if x in entity:
                # This will make sure to get only one value or the other of an
                # entity of two terms
                if entity_mentions[entity] < frequencies[x]:
                    entity_mentions[entity] = frequencies[x]


    # sort dict
    entity_mentions = collections.OrderedDict(
        sorted(entity_mentions.items(),
                key=operator.itemgetter(1), reverse=True))

    # normalize between zero and one
    max_value = entity_mentions.values()[0]
    min_value = entity_mentions.values()[-1]

    for entity in entity_mentions.keys():

        # Normalize score
        score = (
                    (entity_mentions[entity] - min_value)/ (
                        float(max_value - min_value)
                    )
                )

        if entity in entities_rank.keys():
            entities_rank[entity] = entities_rank[entity] + score

    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    entities_rank = collections.OrderedDict(sorted(entities_rank.items(),
                        key=operator.itemgetter(1), reverse=True))


    return entities_rank
Ejemplo n.º 2
0
def findWikiHashtags(wikipage, sessionID):

    # get wikipedia concepts ranks:
    #   gets the current wiki innerlinks and rank them
    _wikipage_entities = rank_wiki_entities(wikipage)

    # Get the list of cooc hashtags from the global variable
    coochashtags = get_coochashtags()

    # will contain the hashtag name and the score based on jaccard similarity
    recommended_hashtags_jaccard = defaultdict(float)

    # cosine scores
    recommended_hashtags_cosine = defaultdict(float)

    # The cooccurences of the coocurrences of the main hashtag
    recommended_hashtags_coocs = defaultdict(list)

    # Taxonomies of searched tweets
    recommended_hashtags_taxonomies = defaultdict(list)

    # 1- search twitter for each hashtag ---------------------------------------
    entities = []

    max_tweets = 20

    #TODO: this should be a thredhold instead
    # only the top x hashtags
    number_of_hashtags = 10

    for hashtag in coochashtags.keys():
        if number_of_hashtags >= 0:

            # TYPE: List
            hashtag_tweets = search_twitter.get_recentX_tweets(hashtag, max_tweets)

            # This will allow me to find other hashtags that did not coocuured
            # directly with my main hashtag (AND) find taxonomies of each tweet
            for tweet in hashtag_tweets:

                # This will get all hashtags++++++++++++++++++++++++++++++++++++

                hashtagsInTweet = re.findall(r"#(\w+)", tweet.lower())

                # To remove duplicate hashtags in same tweet
                hashtagsInTweet = list(set(hashtagsInTweet))

                for haTag in hashtagsInTweet:
                    recommended_hashtags_coocs[hashtag].append(haTag)

                #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                if 'taxonomy' in annotate_tweets.alchemy(tweet):
                    for label in annotate_tweets.alchemy(tweet)['taxonomy']:
                        recommended_hashtags_taxonomies[hashtag].append(label)


            # Annotate tweets and get all entities
            # TYPE: List
            entities = annotate_tweets.annotate_return_entities(hashtag_tweets)

            # entity and Frequency in a hashtag tweets
            hashtag_all_entities = dict(Counter(entities))

            # Jaccard Similarity - section 3.3.1: without scores
            sim = similaritites.jaccard_similarity(
                    Counter(hashtag_all_entities.keys()),
                    Counter(_wikipage_entities.keys()))

            recommended_hashtags_jaccard[hashtag] = sim

            # Cosine Similarity - section 3.3.2: with scores

            #1: unify the two sets of entities from wikipage and tweets
            combined_entities = list(
                                set(entities).union(_wikipage_entities.keys()))

            #===================================================================

            # As in Pavan Paper
            hashtag_h = []

            for x in combined_entities:
                if x in entities:
                    hashtag_h.append(hashtag_all_entities[x])
                else:
                    hashtag_h.append(0)

            #===================================================================

            # As in Pavan Paper
            event_e = []

            for x in combined_entities:
                if x in _wikipage_entities.keys():
                    event_e.append(_wikipage_entities[x])
                else:
                    event_e.append(0)

            #===================================================================

            sim = similaritites.cosine_similarity(
                    Counter(hashtag_h), Counter(event_e))

            recommended_hashtags_cosine[hashtag] = sim

            #-------
            print number_of_hashtags

            number_of_hashtags -= 1

            #===================================================================
            #===================================================================
            #===================================================================

        else:
            break

    recommended_hashtags = {
        'jaccard_sim': recommended_hashtags_jaccard,
        'cosine_sim': recommended_hashtags_cosine,
        'hashtag-cooc': recommended_hashtags_coocs,
        'hashtag-taxonomies': recommended_hashtags_taxonomies
    }

    db_operations.insert_wikipage_recommended_hashtags(sessionID,
                    recommended_hashtags)