def rank_wiki_entities(wikipage): print wikipage # gets the current wiki innerlinks _wikipage_entities = wikipage_entities.get_current_wiki_links(wikipage) entities_rank = defaultdict(float) # 1- Rank Entities based on backlins # 2- Rank based on outt link overlap # 3- Mentions in the page - TF for entity in _wikipage_entities: # Edge based------------------------------------ innerLink_outLinks = wikipage_entities.get_current_wiki_links(entity) if innerLink_outLinks is not None: if wikipage in innerLink_outLinks: entities_rank[entity] = entities_rank[entity] + 2.0 else: entities_rank[entity] = entities_rank[entity] + 1.0 # Concept overlap - using jaccard entities_rank[entity] = entities_rank[entity] + ( similaritites.jaccard_similarity( Counter(_wikipage_entities), Counter(innerLink_outLinks))) # 3- TF +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ entity_mentions = defaultdict(int) frequencies = tf.get_frequencies_wikipage(wikipage) for x in frequencies.keys(): for entity in _wikipage_entities: if x in entity: # This will make sure to get only one value or the other of an # entity of two terms if entity_mentions[entity] < frequencies[x]: entity_mentions[entity] = frequencies[x] # sort dict entity_mentions = collections.OrderedDict( sorted(entity_mentions.items(), key=operator.itemgetter(1), reverse=True)) # normalize between zero and one max_value = entity_mentions.values()[0] min_value = entity_mentions.values()[-1] for entity in entity_mentions.keys(): # Normalize score score = ( (entity_mentions[entity] - min_value)/ ( float(max_value - min_value) ) ) if entity in entities_rank.keys(): entities_rank[entity] = entities_rank[entity] + score #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ entities_rank = collections.OrderedDict(sorted(entities_rank.items(), key=operator.itemgetter(1), reverse=True)) return entities_rank
def findWikiHashtags(wikipage, sessionID): # get wikipedia concepts ranks: # gets the current wiki innerlinks and rank them _wikipage_entities = rank_wiki_entities(wikipage) # Get the list of cooc hashtags from the global variable coochashtags = get_coochashtags() # will contain the hashtag name and the score based on jaccard similarity recommended_hashtags_jaccard = defaultdict(float) # cosine scores recommended_hashtags_cosine = defaultdict(float) # The cooccurences of the coocurrences of the main hashtag recommended_hashtags_coocs = defaultdict(list) # Taxonomies of searched tweets recommended_hashtags_taxonomies = defaultdict(list) # 1- search twitter for each hashtag --------------------------------------- entities = [] max_tweets = 20 #TODO: this should be a thredhold instead # only the top x hashtags number_of_hashtags = 10 for hashtag in coochashtags.keys(): if number_of_hashtags >= 0: # TYPE: List hashtag_tweets = search_twitter.get_recentX_tweets(hashtag, max_tweets) # This will allow me to find other hashtags that did not coocuured # directly with my main hashtag (AND) find taxonomies of each tweet for tweet in hashtag_tweets: # This will get all hashtags++++++++++++++++++++++++++++++++++++ hashtagsInTweet = re.findall(r"#(\w+)", tweet.lower()) # To remove duplicate hashtags in same tweet hashtagsInTweet = list(set(hashtagsInTweet)) for haTag in hashtagsInTweet: recommended_hashtags_coocs[hashtag].append(haTag) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if 'taxonomy' in annotate_tweets.alchemy(tweet): for label in annotate_tweets.alchemy(tweet)['taxonomy']: recommended_hashtags_taxonomies[hashtag].append(label) # Annotate tweets and get all entities # TYPE: List entities = annotate_tweets.annotate_return_entities(hashtag_tweets) # entity and Frequency in a hashtag tweets hashtag_all_entities = dict(Counter(entities)) # Jaccard Similarity - section 3.3.1: without scores sim = similaritites.jaccard_similarity( Counter(hashtag_all_entities.keys()), Counter(_wikipage_entities.keys())) recommended_hashtags_jaccard[hashtag] = sim # Cosine Similarity - section 3.3.2: with scores #1: unify the two sets of entities from wikipage and tweets combined_entities = list( set(entities).union(_wikipage_entities.keys())) #=================================================================== # As in Pavan Paper hashtag_h = [] for x in combined_entities: if x in entities: hashtag_h.append(hashtag_all_entities[x]) else: hashtag_h.append(0) #=================================================================== # As in Pavan Paper event_e = [] for x in combined_entities: if x in _wikipage_entities.keys(): event_e.append(_wikipage_entities[x]) else: event_e.append(0) #=================================================================== sim = similaritites.cosine_similarity( Counter(hashtag_h), Counter(event_e)) recommended_hashtags_cosine[hashtag] = sim #------- print number_of_hashtags number_of_hashtags -= 1 #=================================================================== #=================================================================== #=================================================================== else: break recommended_hashtags = { 'jaccard_sim': recommended_hashtags_jaccard, 'cosine_sim': recommended_hashtags_cosine, 'hashtag-cooc': recommended_hashtags_coocs, 'hashtag-taxonomies': recommended_hashtags_taxonomies } db_operations.insert_wikipage_recommended_hashtags(sessionID, recommended_hashtags)