def test_everything(self): s = "Me taking a hit of an #ipython acid tab at #pydata whilst Peter @norvig talks. I see multicolor lambda funcs @ivanov http://pic.twitter.com/pEERrxscZU" e = [["Me"], ["Peter"]] r = make_ngrams.get_cleaned_capitalised_word_sequences(s) self.assertEqual(r, e) s = "Stop talking about #pycon. Makes me sad." e = [] r = make_ngrams.get_cleaned_capitalised_word_sequences(s) self.assertEqual(r, e) s = "Loving this Live KeyNote. Having fun at #PyData today looking at Lots Of Interesting Stuff" e = [["Loving"], ["Live", "KeyNote"], ["Having"], ["Lots", "Of", "Interesting", "Stuff"]] r = make_ngrams.get_cleaned_capitalised_word_sequences(s) self.assertEqual(r, e)
def build_and_trim_network(json_cleaned_lines, remove_nodes, remove_usernames_below, remove_hashtags_below, remove_phrases_below): items = json_cleaned_lines hashtag_net = nx.Graph() top_collocations = colloc_analysis.extract_top_collocations(items) for item in items: # combine hashtags and users into one list of things to pair up all_items = item['hashtags'] + item['users'] word_sequences = make_ngrams.get_cleaned_capitalised_word_sequences(item['tweet']) for word_sequence in word_sequences: if len(word_sequence) > 1: capitalised_words = " ".join(word_sequence) capitalised_words = capitalised_words.lower() # normalise e.g. Github GitHub GITHUB -> github all_items.append(capitalised_words) # extract frequent collocations tweet_cleaned_lowercased = " ".join(colloc_analysis.tweet_as_terms(item['tweet'])) for top_collocation in top_collocations: tc = " ".join(top_collocation) if tc in tweet_cleaned_lowercased: all_items.append(tc) # add collocation phrase # add nodes with a default weight for item in all_items: add_node(hashtag_net, item) for t1 in all_items: for t2 in all_items: if t1 is not t2: maksim_utils.add_or_inc_edge(hashtag_net, t1, t2) for node in hashtag_net.nodes(): if node.startswith('@'): if hashtag_net.node[node]['weight'] < remove_usernames_below: hashtag_net.remove_node(node) if node.startswith('#'): if hashtag_net.node[node]['weight'] < remove_hashtags_below: hashtag_net.remove_node(node) if not node.startswith('#') and not node.startswith('@'): # here if we have a phrase if hashtag_net.node[node]['weight'] < remove_phrases_below: hashtag_net.remove_node(node) # remove nodes that too many people might be connected to for removal in remove_nodes: try: hashtag_net.remove_node(removal) except nx.NetworkXError as err: logging.warning("Node %r not in the graph (error==%r)" % (removal, err)) # remove singularly connected nodes until none left while True: nbr_of_nodes = hashtag_net.number_of_nodes() logging.info("Trimming, currently we have %d nodes" % (nbr_of_nodes)) hashtag_net = maksim_utils.trim_degrees(hashtag_net) if hashtag_net.number_of_nodes() == nbr_of_nodes: break return hashtag_net
def build_and_trim_network(json_cleaned_lines, remove_nodes, remove_usernames_below, remove_hashtags_below, remove_phrases_below): items = json_cleaned_lines hashtag_net = nx.Graph() top_collocations = colloc_analysis.extract_top_collocations(items) for item in items: # combine hashtags and users into one list of things to pair up all_items = item['hashtags'] + item['users'] word_sequences = make_ngrams.get_cleaned_capitalised_word_sequences( item['tweet']) for word_sequence in word_sequences: if len(word_sequence) > 1: capitalised_words = " ".join(word_sequence) capitalised_words = capitalised_words.lower( ) # normalise e.g. Github GitHub GITHUB -> github all_items.append(capitalised_words) # extract frequent collocations tweet_cleaned_lowercased = " ".join( colloc_analysis.tweet_as_terms(item['tweet'])) for top_collocation in top_collocations: tc = " ".join(top_collocation) if tc in tweet_cleaned_lowercased: all_items.append(tc) # add collocation phrase # add nodes with a default weight for item in all_items: add_node(hashtag_net, item) for t1 in all_items: for t2 in all_items: if t1 is not t2: maksim_utils.add_or_inc_edge(hashtag_net, t1, t2) for node in hashtag_net.nodes(): if node.startswith('@'): if hashtag_net.node[node]['weight'] < remove_usernames_below: hashtag_net.remove_node(node) if node.startswith('#'): if hashtag_net.node[node]['weight'] < remove_hashtags_below: hashtag_net.remove_node(node) if not node.startswith('#') and not node.startswith('@'): # here if we have a phrase if hashtag_net.node[node]['weight'] < remove_phrases_below: hashtag_net.remove_node(node) # remove nodes that too many people might be connected to for removal in remove_nodes: try: hashtag_net.remove_node(removal) except nx.NetworkXError as err: logging.warning("Node %r not in the graph (error==%r)" % (removal, err)) # remove singularly connected nodes until none left while True: nbr_of_nodes = hashtag_net.number_of_nodes() logging.info("Trimming, currently we have %d nodes" % (nbr_of_nodes)) hashtag_net = maksim_utils.trim_degrees(hashtag_net) if hashtag_net.number_of_nodes() == nbr_of_nodes: break return hashtag_net