def test_everything(self):
        s = "Me taking a hit of an #ipython acid tab at #pydata whilst Peter @norvig talks. I see multicolor lambda funcs @ivanov http://pic.twitter.com/pEERrxscZU"
        e = [["Me"], ["Peter"]]
        r = make_ngrams.get_cleaned_capitalised_word_sequences(s)
        self.assertEqual(r, e)

        s = "Stop talking about #pycon. Makes me sad."
        e = []
        r = make_ngrams.get_cleaned_capitalised_word_sequences(s)
        self.assertEqual(r, e)

        s = "Loving this Live KeyNote. Having fun at #PyData today looking at Lots Of Interesting Stuff"
        e = [["Loving"], ["Live", "KeyNote"], ["Having"], ["Lots", "Of", "Interesting", "Stuff"]]
        r = make_ngrams.get_cleaned_capitalised_word_sequences(s)
        self.assertEqual(r, e)
def build_and_trim_network(json_cleaned_lines, remove_nodes, remove_usernames_below, remove_hashtags_below, remove_phrases_below):
    items = json_cleaned_lines

    hashtag_net = nx.Graph()

    top_collocations = colloc_analysis.extract_top_collocations(items)

    for item in items:
        # combine hashtags and users into one list of things to pair up
        all_items = item['hashtags'] + item['users']
        word_sequences = make_ngrams.get_cleaned_capitalised_word_sequences(item['tweet'])
        for word_sequence in word_sequences:
            if len(word_sequence) > 1:
                capitalised_words = " ".join(word_sequence)
                capitalised_words = capitalised_words.lower()  # normalise e.g. Github GitHub GITHUB -> github
                all_items.append(capitalised_words)

        # extract frequent collocations
        tweet_cleaned_lowercased = " ".join(colloc_analysis.tweet_as_terms(item['tweet']))
        for top_collocation in top_collocations:
            tc = " ".join(top_collocation)
            if tc in tweet_cleaned_lowercased:
                all_items.append(tc)  # add collocation phrase

        # add nodes with a default weight
        for item in all_items:
            add_node(hashtag_net, item)
        for t1 in all_items:
            for t2 in all_items:
                if t1 is not t2:
                    maksim_utils.add_or_inc_edge(hashtag_net, t1, t2)

    for node in hashtag_net.nodes():
        if node.startswith('@'):
            if hashtag_net.node[node]['weight'] < remove_usernames_below:
                hashtag_net.remove_node(node)
        if node.startswith('#'):
            if hashtag_net.node[node]['weight'] < remove_hashtags_below:
                hashtag_net.remove_node(node)
        if not node.startswith('#') and not node.startswith('@'):
            # here if we have a phrase
            if hashtag_net.node[node]['weight'] < remove_phrases_below:
                hashtag_net.remove_node(node)

    # remove nodes that too many people might be connected to
    for removal in remove_nodes:
        try:
            hashtag_net.remove_node(removal)
        except nx.NetworkXError as err:
            logging.warning("Node %r not in the graph (error==%r)" % (removal, err))

    # remove singularly connected nodes until none left
    while True:
        nbr_of_nodes = hashtag_net.number_of_nodes()
        logging.info("Trimming, currently we have %d nodes" % (nbr_of_nodes))
        hashtag_net = maksim_utils.trim_degrees(hashtag_net)
        if hashtag_net.number_of_nodes() == nbr_of_nodes:
            break

    return hashtag_net
def build_and_trim_network(json_cleaned_lines, remove_nodes,
                           remove_usernames_below, remove_hashtags_below,
                           remove_phrases_below):
    items = json_cleaned_lines

    hashtag_net = nx.Graph()

    top_collocations = colloc_analysis.extract_top_collocations(items)

    for item in items:
        # combine hashtags and users into one list of things to pair up
        all_items = item['hashtags'] + item['users']
        word_sequences = make_ngrams.get_cleaned_capitalised_word_sequences(
            item['tweet'])
        for word_sequence in word_sequences:
            if len(word_sequence) > 1:
                capitalised_words = " ".join(word_sequence)
                capitalised_words = capitalised_words.lower(
                )  # normalise e.g. Github GitHub GITHUB -> github
                all_items.append(capitalised_words)

        # extract frequent collocations
        tweet_cleaned_lowercased = " ".join(
            colloc_analysis.tweet_as_terms(item['tweet']))
        for top_collocation in top_collocations:
            tc = " ".join(top_collocation)
            if tc in tweet_cleaned_lowercased:
                all_items.append(tc)  # add collocation phrase

        # add nodes with a default weight
        for item in all_items:
            add_node(hashtag_net, item)
        for t1 in all_items:
            for t2 in all_items:
                if t1 is not t2:
                    maksim_utils.add_or_inc_edge(hashtag_net, t1, t2)

    for node in hashtag_net.nodes():
        if node.startswith('@'):
            if hashtag_net.node[node]['weight'] < remove_usernames_below:
                hashtag_net.remove_node(node)
        if node.startswith('#'):
            if hashtag_net.node[node]['weight'] < remove_hashtags_below:
                hashtag_net.remove_node(node)
        if not node.startswith('#') and not node.startswith('@'):
            # here if we have a phrase
            if hashtag_net.node[node]['weight'] < remove_phrases_below:
                hashtag_net.remove_node(node)

    # remove nodes that too many people might be connected to
    for removal in remove_nodes:
        try:
            hashtag_net.remove_node(removal)
        except nx.NetworkXError as err:
            logging.warning("Node %r not in the graph (error==%r)" %
                            (removal, err))

    # remove singularly connected nodes until none left
    while True:
        nbr_of_nodes = hashtag_net.number_of_nodes()
        logging.info("Trimming, currently we have %d nodes" % (nbr_of_nodes))
        hashtag_net = maksim_utils.trim_degrees(hashtag_net)
        if hashtag_net.number_of_nodes() == nbr_of_nodes:
            break

    return hashtag_net