コード例 #1
0
ファイル: test_suite.py プロジェクト: jgors/cc_insight
    def test_update_graph_with_example_from_instructions_manually(self):
        '''exact example from the online instructions done manually here: 
        https://github.com/InsightDataScience/coding-challenge#building-the-twitter-hashtag-graph'''

        
        # First tweet added to the graph
        self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:01 +0000 2015', hashtags=['Spark', 'Apache'])) 
        # graph will have each hashtag as a node and neighbor to one another 
        assert self.tweet_graph.graph == {'apache': set(['spark']), 
                                           'spark': set(['apache'])}
        # get the graph avg degree 
        assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '1.00'


        # Second tweet added to the graph
        self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:30 +0000 2015', hashtags=['Apache', 'Hadoop', 'Storm'])) 
        # graph gets updated
        assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 
                                           'spark': set(['apache']), 
                                           'hadoop': set(['apache', 'storm']), 
                                           'storm': set(['apache', 'hadoop'])}  
        # get the graph avg degree 
        assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' 


        # Third tweet to the graph
        self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:55 +0000 2015', hashtags=['Apache'])) 
        # graph stays unchanged since there was only one hashtag passed in for this tweet
        assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 
                                           'spark': set(['apache']), 
                                           'hadoop': set(['apache', 'storm']), 
                                           'storm': set(['apache', 'hadoop'])}
        # get the graph avg degree 
        assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' 


        # Fourth tweet to the graph
        self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:56 +0000 2015', hashtags=['Flink', 'Spark'])) 
        # graph gets updated accordingly
        assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 
                                           'spark': set(['apache', 'flink']), 
                                           'flink': set(['spark']), 
                                           'hadoop': set(['apache', 'storm']), 
                                           'storm': set(['apache', 'hadoop'])}
        # get the graph avg degree 
        assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' 


        # Fifth tweet to the graph
        self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:59 +0000 2015', hashtags=['HBase', 'Spark'])) 
        # graph gets updated 
        assert self.tweet_graph.graph == {'flink': set(['spark']), 
                                           'hadoop': set(['apache', 'storm']), 
                                           'storm': set(['apache', 'hadoop']), 
                                           'apache': set(['spark', 'hadoop', 'storm']), 
                                           'hbase': set(['spark']), 
                                           'spark': set(['apache', 'hbase', 'flink'])}
        # get the graph avg degree 
        assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' 


        # Last tweet to the graph
        self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:52:05 +0000 2015', hashtags=['Apache'])) 
        # graph gets updated and now the Spark and Apache edge is remove b/c the tweet with them in it was older than 60s 
        assert self.tweet_graph.graph == {'flink': set(['spark']), 
                                           'hadoop': set(['apache', 'storm']), 
                                           'storm': set(['apache', 'hadoop']), 
                                           'apache': set(['hadoop', 'storm']), 
                                           'hbase': set(['spark']), 
                                           'spark': set(['hbase', 'flink'])}
        # get the graph avg degree 
        assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '1.67' 
        
       
        # now test this against the graph we just did manually, but with the same data loaded from file
        tweets_test_graph2 = TweetsGraph()
        testfile = os.path.join(tests_dir, 'test_data', 'data_for_building_hashtag_graph.txt')
        with open(testfile, 'r') as f: 
            for tweet in f:
                tweet_dict = json.loads(tweet)
                hashtags = [hashtag['text'] for hashtag in tweet_dict['entities']['hashtags']]
                tweets_test_graph2.update_graph(Tweet(tweet_dict['created_at'], hashtags))
        # check that the graph output here is the same as for the previous example that was just performed manually  
        assert tweets_test_graph2.graph == self.tweet_graph.graph 
コード例 #2
0
ファイル: tweet_processor.py プロジェクト: jgors/cc_insight
        tweets_data_incoming = sys.argv[1]
        assert path.isfile(tweets_data_incoming), "Error: need to pass in a data file that exists."
        tweets_incomming_path = path.abspath(tweets_data_incoming)
    else:
        # to run the actual test data set (needs to be run from root of repo!)
        tweets_incomming_path = path.abspath(path.join('tweet_input', 'tweets.txt'))
    tweet_output_path = path.abspath('tweet_output')

    if not path.isdir(tweet_output_path):
        os.makedirs(tweet_output_path)

    ft1 = open(path.abspath(path.join(tweet_output_path, 'ft1.txt')), 'w')
    ft2 = open(path.abspath(path.join(tweet_output_path, 'ft2.txt')), 'w')
    close_files = lambda l: [f.close() for f in l]

    tweet_graph = TweetsGraph(time_window=60)

    with open(tweets_incomming_path, 'r') as tweets_incomming:
        # all tweets from the api are utf-8 encoded:
        # https://dev.twitter.com/overview/api/counting-characters
        for cnt, tweet in enumerate(tweets_incomming, start=1):
            try:
                tweet_dict =  json_loads(tweet)     # json.loads uses utf-8 decoding by default
                text = tweet_dict["text"]
                created_at = tweet_dict["created_at"]
                hashtags = [hashtag['text'] for hashtag in tweet_dict['entities']['hashtags']]
                tweet = Tweet(created_at, hashtags)
                tweet_graph.update_graph(tweet)

                cleaned_text = clean_text(text, count_unicode=True)
                # logging.debug('tweet_cnt: {}, num_graph_nodes: {}, avg_deg: {}'.format(
コード例 #3
0
ファイル: test_suite.py プロジェクト: jgors/cc_insight
 def setUp(self):
     self.tweet_graph = TweetsGraph()
     ok_(self.tweet_graph.graph == {})