def test_update_graph_with_example_from_instructions_manually(self): '''exact example from the online instructions done manually here: https://github.com/InsightDataScience/coding-challenge#building-the-twitter-hashtag-graph''' # First tweet added to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:01 +0000 2015', hashtags=['Spark', 'Apache'])) # graph will have each hashtag as a node and neighbor to one another assert self.tweet_graph.graph == {'apache': set(['spark']), 'spark': set(['apache'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '1.00' # Second tweet added to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:30 +0000 2015', hashtags=['Apache', 'Hadoop', 'Storm'])) # graph gets updated assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 'spark': set(['apache']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Third tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:55 +0000 2015', hashtags=['Apache'])) # graph stays unchanged since there was only one hashtag passed in for this tweet assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 'spark': set(['apache']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Fourth tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:56 +0000 2015', hashtags=['Flink', 'Spark'])) # graph gets updated accordingly assert self.tweet_graph.graph == {'apache': set(['spark', 'hadoop', 'storm']), 'spark': set(['apache', 'flink']), 'flink': set(['spark']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Fifth tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:51:59 +0000 2015', hashtags=['HBase', 'Spark'])) # graph gets updated assert self.tweet_graph.graph == {'flink': set(['spark']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop']), 'apache': set(['spark', 'hadoop', 'storm']), 'hbase': set(['spark']), 'spark': set(['apache', 'hbase', 'flink'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '2.00' # Last tweet to the graph self.tweet_graph.update_graph(Tweet('Thu Oct 29 17:52:05 +0000 2015', hashtags=['Apache'])) # graph gets updated and now the Spark and Apache edge is remove b/c the tweet with them in it was older than 60s assert self.tweet_graph.graph == {'flink': set(['spark']), 'hadoop': set(['apache', 'storm']), 'storm': set(['apache', 'hadoop']), 'apache': set(['hadoop', 'storm']), 'hbase': set(['spark']), 'spark': set(['hbase', 'flink'])} # get the graph avg degree assert self.tweet_graph.get_graph_avg_degree_of_all_nodes() == '1.67' # now test this against the graph we just did manually, but with the same data loaded from file tweets_test_graph2 = TweetsGraph() testfile = os.path.join(tests_dir, 'test_data', 'data_for_building_hashtag_graph.txt') with open(testfile, 'r') as f: for tweet in f: tweet_dict = json.loads(tweet) hashtags = [hashtag['text'] for hashtag in tweet_dict['entities']['hashtags']] tweets_test_graph2.update_graph(Tweet(tweet_dict['created_at'], hashtags)) # check that the graph output here is the same as for the previous example that was just performed manually assert tweets_test_graph2.graph == self.tweet_graph.graph
tweets_data_incoming = sys.argv[1] assert path.isfile(tweets_data_incoming), "Error: need to pass in a data file that exists." tweets_incomming_path = path.abspath(tweets_data_incoming) else: # to run the actual test data set (needs to be run from root of repo!) tweets_incomming_path = path.abspath(path.join('tweet_input', 'tweets.txt')) tweet_output_path = path.abspath('tweet_output') if not path.isdir(tweet_output_path): os.makedirs(tweet_output_path) ft1 = open(path.abspath(path.join(tweet_output_path, 'ft1.txt')), 'w') ft2 = open(path.abspath(path.join(tweet_output_path, 'ft2.txt')), 'w') close_files = lambda l: [f.close() for f in l] tweet_graph = TweetsGraph(time_window=60) with open(tweets_incomming_path, 'r') as tweets_incomming: # all tweets from the api are utf-8 encoded: # https://dev.twitter.com/overview/api/counting-characters for cnt, tweet in enumerate(tweets_incomming, start=1): try: tweet_dict = json_loads(tweet) # json.loads uses utf-8 decoding by default text = tweet_dict["text"] created_at = tweet_dict["created_at"] hashtags = [hashtag['text'] for hashtag in tweet_dict['entities']['hashtags']] tweet = Tweet(created_at, hashtags) tweet_graph.update_graph(tweet) cleaned_text = clean_text(text, count_unicode=True) # logging.debug('tweet_cnt: {}, num_graph_nodes: {}, avg_deg: {}'.format(
def setUp(self): self.tweet_graph = TweetsGraph() ok_(self.tweet_graph.graph == {})