def describe_data(tweets): """ Evaluate the Tweets provided and return the results object """ logger.info('Describing Tweets') results = tweet_evaluation_module.setup_analysis(conversation=True,audience=True) tweet_evaluation_module.analyze_tweets(tweets,results) return results
def test_hashtag_count(self): """ inject hashtags with a predetermined number of test tokens """ pass # configure results structure results = analysis.setup_analysis(do_conversation=True) # use counter for verification counter = 0 for tweet in self.tweets: tweet['twitter_entities']['hashtags'].append({"text":"notarandomhashtag"}) analysis.analyze_tweet(tweet, results) counter += 1 self.assertEqual(results['hashtags']['notarandomhashtag'], counter)
def test_conversation_length(self): """ Test count of Tweets in analysis code against known number """ # configure results structure results = analysis.setup_analysis(do_conversation=True) # is this id necessary for testing? #results["unique_id"] = "TEST" # run analysis code (including conversation) analysis.analyze_tweets(self.tweets, results) # ground truth from setUp() self.assertEqual(results['tweet_count'],self.generator_length_truth)
def test_conversation_length(self): """ Test count of Tweets in analysis code against known number """ # configure results structure results = analysis.setup_analysis(do_conversation=True) # is this id necessary for testing? #results["unique_id"] = "TEST" # run analysis code (including conversation) analysis.analyze_tweets(self.tweets, results) # ground truth from setUp() self.assertEqual(results['tweet_count'], self.generator_length_truth)
def test_hashtag_count(self): """ inject hashtags with a predetermined number of test tokens """ pass # configure results structure results = analysis.setup_analysis(do_conversation=True) # use counter for verification counter = 0 for tweet in self.tweets: tweet['twitter_entities']['hashtags'].append( {"text": "notarandomhashtag"}) analysis.analyze_tweet(tweet, results) counter += 1 self.assertEqual(results['hashtags']['notarandomhashtag'], counter)
def test_bio_term_count(self): """ inject bio with a predetermined number of test tokens """ pass # configure results structure results = analysis.setup_analysis(do_audience=True) # use counter for verification counter = 1 for tweet in self.tweets: addition = " test_term"*counter tweet['actor']['summary'] += addition analysis.analyze_tweet(tweet, results) counter += 1 expected_test_count = int( next(results['bio_term_count'].get_tokens())[0] ) self.assertEqual(expected_test_count, sum(range(counter)))
def test_audience_length(self): pass # configure results structure results = analysis.setup_analysis(do_audience=True) # run analysis code (including audience) for user ids analysis.analyze_tweets(self.tweets, results) user_ids = results["tweets_per_user"].keys() # get ground truth (# unique user ids) from test data file p1 = subprocess.Popen(['cat', INPUT_FILE_NAME], stdout=subprocess.PIPE) p2 = subprocess.Popen(['python', '-c', 'import sys; import json; print(len(set([json.loads(i)["actor"]["id"] for i in sys.stdin])))'], stdin=p1.stdout, stdout=subprocess.PIPE) p1.stdout.close() out, err = p2.communicate() shell_user_count = int(out) self.assertEqual(shell_user_count, len(user_ids))
def test_body_term_count(self): """ inject body with a predetermined number of test tokens """ pass # configure results structure results = analysis.setup_analysis(do_conversation=True) # use counter for verification counter = 1 for tweet in self.tweets: addition = " test_term" * counter tweet['body'] += addition analysis.analyze_tweet(tweet, results) counter += 1 expected_test_count = int( next(results['body_term_count'].get_tokens())[0]) self.assertEqual(expected_test_count, sum(range(counter)))
def test_audience_length(self): pass # configure results structure results = analysis.setup_analysis(do_audience=True) # run analysis code (including audience) for user ids analysis.analyze_tweets(self.tweets, results) user_ids = results["tweets_per_user"].keys() # get ground truth (# unique user ids) from test data file p1 = subprocess.Popen(['cat', INPUT_FILE_NAME], stdout=subprocess.PIPE) p2 = subprocess.Popen([ 'python', '-c', 'import sys; import json; print(len(set([json.loads(i)["actor"]["id"] for i in sys.stdin])))' ], stdin=p1.stdout, stdout=subprocess.PIPE) p1.stdout.close() out, err = p2.communicate() shell_user_count = int(out) self.assertEqual(shell_user_count, len(user_ids))
splitting_config = None if args.splitting_config is not None: # if file not in local directory, temporarily extend path to its location config_file_full_path = args.config_file.split('/') if len(config_file_full_path) > 1: path = '/'.join(config_file_full_path[:-1]) sys.path.append(os.path.join(os.getcwd(), path)) else: sys.path.append(os.getcwd()) splitting_config = importlib.import_module( config_file_full_path[-1].rstrip('.py')).splitting_config sys.path.pop() results = analysis.setup_analysis( conversation=args.do_conversation_analysis, audience=args.do_audience_analysis, identifier='analyzed', input_results={}) results = analysis.setup_analysis( conversation=args.do_conversation_analysis, audience=args.do_audience_analysis, identifier='baseline', input_results=results) else: results = analysis.setup_analysis( conversation=args.do_conversation_analysis, audience=args.do_audience_analysis) # manage input sources, file opening, and deserialization if args.input_file_name is not None: tweet_generator = analysis.deserialize_tweets(
parser.add_argument('-o','--output-dir',dest='output_directory',default=os.environ['HOME'] + '/tweet_evaluation/', help='directory for output files; default is %(default)s') parser.add_argument('-b','--baseline-input-file',dest='baseline_input_name',default=None, help='Tweets against which to run a relative analysis') parser.add_argument('--no-insights',dest='use_insights',action='store_false',default=True) args = parser.parse_args() # get the time right now, to use in output naming time_now = datetime.datetime.now() output_directory = '{0}/{1:04d}/{2:02d}/{3:02d}/'.format(args.output_directory.rstrip('/') ,time_now.year ,time_now.month ,time_now.day ) # get the empty results object, which defines the measurements to be run results = analysis.setup_analysis(do_conversation = args.do_conversation_analysis, do_audience = args.do_audience_analysis) baseline_results = None if args.baseline_input_name is not None: baseline_results = analysis.setup_analysis(do_conversation = args.do_conversation_analysis, do_audience = args.do_audience_analysis) if not args.use_insights: results.pop('audience_api',None) if args.baseline_input_name is not None: baseline_results.pop('audience_api',None) # manage input sources, file opening, and deserialization if args.input_file_name is not None: tweet_generator = analysis.deserialize_tweets(open(args.input_file_name)) else: tweet_generator = analysis.deserialize_tweets(sys.stdin)
help="file containing tweets, tweet IDs, or user IDs; take input from stdin if not present") parser.add_argument('-o','--output-dir',dest='output_directory',default=os.environ['HOME'] + '/tweet_evaluation/', help='directory for output files; default is %(default)s') args = parser.parse_args() # get the time right now, to use in output naming time_now = datetime.datetime.now() time_string = time_now.isoformat().split(".")[0].translate(None,":") output_directory = '{0}/{1:04d}/{2:02d}/{3:02d}/'.format(args.output_directory.rstrip('/') ,time_now.year ,time_now.month ,time_now.day ) # create the output directory if it doesn't exist ### results = analysis.setup_analysis(conversation = args.do_conversation_analysis, audience = args.do_audience_analysis) # manage input source if args.input_file_name is not None: input_generator = open(args.input_file_name) else: input_generator = sys.stdin # run analysis run_analysis(input_generator, results) # dump the output output.dump_results(results, output_directory, args.unique_identifier)
help='directory for output files; default is %(default)s') parser.add_argument('-b', '--baseline-input-file', dest='baseline_input_name', default=None, help='Tweets against which to run a relative analysis') args = parser.parse_args() # get the time right now, to use in output naming time_now = datetime.datetime.now() output_directory = '{0}/{1:04d}/{2:02d}/{3:02d}/'.format( args.output_directory.rstrip('/'), time_now.year, time_now.month, time_now.day) # get the empty results object, which defines the measurements to be run results = analysis.setup_analysis( do_conversation=args.do_conversation_analysis, do_audience=args.do_audience_analysis) baseline_results = None if args.baseline_input_name is not None: baseline_results = analysis.setup_analysis( do_conversation=args.do_conversation_analysis, do_audience=args.do_audience_analysis) # manage input sources, file opening, and deserialization if args.input_file_name is not None: tweet_generator = analysis.deserialize_tweets( open(args.input_file_name)) else: tweet_generator = analysis.deserialize_tweets(sys.stdin)