def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"): self.regex = re.compile( '"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"' ) self.hashtag_regex = re.compile("#(\w+)") self.format_string = "%a %b %d %H:%M:%S +0000 %Y" self.time_key = {} self.adjacency_list = {} self.input_filepath = inputfile self.output_filepath = outputfile self.clean_tweet_object = CleanTweet()
def test_unicode_tweet_count(self): # Should have mocked. But did not. cleaned_tweet_list = ["Spark Summit East this week! #Spark #Apache (timestamp: Thu Oct 29 17:51:01 +0000 2015)", "Just saw a great post on Insight Data Engineering #Apache #Hadoop #Storm (timestamp: Thu Oct 29 17:51:30 +0000 2015)", "Doing great work #Apache (timestamp: Thu Oct 29 17:51:55 +0000 2015)", "Excellent post on #Flink and #Spark (timestamp: Thu Oct 29 17:51:56 +0000 2015)", "New and improved #HBase connector for #Spark (timestamp: Thu Oct 29 17:51:59 +0000 2015)", "New 2.7.1 version update for #Hadoop #Apache (timestamp: Thu Oct 29 17:52:05 +0000 2015)"] self.clean_tweet = CleanTweet(self.base_dir+"tweet_input/sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt") cleaned_tweets = self.clean_tweet.process_tweets() self.failUnlessEqual(set(cleaned_tweet_list), set(cleaned_tweets)) self.failUnlessEqual(4, self.clean_tweet.get_count_of_unicode_tweets())
class TestCase(unittest.TestCase): def setUp(self): self.clean_tweet = CleanTweet() self.avg_deg = AverageDegree() self.base_dir = "/home/sesha/Interests/insightDataFella/coding-challenge/" def test_if_unicode_removed(self): tweet = "Spark \u009FSummit East this week! #Spark #Apache" clean_tweet = "Spark Summit East this week! #Spark #Apache" self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet)) def test_if_unicode_spared(self): # Spare unicode between 0000 and 007F tweet = "Spark \u003cSummit East this week! #Spark #Apache" clean_tweet = "Spark <Summit East this week! #Spark #Apache" self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet)) def test_if_double_backslash_handled(self): tweet = "This concert \\\\m/. Am enjoying!!" clean_tweet = "This concert \\m/. Am enjoying!!" self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet)) def test_if_escape_url_handled(self): tweet = "PB https:\/\/t.co\/HOl34REL1a hello" clean_tweet = "PB https://t.co/HOl34REL1a hello" self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet)) def test_if_escape_sequences_handled(self): tweet = "Should \n\nclean\t\rall of \"this\"" clean_tweet = 'Should clean all of "this"' self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet)) def test_if_empty(self): tweet = "\u3084\u3070\u3044\u7709\u6d88\u3048\u305f\uff01\u30d6\u30ea\u30fc\u30c1\u6642\u9593\u30df\u30b9\u3063\u305f\u308f\u2026" clean_tweet = "" self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet)) def test_unicode_tweet_count(self): # Should have mocked. But did not. cleaned_tweet_list = ["Spark Summit East this week! #Spark #Apache (timestamp: Thu Oct 29 17:51:01 +0000 2015)", "Just saw a great post on Insight Data Engineering #Apache #Hadoop #Storm (timestamp: Thu Oct 29 17:51:30 +0000 2015)", "Doing great work #Apache (timestamp: Thu Oct 29 17:51:55 +0000 2015)", "Excellent post on #Flink and #Spark (timestamp: Thu Oct 29 17:51:56 +0000 2015)", "New and improved #HBase connector for #Spark (timestamp: Thu Oct 29 17:51:59 +0000 2015)", "New 2.7.1 version update for #Hadoop #Apache (timestamp: Thu Oct 29 17:52:05 +0000 2015)"] self.clean_tweet = CleanTweet(self.base_dir+"tweet_input/sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt") cleaned_tweets = self.clean_tweet.process_tweets() self.failUnlessEqual(set(cleaned_tweet_list), set(cleaned_tweets)) self.failUnlessEqual(4, self.clean_tweet.get_count_of_unicode_tweets()) def test_average_degree_changes(self): self.avg_deg = AverageDegree(self.base_dir+"tweet_input/half_sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt") self.avg_deg.process_tweets() self.failUnlessEqual(2.0, self.avg_deg.calculate_avg_degree()) self.avg_deg = AverageDegree(self.base_dir+"tweet_input/sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt") self.avg_deg.process_tweets() self.failUnlessEqual(1.67, self.avg_deg.calculate_avg_degree()) def tearDown(self): self.clean_tweet = CleanTweet() self.avg_deg = AverageDegree()
def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"): self.regex = re.compile('"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"') self.hashtag_regex = re.compile("#(\w+)") self.format_string = "%a %b %d %H:%M:%S +0000 %Y" self.time_key = {} self.adjacency_list = {} self.input_filepath = inputfile self.output_filepath = outputfile self.clean_tweet_object = CleanTweet()
def tearDown(self): self.clean_tweet = CleanTweet() self.avg_deg = AverageDegree()
def setUp(self): self.clean_tweet = CleanTweet() self.avg_deg = AverageDegree() self.base_dir = "/home/sesha/Interests/insightDataFella/coding-challenge/"
class AverageDegree(object): def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"): self.regex = re.compile( '"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"' ) self.hashtag_regex = re.compile("#(\w+)") self.format_string = "%a %b %d %H:%M:%S +0000 %Y" self.time_key = {} self.adjacency_list = {} self.input_filepath = inputfile self.output_filepath = outputfile self.clean_tweet_object = CleanTweet() def remove_older_hashtags(self, timestamp): one_minute = datetime.timedelta(minutes=1) current_time = datetime.datetime.strptime(timestamp, self.format_string) timestamps = self.time_key.keys() ts_to_remove = filter( lambda old_ts: (current_time - datetime.datetime.strptime( old_ts, self.format_string)) > one_minute, timestamps) hashtags = [] for ts in ts_to_remove: hashtags += self.time_key.pop(ts) unique_hashtags = set(hashtags) for hashtag in unique_hashtags: self.adjacency_list[hashtag] -= unique_hashtags def add_nodes(self, tweet, timestamp): raw_hashtags = self.hashtag_regex.findall(tweet) hashtags = map(lambda tag: tag.lower().strip(), raw_hashtags) self.remove_older_hashtags(timestamp) if timestamp not in self.time_key: self.time_key[timestamp] = [] unique_hashtags = set(hashtags) if len(unique_hashtags) > 1: self.time_key[timestamp] += list(unique_hashtags) temp_hashtags = unique_hashtags.copy() for hashtag in unique_hashtags: if hashtag not in self.adjacency_list: self.adjacency_list[hashtag] = set([]) temp_hashtags.remove(hashtag) self.adjacency_list[hashtag].update(temp_hashtags) temp_hashtags.add(hashtag) def calculate_avg_degree(self): avg_degree = 0 no_of_nodes = len(self.adjacency_list.keys()) if no_of_nodes != 0: for links in self.adjacency_list.values(): avg_degree += len(links) avg_degree = round(avg_degree * 1. / no_of_nodes, 2) return avg_degree def process_tweets(self): output = open(self.output_filepath, "a+") with open(self.input_filepath) as tweets: for raw_tweet in tweets: if "source" in raw_tweet: source_word_idx = raw_tweet.index(',"source"') trimmed_tweet = raw_tweet[0:source_word_idx] matches = self.regex.search(trimmed_tweet) if matches != None: timestamp = matches.group(1) content = str(matches.group(3)) clean_tweet = self.clean_tweet_object.get_clean_tweet( content) self.add_nodes(clean_tweet, timestamp) output.write(str(self.calculate_avg_degree())) output.write("\n") output.close()
class AverageDegree(object): def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"): self.regex = re.compile('"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"') self.hashtag_regex = re.compile("#(\w+)") self.format_string = "%a %b %d %H:%M:%S +0000 %Y" self.time_key = {} self.adjacency_list = {} self.input_filepath = inputfile self.output_filepath = outputfile self.clean_tweet_object = CleanTweet() def remove_older_hashtags(self, timestamp): one_minute = datetime.timedelta(minutes=1) current_time = datetime.datetime.strptime(timestamp, self.format_string) timestamps = self.time_key.keys() ts_to_remove = filter( lambda old_ts: (current_time - datetime.datetime.strptime(old_ts, self.format_string)) > one_minute, timestamps, ) hashtags = [] for ts in ts_to_remove: hashtags += self.time_key.pop(ts) unique_hashtags = set(hashtags) for hashtag in unique_hashtags: self.adjacency_list[hashtag] -= unique_hashtags def add_nodes(self, tweet, timestamp): raw_hashtags = self.hashtag_regex.findall(tweet) hashtags = map(lambda tag: tag.lower().strip(), raw_hashtags) self.remove_older_hashtags(timestamp) if timestamp not in self.time_key: self.time_key[timestamp] = [] unique_hashtags = set(hashtags) if len(unique_hashtags) > 1: self.time_key[timestamp] += list(unique_hashtags) temp_hashtags = unique_hashtags.copy() for hashtag in unique_hashtags: if hashtag not in self.adjacency_list: self.adjacency_list[hashtag] = set([]) temp_hashtags.remove(hashtag) self.adjacency_list[hashtag].update(temp_hashtags) temp_hashtags.add(hashtag) def calculate_avg_degree(self): avg_degree = 0 no_of_nodes = len(self.adjacency_list.keys()) if no_of_nodes != 0: for links in self.adjacency_list.values(): avg_degree += len(links) avg_degree = round(avg_degree * 1.0 / no_of_nodes, 2) return avg_degree def process_tweets(self): output = open(self.output_filepath, "a+") with open(self.input_filepath) as tweets: for raw_tweet in tweets: if "source" in raw_tweet: source_word_idx = raw_tweet.index(',"source"') trimmed_tweet = raw_tweet[0:source_word_idx] matches = self.regex.search(trimmed_tweet) if matches != None: timestamp = matches.group(1) content = str(matches.group(3)) clean_tweet = self.clean_tweet_object.get_clean_tweet(content) self.add_nodes(clean_tweet, timestamp) output.write(str(self.calculate_avg_degree())) output.write("\n") output.close()