def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"):
     self.regex = re.compile(
         '"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"'
     )
     self.hashtag_regex = re.compile("#(\w+)")
     self.format_string = "%a %b %d %H:%M:%S +0000 %Y"
     self.time_key = {}
     self.adjacency_list = {}
     self.input_filepath = inputfile
     self.output_filepath = outputfile
     self.clean_tweet_object = CleanTweet()
Esempio n. 2
0
	def test_unicode_tweet_count(self):
		# Should have mocked. But did not.
		cleaned_tweet_list = ["Spark Summit East this week! #Spark #Apache (timestamp: Thu Oct 29 17:51:01 +0000 2015)", "Just saw a great post on Insight Data Engineering #Apache #Hadoop #Storm (timestamp: Thu Oct 29 17:51:30 +0000 2015)", "Doing great work #Apache (timestamp: Thu Oct 29 17:51:55 +0000 2015)", "Excellent post on #Flink and #Spark (timestamp: Thu Oct 29 17:51:56 +0000 2015)", "New and improved #HBase connector for #Spark (timestamp: Thu Oct 29 17:51:59 +0000 2015)", "New 2.7.1 version update for #Hadoop #Apache (timestamp: Thu Oct 29 17:52:05 +0000 2015)"]
		self.clean_tweet = CleanTweet(self.base_dir+"tweet_input/sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt")
		cleaned_tweets = self.clean_tweet.process_tweets()
		self.failUnlessEqual(set(cleaned_tweet_list), set(cleaned_tweets))
		self.failUnlessEqual(4, self.clean_tweet.get_count_of_unicode_tweets())
Esempio n. 3
0
class TestCase(unittest.TestCase):
	def setUp(self):
		self.clean_tweet = CleanTweet()
		self.avg_deg = AverageDegree()
		self.base_dir = "/home/sesha/Interests/insightDataFella/coding-challenge/"

	def test_if_unicode_removed(self):
		tweet = "Spark \u009FSummit East this week! #Spark #Apache"
		clean_tweet = "Spark Summit East this week! #Spark #Apache"
		self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet))

	def test_if_unicode_spared(self):
		# Spare unicode between 0000 and 007F
		tweet = "Spark \u003cSummit East this week! #Spark #Apache"
		clean_tweet = "Spark <Summit East this week! #Spark #Apache"
		self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet))

	def test_if_double_backslash_handled(self):
		tweet = "This concert \\\\m/. Am enjoying!!"
		clean_tweet = "This concert \\m/. Am enjoying!!"
		self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet))

	def test_if_escape_url_handled(self):
		tweet = "PB https:\/\/t.co\/HOl34REL1a hello"
		clean_tweet = "PB https://t.co/HOl34REL1a hello"
		self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet))

	def test_if_escape_sequences_handled(self):
		tweet = "Should \n\nclean\t\rall of \"this\""
		clean_tweet = 'Should   clean  all of "this"'
		self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet))

	def test_if_empty(self):
		tweet = "\u3084\u3070\u3044\u7709\u6d88\u3048\u305f\uff01\u30d6\u30ea\u30fc\u30c1\u6642\u9593\u30df\u30b9\u3063\u305f\u308f\u2026"
		clean_tweet = ""
		self.failUnlessEqual(clean_tweet, self.clean_tweet.get_clean_tweet(tweet))

	def test_unicode_tweet_count(self):
		# Should have mocked. But did not.
		cleaned_tweet_list = ["Spark Summit East this week! #Spark #Apache (timestamp: Thu Oct 29 17:51:01 +0000 2015)", "Just saw a great post on Insight Data Engineering #Apache #Hadoop #Storm (timestamp: Thu Oct 29 17:51:30 +0000 2015)", "Doing great work #Apache (timestamp: Thu Oct 29 17:51:55 +0000 2015)", "Excellent post on #Flink and #Spark (timestamp: Thu Oct 29 17:51:56 +0000 2015)", "New and improved #HBase connector for #Spark (timestamp: Thu Oct 29 17:51:59 +0000 2015)", "New 2.7.1 version update for #Hadoop #Apache (timestamp: Thu Oct 29 17:52:05 +0000 2015)"]
		self.clean_tweet = CleanTweet(self.base_dir+"tweet_input/sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt")
		cleaned_tweets = self.clean_tweet.process_tweets()
		self.failUnlessEqual(set(cleaned_tweet_list), set(cleaned_tweets))
		self.failUnlessEqual(4, self.clean_tweet.get_count_of_unicode_tweets())

	def test_average_degree_changes(self):
		self.avg_deg = AverageDegree(self.base_dir+"tweet_input/half_sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt")
		self.avg_deg.process_tweets()
		self.failUnlessEqual(2.0, self.avg_deg.calculate_avg_degree())
		self.avg_deg = AverageDegree(self.base_dir+"tweet_input/sample_tweets.txt", self.base_dir+"tweet_output/test_output.txt")
		self.avg_deg.process_tweets()
		self.failUnlessEqual(1.67, self.avg_deg.calculate_avg_degree())

	def tearDown(self):
		self.clean_tweet = CleanTweet()
		self.avg_deg = AverageDegree()
 def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"):
     self.regex = re.compile('"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"')
     self.hashtag_regex = re.compile("#(\w+)")
     self.format_string = "%a %b %d %H:%M:%S +0000 %Y"
     self.time_key = {}
     self.adjacency_list = {}
     self.input_filepath = inputfile
     self.output_filepath = outputfile
     self.clean_tweet_object = CleanTweet()
Esempio n. 5
0
	def tearDown(self):
		self.clean_tweet = CleanTweet()
		self.avg_deg = AverageDegree()
Esempio n. 6
0
	def setUp(self):
		self.clean_tweet = CleanTweet()
		self.avg_deg = AverageDegree()
		self.base_dir = "/home/sesha/Interests/insightDataFella/coding-challenge/"
class AverageDegree(object):
    def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"):
        self.regex = re.compile(
            '"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"'
        )
        self.hashtag_regex = re.compile("#(\w+)")
        self.format_string = "%a %b %d %H:%M:%S +0000 %Y"
        self.time_key = {}
        self.adjacency_list = {}
        self.input_filepath = inputfile
        self.output_filepath = outputfile
        self.clean_tweet_object = CleanTweet()

    def remove_older_hashtags(self, timestamp):
        one_minute = datetime.timedelta(minutes=1)
        current_time = datetime.datetime.strptime(timestamp,
                                                  self.format_string)
        timestamps = self.time_key.keys()
        ts_to_remove = filter(
            lambda old_ts: (current_time - datetime.datetime.strptime(
                old_ts, self.format_string)) > one_minute, timestamps)
        hashtags = []
        for ts in ts_to_remove:
            hashtags += self.time_key.pop(ts)
        unique_hashtags = set(hashtags)
        for hashtag in unique_hashtags:
            self.adjacency_list[hashtag] -= unique_hashtags

    def add_nodes(self, tweet, timestamp):
        raw_hashtags = self.hashtag_regex.findall(tweet)
        hashtags = map(lambda tag: tag.lower().strip(), raw_hashtags)

        self.remove_older_hashtags(timestamp)
        if timestamp not in self.time_key:
            self.time_key[timestamp] = []

        unique_hashtags = set(hashtags)

        if len(unique_hashtags) > 1:
            self.time_key[timestamp] += list(unique_hashtags)
            temp_hashtags = unique_hashtags.copy()
            for hashtag in unique_hashtags:
                if hashtag not in self.adjacency_list:
                    self.adjacency_list[hashtag] = set([])
                temp_hashtags.remove(hashtag)
                self.adjacency_list[hashtag].update(temp_hashtags)
                temp_hashtags.add(hashtag)

    def calculate_avg_degree(self):
        avg_degree = 0
        no_of_nodes = len(self.adjacency_list.keys())
        if no_of_nodes != 0:
            for links in self.adjacency_list.values():
                avg_degree += len(links)

            avg_degree = round(avg_degree * 1. / no_of_nodes, 2)
        return avg_degree

    def process_tweets(self):
        output = open(self.output_filepath, "a+")

        with open(self.input_filepath) as tweets:
            for raw_tweet in tweets:
                if "source" in raw_tweet:
                    source_word_idx = raw_tweet.index(',"source"')
                    trimmed_tweet = raw_tweet[0:source_word_idx]
                    matches = self.regex.search(trimmed_tweet)

                    if matches != None:
                        timestamp = matches.group(1)
                        content = str(matches.group(3))

                        clean_tweet = self.clean_tweet_object.get_clean_tweet(
                            content)
                        self.add_nodes(clean_tweet, timestamp)

                output.write(str(self.calculate_avg_degree()))
                output.write("\n")
        output.close()
class AverageDegree(object):
    def __init__(self, inputfile="dummyInput", outputfile="dummyOutput"):
        self.regex = re.compile('"((Sun|Mon|Tue|Thu|Wed|Fri|Sat) [A-Za-z0-9\s:\+]*)".*"text":"(.*)"')
        self.hashtag_regex = re.compile("#(\w+)")
        self.format_string = "%a %b %d %H:%M:%S +0000 %Y"
        self.time_key = {}
        self.adjacency_list = {}
        self.input_filepath = inputfile
        self.output_filepath = outputfile
        self.clean_tweet_object = CleanTweet()

    def remove_older_hashtags(self, timestamp):
        one_minute = datetime.timedelta(minutes=1)
        current_time = datetime.datetime.strptime(timestamp, self.format_string)
        timestamps = self.time_key.keys()
        ts_to_remove = filter(
            lambda old_ts: (current_time - datetime.datetime.strptime(old_ts, self.format_string)) > one_minute,
            timestamps,
        )
        hashtags = []
        for ts in ts_to_remove:
            hashtags += self.time_key.pop(ts)
        unique_hashtags = set(hashtags)
        for hashtag in unique_hashtags:
            self.adjacency_list[hashtag] -= unique_hashtags

    def add_nodes(self, tweet, timestamp):
        raw_hashtags = self.hashtag_regex.findall(tweet)
        hashtags = map(lambda tag: tag.lower().strip(), raw_hashtags)

        self.remove_older_hashtags(timestamp)
        if timestamp not in self.time_key:
            self.time_key[timestamp] = []

        unique_hashtags = set(hashtags)

        if len(unique_hashtags) > 1:
            self.time_key[timestamp] += list(unique_hashtags)
            temp_hashtags = unique_hashtags.copy()
            for hashtag in unique_hashtags:
                if hashtag not in self.adjacency_list:
                    self.adjacency_list[hashtag] = set([])
                temp_hashtags.remove(hashtag)
                self.adjacency_list[hashtag].update(temp_hashtags)
                temp_hashtags.add(hashtag)

    def calculate_avg_degree(self):
        avg_degree = 0
        no_of_nodes = len(self.adjacency_list.keys())
        if no_of_nodes != 0:
            for links in self.adjacency_list.values():
                avg_degree += len(links)

            avg_degree = round(avg_degree * 1.0 / no_of_nodes, 2)
        return avg_degree

    def process_tweets(self):
        output = open(self.output_filepath, "a+")

        with open(self.input_filepath) as tweets:
            for raw_tweet in tweets:
                if "source" in raw_tweet:
                    source_word_idx = raw_tweet.index(',"source"')
                    trimmed_tweet = raw_tweet[0:source_word_idx]
                    matches = self.regex.search(trimmed_tweet)

                    if matches != None:
                        timestamp = matches.group(1)
                        content = str(matches.group(3))

                        clean_tweet = self.clean_tweet_object.get_clean_tweet(content)
                        self.add_nodes(clean_tweet, timestamp)

                output.write(str(self.calculate_avg_degree()))
                output.write("\n")
        output.close()