Esempio n. 1
0
def main(argv):

    inp_file, out_file = argv
    
    graph = {}
    
    out_put = open(out_file,'w')
    
    with open(inp_file) as input_file:
        for line in input_file:
            line = line.rstrip()
            line_json = json.loads(line)
            try:
                created_at = dtparser.parse(line_json["created_at"])
                clean, cleaned_line = _clean_string(line_json['text'])
                hash_tags = get_hashtag(cleaned_line)
                
                # update graph if there are more than one hashtags
                if hash_tags and len(hash_tags) > 1:
                    graph = update_or_build_graph(graph, hash_tags, created_at)
                    
                # update graph to remove edges created more than 60 seconds ago
                graph = shuffle_graph(graph, created_at)
                avg_degree =  calculate_avg_degree(graph)
                out_put.write(str(avg_degree)+"\n")
                
            except Exception as e:
                # this except block is here to handle the following sample limit lines
                # {"limit":{"track":19,"timestamp_ms":"1446218985758"}}
                pass
    out_put.close()
    input_file.close()
Esempio n. 2
0
    def test_tweet_cleaning_and_formatting(self):

        clean, cleaned_tweet = _clean_string(self.test_tweet_text)
        self.assertEqual(
            clean, False, "incorrect testing of unicode and escape character presence")
        self.assertEqual(cleaned_tweet, self.correct_unicode_tweet,
                         "incorrect escaping and formatting of tweet")
def main(argv):

    num_unicode = 0
    inp_file, out_file = argv

    out_put = open(out_file, 'w')

    with open(inp_file) as input_file:
        for line in input_file:
            line = line.rstrip()
            line_json = json.loads(line)
            try:
                clean, cleaned_line = _clean_string(line_json["text"])
                if not clean:
                    num_unicode += 1
                out_put.write(
                    cleaned_line +
                    " (timestamp: {0})".format(line_json["created_at"]) + "\n")
            except:
                #this except block is here to handle the following sample limit lines
                # {"limit":{"track":19,"timestamp_ms":"1446218985758"}}
                pass

    out_put.write("\n {0} tweets contained unicode.".format(num_unicode))
    out_put.close()
    input_file.close()
def main(argv):

    inp_file, out_file = argv

    graph = {}

    out_put = open(out_file, 'w')

    with open(inp_file) as input_file:
        for line in input_file:
            line = line.rstrip()
            line_json = json.loads(line)
            try:
                created_at = dtparser.parse(line_json["created_at"])
                clean, cleaned_line = _clean_string(line_json['text'])
                hash_tags = get_hashtag(cleaned_line)

                # update graph if there are more than one hashtags
                if hash_tags and len(hash_tags) > 1:
                    graph = update_or_build_graph(graph, hash_tags, created_at)

                # update graph to remove edges created more than 60 seconds ago
                graph = shuffle_graph(graph, created_at)
                avg_degree = calculate_avg_degree(graph)
                out_put.write(str(avg_degree) + "\n")

            except Exception as e:
                # this except block is here to handle the following sample limit lines
                # {"limit":{"track":19,"timestamp_ms":"1446218985758"}}
                pass
    out_put.close()
    input_file.close()
Esempio n. 5
0
    def test_tweet_cleaning_and_formatting(self):

        clean, cleaned_tweet = _clean_string(self.test_tweet_text)
        self.assertEqual(
            clean, False,
            "incorrect testing of unicode and escape character presence")
        self.assertEqual(cleaned_tweet, self.correct_unicode_tweet,
                         "incorrect escaping and formatting of tweet")
Esempio n. 6
0
    def test_rolling_avg_degree(self):

        for tweet in self.tweet_list:
            created_at = dtparser.parse(tweet["created_at"])
            clean, cleaned_line = _clean_string(tweet['text'])
            hash_tags = get_hashtag(cleaned_line)
            if hash_tags and len(hash_tags) > 1:
                self.graph = update_or_build_graph(
                    self.graph, hash_tags, created_at)
                # print self.graph
            self.graph = shuffle_graph(self.graph, created_at)
            self.avg_degree_list.append(calculate_avg_degree(self.graph))
        #print self.avg_degree_list
        self.assertEqual(self.avg_degree_list, [1.0, 2.0, 2.0, 2.0, 1.67],
                         'incorrect average degree')
Esempio n. 7
0
    def test_rolling_avg_degree(self):

        for tweet in self.tweet_list:
            created_at = dtparser.parse(tweet["created_at"])
            clean, cleaned_line = _clean_string(tweet['text'])
            hash_tags = get_hashtag(cleaned_line)
            if hash_tags and len(hash_tags) > 1:
                self.graph = update_or_build_graph(self.graph, hash_tags,
                                                   created_at)
                # print self.graph
            self.graph = shuffle_graph(self.graph, created_at)
            self.avg_degree_list.append(calculate_avg_degree(self.graph))
        #print self.avg_degree_list
        self.assertEqual(self.avg_degree_list, [1.0, 2.0, 2.0, 2.0, 1.67],
                         'incorrect average degree')
Esempio n. 8
0
def main(argv):

    num_unicode = 0
    inp_file, out_file = argv

    out_put = open(out_file,'w')

    with open(inp_file) as input_file:
        for line in input_file:
            line = line.rstrip()
            line_json = json.loads(line)
            try:
                clean, cleaned_line = _clean_string(line_json["text"])
                if not clean:
                    num_unicode += 1
                out_put.write(cleaned_line + " (timestamp: {0})".format(line_json["created_at"]) + "\n")
            except:
                #this except block is here to handle the following sample limit lines
                # {"limit":{"track":19,"timestamp_ms":"1446218985758"}}
                pass
        
    out_put.write("\n {0} tweets contained unicode.".format(num_unicode))
    out_put.close()
    input_file.close()