Ejemplo n.º 1
0
def make_sqlite_db_json(output, input_file, fields):
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)

    column_str = ','.join([column for column in fields]).replace('.','__')
    question_marks = ','.join(['?' for column in fields])
    con = sqlite3.connect(output)
    cur = con.cursor()
    cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

    json_col = JsonCollection(input_file)
    insert_list = []
    tp = TweetParser()

    for count,tweet in enumerate(json_col.get_iterator()):
        ret = tp.parse_columns_from_tweet(tweet, fields)
        row = [replace_none(col_val[1]) for col_val in ret]
        insert_list.append(tuple(row))

        if (count % 10000) == 0:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
            insert_list = []

    if count < 10000:
        cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
        con.commit()

    con.close()
    logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
Ejemplo n.º 2
0
def write_files(collection):
    hashtags = ["#ivoted", "#myvote2016", "#myvote"]
    statements = ["i voted", "i will vote", "my vote", "vote for"]

    with open("/scratch/olympus/projects/hashtag_filtering/hashtags_{}.json".format(collection[0].split('/')[4]), 'w') as hashtag_file:
        with open("/scratch/olympus/projects/hashtag_filtering/statements_{}.json".format(collection[0].split('/')[4]), 'w') as statement_file:
            # with open --> this is how to read in / initialize files in python
            # 'w' : write, 'r' : read
            for each_file in collection:
                hashtags_counter = 0
                statements_counter = 0
                collection = JsonCollection(each_file, throw_error=False, verbose=1)
                for tweet in collection.get_iterator():
                    if tweet and tweet["text"]:
                        if any(hashtag in tweet["text"] for hashtag in hashtags):
                            hashtag_file.write("{}\n".format(json.dumps(tweet, default=date_handler)))
                            hashtags_counter += 1
                        if any(statement in tweet["text"] for statement in statements):
                            statement_file.write("{}\n".format(json.dumps(tweet, default=date_handler)))
                            statements_counter += 1
                    else:
                        logging.info("Something was wrong with a tweet")
                logging.info("Extracted {} tweets to the statement file".format(statements_counter))
                logging.info("Extracted {} tweets to the hashtags file".format(hashtags_counter))
                # same as tweet_counter = tweet_counter + 1
        statement_file.close()
    hashtag_file.close()
Ejemplo n.º 3
0
	def test_strip_tweets_keeps_fields(self):
		tweet_parser = TweetParser()
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		self.maxDiff = None
		it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
		def tweets_have_right_keys(iterator, fields):
			for tweet in iterator:
				keys = [key for key,value in tweet_parser.flatten_dict(tweet)]
				for elem in fields:
					if elem not in keys:
						return False
			return True		
		self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
Ejemplo n.º 4
0
	def test_strip_tweets_keeps_fields(self):
		tweet_parser = TweetParser()
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		self.maxDiff = None
		it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
		def tweets_have_right_keys(iterator, fields):
			for tweet in iterator:
				keys = [key for key,value in tweet_parser.flatten_dict(tweet)]
				for elem in fields:
					if elem not in keys:
						return False
			return True		
		self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
Ejemplo n.º 5
0
    def test_clean_tweets_on_clean_data(self):
        self.setUp()
        clean_tweets(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['json']['valid'],
            os.path.dirname(os.path.abspath(__file__)) +
            '/../test/output.json',
            os.path.dirname(os.path.abspath(__file__)) +
            '/../test/output_err.json')

        col = JsonCollection(
            os.path.dirname(os.path.abspath(__file__)) +
            '/../test/output.json')

        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/../test/output.json', 'r') as f:
            for line in f:
                try:
                    json.loads(line)
                except:
                    self.assertTrue(False)

        excepted = False
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/../test/output_err.json', 'r') as f:
            for line in f:
                try:
                    json.loads(line)
                except:
                    excepted = True
        self.assertFalse(excepted)
        self.tearDown()
Ejemplo n.º 6
0
    def test_clean_multiple_files(self):
        self.setUp()
        clean_tweets_multiple(
            os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/dirty*',
            os.path.dirname(os.path.abspath(__file__)) +
            '/../test/output.json',
            os.path.dirname(os.path.abspath(__file__)) +
            '/../test/output_err.json')

        col = JsonCollection(
            os.path.dirname(os.path.abspath(__file__)) +
            '/../test/output.json')

        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/../test/output.json', 'r') as f:
            for line in f:
                try:
                    json.loads(line)
                except:
                    self.assertTrue(False)

        excepted = False
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/../test/output_err.json', 'r') as f:
            for line in f:
                try:
                    json.loads(line)
                except:
                    excepted = True
        self.assertTrue(excepted)
        self.tearDown()
Ejemplo n.º 7
0
	def test_json_collection_custom_filter_filters(self):
		collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		full_collection_len = len(list(collectionone.get_iterator()))
		def is_tweet_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return True
			else:
				return False
		num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))

		collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		def is_not_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return False
			else:
				return True
		num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator()))

		#the numbes of retweets and non retweets should add up to the whole collection
		self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Ejemplo n.º 8
0
	def test_dump_to_csv_orders_and_encodes_properly(self):
		if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'):
			os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')

		output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv'
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid-single'])
		collection.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1', 'source', 'user.id', 'timestamp.$date', 'text'])
		with open(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv', 'rb') as filehandle:
			count = 0
			for line in unicodecsv.reader(filehandle):
				if count != 0:
					val_count = 0
					for csv_row_value in line:
						everything_in_order = True
						if val_count == 0:
							self.assertEqual(csv_row_value, '661275583813431296')
						elif val_count == 1:
							loaded_dict = json_util.loads(csv_row_value)
							if not all(k in loaded_dict for k in ['text', 'indices']) and loaded_dict['text'] == 'jadehelm' and loaded_dict['indices'] == [74, 83]:
								everything_in_order = False
						elif val_count == 2:
							loaded_dict = json_util.loads(csv_row_value)
							if not all(k in loaded_dict for k in ['text', 'indices']) and loaded_dict['text'] == 'newworldorder' and loaded_dict['indices'] == [84, 98]:
								everything_in_order = False
						elif val_count == 3:
							self.assertEqual(csv_row_value, '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>')
						elif val_count == 4:
							self.assertEqual(csv_row_value, '379851447')
						elif val_count == 5:
							self.assertEqual(csv_row_value, '1446495359000')
						elif val_count == 6:
							self.assertEqual(csv_row_value, 'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat')
						if everything_in_order:
							self.assertTrue(True)
						val_count += 1
				else:
					count += 1
		filehandle.close()
		
		if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'):
			os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
Ejemplo n.º 9
0
	def test_json_collection_custom_filter_filters(self):
		collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		full_collection_len = len(list(collectionone.get_iterator()))
		def is_tweet_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return True
			else:
				return False
		num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))

		collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		def is_not_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return False
			else:
				return True
		num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator()))

		#the numbes of retweets and non retweets should add up to the whole collection
		self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Ejemplo n.º 10
0
	def test_iterator_returns_tweets(self):
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
		self.assertTrue(len(list(collection.get_iterator())) > 0)
Ejemplo n.º 11
0
    def test_dump_to_csv_orders_and_encodes_properly(self):
        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')

        output_path = os.path.dirname(
            os.path.realpath(__file__)) + '/' + 'data/output.csv'
        collection = JsonCollection(
            os.path.dirname(os.path.realpath(__file__)) + '/' +
            config['json']['valid-single'])
        collection.dump_to_csv(output_path, [
            'id_str', 'entities.hashtags.0.text', 'entities.hashtags.1.text',
            'source', 'user.id', 'timestamp.$date', 'text'
        ])
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv', 'r') as filehandle:
            count = 0
            for line in csv.reader(filehandle):
                if count != 0:
                    val_count = 0
                    for csv_row_value in line:
                        everything_in_order = True
                        if val_count == 0:
                            self.assertEqual(csv_row_value,
                                             '661275583813431296')
                        elif val_count == 1:
                            if csv_row_value != 'jadehelm':
                                everything_in_order = False
                        elif val_count == 2:
                            if csv_row_value != 'newworldorder':
                                everything_in_order = False
                        elif val_count == 3:
                            self.assertEqual(
                                csv_row_value,
                                '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>'
                            )
                        elif val_count == 4:
                            self.assertEqual(csv_row_value, '379851447')
                        elif val_count == 5:
                            self.assertEqual(csv_row_value,
                                             '2015-11-02 20:15:59+00:00')
                        elif val_count == 6:
                            self.assertEqual(
                                csv_row_value,
                                'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat'
                            )
                        if everything_in_order:
                            self.assertTrue(True)
                        val_count += 1
                else:
                    count += 1

        if os.path.exists(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv'):
            os.remove(
                os.path.dirname(os.path.abspath(__file__)) +
                '/data/output.csv')
Ejemplo n.º 12
0
	def test_iterator_returns_tweets(self):
		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
		self.assertTrue(len(list(collection.get_iterator())) > 0)