Python TweetCleaner Beispiele

Programmiersprache: Python

Namespace / Paketname: tweets_cleaned

Klasse / Typ: TweetCleaner

Beispiele auf hotexamples.com: 3

Python TweetCleaner - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die tweets_cleaned.TweetCleaner, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

process_tweets_file(2)

check_unicode_chars(1)

clean_tweet(1)

read_created_at_field(1)

read_json_data(1)

read_text_field(1)

Beispiel #1

Datei anzeigen

Datei: test_tweets_cleaned.py Projekt: marisavas/DataEngineering

	def test_input_file_do_not_exist(self):
		filename = "foo.txt"
		cleaner1 = TweetCleaner(filename,self.test_output_file)
		with self.assertRaises(IOError) as context:
			cleaner1.process_tweets_file()
		msg = "Input file %s does not exist" % filename 
		self.assertTrue(msg in context.exception)	
		print "Check program execution with a input file that does no exist [OK]"

Beispiel #2

Datei anzeigen

Datei: test_tweets_cleaned.py Projekt: marisavas/DataEngineering

class TestTweetCleaner(unittest.TestCase):

	#Run before each test case
	def setUp(self):
		self.current_path = os.getcwd().replace("src","")
		self.test_input_file = self.current_path + "/tweet_input/test.txt"	
		self.test_output_file = self.current_path + "/tweet_output/test_ouput.txt"	
		self.cleaner = TweetCleaner(self.test_input_file,self.test_output_file)
	
	def tearDown(self):
		del self.cleaner
		self.delete_test_files()
		
	def test_constructor_state(self):
		#Check if class was initialized correctly
		self.assertEqual(self.cleaner.input_filename,self.test_input_file)
		self.assertEqual(self.cleaner.output_filename,self.test_output_file)
		self.assertEqual(self.cleaner.num_tweets_w_unicode,0)
		print "Check state of the attributes in the class constructor [OK]"


	def assertWarns(self,warning,msg,callable,*args, **kwds):
        	with warnings.catch_warnings(record=True) as warning_list:
	        	warnings.simplefilter('always')
			result = callable(*args, **kwds)
            		self.assertTrue(any(item.category == Warning for item in warning_list))
			self.assertEqual(msg,str(warning_list[-1].message))

	def create_test_file(self,data):
		# Write file with two unicode strings
		with open(self.test_input_file ,"w") as f:
			for line in data:
				f.write(json.dumps(line).decode('unicode-escape').encode('utf8'))
				f.write('\n')

	def delete_test_files(self):
		# Delete test files
		try:
			os.remove(self.test_input_file)
			os.remove(self.test_output_file)
		except OSError:
			pass

	def test_no_unicode_char(self):
		# Create string with no unicode chars
		text = "Spark Summit East this week! #Spark #Apache"
		cleaned_text = "Spark Summit East this week! #Spark #Apache"
		self.assertEqual(self.cleaner.clean_tweet(text),cleaned_text)
		print "Check clean_tweet function with string with no unicode chars [OK]"

		
	def test_has_unicode_char(self):
		# Create string with unicode chars
		text = "I'm at Terminal de Integra\u00e7\u00e3o do Varadouro in Jo\u00e3o Pessoa, PB https:\/\/t.co\/HOl34REL1a"
		cleaned_text = "I'm at Terminal de Integrao do Varadouro in Joo Pessoa, PB https://t.co/HOl34REL1a"
		self.assertEqual(self.cleaner.clean_tweet(text),cleaned_text)
		print "Check clean_tweet function with unicode string [OK]"

	def test_empty_string(self):
		# Create empty string
		text = ""
		self.assertFalse(self.cleaner.clean_tweet(text),"")
		print "Check clean_tweet function with empty string [OK]"


	def test_has_escape_char(self):
		# Create string with escape chars
		text = '@KayKay121 dragged me to the library. Now I have to be\n\r productive https:\/\/t.co\/HjZR3d5QaQ\n'
		cleaned_text = '@KayKay121 dragged me to the library. Now I have to be productive https://t.co/HjZR3d5QaQ'
		self.assertEqual(self.cleaner.clean_tweet(text),cleaned_text)
		print "Check clean_tweet function with string with escape chars [OK]"
	
	def test_unicode_escape_char(self):
		# Create string with escape chars
		text = '@KayKay121 dragged me to the library. Now I have to be\n\r productive \ud83d\udc94 https:\/\/t.co\/HjZR3d5QaQ\n'
		cleaned_text = '@KayKay121 dragged me to the library. Now I have to be productive https://t.co/HjZR3d5QaQ'
		self.assertEqual(self.cleaner.clean_tweet(text),cleaned_text)
		print "Check clean_tweet function with string with unicode and escape chars [OK]"

	def test_input_file_do_not_exist(self):
		filename = "foo.txt"
		cleaner1 = TweetCleaner(filename,self.test_output_file)
		with self.assertRaises(IOError) as context:
			cleaner1.process_tweets_file()
		msg = "Input file %s does not exist" % filename 
		self.assertTrue(msg in context.exception)	
		print "Check program execution with a input file that does no exist [OK]"


	def test_incorrect_json_input_file(self):
		line = "Spark Summit East this week! #Spark #Apache"
		with self.assertRaises(ValueError) as context:
			self.cleaner.read_json_data(line)
		self.assertTrue("Error: No JSON object could be decoded" in context.exception)	
		print "Read a JSON object with incorrect format [OK]"


	def test_no_field_text(self):
		# Create test data
		line = '{"created_at":"Thu Oct 29 17:51:01 +0000 2015","id":659789756637822976,"id_str":"659789756637822976"}'
		data = self.cleaner.read_json_data(line)
		self.assertEqual(self.cleaner.read_text_field(data),"")
		print "Read a JSON object without text field [OK]"

	def test_no_field_timestamp(self):
		line = '{"text":"Spark Summit East this week! #Spark #Apache","id":659789756637822976,"id_str":"659789756637822976"}'
		data = self.cleaner.read_json_data(line)
		self.assertEqual(self.cleaner.read_created_at_field(data),None)
		print "Read a JSON object without timestamp field [OK]"

	def test_correct_text_field(self):
		line = '{"text":"Spark Summit East this week! #Spark #Apache","created_at":"Thu Oct 29 17:51:01 +0000 2015","id":659789756637822976,"id_str":"659789756637822976"}'
		data = self.cleaner.read_json_data(line)
		self.assertEqual(self.cleaner.read_text_field(data),"Spark Summit East this week! #Spark #Apache")
		print "Read a JSON object with non empty text field [OK]"
	
	def test_correct_timestamp_field(self):
		line = '{"text":"Spark Summit East this week! #Spark #Apache","created_at":"Thu Oct 29 17:51:01 +0000 2015","id":659789756637822976,"id_str":"659789756637822976"}'
		data = self.cleaner.read_json_data(line)
		self.assertEqual(self.cleaner.read_created_at_field(data),"Thu Oct 29 17:51:01 +0000 2015")
		print "Read a JSON object with non empty timestamp format [OK]"

	def test_tweet_has_unicode(self):
		text = u"@KayKay121 dragged me to the library. Now I have to be productive \ud83d\udc94 https:\/\/t.co\/HjZR3d5QaQ"
		self.assertTrue(self.cleaner.check_unicode_chars(text))
		print "Check if a tweet has unicode chars [OK]"

	def test_tweet_is_not_unicode(self):
		# Create string
		text = "@KayKay121 dragged me to the library. Now I have to be productive \ud83d\udc94 https:\/\/t.co\/HjZR3d5QaQ"
		self.assertTrue(self.cleaner.check_unicode_chars(text))
		print "Check if a tweet has no unicode chars [OK]"

	def test_empty_string_has_unicode(self):
		self.assertFalse(self.cleaner.check_unicode_chars(""))
		print "Check if a empty tweet has unicode chars [OK]"

	def test_clean_tweet_sucessfully(self):
		text = "I'm at Terminal de Integra\u00e7\u00e3o do Varadouro in Jo\u00e3o Pessoa\n\r, PB https:\/\/t.co\/HOl34REL1a"
		processed_text = "I'm at Terminal de Integrao do Varadouro in Joo Pessoa , PB https://t.co/HOl34REL1a"
		self.assertEqual(self.cleaner.clean_tweet(text),processed_text)	
		self.assertEqual(self.cleaner.clean_tweet("This is a test"),"This is a test")	
		print "Clean_tweet function with correct execution [OK]"
		
	def test_clean_empty_tweet(self):
		self.assertEqual(self.cleaner.clean_tweet(""),"")
		print "Clean_tweet function with empty text [OK]"	

	def test_input_file_blank(self):
		# Create empty file
		open(self.test_input_file, 'w').close() 
		self.cleaner.process_tweets_file()
		with open(self.test_output_file) as f:
			total_lines =  sum(1 for _ in f)
	
		#Check empty output file
		self.assertEqual(total_lines,0)
		print "Open empty file and check its output [OK]"

	def test_correct_output(self):
		# Create test data
		data = [{"created_at":"Thu Oct 29 17:51:01 +0000 2015","id":659789756637822976,"id_str":"659789756637822976","text":"Spark Summit East this week! #Spark #Apache"}, {"created_at":"Thu Oct 29 18:10:49 +0000 2015","id":659794531844509700,"id_str":"659794531844509700","text":"I'm at Terminal de Integra\u00e7\u00e3o do Varadouro in Jo\u00e3o Pessoa, PB https:\/\/t.co\/HOl34REL1a"}, {"created_at":"Thu Oct 29 17:51:50 +0000 2015","id":659789756637822976,"id_str":"659789756637822976","text":"@KayKay121 dragged me to the library. Now I have to be productive \ud83d\udc94 https:\/\/t.co\/HjZR3d5QaQ"}]
		
		self.create_test_file(data)
		self.cleaner.process_tweets_file()

		#Create a file with the correct output
		lines = "Spark Summit East this week! #Spark #Apache (timestamp: Thu Oct 29 17:51:01 +0000 2015)\nI'm at Terminal de Integrao do Varadouro in Joo Pessoa, PB https://t.co/HOl34REL1a (timestamp: Thu Oct 29 18:10:49 +0000 2015)\n@KayKay121 dragged me to the library. Now I have to be productive https://t.co/HjZR3d5QaQ (timestamp: Thu Oct 29 17:51:50 +0000 2015)\n\n2 tweets contained unicode"
		with open(self.current_path + "/tweet_output/test_output_correct.txt","w") as f:
			f.write(lines)
	
		#Check if output file is correct
		equal_files = False
		if os.path.getsize(self.test_output_file) == os.path.getsize(self.current_path + "/tweet_output/test_output_correct.txt"):
  			if open(self.test_output_file,'r').read() == open(self.current_path + "/tweet_output/test_output_correct.txt",'r').read():
			# Files are the same.
				equal_files=True

		self.assertEqual(equal_files,True)

		# Delete test files
		self.delete_test_files()
		os.remove(self.current_path + "/tweet_output/test_output_correct.txt")
		print "Check correct execution of a tweet file [OK]"

	def test_process_tweets_file_empty_text_field(self):
		data = [{"created_at":"Thu Oct 29 17:51:01 +0000 2015","id":659789756637822976,"id_str":"659789756637822976"}, {"created_at":"Thu Oct 29 18:10:49 +0000 2015","id":659794531844509700,"id_str":"659794531844509700","text":"I'm at Terminal de Integra\u00e7\u00e3o do Varadouro in Jo\u00e3o Pessoa, PB https:\/\/t.co\/HOl34REL1a"}, {"created_at":"Thu Oct 29 17:51:50 +0000 2015","id":659789756637822976,"id_str":"659789756637822976","text":"@KayKay121 dragged me to the library. Now I have to be productive \ud83d\udc94 https:\/\/t.co\/HjZR3d5QaQ"}]

		self.create_test_file(data)
		self.cleaner.process_tweets_file()
		num_lines = sum(1 for line in open(self.test_output_file))
		self.assertEqual(num_lines,5)
		print "Check execution on tweet with no text field [OK]"
	
	def test_process_tweets_file_empty_timestamp_field(self):
		data = [{"created_at":"Thu Oct 29 17:51:01 +0000 2015","id":659789756637822976,"id_str":"659789756637822976","text":"Spark Summit East this week! #Spark #Apache"}, {"id":659794531844509700,"id_str":"659794531844509700","text":"I'm at Terminal de Integra\u00e7\u00e3o do Varadouro in Jo\u00e3o Pessoa, PB https:\/\/t.co\/HOl34REL1a"}, {"created_at":"Thu Oct 29 17:51:50 +0000 2015","id":659789756637822976,"id_str":"659789756637822976","text":"@KayKay121 dragged me to the library. Now I have to be productive \ud83d\udc94 https:\/\/t.co\/HjZR3d5QaQ"}]

		self.create_test_file(data)
		self.cleaner.process_tweets_file()
		num_lines = sum(1 for line in open(self.test_output_file))
		self.assertEqual(num_lines,5)	
		print "Check execution on tweet with no timestamp field [OK]"

Beispiel #3

Datei anzeigen

Datei: test_tweets_cleaned.py Projekt: marisavas/DataEngineering

	def setUp(self):
		self.current_path = os.getcwd().replace("src","")
		self.test_input_file = self.current_path + "/tweet_input/test.txt"	
		self.test_output_file = self.current_path + "/tweet_output/test_ouput.txt"	
		self.cleaner = TweetCleaner(self.test_input_file,self.test_output_file)