def test_dynamodb_storage(self): config = { "type": "DynamoDBStorage", "dynamodb_table_name": "test_table", "partition_key": "my_hash_key", "partition_key_format_string": "{category}-{url}" } item = { "category": "SomeCategory", "url": "http://google.com", "dict_test": {"corn": "husk"}, "list_test": [1, 2, "corn"], "num": 4.02 } manager = AWSManager() dynamostorage = DynamoDBStorage(manager, config) dynamo_item = dynamostorage.dynamo_item(Item(item_type="", payload=item)) print(dynamo_item) self.assertEqual(dynamo_item['my_hash_key']['S'], "%s-%s" % (item['category'], item['url'])) for k in item: v = item[k] if isinstance(v, str): self.assertEqual(dynamo_item[k]['S'], v) elif isinstance(v, dict): d = json.loads(dynamo_item[k]['S']) for k in v: self.assertEqual(d[k], v[k]) elif isinstance(v, list): l = json.loads(dynamo_item[k]['S']) for i in range(len(v)): self.assertEqual(l[i], v[i]) else: self.assertEqual(dynamo_item[k]['N'], str(v))
def test_defaults(self): config = { 'source_url': 'https://www.gutenberg.org/files/54386/54386-0.txt', 's3_bucket_name': 'antennatest42', 'destination_key': 'gutenberg.txt', } manager = AWSManager() source = StaticFileSource(manager, config) self.assertEqual(source._defaults['item_type'], source.item_type)
def test_local_file_source(self): config = { 'source_url': 'https://www.gutenberg.org/files/54386/54386-0.txt', 's3_bucket_name': 'antennatest42', 'destination_key': 'gutenberg.txt', } manager = AWSManager() # Ensure object does not exist before we move forward client = manager.get_client('s3') client.delete_object(Bucket=config['s3_bucket_name'], Key=config['destination_key']) source = StaticFileSource(manager, config) self.assertTrue(source.has_new_data()) items = list(source.yield_items()) self.assertEqual(1, len(items)) self.assertFalse(source.has_new_data())
def test_newspaper_lib(self): #http://spectrum.ieee.org/blog/nanoclast config = { 'url': 'http://futurism.com', 'output_item_type': 'ScrapedArticle' } manager = AWSManager() source = NewspaperLibSource(manager, config) for item in source.yield_items(): print(item.payload['url'])
def test_invalid_config(self): config = {} manager = AWSManager() try: source = StaticFileSource(manager, config) self.assertEqual( False, "Source should have thrown exception given empty config") except Exception as e: pass
def test_unique_dynamodb_filter(self): config = { "dynamodb_table_name": "test_table", "primary_key": "my_hash_key", "primary_key_format_string": "{category}-{url}" } item = {"category": "SomeCategory", "url": "http://google.com"} manager = AWSManager() ufilter = UniqueDynamoDBFilter(manager, config) formatted = ufilter.format_key(item) self.assertEqual(formatted, "%s-%s" % (item['category'], item['url']))
def test_rss_feed_source(self): config = {"rss_feed_url": "https://qz.com/feed/"} manager = AWSManager() source = RSSFeedSource(manager, config) self.assertTrue(source.has_new_data()) items = list(source.yield_items()) self.assertTrue(len(items) > 3) for item in items: self.assertTrue(len(item.payload['url']) > 10) self.assertTrue(len(item.payload['content']) > 10) self.assertTrue(len(item.payload['source_url']) > 4) self.assertTrue(len(item.payload['title']) > 10)
def test_dynamodb_storage(self): manager = AWSManager() dynamostorage = DynamoDBStorage(manager, self.config) dynamo_item = dynamostorage.dynamo_item(self.item) print(dynamo_item) self.assertEqual(dynamo_item['my_hash_key']['S'], "%s-%s" % (item['category'], self.item['url'])) for k in self.item: v = item[k] if isinstance(v, str): self.assertEqual(dynamo_item[k]['S'], v) else: self.assertEqual(dynamo_item[k]['N'], str(v))
def test_external_resources(self): manager = AWSManager() dynamostorage = DynamoDBStorage(manager, self.config) resources = dynamostorage.external_resources() self.assertEqual(len(resources), 1)