def scrape_images(url: str): """ Scrape webpage for images and create multiple PageImage objects Args: url (str): URL of page to be scrapped """ images_tuple = get_all_images_data(url) page = PageImage(url=url) page.save_to_db() for image_url, img_data in images_tuple: img = Image(img_url=image_url, page_id=page.id, data=img_data) img.save_to_db()
def test_save_to_db(self): """ Test auxiliary method for saving objects in DB """ page = PageImage( url="http://example.com", created_at=datetime.now(), ) self.assertIsNone(page.id) self.assertEqual(PageImage.query.all(), []) page.save_to_db() # Objects receive ID after saving in DB self.assertIsNotNone(page.id) self.assertEqual(PageImage.query.all()[0], page)
def get(self): """ Get PageImage for already scrapped page. Returns: [type]: [description] """ data = PageImageView.parser.parse_args() return PageImage.find_url_or_404(data["url"]).to_json()
def test_get_all_images_list_view(self): """ Test get method for ImageListView Should return all created PageImage objects """ page = PageImage( url="http://example.com", created_at=datetime.now(), ) page.save_to_db() page2 = PageImage( url="http://other.com", created_at=datetime.now(), ) page2.save_to_db() response = self.client.get("/images") self.assertEqual(len(response.json), 2) self.assertEqual(response.json, [page.to_json(), page2.to_json()])
def test_find_url_or_404(self): """ Test classmethod should return only newer object """ with freeze_time("2021-03-24 14:15:16"): page = PageImage( url="http://example.com", created_at=datetime.now(), ) self.db.session.add(page) self.db.session.commit() with freeze_time("2021-03-25 17:15:16"): page2 = PageImage( url="http://example.com", created_at=datetime.now(), ) self.db.session.add(page2) self.db.session.commit() self.assertEqual(PageImage.find_url_or_404(url="http://example.com"), page2)
def test_get_specific_image_view(self): """ Test get method for ImageView Should return response for given url only """ page = PageImage( url="http://example.com", created_at=datetime.now(), ) page.save_to_db() page2 = PageImage( url="http://other.com", created_at=datetime.now(), ) page2.save_to_db() response = self.client.get("/image", data={"url": "http://other.com"}) self.assertEqual(response.json, page2.to_json())
def test_to_json_method(self): """ Test auxiliary method for returing JSON object """ with freeze_time("2021-03-25 14:15:16"): page = PageImage( url="http://example.com", created_at=datetime.now(), ) self.db.session.add(page) self.db.session.commit() img1 = Image(img_url="http://example.com/img_1.png", page_id=page.id, data=b"123") self.db.session.add(img1) self.db.session.commit() expected_response = { "id": page.id, "url": "http://example.com", "images": ["<Image url: http://example.com/img_1.png>"], "created_at": "25/03/21 14:15:16", # expected date format is "%d/%m/%y %H:%M:%S" } self.assertEqual(page.to_json(), expected_response)
def get(self): """ Get all images for already scrapped url as zip . Returns: [type]: [description] """ data = PageImageView.parser.parse_args() page = PageImage.find_url_or_404(data["url"]) images_data = [img.data for img in page.images] if images_data: zf = archive_bytes_stream(images_data) return send_file( BytesIO(zf), attachment_filename=f"{page.url}_images.zip", as_attachment=True, )
def test_find_url_or_404_no_object(self): """ Test classmethod should raise NotFound exception if object with given URL does not exist """ with self.assertRaises(NotFound): PageImage.find_url_or_404(url="http://example.com")