def import_feed_downloads(db: DatabaseHandler, csv_file: str) -> None: log.info(f"Importing downloads from {csv_file}...") db.begin() with open(csv_file, mode='r', encoding='utf-8') as f: # Guess dialect sample = f.read(1024) sniffer = csv.Sniffer() dialect = sniffer.sniff(sample) f.seek(0) input_csv = csv.DictReader(f, dialect=dialect) n = 1 for download in input_csv: log.info(f"Importing download {n}...") n += 1 raw_download_content = download.get('_raw_download_content', None) if raw_download_content: del raw_download_content['_raw_download_content'] # Cast some columns download['feeds_id'] = int( download.get['feeds_id'] ) if 'feeds_id' in download else None # NULL download['stories_id'] = int( download.get['stories_id'] ) if 'stories_id' in download else None # NULL download['parent'] = int( download.get['parent'] ) if 'parent' in download else None # NULL download['priority'] = int( download.get['priority'] ) if 'priority' in download else 0 # NOT NULL download['sequence'] = int( download.get['sequence'] ) if 'sequence' in download else 0 # NOT NULL download['sequence'] = 't' if download.get('extracted', False) else 'f' # Will be rewritten by handle_download() download['path'] = '' download = db.create(table='downloads', insert_hash=download) # Create mock response to import it response = FakeResponse(content=raw_download_content) handler = handler_for_download(db=db, download=download) handler.store_response(db=db, download=download, response=response) log.info("Committing...") db.commit() log.info(f"Done importing downloads from {csv_file}")
def test_fetch_handle_download(self): credentials = self.univision_credentials() medium = self.db.create(table='media', insert_hash={ 'name': f"Media for test feed {credentials.url}", 'url': 'http://www.univision.com/', }) feed = self.db.create(table='feeds', insert_hash={ 'name': 'feed', 'type': 'univision', 'url': credentials.url, 'media_id': medium['media_id'], }) download = create_download_for_feed(db=self.db, feed=feed) handler = handler_for_download(db=self.db, download=download) assert isinstance(handler, DownloadFeedUnivisionHandler) # Recreate handler with mock configuration handler = DownloadFeedUnivisionHandler( crawler_config=self._mock_crawler_config()) response = handler.fetch_download(db=self.db, download=download) assert response handler.store_response(db=self.db, download=download, response=response) download = self.db.find_by_id(table='downloads', object_id=download['downloads_id']) assert download assert download[ 'state'] == 'success', f"Download's state is not 'success': {download['state']}" assert not download[ 'error_message'], f"Download's error_message should be empty: {download['error_message']}" if self.expect_to_find_some_stories(): story_downloads = self.db.query( """ SELECT * FROM downloads WHERE feeds_id = %(feeds_id)s AND type = 'content' AND state = 'pending' """, { 'feeds_id': download['feeds_id'], }).hashes() assert story_downloads, 'One or more story downloads were derived from feed'
def test_invalid_feed_url(): """Try fetching a funky URL.""" db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) download = db.create(table='downloads', insert_hash={ 'url': 'file:///etc/passwd', 'host': 'localhost', 'type': 'feed', 'state': 'pending', 'priority': 0, 'sequence': 1, 'feeds_id': test_feed['feeds_id'], }) handler = handler_for_download(db=db, download=download) with pytest.raises(McCrawlerFetcherSoftError, message="Invalid URL should be a soft exception"): handler.fetch_download(db=db, download=download)
def _fetch_and_handle_response( self, path: str, downloads_id: Optional[int] = None) -> Dict[str, Any]: """Call the fetcher and handler on the given URL. Return the download passed to the fetcher and handler.""" if downloads_id: download = self.db.find_by_id(table='downloads', object_id=downloads_id) else: download = self.db.create( table='downloads', insert_hash={ 'url': f"http://localhost:{self.port}{path}", 'host': 'localhost', 'type': 'feed', 'state': 'pending', 'priority': 0, 'sequence': 1, 'feeds_id': self.feed['feeds_id'], }) downloads_id = download['downloads_id'] handler = handler_for_download(db=self.db, download=download) response = handler.fetch_download(db=self.db, download=download) assert response handler.store_response(db=self.db, download=download, response=response) download = self.db.find_by_id(table='downloads', object_id=downloads_id) return download