def run(argv): """This function is the core-function using all the other components to do the expected work. This includes parsing the commandline, reading "config.ini", parsing the input-file and actual downloading of all the files. Called by ```run.py``` in base-dir. """ # Parse arguments arg_parser = CLI() input_path, verbose = arg_parser.parse(argv[1:]) # argv[0] = script-name # Read config-file output_path = read_config() # Create and use input-parser parser = InputParser(input_path, output_path, verbose) # Create downloader downloader = Downloader(output_path, verbose) # Use downloader for url, filename in parser.get_url_targetname_pairs(): downloader.download(url, filename)
class TestDownloader(unittest.TestCase): def setUp(self): self.d = Downloader() def test_download_contents_file(self): download_mock_fn = Mock() self.d._download = download_mock_fn res = self.d.download_contents_file('amd-foo') expected = 'Contents-amd-foo' expected_ext = expected + '.gz' self.assertEquals(res, expected) self.d._download.assert_called_with(expected_ext, expected) @patch('urllib.request.urlopen') @patch('gzip.decompress') @patch("__main__.open", new_callable=mock_open, read_data="data") def test_download(self, mock_urlopen, mock_gzip, mock_open_handler): a = Mock() a.read.return_value = bytes('foo','utf-8') mock_urlopen.return_value = a mock_gzip.return_value = bytes('foo', 'utf-8') fpath = '/tmp/baz' fname = fpath.split('/')[-1] self.d._download(fname, fpath) mock_open_handler.assert_called_once()
async def main() -> None: new_e3 = NewE3() old_e3 = OldE3() with open("config.json", "r") as f: config = json.loads(f.read()) username = config.get("studentId", "") old_e3_pwd = config.get("oldE3Password", "") new_e3_pwd = config.get("newE3Password", "") download_path = config.get("downloadPath", "e3") gdrive_enable = config.get("gdrive_enable", True) download_path = os.path.expanduser(download_path) if gdrive_enable: store = oauth_file.Storage("token.json") creds = store.get() if not creds or creds.invalid: flow = client.flow_from_clientsecrets("credentials.json", SCOPES) creds = tools.run_flow(flow, store) while True: if username == "": username = input("StudentID: ") if old_e3_pwd == "": old_e3_pwd = getpass("Old E3 Password: "******"", "" print("ID or Old E3 Password Error") while True: if new_e3_pwd == "": new_e3_pwd = getpass("New E3 Password: "******"" print("New E3 Password Error") downloader = Downloader(download_path) async with stream.merge(new_e3.all_files(), old_e3.all_files()).stream() as files: async for file in files: downloader.add_file(file) modified_files = await downloader.done() if gdrive_enable: gdirve_client = GDrive(download_path) await gdirve_client.upload() print("") if modified_files: print("The below files are added or modified") modified_files.sort(key=lambda x: x.course_name) for modified_file in modified_files: print(f"{modified_file.course_name} - {modified_file.name}") else: print("No files are added or modified")
def test_nonexisting_download_valid_outdir(self): """Invalid Download-link; valid out-dir. """ target_file = os.path.join(self.valid_out_dir, '1.file') downloader = Downloader(self.valid_out_dir) with self.assertRaises(DownloaderDownloadError): downloader.download(NON_EXISTING_DOWNLOAD_URL, target_file)
def test_existing_download_valid_outdir(self): """Valid Download-link; valid out-dir. """ target_file = os.path.join(self.valid_out_dir, '1.file') downloader = Downloader(self.valid_out_dir) downloader.download(EXISTING_DOWNLOADL_URL, target_file) self.assertEqual(get_size(target_file), 35832)
def test_nonexisting_download_invalid_outdir(self): """Invalid Download-link; invalid out-dir. Directory invalid / non-existing with high-probability. """ target_file = os.path.join(self.valid_out_dir + '1', '1.file') downloader = Downloader(self.valid_out_dir) with self.assertRaises(DownloaderDownloadError): downloader.download(NON_EXISTING_DOWNLOAD_URL, target_file)
def run(self, parser=2, downloader=2): self._logger.info('이미지 다운로드 작업 시작') start = time.time() # 멀티 프로세싱 처리를 위한 매니저 with Manager() as manager: # 프로세스 목록 processes = [] # 공유 메모리 변수 content_list = manager.list() image_list = manager.list() count = manager.Value('i', 0) lock = manager.Lock() feeder_running = manager.Value('i', 1) parser_running = manager.Value('i', 1) parser_logger = Logger('cybot_parser.log') downloader_logger = Logger('cybot_downloader.log') main_cookies = self._driver.get_cookies() cookie = [] for c in main_cookies: cookie.append({'name': c['name'], 'value': c['value']}) # 파서 프로세스 생성 및 시작 for idx in range(parser): parser_instance = Parser(self._chromedriver, cookie, parser_logger, self._delay) parser_process = Process(target=parser_instance.parse, \ args=(content_list, image_list, feeder_running, parser_running) ) parser_process.name = 'Parser::' + str(idx) parser_process.start() processes.append(parser_process) self._logger.info('Parser', str(idx), '프로세스 시작') # 다운로더 프로세스 생성 및 시작 for idx in range(downloader): downloader_instance = Downloader(downloader_logger) downloader_process = Process(target=downloader_instance.downloader, \ args=(image_list, count, lock, parser_running)) downloader_process.name = 'Downloader::' + str(idx) downloader_process.start() processes.append(downloader_process) self._logger.info('Downloader', str(idx), '프로세스 시작') # 피더 프로세스 시작 self._logger.info('Feeder 시작') self.feeder(content_list, feeder_running) # 파서, 다운로더 프로세스가 종료되지않은 경우 대기 for p in processes: p.join() self._logger.info('작업 소요시간: {}초'.format( round(time.time() - start, 2))) self._logger.info('전체 이미지 수: {}'.format(count.value))
class App: __FILE_FORMAT = '.mp4' __TIMEOUT = config.BLOCKED_TIMEOUT def __init__(self, anime_url:str, download_path:str): self.__scraper = Scraper(anime_url) self.__downloader = Downloader(download_path) def download(self, episode:str) -> bool: while True: try: LOGGER.info(f'downloading episode {episode}') # acquire list of downloadable video urls videos = self.__scraper.get(episode) break except RequestBlocked: LOGGER.error(f'request blocked by anime heaven for episode {episode}, going to try again in {self.__TIMEOUT} seconds') time.sleep(self.__TIMEOUT) if not videos: LOGGER.error(f'url not found for episode {episode}') return False filename = self.__get_filename(episode) # NOTE: use first download url only todownload = videos[0] self.__downloader.download(filename, todownload) LOGGER.info(f'downloaded episode {episode}') return True def get_downloads(self) -> dict: return self.__downloader.get_downloads() def __get_filename(self, episode:str) -> str: return f'Episode-{episode}{self.__FILE_FORMAT}'
def get_pkg_stats(arch_type): # download contents file outfile_path = Downloader().download_contents_file(arch_type) # parse contents file and print top 10 files Parser().parse_contents_file(outfile_path)
def __init__(self): self._log = logging.getLogger("src.tagesschau") self.latest_download_file_handler = LatestDownloadFileHandler(config.LATEST_DOWNLOAD_FILE_PATH) self.downloader = Downloader()
class Tagesschau(object): def __init__(self): self._log = logging.getLogger("src.tagesschau") self.latest_download_file_handler = LatestDownloadFileHandler(config.LATEST_DOWNLOAD_FILE_PATH) self.downloader = Downloader() def run(self): self.create_download_path_if_not_exists() self._log.debug("Fetch latest downloaded show and latest download date") self.fetch_latest_downloaded_show_and_date() self._log.debug("latest_show: %s - %s" % (self.latest_downloaded_show, str(self.latest_date))) while self.latest_date <= date.today(): if self.latest_downloaded_show == "tt": self.latest_date += timedelta(days=1) self._log.info("Fetch html from website for " + str(self.latest_date)) url = VIDEO_ARCHIV_URL % self.latest_date.strftime("%Y%m%d") html = self.downloader.fetch_html(url) self._log.debug("Fetch the main content from html") main_content = self.fetch_main_content_from_html(html) self.download_shows_for_one_day(main_content) def create_download_path_if_not_exists(self): if not os.path.exists(config.DOWNLOAD_DIR): os.mkdir(config.DOWNLOAD_DIR) def fetch_latest_downloaded_show_and_date(self): file_content = self.latest_download_file_handler.fetch_content() self.latest_downloaded_show, latest_date = file_content.split("_") self.latest_date = datetime.strptime(latest_date, "%Y%m%d").date() def fetch_main_content_from_html(self, html): content = re.search("<!-- START -->(.+?)<!-- section -->", html, re.S) if not content: raise "Pattern did not match." return content.group(1) def download_shows_for_one_day(self, html): if self.latest_downloaded_show == "tt": self.download_tagesschau(html) if self.latest_downloaded_show == "ts": self.download_tagesthemen(html) def download_tagesschau(self, html): self._log.info("Download tagesschau published on " + str(self.latest_date)) url_match = re.search(PATTERN_TAGESSCHAU_URL, html, re.S) if not url_match: raise("Pattern not found. Tagesschau maybe not yet published.") self.download_show(url_match.group(1)) self.latest_downloaded_show = "ts" self.update_latest_downloaded_show_file() def download_tagesthemen(self, html): self._log.info("Download tagesthemen published on " + str(self.latest_date)) url_match = re.search(PATTERN_TAGESTHEMEN_URL, html) if not url_match: raise("Pattern not found. Tagesthemen maybe not yet published.") self.download_show(url_match.group(1)) self.latest_downloaded_show = "tt" self.update_latest_downloaded_show_file() def download_show(self, url): url = urllib.parse.urljoin(TAGESSCHAU_URL, url) filename = self.create_filename(url) self.downloader.download_show(url, filename) def create_filename(self, url): match = re.search("/((t[st])-\d+?)\.html", url) page_id, show = match.groups() self._log.debug("page_id: " + page_id) self._log.debug("show: " + show) filename = "%s_%s.mp4" % (self.latest_date.strftime("%Y_%m_%d"), show) return os.path.join(config.DOWNLOAD_DIR, filename) def update_latest_downloaded_show_file(self): latest_date = self.latest_date.strftime("%Y%m%d") new_content = "%s_%s" % (self.latest_downloaded_show, latest_date) self.latest_download_file_handler.update(new_content)
level=os.getenv('LOG_LEVEL', logging.INFO)) logger = logging.getLogger(__name__) config = Config(logger, os.getenv('BOT_TOKEN', Helper.list_get(sys.argv, 1, None)), os.getenv('BOT_ADMIN', Helper.list_get(sys.argv, 2, None)), os.getenv('BOT_DESTINATION', Helper.list_get(sys.argv, 3, None)), os.getenv('BOT_PERSISTENCE', Helper.list_get(sys.argv, 6, True)), os.getenv('BOT_QUALITY', Helper.list_get(sys.argv, 7, '320')), mpd_host=os.getenv('BOT_MPD_HOST', Helper.list_get(sys.argv, 4, False)), mpd_port=os.getenv('BOT_MPD_PORT', Helper.list_get(sys.argv, 5, False))) downloader = Downloader(config) updates = {} button_list = [ telegram.InlineKeyboardButton("▶ Play", callback_data='mpd_play'), telegram.InlineKeyboardButton("⏸ Pause", callback_data='mpd_pause'), telegram.InlineKeyboardButton("🗃 All Playlists", callback_data='mpd_lists'), telegram.InlineKeyboardButton("🔊 Up +5", callback_data='mpd_up'), telegram.InlineKeyboardButton("🔇 Mute", callback_data='mpd_mute'), telegram.InlineKeyboardButton("🔉 Down -5", callback_data='mpd_down') ] reply_markup = telegram.InlineKeyboardMarkup( Helper.build_menu(button_list, n_cols=3))
def __init__(self, anime_url:str, download_path:str): self.__scraper = Scraper(anime_url) self.__downloader = Downloader(download_path)
def get_dates_in_range(start_date: date, end_date: date) -> list: date_list = [] delta = end_date - start_date for i in range(delta.days + 1): date_list.append(start_date + timedelta(days=i)) return date_list if __name__ == "__main__": work_dir = args.work_dir if not os.path.exists(work_dir): os.mkdir(work_dir) processing_dates = get_dates_in_range(args.start_date, args.end_date) if args.operation == 'download': hail_dir = os.path.join(work_dir, 'hail_reports') if not os.path.exists(hail_dir): os.mkdir(hail_dir) downloader = Downloader(processing_dates, hail_dir) downloader.download_hail_reports() if args.operation == 'preprocess': hail_dir = os.path.join(work_dir, 'hail_reports') if not os.path.exists(hail_dir): raise FileNotFoundError('Missing hail data') hail_preprocessor = HailReportPreprocessor(work_dir, hail_dir) hail_preprocessor.wrangle_data()
def setUp(self): self.d = Downloader()
def main(argv): # Setup logging duallog.setup(Path(FLAGS.data_directory) / 'logs') logging.set_verbosity( FLAGS.logging_verbosity ) # Must be called after duallog.setup() to function properly # Configure GDAL gdal.SetCacheMax(8 * 1000000000) # Create absolute paths (either use full path provided as argument or use data dir in the project folder) data_dir = Path(FLAGS.data_directory) if os.path.isabs( FLAGS.data_directory) else Path.cwd() / FLAGS.data_directory # Ensure filename on geojson file geojson_path = FLAGS.geojson if FLAGS.geojson.endswith( '.geojson') else FLAGS.geojson + '.geojson' # If no order_id from previous order is provided, then download the data requested for this order order_id = FLAGS.order_id if order_id == 'Empty': order_id = 'order_' + datetime.datetime.today().strftime( '%Y%m%d-%H%M%S') logging.info("####################################") logging.info("# Initializing Sentinel downloader #") logging.info("####################################") logging.info("Order id: " + order_id) downloader = Downloader(username=FLAGS.username, password=FLAGS.password, satellite=FLAGS.satellite, order_id=order_id, directory=data_dir) # Load the geojson file (check whether the filename was included in the provided name) if 'denmark_without_bornholm' in str(geojson_path): # Load the default geojson (denmark_without_bornholm), which is included in the project code footprint = geojson_to_wkt( read_geojson( Path('data') / 'geojson' / 'denmark_without_bornholm.geojson')) else: # Load the provided geojson file from the data directory footprint = geojson_to_wkt( read_geojson(data_dir / 'geojson' / geojson_path)) # Load from data directory # Query the data (multiple footprints can be used, but it is recommended to stick to a single footprint) downloader.query(footprint, FLAGS.startdate, FLAGS.enddate) # Following code can be used if several geojson files are to be queried # footprint = geojson_to_wkt(read_geojson('data/geojson/bornholm.geojson')) # downloader.query(footprint, FLAGS.startdate, FLAGS.enddate) # Print the number of products and size of all products to be downloaded downloader.print_num_and_size_of_products() downloader.save_queried_products( ) # Save a geojson containing all products to be downloaded logging.info("") if FLAGS.download: logging.info("####################") logging.info("# Downloading data #") logging.info("####################") downloader.download_zipfiles() logging.info("") if FLAGS.process_tiles: # Load products to be processed (always load from file to ensure modularity for the downloader and processor) queried_products_path = (data_dir / 'orders' / order_id).with_suffix('.pkl') products_df = pd.read_pickle(queried_products_path) logging.info("###################") logging.info("# Processing data #") logging.info("###################") processpipeliner = ProcessPipeliner(products_df=products_df, directory=data_dir) processpipeliner.process_products()
from src.twitter_image_getter import TwitterImageGetter from src.downloader import Downloader from src.uploader import Uploader if __name__ == '__main__': tmp_dir = "tmp/" twitter_image_getter = TwitterImageGetter() twitter_image_getter.login() image_urls = twitter_image_getter.get_image_url_list() downloader = Downloader(tmp_dir, image_urls) downloader.run() uploader = Uploader(tmp_dir, image_urls) uploader.login() uploader.run()
def run(self) -> None: d = Downloader(self.novel) if self.mode == "singleVolume": logging.info(f"下载{self.novel.title} {self.kwargs['name']}中") d.singleVolume(self.kwargs["vid"], self.kwargs["name"]) logging.info("下载成功!") elif self.mode == "cover": logging.info("下载封面中") d.cover() logging.info("下载成功!") elif self.mode == "allBook": logging.info(f"下载{self.novel.title}全本中") d.allBooks() logging.info("下载成功!") elif self.mode == "volumes": for i in self.novel.volumeList: for j in i["chapters"]: if i["chapters"].index(j) == 0: logging.info(f"下载{self.novel.title} {i['name']}中") d.singleVolume(j['cid'] - 1, i["name"]) logging.info("下载成功!") elif j["name"] == "插图": logging.info(f"下载{self.novel.title}-{i['name']}插图中") d.pictures(j['cid'], False, i["name"]) logging.info("下载成功!")
def begin_download(download_link): d = Downloader(download_link) d.download()