class DownloadHandler(object): def __init__(self, conf): self.conf = conf self.log = conf.get('log') self.log.info('DownloadHandler load start') self.downloader = Downloader(conf=conf) self.middleware_manager = MiddlewareManager(conf) def write_log(self, request, response): info = "status:%s\turl:%s\tdownload_type:%s\telapsed:%s\tproxy_time:%s\thttpcode:%d" \ % (response.status, request.url, request.download_type, response.elapsed, request.proxytime, response.http_code) if request.proxy != None: info += "\tproxy:%s:%s" % (request.proxy.host, request.proxy.port ) # str(request['proxy']['port'] if response.status == CrawlStatus.CRAWL_SUCCESS: self.log.info(info) else: self.log.error(info) def download(self, request): self.log.info("start_crawl\turl::%s\tmethod:%s\tdownload_type:%s" % (request.url, request.method, request.download_type)) start = time.time() response = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) try: if request.retry_times is None: retry_times = self.conf.get( 'default_request_kwargs')['retry_times'] else: retry_times = request.retry_times for t in xrange(retry_times): request = copy.deepcopy(request) self.middleware_manager.process_request(request) start_time = time.time() res = self.downloader.download(request) response = self.middleware_manager.process_response( request, res) request.proxytime = (time.time() - start_time) * 1000.0 - response.elapsed self.write_log(request, response) if response.status == CrawlStatus.CRAWL_SUCCESS: break time.sleep(3) except Exception as e: self.log.error('url:' + request.url + '\terror_msg:' + str(traceback.format_exc())) finally: content_len = -1 if response.content: content_len = len(response.content) self.log.info('finish_crawl\tuse_time:' + str(time.time() - start) + '\tlens:' + str(content_len) + '\tstatus:' + str(response.status) + '\turl:' + str(request.url)) return response def stop(self): self.downloader.stop()
class BoxCoxRoxPipeline: dd = Downloader() dfti = DFTI() sentiment = Sentiment() lda = GridSearchLDA() dataset_builder = DatasetBuilder() ranking = Rankings() validator = LinkValidator() database_location = "pets.db" categories_file = "category.csv" def run(self, download_data=True): print("Downloader...") if download_data: self.dd.run(self.database_location) print("DFTI...") self.dfti.run(self.database_location) print("Sentiment...") self.sentiment.run(self.database_location) print("Ranking...") self.ranking.run(self.database_location) print("Validator...") self.validator.run(top_reviews=50) print("LDA...") self.lda.run(self.database_location, self.categories_file) print("Build Dataset...") self.dataset_builder.run(self.database_location)
def start_downloader(self): """ Starts the downloader in a thread. """ # Don't know how to reproduce, but in some really rare cases the downloader might start without the user requesting it. # These logs try to collect information that might help pinpoint what causes that. # Actually, it's been so long since the last time this error was observed that I don't know if it still happens # or if whatever caused it was fixed... self.log.debug("stack ([1][3]):") i = 0 for item in inspect.stack(): self.log.debug("[" + str(i) + "]= " + str(item)) i += 1 self.log.debug("downloader_is_running: " + str(self.downloader_is_running)) self.log.debug("downloader_is_restarting: " + str(self.downloader_is_restarting)) self.log.debug("downloader_is_stopping: " + str(self.downloader_is_stopping)) if not self.downloader_is_stopping: if self.downloader_is_restarting: self.log.info("RESTARTING DOWNLOADER THREAD") self.downloader_is_restarting = False else: self.log.info("STARTING DOWNLOADER THREAD") self.window.downloader_starting() self.thread = QtCore.QThread(self) self.downloader = Downloader() self.downloader.moveToThread(self.thread) self.downloader.running.connect(self.downloader_started) self.downloader.finish.connect(self.thread.quit) self.downloader.restart.connect(self.restart_downloader) self.downloader.showMessage.connect(self.show_tray_message) self.downloader.update_ui.connect(self.update_ui) # noinspection PyUnresolvedReferences self.thread.started.connect( self.downloader.execute_once ) # PyCharm doesn't recognize started.connect()... # noinspection PyUnresolvedReferences self.thread.finished.connect( self.downloader_stopped ) # PyCharm doesn't recognize finished.connect()... self.thread.start() else: self.downloader_is_stopping = False self.downloader_is_restarting = False
def job(): try: HealthCheck.ping_status(Status.START) downloader = Downloader(args.username, args.password, args.directory) downloader.run() if args.convert: converter = PDFConverter(args.directory) converter.run() HealthCheck.ping_status(Status.SUCCESS) if "DEV_RUN" in os.environ: while 1: time.sleep(0.1) except Exception: HealthCheck.ping_status(Status.FAIL) raise Exception
def main(): # Firebase domain and credential setup fb.setDomain() fb.setCredential() # connect database database = DB() # get watching anime animes = database.watching() for anime in animes: # if not currently watching anime, skip if not anime['watching']: continue print(f'Downloading episode {anime["episode"]} of {anime["name"]}') # NOTE: instantiate scraper scraper = Scraper( anime['url'] ) # get video link while True: try: videos = scraper.get( anime['episode'] ) break except RequestBlocked: time.sleep(TIMEOUT) # if videos cannot be found, skip if not videos: print(f'Cannot find download link for episode {anime["episode"]} of {anime["name"]}') continue filename = f'{anime["name"]} Episode-{anime["episode"]}{FILE_FORMAT}' # NOTE: use first download url only todownload = videos[0] # NOTE: instantiate downloader downloader = Downloader( DOWNLOAD_PATH ) downloader.download(filename, todownload) print(f'Downloaded episode {anime["episode"]} of {anime["name"]}') # increment episode number in firebase database.update(url=anime['url'], episode=anime['episode'] + 1)
def get_zhidao_content(url, method, gap, header, batch_id): if not hasattr(get_zhidao_content, '_batches'): setattr(get_zhidao_content, '_batches', {}) ret = get_zhidao_content._batches.get(batch_id) if ret is None: downloader = Downloader(request=True, gap=gap, batch_id=batch_id) downloader.login() get_zhidao_content._batches[batch_id] = downloader if header: get_zhidao_content._batches[batch_id].update_header(header) return get_zhidao_content._batches[batch_id].requests_with_cache( url, 'get', encode='gb18030', redirect_check=True, error_check=True, refresh=False)
def test_downloader(): downloader = Downloader(config.HEADERS) task = Task('https://www.dianping.com/shop/90556783/review_all', '', 'https://www.dianping.com/shop/90556783') result = downloader.download_task(task) logger.info(result)
def __init__(self): super().__init__() self.downloader = Downloader(config.HEADERS) self.task_queue = TaskQueue(config.REDIS_DB_URL, config.REDIS_DB_DATABASE) self.count = 0
class Scheduler(threading.Thread): def __init__(self): super().__init__() self.downloader = Downloader(config.HEADERS) self.task_queue = TaskQueue(config.REDIS_DB_URL, config.REDIS_DB_DATABASE) self.count = 0 def append_request_task(self, task: Task): self.task_queue.push_task(task) # if self.count <= 50: # self.task_queue.push_task(task) # self.count += 1 def run(self) -> None: retry = 0 while True: task = self.task_queue.get_top_task() if task is None: break try: self._process_task(task) retry = 0 except: if retry <= MAX_RETRY: logger.warning( f'Failed to process task, attempt the {retry} retry.') retry += 1 delay = retry * 10 + 10 time.sleep(delay) else: raise def _process_task(self, task: Task) -> None: content = self.downloader.download_task(task) try: parser = get_parser(task.url) for item in parser.parse(task, content): if isinstance(item, Task): logger.info(f'Append new task {item}') self.task_queue.push_task(item) elif isinstance(item, Document): logger.info(f'Save the parsed item {item}') item.__class__.store_item(item) else: raise Exception( f'Unsupported parse result: class={item.__class__}') except Exception as e: with open('exception.html', 'w') as ofile: ofile.write(content) logger.error(f'Parse failed with error:') logger.exception(e) raise self.task_queue.drop_top_task(task.type_) # 等待指定的秒数+-2s # delay = config.DOWNLOAD_DELAY + random.randint(20, 50) / 10 delay = config.DOWNLOAD_DELAY logger.info(f'Delay for {delay} seconds.') time.sleep(delay)
class Main(QtCore.QObject): """ Main class, instantiated when the application starts. Creates main window/system tray icon, and stops the user from opnening more than one instance of the application. It also starts/stops the downloader when the user requests or during auto-start on Windows startup. """ def __init__(self): QtCore.QObject.__init__(self) # Make sure the required folders/files exist if not os.path.isdir(constant.DATA_PATH): os.makedirs(constant.DATA_PATH) self.log = LoggerManager().get_logger("MAIN") try: if not os.path.isfile(constant.DB_PATH): shutil.copyfile("dbTemplate.db", constant.DB_PATH) except (shutil.Error, IOError) as error: self.log.print_traceback(error, self.log.critical) sys.exit(1) try: if not os.path.isdir(constant.DEFAULT_TORRENTS_PATH): os.makedirs(constant.DEFAULT_TORRENTS_PATH) except Exception as error: self.log.print_traceback(error, self.log.critical) sys.exit(1) self.app = QtSingleApplication(constant.GUID, sys.argv) self.log.info("---STARTING APPLICATION---") if self.app.isRunning(): self.log.warning( "---The launch of another instance of this application will be cancelled---" ) self.app.sendMessage() sys.exit(0) self.app.messageReceived.connect(self.another_instance_opened) self.app.setQuitOnLastWindowClosed(False) self.window = None self.tray_icon = None self.thread = None self.downloader = None self.timer = None self.downloader_is_running = False self.downloader_is_restarting = False self.downloader_is_stopping = False try: self.window = WindowMain(self) self.tray_icon = SystemTrayIcon(self, self.window) self.tray_icon.show() show_gui = "-nogui" not in sys.argv if show_gui: if self.downloader_is_running: self.window.downloader_started() else: self.window.downloader_stopped() self.window.show() elif not self.window.is_visible(): self.log.info("STARTING DOWNLOADER") self.start_downloader() self.app.exec_() except Exception as unforeseenError: self.log.critical("UNFORESEEN ERROR") self.log.print_traceback(unforeseenError, self.log.critical) if self.tray_icon is not None: self.show_tray_message("Unforeseen error occurred...") exit() def quit(self): """ Finishes the application gracefully - at least tries to, teehee (^_^;) """ if self.tray_icon is not None: self.tray_icon.hide() self.tray_icon.deleteLater() if self.timer is not None: self.timer.stop() if self.thread is not None and self.thread.isRunning(): self.stop_downloader() #self.app.closeAllWindows() self.app.quit() def another_instance_opened(self, _): """ Called when the user tries to open another instance of the application. Instead of allowing it, will open the current one to avoid any errors. :type _: QtCore.QString :param _: message received, see class QtSingleApplication below. """ self.window.show() def start_downloader(self): """ Starts the downloader in a thread. """ # Don't know how to reproduce, but in some really rare cases the downloader might start without the user requesting it. # These logs try to collect information that might help pinpoint what causes that. # Actually, it's been so long since the last time this error was observed that I don't know if it still happens # or if whatever caused it was fixed... self.log.debug("stack ([1][3]):") i = 0 for item in inspect.stack(): self.log.debug("[" + str(i) + "]= " + str(item)) i += 1 self.log.debug("downloader_is_running: " + str(self.downloader_is_running)) self.log.debug("downloader_is_restarting: " + str(self.downloader_is_restarting)) self.log.debug("downloader_is_stopping: " + str(self.downloader_is_stopping)) if not self.downloader_is_stopping: if self.downloader_is_restarting: self.log.info("RESTARTING DOWNLOADER THREAD") self.downloader_is_restarting = False else: self.log.info("STARTING DOWNLOADER THREAD") self.window.downloader_starting() self.thread = QtCore.QThread(self) self.downloader = Downloader() self.downloader.moveToThread(self.thread) self.downloader.running.connect(self.downloader_started) self.downloader.finish.connect(self.thread.quit) self.downloader.restart.connect(self.restart_downloader) self.downloader.showMessage.connect(self.show_tray_message) self.downloader.update_ui.connect(self.update_ui) # noinspection PyUnresolvedReferences self.thread.started.connect( self.downloader.execute_once ) # PyCharm doesn't recognize started.connect()... # noinspection PyUnresolvedReferences self.thread.finished.connect( self.downloader_stopped ) # PyCharm doesn't recognize finished.connect()... self.thread.start() else: self.downloader_is_stopping = False self.downloader_is_restarting = False def stop_downloader(self): """ Stops the downloader (¬_¬) """ self.log.info("TERMINATING DOWNLOADER THREAD") self.window.downloader_stopping() self.downloader_is_stopping = True self.downloader_is_restarting = False if self.thread.isRunning(): self.downloader.stop_thread() thread_stopped_gracefully = self.thread.wait(300) if self.thread.isRunning(): thread_stopped_gracefully = self.thread.quit() self.log.info("THREAD STOPPED CORRECTLY: %s" % thread_stopped_gracefully) if not thread_stopped_gracefully: self.thread.terminate() else: self.downloader_stopped() try: self.timer.stop() except AttributeError: pass # Happens when the downloader is interrupted before being able to fully execute at least once. def restart_downloader(self): """ Finishes the current downloader thread and starts a timer. When the timer times out a new downloader thread is created. """ self.downloader_is_restarting = True self.thread.quit() self.log.info("THREAD FINISHED CORRECTLY: %s" % self.thread.wait(300)) self.timer = QtCore.QTimer() # noinspection PyUnresolvedReferences self.timer.timeout.connect( self.start_downloader ) # PyCharm doesn't recognize timeout.connect()... self.timer.setSingleShot(True) self.timer.start(db.DBManager().get_config().sleep_time * 1000) @QtCore.pyqtSlot() def downloader_started(self): """ Downloader thread started correctly; notifies the user. """ self.downloader_is_running = True self.window.downloader_started() @QtCore.pyqtSlot() def downloader_stopped(self): """ Downloader thread stopped correctly; notifies the user. """ if not self.downloader_is_restarting: self.downloader_is_running = False self.downloader_is_stopping = False self.downloader_is_restarting = False self.window.downloader_stopped() @QtCore.pyqtSlot(str) def show_tray_message(self, message): """ Uses the system tray icon to notify the user about something. :type message: str :param message: Message to be shown to the user. """ # TODO: Would it be better if this were moved to manager.system_tray_icon? self.tray_icon.showMessage(constant.TRAY_MESSAGE_TITLE, message, QtGui.QSystemTrayIcon.Information, 5000) @QtCore.pyqtSlot(str) def update_ui(self, message): """ Updates the anime table in the main window. Also, shows a message to the user using the system tray icon. :type message: str :param message: Message to be shown to the user. """ if self.window is not None: self.window.update_anime_table() self.show_tray_message(message)
from downloader.downloader import Downloader pmc_downloader = Downloader() pmc_downloader.download_all() pmc_downloader.download_by_query('sarcosine')
def __init__(self, conf): self.conf = conf self.log = conf.get('log') self.log.info('DownloadHandler load start') self.downloader = Downloader(conf=conf) self.middleware_manager = MiddlewareManager(conf)
def initLogger(): if not os.path.exists(dir_logs): os.mkdir(dir_logs) handler1 = logging.FileHandler(dir_logs + "/" + "download." + str(datetime.date.today()) + ".log", mode="a", encoding="utf8") handler2 = logging.StreamHandler() formatter1 = logging.Formatter( fmt="%(asctime)s [%(levelname)s] [%(lineno)d] >> %(message)s", datefmt="%Y-%m-%d %H:%M:%S") formatter2 = logging.Formatter(fmt="[%(levelname)s] >> %(message)s") handler1.setFormatter(formatter1) handler2.setFormatter(formatter2) logger.setLevel(logging.INFO) handler1.setLevel(logging.INFO) handler2.setLevel(logging.INFO) logger.addHandler(handler1) logger.addHandler(handler2) if __name__ == "__main__": initLogger() logger.info("----顺丰快递验证码下载程序启动----") downloader = Downloader() num = int(input("请输入下载的验证码数量:\n")) logger.info("下载验证码数量:{}".format(num)) downloader.getVerificationCode(dir_unmarked, num) logger.info("----顺丰快递验证码下载程序结束----")