Ejemplo n.º 1
0
class DownloadHandler(object):
    def __init__(self, conf):
        self.conf = conf
        self.log = conf.get('log')
        self.log.info('DownloadHandler load start')
        self.downloader = Downloader(conf=conf)
        self.middleware_manager = MiddlewareManager(conf)

    def write_log(self, request, response):
        info = "status:%s\turl:%s\tdownload_type:%s\telapsed:%s\tproxy_time:%s\thttpcode:%d" \
              % (response.status, request.url, request.download_type, response.elapsed, request.proxytime,
                 response.http_code)
        if request.proxy != None:
            info += "\tproxy:%s:%s" % (request.proxy.host, request.proxy.port
                                       )  # str(request['proxy']['port']
        if response.status == CrawlStatus.CRAWL_SUCCESS:
            self.log.info(info)
        else:
            self.log.error(info)

    def download(self, request):
        self.log.info("start_crawl\turl::%s\tmethod:%s\tdownload_type:%s" %
                      (request.url, request.method, request.download_type))
        start = time.time()
        response = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
        try:
            if request.retry_times is None:
                retry_times = self.conf.get(
                    'default_request_kwargs')['retry_times']
            else:
                retry_times = request.retry_times
            for t in xrange(retry_times):
                request = copy.deepcopy(request)
                self.middleware_manager.process_request(request)
                start_time = time.time()
                res = self.downloader.download(request)
                response = self.middleware_manager.process_response(
                    request, res)
                request.proxytime = (time.time() -
                                     start_time) * 1000.0 - response.elapsed
                self.write_log(request, response)
                if response.status == CrawlStatus.CRAWL_SUCCESS:
                    break
                time.sleep(3)
        except Exception as e:
            self.log.error('url:' + request.url + '\terror_msg:' +
                           str(traceback.format_exc()))
        finally:
            content_len = -1
            if response.content:
                content_len = len(response.content)
            self.log.info('finish_crawl\tuse_time:' +
                          str(time.time() - start) + '\tlens:' +
                          str(content_len) + '\tstatus:' +
                          str(response.status) + '\turl:' + str(request.url))
        return response

    def stop(self):
        self.downloader.stop()
Ejemplo n.º 2
0
class BoxCoxRoxPipeline:

    dd = Downloader()
    dfti = DFTI()
    sentiment = Sentiment()
    lda = GridSearchLDA()
    dataset_builder = DatasetBuilder()
    ranking = Rankings()
    validator = LinkValidator()

    database_location = "pets.db"
    categories_file = "category.csv"

    def run(self, download_data=True):
        print("Downloader...")
        if download_data:
            self.dd.run(self.database_location)
        print("DFTI...")
        self.dfti.run(self.database_location)
        print("Sentiment...")
        self.sentiment.run(self.database_location)
        print("Ranking...")
        self.ranking.run(self.database_location)
        print("Validator...")
        self.validator.run(top_reviews=50)
        print("LDA...")
        self.lda.run(self.database_location, self.categories_file)
        print("Build Dataset...")
        self.dataset_builder.run(self.database_location)
Ejemplo n.º 3
0
    def start_downloader(self):
        """
        Starts the downloader in a thread.
        """
        # Don't know how to reproduce, but in some really rare cases the downloader might start without the user requesting it.
        # These logs try to collect information that might help pinpoint what causes that.
        # Actually, it's been so long since the last time this error was observed that I don't know if it still happens
        # or if whatever caused it was fixed...
        self.log.debug("stack ([1][3]):")
        i = 0
        for item in inspect.stack():
            self.log.debug("[" + str(i) + "]= " + str(item))
            i += 1
        self.log.debug("downloader_is_running: " +
                       str(self.downloader_is_running))
        self.log.debug("downloader_is_restarting: " +
                       str(self.downloader_is_restarting))
        self.log.debug("downloader_is_stopping: " +
                       str(self.downloader_is_stopping))

        if not self.downloader_is_stopping:
            if self.downloader_is_restarting:
                self.log.info("RESTARTING DOWNLOADER THREAD")
                self.downloader_is_restarting = False
            else:
                self.log.info("STARTING DOWNLOADER THREAD")
                self.window.downloader_starting()
            self.thread = QtCore.QThread(self)
            self.downloader = Downloader()
            self.downloader.moveToThread(self.thread)
            self.downloader.running.connect(self.downloader_started)
            self.downloader.finish.connect(self.thread.quit)
            self.downloader.restart.connect(self.restart_downloader)
            self.downloader.showMessage.connect(self.show_tray_message)
            self.downloader.update_ui.connect(self.update_ui)
            # noinspection PyUnresolvedReferences
            self.thread.started.connect(
                self.downloader.execute_once
            )  # PyCharm doesn't recognize started.connect()...
            # noinspection PyUnresolvedReferences
            self.thread.finished.connect(
                self.downloader_stopped
            )  # PyCharm doesn't recognize finished.connect()...
            self.thread.start()
        else:
            self.downloader_is_stopping = False
            self.downloader_is_restarting = False
Ejemplo n.º 4
0
def job():
    try:
        HealthCheck.ping_status(Status.START)
        downloader = Downloader(args.username, args.password, args.directory)
        downloader.run()

        if args.convert:
            converter = PDFConverter(args.directory)
            converter.run()

        HealthCheck.ping_status(Status.SUCCESS)
        if "DEV_RUN" in os.environ:
            while 1:
                time.sleep(0.1)
    except Exception:
        HealthCheck.ping_status(Status.FAIL)
        raise Exception
Ejemplo n.º 5
0
def main():
    # Firebase domain and credential setup
    fb.setDomain()
    fb.setCredential()

    # connect database
    database = DB()

    # get watching anime
    animes = database.watching()

    for anime in animes:
        # if not currently watching anime, skip
        if not anime['watching']:
            continue

        print(f'Downloading episode {anime["episode"]} of {anime["name"]}')

        # NOTE: instantiate scraper
        scraper = Scraper( anime['url'] )

        # get video link
        while True:
            try:
                videos = scraper.get( anime['episode'] )
                break
            except RequestBlocked:
                time.sleep(TIMEOUT)

        # if videos cannot be found, skip 
        if not videos:
            print(f'Cannot find download link for episode {anime["episode"]} of {anime["name"]}')
            continue
        
        filename = f'{anime["name"]} Episode-{anime["episode"]}{FILE_FORMAT}'
        # NOTE: use first download url only
        todownload = videos[0]
        # NOTE: instantiate downloader
        downloader = Downloader( DOWNLOAD_PATH )
        downloader.download(filename, todownload)

        print(f'Downloaded episode {anime["episode"]} of {anime["name"]}')

        # increment episode number in firebase
        database.update(url=anime['url'], episode=anime['episode'] + 1)
Ejemplo n.º 6
0
def get_zhidao_content(url, method, gap, header, batch_id):
    if not hasattr(get_zhidao_content, '_batches'):
        setattr(get_zhidao_content, '_batches', {})

    ret = get_zhidao_content._batches.get(batch_id)
    if ret is None:
        downloader = Downloader(request=True, gap=gap, batch_id=batch_id)
        downloader.login()
        get_zhidao_content._batches[batch_id] = downloader

    if header:
        get_zhidao_content._batches[batch_id].update_header(header)

    return get_zhidao_content._batches[batch_id].requests_with_cache(
        url,
        'get',
        encode='gb18030',
        redirect_check=True,
        error_check=True,
        refresh=False)
Ejemplo n.º 7
0
def test_downloader():
    downloader = Downloader(config.HEADERS)
    task = Task('https://www.dianping.com/shop/90556783/review_all', '',
                'https://www.dianping.com/shop/90556783')
    result = downloader.download_task(task)
    logger.info(result)
Ejemplo n.º 8
0
 def __init__(self):
     super().__init__()
     self.downloader = Downloader(config.HEADERS)
     self.task_queue = TaskQueue(config.REDIS_DB_URL,
                                 config.REDIS_DB_DATABASE)
     self.count = 0
Ejemplo n.º 9
0
class Scheduler(threading.Thread):
    def __init__(self):
        super().__init__()
        self.downloader = Downloader(config.HEADERS)
        self.task_queue = TaskQueue(config.REDIS_DB_URL,
                                    config.REDIS_DB_DATABASE)
        self.count = 0

    def append_request_task(self, task: Task):
        self.task_queue.push_task(task)
        # if self.count <= 50:
        #     self.task_queue.push_task(task)
        #     self.count += 1

    def run(self) -> None:
        retry = 0
        while True:
            task = self.task_queue.get_top_task()
            if task is None:
                break

            try:
                self._process_task(task)
                retry = 0
            except:
                if retry <= MAX_RETRY:
                    logger.warning(
                        f'Failed to process task, attempt the {retry} retry.')
                    retry += 1
                    delay = retry * 10 + 10
                    time.sleep(delay)
                else:
                    raise

    def _process_task(self, task: Task) -> None:
        content = self.downloader.download_task(task)
        try:
            parser = get_parser(task.url)
            for item in parser.parse(task, content):
                if isinstance(item, Task):
                    logger.info(f'Append new task {item}')
                    self.task_queue.push_task(item)
                elif isinstance(item, Document):
                    logger.info(f'Save the parsed item {item}')
                    item.__class__.store_item(item)
                else:
                    raise Exception(
                        f'Unsupported parse result: class={item.__class__}')

        except Exception as e:
            with open('exception.html', 'w') as ofile:
                ofile.write(content)
            logger.error(f'Parse failed with error:')
            logger.exception(e)
            raise

        self.task_queue.drop_top_task(task.type_)

        # 等待指定的秒数+-2s
        # delay = config.DOWNLOAD_DELAY + random.randint(20, 50) / 10
        delay = config.DOWNLOAD_DELAY
        logger.info(f'Delay for {delay} seconds.')
        time.sleep(delay)
Ejemplo n.º 10
0
class Main(QtCore.QObject):
    """
    Main class, instantiated when the application starts.
    Creates main window/system tray icon, and stops the user from opnening more than one instance of the application.
    It also starts/stops the downloader when the user requests or during auto-start on Windows startup.
    """
    def __init__(self):
        QtCore.QObject.__init__(self)

        # Make sure the required folders/files exist
        if not os.path.isdir(constant.DATA_PATH):
            os.makedirs(constant.DATA_PATH)
        self.log = LoggerManager().get_logger("MAIN")
        try:
            if not os.path.isfile(constant.DB_PATH):
                shutil.copyfile("dbTemplate.db", constant.DB_PATH)
        except (shutil.Error, IOError) as error:
            self.log.print_traceback(error, self.log.critical)
            sys.exit(1)
        try:
            if not os.path.isdir(constant.DEFAULT_TORRENTS_PATH):
                os.makedirs(constant.DEFAULT_TORRENTS_PATH)
        except Exception as error:
            self.log.print_traceback(error, self.log.critical)
            sys.exit(1)

        self.app = QtSingleApplication(constant.GUID, sys.argv)
        self.log.info("---STARTING APPLICATION---")
        if self.app.isRunning():
            self.log.warning(
                "---The launch of another instance of this application will be cancelled---"
            )
            self.app.sendMessage()
            sys.exit(0)
        self.app.messageReceived.connect(self.another_instance_opened)
        self.app.setQuitOnLastWindowClosed(False)

        self.window = None
        self.tray_icon = None

        self.thread = None
        self.downloader = None
        self.timer = None
        self.downloader_is_running = False
        self.downloader_is_restarting = False
        self.downloader_is_stopping = False

        try:
            self.window = WindowMain(self)
            self.tray_icon = SystemTrayIcon(self, self.window)
            self.tray_icon.show()

            show_gui = "-nogui" not in sys.argv
            if show_gui:
                if self.downloader_is_running:
                    self.window.downloader_started()
                else:
                    self.window.downloader_stopped()
                self.window.show()
            elif not self.window.is_visible():
                self.log.info("STARTING DOWNLOADER")
                self.start_downloader()
            self.app.exec_()
        except Exception as unforeseenError:
            self.log.critical("UNFORESEEN ERROR")
            self.log.print_traceback(unforeseenError, self.log.critical)
            if self.tray_icon is not None:
                self.show_tray_message("Unforeseen error occurred...")
            exit()

    def quit(self):
        """
        Finishes the application gracefully - at least tries to, teehee (^_^;)
        """
        if self.tray_icon is not None:
            self.tray_icon.hide()
            self.tray_icon.deleteLater()
        if self.timer is not None:
            self.timer.stop()
        if self.thread is not None and self.thread.isRunning():
            self.stop_downloader()
        #self.app.closeAllWindows()
        self.app.quit()

    def another_instance_opened(self, _):
        """
        Called when the user tries to open another instance of the application.
        Instead of allowing it, will open the current one to avoid any errors.

        :type _: QtCore.QString
        :param _: message received, see class QtSingleApplication below.
        """
        self.window.show()

    def start_downloader(self):
        """
        Starts the downloader in a thread.
        """
        # Don't know how to reproduce, but in some really rare cases the downloader might start without the user requesting it.
        # These logs try to collect information that might help pinpoint what causes that.
        # Actually, it's been so long since the last time this error was observed that I don't know if it still happens
        # or if whatever caused it was fixed...
        self.log.debug("stack ([1][3]):")
        i = 0
        for item in inspect.stack():
            self.log.debug("[" + str(i) + "]= " + str(item))
            i += 1
        self.log.debug("downloader_is_running: " +
                       str(self.downloader_is_running))
        self.log.debug("downloader_is_restarting: " +
                       str(self.downloader_is_restarting))
        self.log.debug("downloader_is_stopping: " +
                       str(self.downloader_is_stopping))

        if not self.downloader_is_stopping:
            if self.downloader_is_restarting:
                self.log.info("RESTARTING DOWNLOADER THREAD")
                self.downloader_is_restarting = False
            else:
                self.log.info("STARTING DOWNLOADER THREAD")
                self.window.downloader_starting()
            self.thread = QtCore.QThread(self)
            self.downloader = Downloader()
            self.downloader.moveToThread(self.thread)
            self.downloader.running.connect(self.downloader_started)
            self.downloader.finish.connect(self.thread.quit)
            self.downloader.restart.connect(self.restart_downloader)
            self.downloader.showMessage.connect(self.show_tray_message)
            self.downloader.update_ui.connect(self.update_ui)
            # noinspection PyUnresolvedReferences
            self.thread.started.connect(
                self.downloader.execute_once
            )  # PyCharm doesn't recognize started.connect()...
            # noinspection PyUnresolvedReferences
            self.thread.finished.connect(
                self.downloader_stopped
            )  # PyCharm doesn't recognize finished.connect()...
            self.thread.start()
        else:
            self.downloader_is_stopping = False
            self.downloader_is_restarting = False

    def stop_downloader(self):
        """
        Stops the downloader (¬_¬)
        """
        self.log.info("TERMINATING DOWNLOADER THREAD")
        self.window.downloader_stopping()
        self.downloader_is_stopping = True
        self.downloader_is_restarting = False
        if self.thread.isRunning():
            self.downloader.stop_thread()
            thread_stopped_gracefully = self.thread.wait(300)
            if self.thread.isRunning():
                thread_stopped_gracefully = self.thread.quit()
            self.log.info("THREAD STOPPED CORRECTLY: %s" %
                          thread_stopped_gracefully)
            if not thread_stopped_gracefully:
                self.thread.terminate()
        else:
            self.downloader_stopped()
        try:
            self.timer.stop()
        except AttributeError:
            pass  # Happens when the downloader is interrupted before being able to fully execute at least once.

    def restart_downloader(self):
        """
        Finishes the current downloader thread and starts a timer.
        When the timer times out a new downloader thread is created.
        """
        self.downloader_is_restarting = True
        self.thread.quit()
        self.log.info("THREAD FINISHED CORRECTLY: %s" % self.thread.wait(300))
        self.timer = QtCore.QTimer()
        # noinspection PyUnresolvedReferences
        self.timer.timeout.connect(
            self.start_downloader
        )  # PyCharm doesn't recognize timeout.connect()...
        self.timer.setSingleShot(True)
        self.timer.start(db.DBManager().get_config().sleep_time * 1000)

    @QtCore.pyqtSlot()
    def downloader_started(self):
        """
        Downloader thread started correctly; notifies the user.
        """
        self.downloader_is_running = True
        self.window.downloader_started()

    @QtCore.pyqtSlot()
    def downloader_stopped(self):
        """
        Downloader thread stopped correctly; notifies the user.
        """
        if not self.downloader_is_restarting:
            self.downloader_is_running = False
            self.downloader_is_stopping = False
            self.downloader_is_restarting = False
            self.window.downloader_stopped()

    @QtCore.pyqtSlot(str)
    def show_tray_message(self, message):
        """
        Uses the system tray icon to notify the user about something.

        :type message: str
        :param message: Message to be shown to the user.
        """
        # TODO: Would it be better if this were moved to manager.system_tray_icon?
        self.tray_icon.showMessage(constant.TRAY_MESSAGE_TITLE, message,
                                   QtGui.QSystemTrayIcon.Information, 5000)

    @QtCore.pyqtSlot(str)
    def update_ui(self, message):
        """
        Updates the anime table in the main window.
        Also, shows a message to the user using the system tray icon.

        :type message: str
        :param message: Message to be shown to the user.
        """
        if self.window is not None:
            self.window.update_anime_table()
        self.show_tray_message(message)
Ejemplo n.º 11
0
from downloader.downloader import Downloader

pmc_downloader = Downloader()
pmc_downloader.download_all()
pmc_downloader.download_by_query('sarcosine')
Ejemplo n.º 12
0
 def __init__(self, conf):
     self.conf = conf
     self.log = conf.get('log')
     self.log.info('DownloadHandler load start')
     self.downloader = Downloader(conf=conf)
     self.middleware_manager = MiddlewareManager(conf)
Ejemplo n.º 13
0

def initLogger():
    if not os.path.exists(dir_logs):
        os.mkdir(dir_logs)
    handler1 = logging.FileHandler(dir_logs + "/" + "download." +
                                   str(datetime.date.today()) + ".log",
                                   mode="a",
                                   encoding="utf8")
    handler2 = logging.StreamHandler()
    formatter1 = logging.Formatter(
        fmt="%(asctime)s [%(levelname)s] [%(lineno)d] >> %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S")
    formatter2 = logging.Formatter(fmt="[%(levelname)s] >> %(message)s")
    handler1.setFormatter(formatter1)
    handler2.setFormatter(formatter2)
    logger.setLevel(logging.INFO)
    handler1.setLevel(logging.INFO)
    handler2.setLevel(logging.INFO)
    logger.addHandler(handler1)
    logger.addHandler(handler2)


if __name__ == "__main__":
    initLogger()
    logger.info("----顺丰快递验证码下载程序启动----")
    downloader = Downloader()
    num = int(input("请输入下载的验证码数量:\n"))
    logger.info("下载验证码数量:{}".format(num))
    downloader.getVerificationCode(dir_unmarked, num)
    logger.info("----顺丰快递验证码下载程序结束----")