Example #1
0
class BoxCoxRoxPipeline:

    dd = Downloader()
    dfti = DFTI()
    sentiment = Sentiment()
    lda = GridSearchLDA()
    dataset_builder = DatasetBuilder()
    ranking = Rankings()
    validator = LinkValidator()

    database_location = "pets.db"
    categories_file = "category.csv"

    def run(self, download_data=True):
        print("Downloader...")
        if download_data:
            self.dd.run(self.database_location)
        print("DFTI...")
        self.dfti.run(self.database_location)
        print("Sentiment...")
        self.sentiment.run(self.database_location)
        print("Ranking...")
        self.ranking.run(self.database_location)
        print("Validator...")
        self.validator.run(top_reviews=50)
        print("LDA...")
        self.lda.run(self.database_location, self.categories_file)
        print("Build Dataset...")
        self.dataset_builder.run(self.database_location)
Example #2
0
    def start_downloader(self):
        """
        Starts the downloader in a thread.
        """
        # Don't know how to reproduce, but in some really rare cases the downloader might start without the user requesting it.
        # These logs try to collect information that might help pinpoint what causes that.
        # Actually, it's been so long since the last time this error was observed that I don't know if it still happens
        # or if whatever caused it was fixed...
        self.log.debug("stack ([1][3]):")
        i = 0
        for item in inspect.stack():
            self.log.debug("[" + str(i) + "]= " + str(item))
            i += 1
        self.log.debug("downloader_is_running: " +
                       str(self.downloader_is_running))
        self.log.debug("downloader_is_restarting: " +
                       str(self.downloader_is_restarting))
        self.log.debug("downloader_is_stopping: " +
                       str(self.downloader_is_stopping))

        if not self.downloader_is_stopping:
            if self.downloader_is_restarting:
                self.log.info("RESTARTING DOWNLOADER THREAD")
                self.downloader_is_restarting = False
            else:
                self.log.info("STARTING DOWNLOADER THREAD")
                self.window.downloader_starting()
            self.thread = QtCore.QThread(self)
            self.downloader = Downloader()
            self.downloader.moveToThread(self.thread)
            self.downloader.running.connect(self.downloader_started)
            self.downloader.finish.connect(self.thread.quit)
            self.downloader.restart.connect(self.restart_downloader)
            self.downloader.showMessage.connect(self.show_tray_message)
            self.downloader.update_ui.connect(self.update_ui)
            # noinspection PyUnresolvedReferences
            self.thread.started.connect(
                self.downloader.execute_once
            )  # PyCharm doesn't recognize started.connect()...
            # noinspection PyUnresolvedReferences
            self.thread.finished.connect(
                self.downloader_stopped
            )  # PyCharm doesn't recognize finished.connect()...
            self.thread.start()
        else:
            self.downloader_is_stopping = False
            self.downloader_is_restarting = False
Example #3
0
def job():
    try:
        HealthCheck.ping_status(Status.START)
        downloader = Downloader(args.username, args.password, args.directory)
        downloader.run()

        if args.convert:
            converter = PDFConverter(args.directory)
            converter.run()

        HealthCheck.ping_status(Status.SUCCESS)
        if "DEV_RUN" in os.environ:
            while 1:
                time.sleep(0.1)
    except Exception:
        HealthCheck.ping_status(Status.FAIL)
        raise Exception
def main():
    # Firebase domain and credential setup
    fb.setDomain()
    fb.setCredential()

    # connect database
    database = DB()

    # get watching anime
    animes = database.watching()

    for anime in animes:
        # if not currently watching anime, skip
        if not anime['watching']:
            continue

        print(f'Downloading episode {anime["episode"]} of {anime["name"]}')

        # NOTE: instantiate scraper
        scraper = Scraper( anime['url'] )

        # get video link
        while True:
            try:
                videos = scraper.get( anime['episode'] )
                break
            except RequestBlocked:
                time.sleep(TIMEOUT)

        # if videos cannot be found, skip 
        if not videos:
            print(f'Cannot find download link for episode {anime["episode"]} of {anime["name"]}')
            continue
        
        filename = f'{anime["name"]} Episode-{anime["episode"]}{FILE_FORMAT}'
        # NOTE: use first download url only
        todownload = videos[0]
        # NOTE: instantiate downloader
        downloader = Downloader( DOWNLOAD_PATH )
        downloader.download(filename, todownload)

        print(f'Downloaded episode {anime["episode"]} of {anime["name"]}')

        # increment episode number in firebase
        database.update(url=anime['url'], episode=anime['episode'] + 1)
Example #5
0
def get_zhidao_content(url, method, gap, header, batch_id):
    if not hasattr(get_zhidao_content, '_batches'):
        setattr(get_zhidao_content, '_batches', {})

    ret = get_zhidao_content._batches.get(batch_id)
    if ret is None:
        downloader = Downloader(request=True, gap=gap, batch_id=batch_id)
        downloader.login()
        get_zhidao_content._batches[batch_id] = downloader

    if header:
        get_zhidao_content._batches[batch_id].update_header(header)

    return get_zhidao_content._batches[batch_id].requests_with_cache(
        url,
        'get',
        encode='gb18030',
        redirect_check=True,
        error_check=True,
        refresh=False)
Example #6
0
def test_downloader():
    downloader = Downloader(config.HEADERS)
    task = Task('https://www.dianping.com/shop/90556783/review_all', '',
                'https://www.dianping.com/shop/90556783')
    result = downloader.download_task(task)
    logger.info(result)
Example #7
0
 def __init__(self):
     super().__init__()
     self.downloader = Downloader(config.HEADERS)
     self.task_queue = TaskQueue(config.REDIS_DB_URL,
                                 config.REDIS_DB_DATABASE)
     self.count = 0
Example #8
0
from downloader.downloader import Downloader

pmc_downloader = Downloader()
pmc_downloader.download_all()
pmc_downloader.download_by_query('sarcosine')
Example #9
0
 def __init__(self, conf):
     self.conf = conf
     self.log = conf.get('log')
     self.log.info('DownloadHandler load start')
     self.downloader = Downloader(conf=conf)
     self.middleware_manager = MiddlewareManager(conf)
Example #10
0

def initLogger():
    if not os.path.exists(dir_logs):
        os.mkdir(dir_logs)
    handler1 = logging.FileHandler(dir_logs + "/" + "download." +
                                   str(datetime.date.today()) + ".log",
                                   mode="a",
                                   encoding="utf8")
    handler2 = logging.StreamHandler()
    formatter1 = logging.Formatter(
        fmt="%(asctime)s [%(levelname)s] [%(lineno)d] >> %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S")
    formatter2 = logging.Formatter(fmt="[%(levelname)s] >> %(message)s")
    handler1.setFormatter(formatter1)
    handler2.setFormatter(formatter2)
    logger.setLevel(logging.INFO)
    handler1.setLevel(logging.INFO)
    handler2.setLevel(logging.INFO)
    logger.addHandler(handler1)
    logger.addHandler(handler2)


if __name__ == "__main__":
    initLogger()
    logger.info("----顺丰快递验证码下载程序启动----")
    downloader = Downloader()
    num = int(input("请输入下载的验证码数量:\n"))
    logger.info("下载验证码数量:{}".format(num))
    downloader.getVerificationCode(dir_unmarked, num)
    logger.info("----顺丰快递验证码下载程序结束----")