Python get_categories Exemples, scraper.get_categories Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : __main__.py Projet : aokolovskis/blinkist-scraper

def main():
    parser = argparse.ArgumentParser(
        description="Scrape blinkist.com and generate pretty output")

    parser.add_argument(
        "--language",
        choices={"en", "de"},
        default="en",
        help=
        "The language to scrape books in - either 'en' for english or 'de' for german",
    )
    parser.add_argument(
        "--match-language",
        action="store_true",
        default=False,
        help=
        "Skip scraping books if not in the requested language (not all book are avaible in german)",
    )

    def check_cooldown(value):
        if int(value) < 1:
            raise argparse.ArgumentTypeError("Can't be smaller than 1")
        return int(value)

    parser.add_argument(
        "--cooldown",
        type=check_cooldown,
        default=1,
        help=
        "Seconds to wait between scraping books, and downloading audio files. Can't be smaller than 1",
    )
    parser.add_argument(
        "--headless",
        action="store_true",
        default=False,
        help=
        "Start the automated web browser in headless mode. Works only if you already logged in once",
    )
    parser.add_argument(
        "--audio",
        action="store_true",
        default=False,
        help="Download the audio blinks for each book",
    )
    parser.add_argument(
        "--concat-audio",
        action="store_true",
        default=False,
        help=
        "Concatenate the audio blinks into a single file and tag it. Requires ffmpeg",
    )
    parser.add_argument(
        "--keep-noncat",
        action="store_true",
        default=False,
        help=
        "Keep the individual blink audio files, instead of deleting them (works with '--concat-audio' only)",
    )
    parser.add_argument(
        "--no-scrape",
        action="store_true",
        default=False,
        help=
        "Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.",
    )
    parser.add_argument(
        "--book",
        default=False,
        help="Scrapes this book only, takes the blinkist url for the book"
        "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)",
    )
    parser.add_argument(
        "--daily-book",
        action="store_true",
        default=False,
        help="Scrapes the free daily book only.",
    )
    parser.add_argument(
        "--books",
        default=False,
        help=
        "Scrapes the list of books, takes a txt file with the list of blinkist urls for the books"
        "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)",
    )
    parser.add_argument(
        "--book-category",
        default="Uncategorized",
        help=
        "When scraping a single book, categorize it under this category (works with '--book' only)",
    )
    parser.add_argument(
        "--categories",
        type=str,
        nargs="+",
        default="",
        help=
        ("Only the categories whose label contains at least one string here will be scraped."
         "Case-insensitive; use spaces to separate categories. "
         "(e.g. '--categories entrep market' will only scrape books under 'Entrepreneurship' and 'Marketing & Sales')"
         ),
    )
    parser.add_argument(
        "--ignore-categories",
        type=str,
        nargs="+",
        default="",
        help=
        ("If a category label contains anything in ignored_categories, books under that category will not be scraped. "
         "Case-insensitive; use spaces to separate categories. "
         "(e.g. '--ignored-categories entrep market' will skip scraping of 'Entrepreneurship' and 'Marketing & Sales')"
         ),
    )
    parser.add_argument(
        "--create-html",
        action="store_true",
        default=True,
        help="Generate a formatted html document for the book",
    )
    parser.add_argument(
        "--create-epub",
        action="store_true",
        default=True,
        help="Generate a formatted epub document for the book",
    )
    parser.add_argument(
        "--create-pdf",
        action="store_true",
        default=False,
        help=
        "Generate a formatted pdf document for the book. Requires wkhtmltopdf",
    )
    parser.add_argument(
        "--save-cover",
        action="store_true",
        default=False,
        help="Save a copy of the Blink cover artwork in the folder",
    )
    parser.add_argument(
        "--embed-cover-art",
        action="store_true",
        default=False,
        help=
        "Embed the Blink cover artwork into the concatenated audio file (works with '--concat-audio' only)",
    )
    parser.add_argument(
        "--chromedriver",
        help=
        "Path to a specific chromedriver executable instead of the built-in one",
    )
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help="Increases logging verbosity")

    if "--no-scrape" not in sys.argv:
        parser.add_argument(
            "email",
            help="The email to log into your premium Blinkist account")
        parser.add_argument(
            "password",
            help="The password to log into your premium Blinkist account")

    args = parser.parse_args()

    # set up logger verbosity
    logger.set_verbose(log, args.verbose)

    def generate_book_outputs(book_json, cover_img=False):
        if args.create_html:
            generator.generate_book_html(book_json, cover_img)
        if args.create_epub:
            generator.generate_book_epub(book_json)
        if args.create_pdf:
            generator.generate_book_pdf(book_json, cover_img)

    def scrape_book(driver, processed_books, book_url, category,
                    match_language):
        book_json, dump_exists = scraper.scrape_book_data(
            driver, book_url, category=category, match_language=match_language)
        if book_json:
            cover_img_file = False
            cover_tmp_file = False
            if args.audio:
                audio_files = scraped_audio_exists(book_json)
                if not audio_files:
                    audio_files = scraper.scrape_book_audio(
                        driver, book_json, args.language)
                if audio_files and args.concat_audio:
                    if type(audio_files) == list:
                        if args.embed_cover_art:
                            cover_tmp_file = scraper.download_book_cover_image(
                                book_json,
                                filename="_cover.jpg",
                                alt_file="cover.jpg")
                        generator.combine_audio(book_json, audio_files,
                                                args.keep_noncat,
                                                cover_tmp_file)
            if args.save_cover:
                cover_img_file = scraper.download_book_cover_image(
                    book_json, filename="cover.jpg", alt_file="_cover.jpg")
                generate_book_outputs(book_json, cover_img=cover_img_file)
            else:
                generate_book_outputs(book_json)
            if cover_tmp_file:
                if os.path.exists(cover_tmp_file):
                    log.debug(f"Deleting {cover_tmp_file}")
                    os.remove(cover_tmp_file)
                else:
                    log.debug(f'Could not find "{cover_tmp_file}"')
            processed_books.append(book_url)
        return dump_exists

    def finish(start_time, processed_books, driver=None):
        if driver:
            driver.close()
        elapsed_time = time.time() - start_time
        formatted_time = "{:02d}:{:02d}:{:02d}".format(
            int(elapsed_time // 3600),
            int(elapsed_time % 3600 // 60),
            int(elapsed_time % 60),
        )
        total_books = len(processed_books)
        log.info(
            f"Processed {total_books} book{'s' if total_books != 1 else ''} in {formatted_time}"
        )

    # start scraping
    log.info("Starting scrape run...")
    processed_books = []
    start_time = time.time()

    if args.no_scrape:
        # if the --no-scrape argument is passed, just process the
        # existing json dump files
        for file in glob.glob(os.path.join(os.getcwd(), "dump", "*.json")):
            generate_book_outputs(file)
            processed_books.append(file)
        finish(start_time, processed_books)
    else:
        match_language = args.language if args.match_language else ""
        # if no login cookies were found, don't start a headless browser
        # so that the user can solve recaptcha and log in
        start_headless = args.headless
        if not scraper.has_login_cookies():
            start_headless = False
        # add uBlock (if the conditions are right)
        use_ublock = not (args.book or args.headless)
        driver = scraper.initialize_driver(
            headless=start_headless,
            with_ublock=use_ublock,
            chromedriver_path=args.chromedriver,
        )

        is_logged_in = scraper.login(driver, args.language, args.email,
                                     args.password)
        if is_logged_in:
            if args.book or args.daily_book:
                # scrape single book
                book_url = (args.book if not args.daily_book else
                            scraper.get_daily_book_url(driver, args.language))
                scrape_book(
                    driver,
                    processed_books,
                    book_url,
                    category={"label": args.book_category},
                    match_language=match_language,
                )
            elif args.books:
                # scrape list of books
                with open(args.books, "r") as books_urls:
                    for book_url in books_urls.readlines():
                        dump_exists = scrape_book(
                            driver,
                            processed_books,
                            book_url.strip(),
                            category={"label": args.book_category},
                            match_language=match_language,
                        )
                        if not dump_exists:
                            time.sleep(args.cooldown)
            else:
                # scrape all books / categories
                all_books = scraper.get_all_books(driver, args.language)
                categories = scraper.get_categories(
                    driver,
                    args.language,
                    specified_categories=args.categories,
                    ignored_categories=args.ignore_categories,
                )
                for category in categories:
                    books_urls = scraper.get_all_books_for_categories(
                        driver, category)
                    for book_url in books_urls:
                        dump_exists = scrape_book(
                            driver,
                            processed_books,
                            book_url,
                            category=category,
                            match_language=match_language,
                        )
                        # if we processed the book from an existing dump
                        # no scraping was involved, no need to cooldown
                        if not dump_exists:
                            time.sleep(args.cooldown)
                uncategorized_books = [
                    x for x in all_books if x not in processed_books
                ]
                log.info(
                    f"Scraping {len(uncategorized_books)} remaining uncategorized books..."
                )
                for book_url in uncategorized_books:
                    dump_exists = scrape_book(
                        driver,
                        processed_books,
                        book_url,
                        category={"label": "Uncategorized"},
                        match_language=match_language,
                    )
                    if not dump_exists:
                        time.sleep(args.cooldown)
        else:
            log.error("Unable to login into Blinkist")
        finish(start_time, processed_books, driver)

Exemple #2

0

Afficher le fichier

def main():
  parser = argparse.ArgumentParser(description="Scrape blinkist.com and generate pretty output")

  parser.add_argument("--language", choices={"en", "de"}, default="en", 
                      help="The language to scrape books in - either 'en' for english or 'de' for german")
  parser.add_argument("--match-language", action="store_true", default=False, 
                      help="Skip scraping books if not in the requested language (not all book are avaible in german)")

  def check_cooldown(value):
    if int(value) < 1:
      raise argparse.ArgumentTypeError("Can't be smaller than 1")
    return int(value)

  parser.add_argument("--cooldown", type=check_cooldown, default=1, 
                      help="Seconds to wait between scraping books, and downloading audio files. Can't be smaller than 1")
  parser.add_argument("--headless", action="store_true", default=False, 
                      help="Start the automated web browser in headless mode. Works only if you already logged in once")
  parser.add_argument("--audio", action="store_true", default=True, 
                      help="Download the audio blinks for each book")
  parser.add_argument("--concat-audio", action="store_true", default=False, 
                      help="Concatenate the audio blinks into a single file and tag it. Requires ffmpeg")
  parser.add_argument("--keep-noncat", action="store_true", default=False,
                      help="Keep the individual blink audio files, instead of deleting them (works with '--concat-audio' only")
  parser.add_argument("--no-scrape", action="store_true", default=False, 
                      help="Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.")
  parser.add_argument("--book", default=False, 
                      help="Scrapes this book only, takes the blinkist url for the book"
                      "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)")
  parser.add_argument("--books", default=False, 
                      help="Scrapes the list of books, takes a txt file with the list of blinkist urls for the books"
                      "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)")
  parser.add_argument("--book-category", default="Uncategorized", 
                      help="When scraping a single book, categorize it under this category (works with '--book' only)")
  parser.add_argument("--categories", type=str, nargs="+", default="", 
                      help=("Only the categories whose label contains at least one string here will be scraped."
                      "Case-insensitive; use spaces to separate categories. "
                      "(e.g. '--categories entrep market' will only scrape books under 'Entrepreneurship' and 'Marketing & Sales')"))
  parser.add_argument("--ignore-categories", type=str, nargs="+", default="", 
                      help=("If a category label contains anything in ignored_categories, books under that category will not be scraped. "
                            "Case-insensitive; use spaces to separate categories. "
                            "(e.g. '--ignored-categories entrep market' will skip scraping of 'Entrepreneurship' and 'Marketing & Sales')"))
  parser.add_argument("--create-html", action="store_true", default=True, 
                      help="Generate a formatted html document for the book")
  parser.add_argument("--create-epub", action="store_true", default=True, 
                      help="Generate a formatted epub document for the book")
  parser.add_argument("--create-pdf", action="store_true", default=False, 
                      help="Generate a formatted pdf document for the book. Requires wkhtmltopdf")
  parser.add_argument("--save-cover", action="store_true", default=False,
                      help="Save a copy of the Blink cover artwork in the folder")
  parser.add_argument("--embed-cover-art", action="store_true", default=False,
                      help="Embed the Blink cover artwork into the concatenated audio file (works with '--concat-audio' only)")
  parser.add_argument("--chromedriver", help='Path to a specific chromedriver executable instead of the built-in one')
  parser.add_argument("-v", "--verbose", action="store_true", help="Increases logging verbosity")

  if '--no-scrape' not in sys.argv:
    parser.add_argument("email", help="The email to log into your premium Blinkist account")
    parser.add_argument("password", help="The password to log into your premium Blinkist account")

  args = parser.parse_args()

  # set up logger
  log.setLevel(logging.INFO if not args.verbose else logging.DEBUG)
  log_screen_handler = logging.StreamHandler(stream=sys.stdout)
  log.addHandler(log_screen_handler)
  log.propagate = False
  try:
    import colorama, copy

    LOG_COLORS = {
      logging.DEBUG: colorama.Fore.GREEN,
      logging.INFO: colorama.Fore.BLUE,
      logging.WARNING: colorama.Fore.YELLOW,
      logging.ERROR: colorama.Fore.RED,
      logging.CRITICAL: colorama.Back.RED
    }

    class ColorFormatter(logging.Formatter):
      def format(self, record, *args, **kwargs):
        # if the corresponding logger has children, they may receive modified
        # record, so we want to keep it intact
        new_record = copy.copy(record)
        if new_record.levelno in LOG_COLORS:
          new_record.levelname = "{color_begin}{level}{color_end}".format(
              level=new_record.levelname,
              color_begin=LOG_COLORS[new_record.levelno],
              color_end=colorama.Style.RESET_ALL,
          )
        return super(ColorFormatter, self).format(new_record, *args, **kwargs)

    log_screen_handler.setFormatter(ColorFormatter(fmt='%(asctime)s %(levelname)-8s %(message)s', 
      datefmt="{color_begin}[%H:%M:%S]{color_end}".format(
        color_begin=colorama.Style.DIM,
        color_end=colorama.Style.RESET_ALL
      )))
  except ModuleNotFoundError as identifier:
    pass

  def generate_book_outputs(book_json, cover_img=False):
    if (args.create_html):
      generator.generate_book_html(book_json, cover_img)
    if (args.create_epub):
      generator.generate_book_epub(book_json)
    if (args.create_pdf):
      generator.generate_book_pdf(book_json, cover_img)

  def scrape_book(driver, processed_books, book_url, category, match_language):
    book_json, dump_exists = scraper.scrape_book_data(driver, book_url, category=category, match_language=match_language)
    if (book_json):
      cover_img_file = False
      cover_tmp_file = False
      if (args.audio):
        audio_files = scraped_audio_exists(book_json)
        if (not audio_files):
          audio_files = scraper.scrape_book_audio(driver, book_json, args.language)
        if (audio_files and args.concat_audio):
          if (type(audio_files) == list):
            if (args.embed_cover_art):
              cover_tmp_file = scraper.download_book_cover_image(book_json, filename='_cover.jpg',  alt_file='cover.jpg')
            generator.combine_audio(book_json, audio_files, args.keep_noncat, cover_tmp_file)
      if (args.save_cover):
        cover_img_file = scraper.download_book_cover_image(book_json, filename='cover.jpg',  alt_file='_cover.jpg')
        generate_book_outputs(book_json, cover_img='cover.jpg')
      else:
        generate_book_outputs(book_json)
      if cover_tmp_file:
        if (os.path.exists(cover_tmp_file)):
          log.debug(f'Deleting {cover_tmp_file}')
          os.remove(cover_tmp_file)
        else:
          log.debug(f'Could not find "{cover_tmp_file}"')
      processed_books += 1
    return dump_exists

  def finish(start_time, processed_books, driver = None):
    if (driver):
      driver.close()
    elapsed_time = time.time() - start_time
    formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60))
    log.info(f"Processed {processed_books} books in {formatted_time}")

  # start scraping
  log.info('Starting scrape run...')
  processed_books = 0
  start_time = time.time()

  if (args.no_scrape):
    # if the --no-scrape argument is passed, just process the existing json dump files
    for file in glob.glob(os.path.join(os.getcwd(), "dump", "*.json")):
      generate_book_outputs(file)
      processed_books += 1
    finish(start_time, processed_books)
  else:
    match_language = args.language if args.match_language else ""
    # if no login cookies were found, don't start a headless browser
    # so that the user can solve recaptcha and log in
    start_headless = args.headless
    if not scraper.has_login_cookies():
      start_headless = False
    # add uBlock (if the conditions are right)
    use_ublock = not (args.book or args.headless)
    driver = scraper.initialize_driver(
      headless=start_headless, 
      with_ublock=use_ublock, 
      chromedriver_path=args.chromedriver)

    is_logged_in = scraper.login(driver, args.language, args.email, args.password)
    if (is_logged_in):
      if (args.book):
        # scrape single book
        scrape_book(
          driver, processed_books, args.book, category={ "label" : args.book_category}, match_language=match_language)     
      elif (args.books):
        # scrape list of books
        with open(args.books, 'r') as books_urls:
          for book_url in books_urls.readlines():
            dump_exists = scrape_book(
              driver, processed_books, book_url.strip(), category={ "label" : args.book_category}, match_language=match_language)
            if not dump_exists:           
              time.sleep(args.cooldown)
      else:
        # scrape all books
        categories = scraper.get_categories(
          driver, args.language, 
          specified_categories=args.categories, 
          ignored_categories=args.ignore_categories)
        for category in categories:
          books_urls = scraper.get_all_books_for_categories(driver, category)
          for book_url in books_urls:
            dump_exists = scrape_book(driver, processed_books, book_url, category=category, match_language=match_language)
            # if we processed the book from an existing dump
            # no scraping was involved, no need to cooldown
            if not dump_exists:
              time.sleep(args.cooldown)
    else:
      log.error("Unable to login into Blinkist")   
    finish(start_time, processed_books, driver)

Exemple #3

0

Afficher le fichier

Fichier : main.py Projet : nerrons/blinkist-scraper

 # so that the user can solve recaptcha and log in
 start_headless = args.headless
 if not scraper.has_login_cookies():
     start_headless = False
 driver = scraper.initialize_driver(headless=start_headless)
 is_logged_in = scraper.login(driver, args.language, args.email,
                              args.password)
 if (is_logged_in):
     if (args.book):
         scrape_book(driver,
                     processed_books,
                     args.book,
                     category={"label": args.category},
                     match_language=match_language)
     else:
         categories = scraper.get_categories(driver, args.language)
         for category in categories:
             books_urls = scraper.get_all_books_for_categories(
                 driver, category)
             for book_url in books_urls:
                 dump_exists = scrape_book(
                     driver,
                     processed_books,
                     book_url,
                     category=category,
                     match_language=match_language)
                 # if we processed the book from an existing dump
                 # no scraping was involved, no need to cooldown
                 if not dump_exists:
                     time.sleep(args.cooldown)
     finish(driver, start_time, processed_books)

Exemple #4

0

Afficher le fichier

Fichier : main.py Projet : govindbhardwaj/blinkist-scraper

       process_book_json(file, processed_books)
     finish(None, start_time, processed_books)
   else:
     match_language = args.language if args.match_language else ""
     # if no login cookies were found, don't start a headless browser
     # so that the user can solve recaptcha and log in
     start_headless = args.headless
     if not scraper.has_login_cookies():
       start_headless = False
     driver = scraper.initialize_driver(headless=start_headless)
     is_logged_in = scraper.login(driver, args.language, args.email, args.password)
     if (is_logged_in):
       if (args.book):
         scrape_book(driver, processed_books, args.book, category={ "label" : args.book_category}, match_language=match_language)
       else:
         categories = scraper.get_categories(driver, args.language, args.categories, args.ignore_categories)
         for category in categories:
           books_urls = scraper.get_all_books_for_categories(driver, category)
           for book_url in books_urls: 
             dump_exists = scrape_book(driver, processed_books, book_url, category=category, match_language=match_language)            
             # if we processed the book from an existing dump 
             # no scraping was involved, no need to cooldown
             if not dump_exists:
               time.sleep(args.cooldown)
       finish(driver, start_time, processed_books)
 except KeyboardInterrupt:
   print('[#] Interrupted by user')
   finish(driver, start_time, processed_books)
   try:
     sys.exit(0)
   except SystemExit: