class BemihoResetProcessor(BemihoProcessor): def __init__(self, user_input): super().__init__(user_input) self.group = user_input.group self.member = user_input.member self.output = user_input.output self.output_path = self.format_path() self.logger = BemihoLogger(self.__class__).get_logger() def format_path(self): group = self.group.kanji group_romaji = self.group.romaji member = self.member.kanji member_romaji = self.member.romaji return join(self.output, f"{group} ({group_romaji})", f"{member} ({member_romaji})") def start(self): self.logger.debug(f'Starting reset for member {self.member.kanji} ({self.member.romaji}) from {self.group.kanji} ({self.group.romaji}) located on {self.output_path}') if exists(self.output_path): self.logger.debug(f'Output path located. Resetting.') try: for file_path in os.listdir(self.output_path): joined_file_path = join(self.output_path, file_path) if os.path.isfile(joined_file_path): os.unlink(joined_file_path) elif os.path.isdir(joined_file_path): shutil.rmtree(joined_file_path) self.logger.debug(f'Reset successful for {self.output_path}') except Exception: self.logger.error(f'Unable to reset due to an unexpected error.', exc_info=True) else: self.logger.debug(f'Output path doesn\'t exist. Terminating')
class NoHTMLTextOutputProcessor(ScrapperOutputProcessor): content = 'no_html' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def get_metadata_handler_class(self, user_input, member_path): return NoHTMLTextMetadataHandler(user_input, member_path) def process_blog_data(self, blog_datas): self.logger.debug(f'Blog data number {len(blog_datas)}.') for blog_data in blog_datas: header = blog_data.header contents = blog_data.contents self.logger.debug( f'Saving text only contents from {header.title} with content count {len(contents)}.' ) for download_content in contents: self.save_to_file_and_metadata(header, download_content) self.metadata_handler.save_metadata() def on_save(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = True self.metadata_handler.add_to_metadata(header, content_data) def on_except(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = False self.metadata_handler.add_to_metadata(header, content_data) def save_to_file_and_metadata(self, header, download_content): download_url = download_content.get_text_file_path(self.member_path) try: content_data = self.metadata_handler.build_content_object_from_data( download_url=download_url, successful=False) if not self.metadata_handler.check_duplicates( header, content_data): download_content.download_to_text_file( self.member_path, lambda file_path: self.on_save( header, content_data, file_path), lambda file_path: self.on_except(header, content_data, file_path)) else: self.logger.debug( f'Duplicate found for {header.title}. Cancelling download') except: self.logger.error( f'Download of no_html from {header.link} to {download_url} is unsuccessful due to issue.', exc_info=True)
class PhotosOutputProcessor(ScrapperOutputProcessor): content = 'photos' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def get_metadata_handler_class(self, user_input, member_path): return PhotosMetadataHandler(user_input, member_path) def process_blog_data(self, blog_datas): self.logger.debug(f'Starting saving photos content to {self.member_path}.') self.logger.debug(f'Blog data number {len(blog_datas)}.') for blog_data in blog_datas: header = blog_data.header contents = list(filter(lambda content: type(content) is ImageBlogDownloadContent or type(content) is SessionBasedImageBlogDownloadContent, blog_data.contents)) self.logger.debug(f'Saving contents from {header.title} with content count {len(contents)}.') for (index, download_content) in enumerate(contents): self.download_file(header, index, download_content) self.metadata_handler.save_metadata() def on_save(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = True self.metadata_handler.add_to_metadata(header, content_data) def on_except(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = False self.metadata_handler.add_to_metadata(header, content_data) def download_file(self, header, index, download_content): image_url = download_content.content download_url = download_content.format_download_url(self.member_path, header.title, index) metadata_content = self.metadata_handler.build_content_object_from_data(image_url=image_url, download_url=download_url, successful=True) try: if self.metadata_handler.check_duplicates(header, metadata_content): self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.') else: metadata_content.download_url = download_content.format_download_url(self.member_path, clean_file_name(header.title), index) if self.metadata_handler.check_duplicates(header, metadata_content): self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.') else: download_content.download_to_file(self.member_path, index, lambda file_path : self.on_save(header, metadata_content, file_path), lambda file_path : self.on_except(header, metadata_content, file_path)) self.metadata_handler.add_to_metadata(header, metadata_content) except Exception: self.logger.error(f'Download from {image_url} to {download_url} is unsuccessful due to issue.', exc_info=True)
class AllOutputProcessor(ScrapperOutputProcessor): content = 'all' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() other_processors = [] self.logger.debug('Getting other output processors for all context implementation.') self.other_processors_blog_datas = {} for output_p in get_output_processor_classes_for_content_except(self.content): other_processors.append(output_p(user_input)) self.other_processors_blog_datas[output_p.content] = [] self.other_processors = other_processors self.logger.debug(f'Found the following other output processor classes: {other_processors}') def get_metadata_handler_class(self, user_input, member_path): pass def create_output_directory(self): for processor in self.other_processors: processor.create_output_directory() def do_blog_datas_remapping(self, blog_datas): self.logger.debug('Performing remapping for blog data for performing output processor for all.') for blog_data in blog_datas: header = blog_data.header contents = blog_data.contents for content in contents: for content_key in content.keys(): self.other_processors_blog_datas[content_key].append(BlogData(header, content[content_key])) def process_blog_data(self, blog_datas): self.logger.debug('Starting blog data processing for all processor. One thread is created per output processor.') self.do_blog_datas_remapping(blog_datas) with ThreadPoolExecutor(max_workers=3) as executor: futures = [] for processor in self.other_processors: self.logger.debug(f'Starting thread execution for processing {processor.content} content.') futures.append(executor.submit(processor.process_blog_data, self.other_processors_blog_datas[processor.content])) for future in as_completed(futures): try: future.result() except Exception: self.logger.error("Exception occurred on thread", exc_info=True)
class BemihoScrapProcessor(BemihoProcessor): def __init__(self, user_input, output_processor_class): self.user_input = user_input self.traversal = get_traversal_based_on_content_request(user_input) self.scrapper_class = get_scrapper_class_based_on_input(user_input) self.output_processor = output_processor_class(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def execute_single_scraper(self, page_number): content = self.user_input.content self.logger.debug(f'Starting fetch {content} for page {page_number}') scrapper = self.scrapper_class(self.user_input, page_number, self.traversal) blog_data = scrapper.start_web_scrape() self.output_processor.process_blog_data(blog_data) return page_number def start(self): group = self.user_input.group member = self.user_input.member firstpage = self.user_input.firstpage number_of_pages = self.user_input.number_of_pages content = self.user_input.content self.logger.debug( f'Starting scrap process for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with content {content} and {number_of_pages} page count from page {firstpage}' ) self.output_processor.create_output_directory() with ThreadPoolExecutor(max_workers=5) as executor: futures = [] page_index = self.scrapper_class.get_proper_page_index(firstpage) for page_number in range(page_index, page_index + number_of_pages): futures.append( executor.submit(self.execute_single_scraper, page_number)) for future in as_completed(futures): try: data = future.result() self.logger.debug( f"Successfully fetched {content} data for page {data}") except Exception: self.logger.error("Exception occurred on thread", exc_info=True)
class LineBlogGroupService(LineBlogService): def __init__(self, url, page_number, author, traversal): self.url = url self.page_number = page_number self.author = author self.logger = BemihoLogger(self.__class__).get_logger() self.traversal = traversal def scrape_single_url(self, header): contents = [] self.logger.debug( f'Extracting data from {header.link} from {header.author}') request = requests.get(header.link) soup = BeautifulSoup(request.text, 'lxml') for article in soup.find_all('article', class_='first-article'): article_body = article.find('div', class_='article-body') article_body_inner = article_body.find('div', class_='article-body-inner') contents = self.traversal.traverse(header, article_body_inner) self.logger.debug( f'Contents extracted from {header.link} with size {len(contents)}' ) return BlogData(header, contents) def serve_contents(self): contents = [] futures = [] headers = LineBlogApiCrawler(self.url, self.page_number, self.author).crawl_api_for_headers() self.logger.debug( f'Headers extracted from api url {self.url} with size {len(headers)}. Proceeding to fetch data.' ) with ThreadPoolExecutor(max_workers=5) as executor: for header in headers: futures.append(executor.submit(self.scrape_single_url, header)) for future in as_completed(futures): try: contents.append(future.result()) except Exception: self.logger.error("Exception occurred on thread", exc_info=True) return contents
class BlogEntryOutputProcessor(ScrapperOutputProcessor): content = 'blog' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def get_metadata_handler_class(self, user_input, member_path): return BlogMetadataHandler(user_input, member_path) def process_blog_data(self, blog_datas): self.logger.debug( f'Starting saving blog content to {self.member_path}.') directory = self.member_path with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for blog_data in blog_datas: self.logger.debug( f'Starting thread execution for building document.') futures.append( executor.submit(self.build_document, directory, blog_data)) for future in as_completed(futures): try: future.result() except Exception: self.logger.error("Exception occurred on thread", exc_info=True) self.metadata_handler.save_metadata() def build_document(self, directory, blog_data): content_data = None header = blog_data.header contents = blog_data.contents date_string = header.date.strftime("%Y.%m.%d") document_path = join( directory, f"{date_string} ({clean_file_separators(header.title)}).docx") try: content_data = self.metadata_handler.build_content_object_from_data( download_url=document_path, successful=False) self.save_to_document(header, contents, content_data, document_path) except OSError as os_error: if os_error.errno == errno.EILSEQ: document_path = join( directory, f"{date_string} ({clean_file_name(header.title)}).docx") content_data = self.metadata_handler.build_content_object_from_data( download_url=document_path, successful=False) self.save_to_document(header, contents, content_data, document_path) else: raise os_error except: content_data = self.metadata_handler.build_content_object_from_data( download_url=document_path, successful=False) self.metadata_handler.add_to_metadata(header, content_data) self.logger.error( f'Download from {header.link} to {document_path} is unsuccessful due to issue.', exc_info=True) def save_to_document(self, header, contents, content_data, document_path): if not self.metadata_handler.check_duplicates(header, content_data): document = Document() paragraph_format = document.styles['Normal'].paragraph_format paragraph_format.line_spacing = 1 HeaderDocumentModifier(header.title, level=1).change_document(document) HeaderDocumentModifier(header.date.strftime("%Y-%m-%d %H:%M:%S"), level=4).change_document(document) HeaderDocumentModifier(header.link, level=4).change_document(document) for content in contents: content.download_to_document(document) document.save(document_path) content_data.successful = True self.metadata_handler.add_to_metadata(header, content_data)
from output_processor.exceptions import OutputProcessorNotFound from scrapper.traversal.exceptions import TraversalClassNotFound from processor import create_bemiho_processor from utilities.text import seconds_to_minutes_format if __name__ == '__main__': logger = BemihoLogger('bemiho').get_logger() start = time.time() try: logger.info('Starting Bemiho.') user_input = get_user_input() processor = create_bemiho_processor(user_input) processor.start() except (JSONDataNotFound, PageNumberNotDigits, NumberOfPageShouldBeAtLeastOne, InvalidContentInput): logger.error("There were exceptions in acquiring data", exc_info=True) except OutputProcessorNotFound as oe: logger.error(oe.message, exc_info=True) except TraversalClassNotFound as te: logger.error(te.message, exc_info=True) except KeyboardInterrupt as ke: logger.debug("User stopped the application.") except Exception as e: logger.error('Uncaught exception occurred', exc_info=True) finally: end = time.time() total_seconds = (end - start) logger.debug('Stopped Bemiho.') logger.info(f'Duration: {seconds_to_minutes_format(total_seconds)}')
class SessionBasedImageBlogDownloadContent(BlogDownloadContent): def __init__(self, header, content, element): super().__init__(header, content) self.element = element self.session_img_service = SessionImageService() self.session_img_service.start() self.bit_content = None self.logger = BemihoLogger(__class__).get_logger() def download_to_file(self, directory, index, on_save, on_except): ( image_url ) = self.content if (image_url and not image_url == ''): self.logger.debug(f'Image url is not empty. Building download path from {image_url}.') bit_content = self.get_bit_content() if bit_content is not None: download_url = self.format_download_url(directory, self.header.title, index) self.save_to_file(directory, download_url, bit_content, index, on_save, on_except) else: smaller_image = self.element.find('img') if (smaller_image is not None): ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_file(directory, index, on_save, on_except) def format_download_url(self, directory, title, index): header_date_string = self.header.date_to_string() bit_content = self.get_bit_content() if bit_content is not None: guessed_ext = get_extension_for_bit_content(bit_content) self.logger.debug(f'Extension for image URL ({self.content[0]}): {guessed_ext}') download_url = join(directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext)) self.logger.debug(f'Download path for image URL {self.content[0]} created: {download_url}') return download_url else: smaller_image = self.element.find('img') if (smaller_image is not None): return ImageBlogDownloadContent(self.header, smaller_image.get('src')).format_download_url(directory, title, index) def save_to_file(self, directory, download_url, bit_content, index, on_save, on_except): try: with open(download_url, 'wb') as download_file: download_file.write(bit_content) on_save(download_url) except OSError as os_err: if os_err.errno == 92: rollback_save_url = self.format_download_url(directory, clean_file_name(self.header.title), index) self.logger.error(f'Download from {self.content} to {download_url} is unsuccessful due to OS issue. Will re-download with a cleaned name ({rollback_save_url}).', exc_info=True) self.save_to_file(directory, rollback_save_url, bit_content, index, on_save, on_except) else: on_except(download_url) raise os_err except Exception as other_error: on_except(download_url) raise other_error def download_to_document(self, document): ( image_url ) = self.content if (image_url and not image_url == ''): try: bit_content = self.get_bit_content() if bit_content is not None: image = io.BytesIO(bit_content) document.add_picture(image, width=Inches(4)) else: smaller_image = self.element.find('img') if (smaller_image is not None): ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_document(document) except Exception: document.add_paragraph(image_url) self.logger.debug(f'Unable to fetch {image_url}. The URL was added instead.') def get_bit_content(self): if self.bit_content is None: ( image_url, image_selector ) = self.content return self.session_img_service.get_image_content(image_url, image_selector) else: return self.bit_content def clear(self): self.session_img_service.stop()
class ImageBlogDownloadContent(BlogDownloadContent): def __init__(self, header, content): super().__init__(header, content) self.logger = BemihoLogger(__class__).get_logger() def download_to_file(self, directory, index, on_save, on_except): image_url = self.content if (image_url and not image_url == ''): self.logger.debug( f'Image url is not empty. Building download path from {image_url}.' ) download_url = self.format_download_url(directory, self.header.title, index) self.save_to_file(directory, download_url, index, on_save, on_except) def format_download_url(self, directory, title, index): image_url = self.content header_date_string = self.header.date_to_string() guessed_ext = get_extension_for_image(image_url) self.logger.debug( f'Extension for image URL ({image_url}): {guessed_ext}') save_url = join( directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext)) self.logger.debug( f'Download path for image URL {image_url} created: {save_url}') return save_url def save_to_file(self, directory, download_url, index, on_save, on_except): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } try: request = requests.get(self.content, allow_redirects=True, headers=headers) with open(download_url, 'wb') as download_file: download_file.write(request.content) on_save(download_url) except OSError as os_err: if os_err.errno == errno.EILSEQ: rollback_save_url = self.format_download_url( directory, clean_file_name(self.header.title), index) self.logger.error( f'Download from {self.content} to {download_url} is unsuccessful due to illegal byte sequence on file name. Will re-download with a cleaned name ({rollback_save_url}).' ) self.save_to_file(directory, rollback_save_url, index, on_save, on_except) else: on_except(download_url) raise os_err except Exception as other_error: on_except(download_url) raise other_error def download_to_document(self, document): image_content = self.content if (image_content and image_content != ''): try: response = requests.get(image_content, stream=True) image = io.BytesIO(response.content) document.add_picture(image, width=Inches(4)) except Exception: document.add_paragraph(image_content) self.logger.debug( f'Unable to fetch {image_content}. The URL was added instead.' )