def __init__(self, user_input): super().__init__(user_input) self.group = user_input.group self.member = user_input.member self.output = user_input.output self.output_path = self.format_path() self.logger = BemihoLogger(self.__class__).get_logger()
def __init__(self, user_input, output_processor_class): self.user_input = user_input self.traversal = get_traversal_based_on_content_request(user_input) self.scrapper_class = get_scrapper_class_based_on_input(user_input) self.output_processor = output_processor_class(user_input) self.logger = BemihoLogger(self.__class__).get_logger()
class BlogMetadataHandler(MetadataHandler): content = 'blog' def __init__(self, user_input, metadata_directory): super().__init__(user_input, metadata_directory) self.logger = BemihoLogger(self.__class__).get_logger() def create_mapper(self): return BlogMetadataJSONMapper() def check_duplicates(self, header, content): if (isinstance(content, BlogMetadata) and header.id in self.metadata.keys()): md = self.metadata[header.id] if md.blog_data.download_url == content.download_url: self.logger.debug( f'Duplicate document found for url {header.link} and output url {content.download_url}. Output process will be cancelled.' ) return True else: return False return True return False def build_content_object_from_data(self, **kwargs): return BlogMetadata(kwargs['download_url'], kwargs['successful']) def add_to_metadata(self, header, content): if (header.id not in self.metadata.keys()): self.logger.debug(f'Added metadata for post {header.id}') self.metadata[header.id] = BlogContentMetadata( header.id, header.title, header.link, header.author, header.date, content)
def __init__(self, header, content, element): super().__init__(header, content) self.element = element self.session_img_service = SessionImageService() self.session_img_service.start() self.bit_content = None self.logger = BemihoLogger(__class__).get_logger()
class BemihoResetProcessor(BemihoProcessor): def __init__(self, user_input): super().__init__(user_input) self.group = user_input.group self.member = user_input.member self.output = user_input.output self.output_path = self.format_path() self.logger = BemihoLogger(self.__class__).get_logger() def format_path(self): group = self.group.kanji group_romaji = self.group.romaji member = self.member.kanji member_romaji = self.member.romaji return join(self.output, f"{group} ({group_romaji})", f"{member} ({member_romaji})") def start(self): self.logger.debug(f'Starting reset for member {self.member.kanji} ({self.member.romaji}) from {self.group.kanji} ({self.group.romaji}) located on {self.output_path}') if exists(self.output_path): self.logger.debug(f'Output path located. Resetting.') try: for file_path in os.listdir(self.output_path): joined_file_path = join(self.output_path, file_path) if os.path.isfile(joined_file_path): os.unlink(joined_file_path) elif os.path.isdir(joined_file_path): shutil.rmtree(joined_file_path) self.logger.debug(f'Reset successful for {self.output_path}') except Exception: self.logger.error(f'Unable to reset due to an unexpected error.', exc_info=True) else: self.logger.debug(f'Output path doesn\'t exist. Terminating')
class PhotosMetadataHandler(MetadataHandler): content = 'photos' def create_mapper(self): return PhotosMetadataJSONMapper() def __init__(self, user_input, metadata_directory): super().__init__(user_input, metadata_directory) self.logger = BemihoLogger(self.__class__).get_logger() def check_duplicates(self, header, content): if (isinstance(content, PhotosMetadata) and header.id in self.metadata.keys()): md = self.metadata[header.id] if (md.does_photo_exist(content) and exists(content.download_url)): self.logger.debug(f'Duplicate photo found for photo url {content.image_url} and output url {content.download_url}. Output process will be cancelled.') return True else: return False return True return False def add_to_metadata(self, header, content): if (header.id in self.metadata.keys()): self.metadata[header.id].add_photo(content) else: self.logger.debug(f'Added metadata for post {header.id}') self.metadata[header.id] = PhotosContentMetadata(header.id, header.title, header.link, header.author, header.date) self.metadata[header.id].add_photo(content) def build_content_object_from_data(self, **kwargs): return PhotosMetadata(kwargs['image_url'], kwargs['download_url'], kwargs['successful'])
def __init__(self): self.group = None self.member = None self.output = 'output' self.content = 'photos' self.firstpage = 1 self.number_of_pages = 1 self.logger = BemihoLogger(BemihoUserInputBuilder).get_logger() self.reset_mode = False self.list_mode = False
def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() other_processors = [] self.logger.debug('Getting other output processors for all context implementation.') self.other_processors_blog_datas = {} for output_p in get_output_processor_classes_for_content_except(self.content): other_processors.append(output_p(user_input)) self.other_processors_blog_datas[output_p.content] = [] self.other_processors = other_processors self.logger.debug(f'Found the following other output processor classes: {other_processors}')
def __init__(self, user_input): self.user_input = user_input file_path = Path(user_input.output).resolve() self.output_path = file_path self.member_path = self.format_path() self.metadata_handler = self.get_metadata_handler_class( user_input, self.member_path) self.logger = BemihoLogger(self.__class__).get_logger() group = self.user_input.group member = self.user_input.member self.logger.debug( f'Created output processor for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with path {self.member_path}' )
def get_page_input(page_from_args, default_value, label): logger = BemihoLogger(get_page_input).get_logger() logger.debug( f'Checking page or count from arguments {page_from_args} with default value {default_value}' ) if (page_from_args == None): page_input = input(f"Select {label}. Default is {default_value}: ") if (page_input == None or page_input == ''): page_input = default_value if (not page_input.isdigit()): raise PageNumberNotDigits() else: page_input = page_from_args return page_input
def __init__(self, user_input, metadata_directory): self.user_input = user_input self.metadata_directory = metadata_directory self.metadata_file = join(metadata_directory, 'metadata.json') self.mapper = self.create_mapper() self.metadata = {} self.logger = BemihoLogger(self.__class__).get_logger()
class BemihoUserInputBuilder: def __init__(self): self.group = None self.member = None self.output = 'output' self.content = 'photos' self.firstpage = 1 self.number_of_pages = 1 self.logger = BemihoLogger(BemihoUserInputBuilder).get_logger() self.reset_mode = False self.list_mode = False def set_group(self, group): self.group = group def set_member(self, member): self.member = member def set_output(self, output): self.output = output def set_content(self, content): self.content = content def set_firstpage(self, firstpage): self.firstpage = firstpage def set_number_of_page(self, number_of_pages): self.number_of_pages = number_of_pages def set_reset_mode(self, reset_mode): self.reset_mode = reset_mode def set_list_mode(self, list_mode): self.list_mode = list_mode def build(self): user_input = BemihoUserInput(self.group, self.member, self.output, self.content, self.firstpage, self.number_of_pages, self.reset_mode, self.list_mode) self.logger.debug( f'User input object created for scrapping that contains the following data:\n{user_input}' ) return user_input
class JSONExtractor: def __init__(self, filename, mapper): self.filename = filename self.mapper = mapper self.logger = BemihoLogger(self.__class__, logging.INFO).get_logger() def extract(self): items = [] self.logger.debug( f'Extracting data from {self.filename} with the mapper {get_qualified_name(self.mapper.__class__)}.' ) with open(self.filename) as jsonfile: data = json.load(jsonfile) for d in data: items.append(self.mapper.map_to_object(d)) self.logger.debug( f'Data successfully extracted from {self.filename} with {len(items)} items.' ) return items
def parse_system_args(): logger = BemihoLogger(parse_system_args).get_logger() parser = argparse.ArgumentParser() parser.add_argument("-g", "--group", help="Select group to pull") parser.add_argument("-m", "--member", help="Select member to pull") parser.add_argument("-o", "--output", help="Output folder", default="output") parser.add_argument("-c", "--content", help="Content to pull for member", choices=get_available_content_options(), type=str.lower) parser.add_argument("-f", "--firstpage", help="First page", type=int) parser.add_argument("-n", "--number", help="Number of pages", type=int) parser.add_argument("--reset", help="Resets saved data from idol's blog", action='store_true') parser.add_argument("--list", help="Lists all groups and supported members", action='store_true') logger.debug('Parsing command line arguments') parsed = parser.parse_args() logger.debug(f'Parsing command line arguments finished {parsed}') return parsed
class NoHTMLTextOutputProcessor(ScrapperOutputProcessor): content = 'no_html' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def get_metadata_handler_class(self, user_input, member_path): return NoHTMLTextMetadataHandler(user_input, member_path) def process_blog_data(self, blog_datas): self.logger.debug(f'Blog data number {len(blog_datas)}.') for blog_data in blog_datas: header = blog_data.header contents = blog_data.contents self.logger.debug( f'Saving text only contents from {header.title} with content count {len(contents)}.' ) for download_content in contents: self.save_to_file_and_metadata(header, download_content) self.metadata_handler.save_metadata() def on_save(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = True self.metadata_handler.add_to_metadata(header, content_data) def on_except(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = False self.metadata_handler.add_to_metadata(header, content_data) def save_to_file_and_metadata(self, header, download_content): download_url = download_content.get_text_file_path(self.member_path) try: content_data = self.metadata_handler.build_content_object_from_data( download_url=download_url, successful=False) if not self.metadata_handler.check_duplicates( header, content_data): download_content.download_to_text_file( self.member_path, lambda file_path: self.on_save( header, content_data, file_path), lambda file_path: self.on_except(header, content_data, file_path)) else: self.logger.debug( f'Duplicate found for {header.title}. Cancelling download') except: self.logger.error( f'Download of no_html from {header.link} to {download_url} is unsuccessful due to issue.', exc_info=True)
class ScrapperOutputProcessor: content = '' def __init__(self, user_input): self.user_input = user_input file_path = Path(user_input.output).resolve() self.output_path = file_path self.member_path = self.format_path() self.metadata_handler = self.get_metadata_handler_class( user_input, self.member_path) self.logger = BemihoLogger(self.__class__).get_logger() group = self.user_input.group member = self.user_input.member self.logger.debug( f'Created output processor for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with path {self.member_path}' ) def get_metadata_handler_class(self, user_input, member_path): return EmptyMetadataHandler(user_input, member_path) def format_path(self): group = self.user_input.group.kanji group_romaji = self.user_input.group.romaji member = self.user_input.member.kanji member_romaji = self.user_input.member.romaji return join(self.output_path, f"{group} ({group_romaji})", f"{member} ({member_romaji})", self.content) def create_output_directory(self): if (not exists(self.member_path)): self.logger.debug( f'Folder for member path {self.member_path} doesn\'t exist. Creating folder' ) path = Path(self.member_path) path.mkdir(parents=True) self.metadata_handler.load_metadata() def process_blog_data(self, blog_data): raise NotImplementedError()
class NoHTMLTextBlogDownloadContent(BlogDownloadContent): def __init__(self, header, content): super().__init__(header, content) self.logger = BemihoLogger(__class__).get_logger() def download_to_text_file(self, directory, on_save, on_except): self.logger.debug(f'Writing no HTML content from {self.header.title} with size {len(self.content)}.') text_file_path = self.get_text_file_path(directory) try: self.do_save(text_file_path, on_save) except OSError as os_err: if os_err.errno == errno.EILSEQ: text_file_path = clean_emojis(self.get_text_file_path(directory)) self.do_save(text_file_path, on_save) else: on_except(text_file_path) raise os_err except Exception as other_err: on_except(text_file_path) raise other_err self.logger.debug(f'Writing no HTML content with size {len(self.content)} successful.') def do_save(self, file_path, on_save): with open(file_path, 'w') as new_text_file: date_string = self.header.date.strftime("%Y-%m-%d %H:%M:%S") new_text_file.write(f"Title: {self.header.title}\n") new_text_file.write(f"Date: {date_string}\n") new_text_file.write(f"Link: {self.header.link}\n") new_text_file.write("===============\n") new_text_file.write(self.content) on_save(file_path) def get_text_file_path(self, directory): header_date_string = self.header.date_to_string() download_url = join(directory, '%s (%s).txt' % (header_date_string, clean_file_separators(self.header.title))) return download_url
def get_output_processor_class_for_content(content): logger = BemihoLogger(get_output_processor_class_for_content).get_logger() qualified_name = get_qualified_name(ScrapperOutputProcessor) logger.debug( f'Getting output processor ({qualified_name}) class for content {content}.' ) writer = get_class_in_module(__file__, __name__, ScrapperOutputProcessor, lambda clazz: clazz.content == content) if (writer == None): raise OutputProcessorNotFound(content) logger.debug(f'Output processor ({get_qualified_name(writer)}) found.') return writer
class LineBlogGroupService(LineBlogService): def __init__(self, url, page_number, author, traversal): self.url = url self.page_number = page_number self.author = author self.logger = BemihoLogger(self.__class__).get_logger() self.traversal = traversal def scrape_single_url(self, header): contents = [] self.logger.debug( f'Extracting data from {header.link} from {header.author}') request = requests.get(header.link) soup = BeautifulSoup(request.text, 'lxml') for article in soup.find_all('article', class_='first-article'): article_body = article.find('div', class_='article-body') article_body_inner = article_body.find('div', class_='article-body-inner') contents = self.traversal.traverse(header, article_body_inner) self.logger.debug( f'Contents extracted from {header.link} with size {len(contents)}' ) return BlogData(header, contents) def serve_contents(self): contents = [] futures = [] headers = LineBlogApiCrawler(self.url, self.page_number, self.author).crawl_api_for_headers() self.logger.debug( f'Headers extracted from api url {self.url} with size {len(headers)}. Proceeding to fetch data.' ) with ThreadPoolExecutor(max_workers=5) as executor: for header in headers: futures.append(executor.submit(self.scrape_single_url, header)) for future in as_completed(futures): try: contents.append(future.result()) except Exception: self.logger.error("Exception occurred on thread", exc_info=True) return contents
class BemihoScrapProcessor(BemihoProcessor): def __init__(self, user_input, output_processor_class): self.user_input = user_input self.traversal = get_traversal_based_on_content_request(user_input) self.scrapper_class = get_scrapper_class_based_on_input(user_input) self.output_processor = output_processor_class(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def execute_single_scraper(self, page_number): content = self.user_input.content self.logger.debug(f'Starting fetch {content} for page {page_number}') scrapper = self.scrapper_class(self.user_input, page_number, self.traversal) blog_data = scrapper.start_web_scrape() self.output_processor.process_blog_data(blog_data) return page_number def start(self): group = self.user_input.group member = self.user_input.member firstpage = self.user_input.firstpage number_of_pages = self.user_input.number_of_pages content = self.user_input.content self.logger.debug( f'Starting scrap process for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with content {content} and {number_of_pages} page count from page {firstpage}' ) self.output_processor.create_output_directory() with ThreadPoolExecutor(max_workers=5) as executor: futures = [] page_index = self.scrapper_class.get_proper_page_index(firstpage) for page_number in range(page_index, page_index + number_of_pages): futures.append( executor.submit(self.execute_single_scraper, page_number)) for future in as_completed(futures): try: data = future.result() self.logger.debug( f"Successfully fetched {content} data for page {data}") except Exception: self.logger.error("Exception occurred on thread", exc_info=True)
def get_traversal_based_on_content_request(user_input): logger = BemihoLogger(get_traversal_based_on_content_request).get_logger() qualified_name = get_qualified_name(ScrapperTraversal) logger.debug( f'Getting traversal method ({qualified_name}) class for content {user_input.content}.' ) traversal = get_class_in_module( __file__, __name__, ScrapperTraversal, lambda clazz: clazz.content == user_input.content) if (traversal == None): raise TraversalClassNotFound(user_input.content) logger.debug(f'Traversal method ({get_qualified_name(traversal)}) found.') return traversal()
class PhotosOutputProcessor(ScrapperOutputProcessor): content = 'photos' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def get_metadata_handler_class(self, user_input, member_path): return PhotosMetadataHandler(user_input, member_path) def process_blog_data(self, blog_datas): self.logger.debug(f'Starting saving photos content to {self.member_path}.') self.logger.debug(f'Blog data number {len(blog_datas)}.') for blog_data in blog_datas: header = blog_data.header contents = list(filter(lambda content: type(content) is ImageBlogDownloadContent or type(content) is SessionBasedImageBlogDownloadContent, blog_data.contents)) self.logger.debug(f'Saving contents from {header.title} with content count {len(contents)}.') for (index, download_content) in enumerate(contents): self.download_file(header, index, download_content) self.metadata_handler.save_metadata() def on_save(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = True self.metadata_handler.add_to_metadata(header, content_data) def on_except(self, header, content_data, file_path): content_data.download_url = file_path content_data.successful = False self.metadata_handler.add_to_metadata(header, content_data) def download_file(self, header, index, download_content): image_url = download_content.content download_url = download_content.format_download_url(self.member_path, header.title, index) metadata_content = self.metadata_handler.build_content_object_from_data(image_url=image_url, download_url=download_url, successful=True) try: if self.metadata_handler.check_duplicates(header, metadata_content): self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.') else: metadata_content.download_url = download_content.format_download_url(self.member_path, clean_file_name(header.title), index) if self.metadata_handler.check_duplicates(header, metadata_content): self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.') else: download_content.download_to_file(self.member_path, index, lambda file_path : self.on_save(header, metadata_content, file_path), lambda file_path : self.on_except(header, metadata_content, file_path)) self.metadata_handler.add_to_metadata(header, metadata_content) except Exception: self.logger.error(f'Download from {image_url} to {download_url} is unsuccessful due to issue.', exc_info=True)
def __init__(self, user_input, metadata_directory): super().__init__(user_input, metadata_directory) self.logger = BemihoLogger(self.__class__).get_logger()
def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger()
class BlogEntryOutputProcessor(ScrapperOutputProcessor): content = 'blog' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() def get_metadata_handler_class(self, user_input, member_path): return BlogMetadataHandler(user_input, member_path) def process_blog_data(self, blog_datas): self.logger.debug( f'Starting saving blog content to {self.member_path}.') directory = self.member_path with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for blog_data in blog_datas: self.logger.debug( f'Starting thread execution for building document.') futures.append( executor.submit(self.build_document, directory, blog_data)) for future in as_completed(futures): try: future.result() except Exception: self.logger.error("Exception occurred on thread", exc_info=True) self.metadata_handler.save_metadata() def build_document(self, directory, blog_data): content_data = None header = blog_data.header contents = blog_data.contents date_string = header.date.strftime("%Y.%m.%d") document_path = join( directory, f"{date_string} ({clean_file_separators(header.title)}).docx") try: content_data = self.metadata_handler.build_content_object_from_data( download_url=document_path, successful=False) self.save_to_document(header, contents, content_data, document_path) except OSError as os_error: if os_error.errno == errno.EILSEQ: document_path = join( directory, f"{date_string} ({clean_file_name(header.title)}).docx") content_data = self.metadata_handler.build_content_object_from_data( download_url=document_path, successful=False) self.save_to_document(header, contents, content_data, document_path) else: raise os_error except: content_data = self.metadata_handler.build_content_object_from_data( download_url=document_path, successful=False) self.metadata_handler.add_to_metadata(header, content_data) self.logger.error( f'Download from {header.link} to {document_path} is unsuccessful due to issue.', exc_info=True) def save_to_document(self, header, contents, content_data, document_path): if not self.metadata_handler.check_duplicates(header, content_data): document = Document() paragraph_format = document.styles['Normal'].paragraph_format paragraph_format.line_spacing = 1 HeaderDocumentModifier(header.title, level=1).change_document(document) HeaderDocumentModifier(header.date.strftime("%Y-%m-%d %H:%M:%S"), level=4).change_document(document) HeaderDocumentModifier(header.link, level=4).change_document(document) for content in contents: content.download_to_document(document) document.save(document_path) content_data.successful = True self.metadata_handler.add_to_metadata(header, content_data)
class SessionBasedImageBlogDownloadContent(BlogDownloadContent): def __init__(self, header, content, element): super().__init__(header, content) self.element = element self.session_img_service = SessionImageService() self.session_img_service.start() self.bit_content = None self.logger = BemihoLogger(__class__).get_logger() def download_to_file(self, directory, index, on_save, on_except): ( image_url ) = self.content if (image_url and not image_url == ''): self.logger.debug(f'Image url is not empty. Building download path from {image_url}.') bit_content = self.get_bit_content() if bit_content is not None: download_url = self.format_download_url(directory, self.header.title, index) self.save_to_file(directory, download_url, bit_content, index, on_save, on_except) else: smaller_image = self.element.find('img') if (smaller_image is not None): ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_file(directory, index, on_save, on_except) def format_download_url(self, directory, title, index): header_date_string = self.header.date_to_string() bit_content = self.get_bit_content() if bit_content is not None: guessed_ext = get_extension_for_bit_content(bit_content) self.logger.debug(f'Extension for image URL ({self.content[0]}): {guessed_ext}') download_url = join(directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext)) self.logger.debug(f'Download path for image URL {self.content[0]} created: {download_url}') return download_url else: smaller_image = self.element.find('img') if (smaller_image is not None): return ImageBlogDownloadContent(self.header, smaller_image.get('src')).format_download_url(directory, title, index) def save_to_file(self, directory, download_url, bit_content, index, on_save, on_except): try: with open(download_url, 'wb') as download_file: download_file.write(bit_content) on_save(download_url) except OSError as os_err: if os_err.errno == 92: rollback_save_url = self.format_download_url(directory, clean_file_name(self.header.title), index) self.logger.error(f'Download from {self.content} to {download_url} is unsuccessful due to OS issue. Will re-download with a cleaned name ({rollback_save_url}).', exc_info=True) self.save_to_file(directory, rollback_save_url, bit_content, index, on_save, on_except) else: on_except(download_url) raise os_err except Exception as other_error: on_except(download_url) raise other_error def download_to_document(self, document): ( image_url ) = self.content if (image_url and not image_url == ''): try: bit_content = self.get_bit_content() if bit_content is not None: image = io.BytesIO(bit_content) document.add_picture(image, width=Inches(4)) else: smaller_image = self.element.find('img') if (smaller_image is not None): ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_document(document) except Exception: document.add_paragraph(image_url) self.logger.debug(f'Unable to fetch {image_url}. The URL was added instead.') def get_bit_content(self): if self.bit_content is None: ( image_url, image_selector ) = self.content return self.session_img_service.get_image_content(image_url, image_selector) else: return self.bit_content def clear(self): self.session_img_service.stop()
class AllOutputProcessor(ScrapperOutputProcessor): content = 'all' def __init__(self, user_input): super().__init__(user_input) self.logger = BemihoLogger(self.__class__).get_logger() other_processors = [] self.logger.debug('Getting other output processors for all context implementation.') self.other_processors_blog_datas = {} for output_p in get_output_processor_classes_for_content_except(self.content): other_processors.append(output_p(user_input)) self.other_processors_blog_datas[output_p.content] = [] self.other_processors = other_processors self.logger.debug(f'Found the following other output processor classes: {other_processors}') def get_metadata_handler_class(self, user_input, member_path): pass def create_output_directory(self): for processor in self.other_processors: processor.create_output_directory() def do_blog_datas_remapping(self, blog_datas): self.logger.debug('Performing remapping for blog data for performing output processor for all.') for blog_data in blog_datas: header = blog_data.header contents = blog_data.contents for content in contents: for content_key in content.keys(): self.other_processors_blog_datas[content_key].append(BlogData(header, content[content_key])) def process_blog_data(self, blog_datas): self.logger.debug('Starting blog data processing for all processor. One thread is created per output processor.') self.do_blog_datas_remapping(blog_datas) with ThreadPoolExecutor(max_workers=3) as executor: futures = [] for processor in self.other_processors: self.logger.debug(f'Starting thread execution for processing {processor.content} content.') futures.append(executor.submit(processor.process_blog_data, self.other_processors_blog_datas[processor.content])) for future in as_completed(futures): try: future.result() except Exception: self.logger.error("Exception occurred on thread", exc_info=True)
import sys import time from input.accept_input import get_user_input from input.exceptions import JSONDataNotFound, PageNumberNotDigits, InvalidContentInput, NumberOfPageShouldBeAtLeastOne from logger import BemihoLogger from output_processor.exceptions import OutputProcessorNotFound from scrapper.traversal.exceptions import TraversalClassNotFound from processor import create_bemiho_processor from utilities.text import seconds_to_minutes_format if __name__ == '__main__': logger = BemihoLogger('bemiho').get_logger() start = time.time() try: logger.info('Starting Bemiho.') user_input = get_user_input() processor = create_bemiho_processor(user_input) processor.start() except (JSONDataNotFound, PageNumberNotDigits, NumberOfPageShouldBeAtLeastOne, InvalidContentInput): logger.error("There were exceptions in acquiring data", exc_info=True) except OutputProcessorNotFound as oe: logger.error(oe.message, exc_info=True) except TraversalClassNotFound as te: logger.error(te.message, exc_info=True) except KeyboardInterrupt as ke: logger.debug("User stopped the application.") except Exception as e: logger.error('Uncaught exception occurred', exc_info=True)
def __init__(self, header, content): super().__init__(header, content) self.logger = BemihoLogger(__class__).get_logger()
def __init__(self, header, content): self.header = header self.content = content self.logger = BemihoLogger(__class__).get_logger()