Ejemplo n.º 1
0
 def __init__(self, user_input):
     super().__init__(user_input)
     self.group = user_input.group
     self.member = user_input.member
     self.output = user_input.output
     self.output_path = self.format_path()
     self.logger = BemihoLogger(self.__class__).get_logger()
Ejemplo n.º 2
0
    def __init__(self, user_input, output_processor_class):
        self.user_input = user_input
        self.traversal = get_traversal_based_on_content_request(user_input)
        self.scrapper_class = get_scrapper_class_based_on_input(user_input)

        self.output_processor = output_processor_class(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()
Ejemplo n.º 3
0
class BlogMetadataHandler(MetadataHandler):
    content = 'blog'

    def __init__(self, user_input, metadata_directory):
        super().__init__(user_input, metadata_directory)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def create_mapper(self):
        return BlogMetadataJSONMapper()

    def check_duplicates(self, header, content):
        if (isinstance(content, BlogMetadata)
                and header.id in self.metadata.keys()):
            md = self.metadata[header.id]
            if md.blog_data.download_url == content.download_url:
                self.logger.debug(
                    f'Duplicate document found for url {header.link} and output url {content.download_url}. Output process will be cancelled.'
                )
                return True
            else:
                return False
            return True
        return False

    def build_content_object_from_data(self, **kwargs):
        return BlogMetadata(kwargs['download_url'], kwargs['successful'])

    def add_to_metadata(self, header, content):
        if (header.id not in self.metadata.keys()):
            self.logger.debug(f'Added metadata for post {header.id}')
            self.metadata[header.id] = BlogContentMetadata(
                header.id, header.title, header.link, header.author,
                header.date, content)
Ejemplo n.º 4
0
 def __init__(self, header, content, element):
     super().__init__(header, content)
     self.element = element
     self.session_img_service = SessionImageService()
     self.session_img_service.start()
     self.bit_content = None
     self.logger = BemihoLogger(__class__).get_logger()
Ejemplo n.º 5
0
class BemihoResetProcessor(BemihoProcessor):
    def __init__(self, user_input):
        super().__init__(user_input)
        self.group = user_input.group
        self.member = user_input.member
        self.output = user_input.output
        self.output_path = self.format_path()
        self.logger = BemihoLogger(self.__class__).get_logger()

    def format_path(self):
        group = self.group.kanji
        group_romaji = self.group.romaji
        member = self.member.kanji
        member_romaji = self.member.romaji
        return join(self.output, f"{group} ({group_romaji})", f"{member} ({member_romaji})")

    def start(self):
        self.logger.debug(f'Starting reset for member {self.member.kanji} ({self.member.romaji}) from {self.group.kanji} ({self.group.romaji}) located on {self.output_path}')
        if exists(self.output_path):
            self.logger.debug(f'Output path located. Resetting.')
            try:
                for file_path in os.listdir(self.output_path):
                    joined_file_path = join(self.output_path, file_path)
                    if os.path.isfile(joined_file_path):
                        os.unlink(joined_file_path)
                    elif os.path.isdir(joined_file_path):
                        shutil.rmtree(joined_file_path)
                self.logger.debug(f'Reset successful for {self.output_path}')
            except Exception:
                self.logger.error(f'Unable to reset due to an unexpected error.', exc_info=True)
        else:
            self.logger.debug(f'Output path doesn\'t exist. Terminating')
Ejemplo n.º 6
0
class PhotosMetadataHandler(MetadataHandler):
    content = 'photos'

    def create_mapper(self):
        return PhotosMetadataJSONMapper()

    def __init__(self, user_input, metadata_directory):
        super().__init__(user_input, metadata_directory)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def check_duplicates(self, header, content):
        if (isinstance(content, PhotosMetadata) and header.id in self.metadata.keys()):
            md = self.metadata[header.id]
            if (md.does_photo_exist(content) and exists(content.download_url)):
                self.logger.debug(f'Duplicate photo found for photo url {content.image_url} and output url {content.download_url}. Output process will be cancelled.')
                return True
            else:
                return False
            return True    
        return False

    def add_to_metadata(self, header, content):
        if (header.id in self.metadata.keys()):
            self.metadata[header.id].add_photo(content)
        else:
            self.logger.debug(f'Added metadata for post {header.id}')
            self.metadata[header.id] = PhotosContentMetadata(header.id, header.title, header.link, header.author, header.date)
            self.metadata[header.id].add_photo(content)
    
    def build_content_object_from_data(self, **kwargs):
        return PhotosMetadata(kwargs['image_url'], kwargs['download_url'], kwargs['successful'])
Ejemplo n.º 7
0
 def __init__(self):
     self.group = None
     self.member = None
     self.output = 'output'
     self.content = 'photos'
     self.firstpage = 1
     self.number_of_pages = 1
     self.logger = BemihoLogger(BemihoUserInputBuilder).get_logger()
     self.reset_mode = False
     self.list_mode = False
Ejemplo n.º 8
0
 def __init__(self, user_input):
     super().__init__(user_input)
     self.logger = BemihoLogger(self.__class__).get_logger()
     other_processors = []
     self.logger.debug('Getting other output processors for all context implementation.')
     self.other_processors_blog_datas = {}
     for output_p in get_output_processor_classes_for_content_except(self.content):
         other_processors.append(output_p(user_input))
         self.other_processors_blog_datas[output_p.content] = []
     self.other_processors = other_processors
     self.logger.debug(f'Found the following other output processor classes: {other_processors}')
Ejemplo n.º 9
0
 def __init__(self, user_input):
     self.user_input = user_input
     file_path = Path(user_input.output).resolve()
     self.output_path = file_path
     self.member_path = self.format_path()
     self.metadata_handler = self.get_metadata_handler_class(
         user_input, self.member_path)
     self.logger = BemihoLogger(self.__class__).get_logger()
     group = self.user_input.group
     member = self.user_input.member
     self.logger.debug(
         f'Created output processor for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with path {self.member_path}'
     )
Ejemplo n.º 10
0
def get_page_input(page_from_args, default_value, label):
    logger = BemihoLogger(get_page_input).get_logger()
    logger.debug(
        f'Checking page or count from arguments {page_from_args} with default value {default_value}'
    )
    if (page_from_args == None):
        page_input = input(f"Select {label}. Default is {default_value}: ")
        if (page_input == None or page_input == ''):
            page_input = default_value
        if (not page_input.isdigit()):
            raise PageNumberNotDigits()
    else:
        page_input = page_from_args
    return page_input
Ejemplo n.º 11
0
 def __init__(self, user_input, metadata_directory):
     self.user_input = user_input
     self.metadata_directory = metadata_directory
     self.metadata_file = join(metadata_directory, 'metadata.json')
     self.mapper = self.create_mapper()
     self.metadata = {}
     self.logger = BemihoLogger(self.__class__).get_logger()
Ejemplo n.º 12
0
class BemihoUserInputBuilder:
    def __init__(self):
        self.group = None
        self.member = None
        self.output = 'output'
        self.content = 'photos'
        self.firstpage = 1
        self.number_of_pages = 1
        self.logger = BemihoLogger(BemihoUserInputBuilder).get_logger()
        self.reset_mode = False
        self.list_mode = False

    def set_group(self, group):
        self.group = group

    def set_member(self, member):
        self.member = member

    def set_output(self, output):
        self.output = output

    def set_content(self, content):
        self.content = content

    def set_firstpage(self, firstpage):
        self.firstpage = firstpage

    def set_number_of_page(self, number_of_pages):
        self.number_of_pages = number_of_pages

    def set_reset_mode(self, reset_mode):
        self.reset_mode = reset_mode

    def set_list_mode(self, list_mode):
        self.list_mode = list_mode

    def build(self):
        user_input = BemihoUserInput(self.group, self.member, self.output,
                                     self.content, self.firstpage,
                                     self.number_of_pages, self.reset_mode,
                                     self.list_mode)
        self.logger.debug(
            f'User input object created for scrapping that contains the following data:\n{user_input}'
        )
        return user_input
Ejemplo n.º 13
0
class JSONExtractor:
    def __init__(self, filename, mapper):
        self.filename = filename
        self.mapper = mapper
        self.logger = BemihoLogger(self.__class__, logging.INFO).get_logger()

    def extract(self):
        items = []
        self.logger.debug(
            f'Extracting data from {self.filename} with the mapper {get_qualified_name(self.mapper.__class__)}.'
        )
        with open(self.filename) as jsonfile:
            data = json.load(jsonfile)
            for d in data:
                items.append(self.mapper.map_to_object(d))
        self.logger.debug(
            f'Data successfully extracted from {self.filename} with {len(items)} items.'
        )
        return items
Ejemplo n.º 14
0
def parse_system_args():
    logger = BemihoLogger(parse_system_args).get_logger()
    parser = argparse.ArgumentParser()
    parser.add_argument("-g", "--group", help="Select group to pull")
    parser.add_argument("-m", "--member", help="Select member to pull")
    parser.add_argument("-o",
                        "--output",
                        help="Output folder",
                        default="output")
    parser.add_argument("-c",
                        "--content",
                        help="Content to pull for member",
                        choices=get_available_content_options(),
                        type=str.lower)
    parser.add_argument("-f", "--firstpage", help="First page", type=int)
    parser.add_argument("-n", "--number", help="Number of pages", type=int)
    parser.add_argument("--reset",
                        help="Resets saved data from idol's blog",
                        action='store_true')
    parser.add_argument("--list",
                        help="Lists all groups and supported members",
                        action='store_true')
    logger.debug('Parsing command line arguments')
    parsed = parser.parse_args()
    logger.debug(f'Parsing command line arguments finished {parsed}')
    return parsed
Ejemplo n.º 15
0
class NoHTMLTextOutputProcessor(ScrapperOutputProcessor):
    content = 'no_html'

    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def get_metadata_handler_class(self, user_input, member_path):
        return NoHTMLTextMetadataHandler(user_input, member_path)

    def process_blog_data(self, blog_datas):
        self.logger.debug(f'Blog data number {len(blog_datas)}.')
        for blog_data in blog_datas:
            header = blog_data.header
            contents = blog_data.contents
            self.logger.debug(
                f'Saving text only contents from {header.title} with content count {len(contents)}.'
            )
            for download_content in contents:
                self.save_to_file_and_metadata(header, download_content)
            self.metadata_handler.save_metadata()

    def on_save(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = True
        self.metadata_handler.add_to_metadata(header, content_data)

    def on_except(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = False
        self.metadata_handler.add_to_metadata(header, content_data)

    def save_to_file_and_metadata(self, header, download_content):
        download_url = download_content.get_text_file_path(self.member_path)
        try:
            content_data = self.metadata_handler.build_content_object_from_data(
                download_url=download_url, successful=False)
            if not self.metadata_handler.check_duplicates(
                    header, content_data):
                download_content.download_to_text_file(
                    self.member_path, lambda file_path: self.on_save(
                        header, content_data, file_path), lambda file_path:
                    self.on_except(header, content_data, file_path))
            else:
                self.logger.debug(
                    f'Duplicate found for {header.title}. Cancelling download')
        except:
            self.logger.error(
                f'Download of no_html from {header.link} to {download_url} is unsuccessful due to issue.',
                exc_info=True)
Ejemplo n.º 16
0
class ScrapperOutputProcessor:
    content = ''

    def __init__(self, user_input):
        self.user_input = user_input
        file_path = Path(user_input.output).resolve()
        self.output_path = file_path
        self.member_path = self.format_path()
        self.metadata_handler = self.get_metadata_handler_class(
            user_input, self.member_path)
        self.logger = BemihoLogger(self.__class__).get_logger()
        group = self.user_input.group
        member = self.user_input.member
        self.logger.debug(
            f'Created output processor for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with path {self.member_path}'
        )

    def get_metadata_handler_class(self, user_input, member_path):
        return EmptyMetadataHandler(user_input, member_path)

    def format_path(self):
        group = self.user_input.group.kanji
        group_romaji = self.user_input.group.romaji
        member = self.user_input.member.kanji
        member_romaji = self.user_input.member.romaji
        return join(self.output_path, f"{group} ({group_romaji})",
                    f"{member} ({member_romaji})", self.content)

    def create_output_directory(self):
        if (not exists(self.member_path)):
            self.logger.debug(
                f'Folder for member path {self.member_path} doesn\'t exist. Creating folder'
            )
            path = Path(self.member_path)
            path.mkdir(parents=True)
        self.metadata_handler.load_metadata()

    def process_blog_data(self, blog_data):
        raise NotImplementedError()
Ejemplo n.º 17
0
class NoHTMLTextBlogDownloadContent(BlogDownloadContent):
    def __init__(self, header, content):
        super().__init__(header, content)
        self.logger = BemihoLogger(__class__).get_logger()

    def download_to_text_file(self, directory, on_save, on_except):
        self.logger.debug(f'Writing no HTML content from {self.header.title} with size {len(self.content)}.')
        text_file_path = self.get_text_file_path(directory)
        try:
            self.do_save(text_file_path, on_save)
        except OSError as os_err:
            if os_err.errno == errno.EILSEQ:
                text_file_path = clean_emojis(self.get_text_file_path(directory))
                self.do_save(text_file_path, on_save)
            else:
                on_except(text_file_path)
                raise os_err
        except Exception as other_err:
            on_except(text_file_path)
            raise other_err
        self.logger.debug(f'Writing no HTML content with size {len(self.content)} successful.')

    def do_save(self, file_path, on_save):
        with open(file_path, 'w') as new_text_file:
            date_string = self.header.date.strftime("%Y-%m-%d %H:%M:%S")
            new_text_file.write(f"Title: {self.header.title}\n")
            new_text_file.write(f"Date: {date_string}\n")
            new_text_file.write(f"Link: {self.header.link}\n")
            new_text_file.write("===============\n")
            new_text_file.write(self.content)
            on_save(file_path)

    def get_text_file_path(self, directory):
        header_date_string = self.header.date_to_string()
        download_url = join(directory, '%s (%s).txt' % (header_date_string, clean_file_separators(self.header.title)))
        return download_url
Ejemplo n.º 18
0
def get_output_processor_class_for_content(content):
    logger = BemihoLogger(get_output_processor_class_for_content).get_logger()
    qualified_name = get_qualified_name(ScrapperOutputProcessor)
    logger.debug(
        f'Getting output processor ({qualified_name}) class for content {content}.'
    )
    writer = get_class_in_module(__file__, __name__, ScrapperOutputProcessor,
                                 lambda clazz: clazz.content == content)
    if (writer == None):
        raise OutputProcessorNotFound(content)
    logger.debug(f'Output processor ({get_qualified_name(writer)}) found.')
    return writer
Ejemplo n.º 19
0
class LineBlogGroupService(LineBlogService):
    def __init__(self, url, page_number, author, traversal):
        self.url = url
        self.page_number = page_number
        self.author = author
        self.logger = BemihoLogger(self.__class__).get_logger()
        self.traversal = traversal

    def scrape_single_url(self, header):
        contents = []
        self.logger.debug(
            f'Extracting data from {header.link} from {header.author}')
        request = requests.get(header.link)
        soup = BeautifulSoup(request.text, 'lxml')
        for article in soup.find_all('article', class_='first-article'):
            article_body = article.find('div', class_='article-body')
            article_body_inner = article_body.find('div',
                                                   class_='article-body-inner')
            contents = self.traversal.traverse(header, article_body_inner)
            self.logger.debug(
                f'Contents extracted from {header.link} with size {len(contents)}'
            )
        return BlogData(header, contents)

    def serve_contents(self):
        contents = []
        futures = []
        headers = LineBlogApiCrawler(self.url, self.page_number,
                                     self.author).crawl_api_for_headers()
        self.logger.debug(
            f'Headers extracted from api url {self.url} with size {len(headers)}. Proceeding to fetch data.'
        )
        with ThreadPoolExecutor(max_workers=5) as executor:
            for header in headers:
                futures.append(executor.submit(self.scrape_single_url, header))
            for future in as_completed(futures):
                try:
                    contents.append(future.result())
                except Exception:
                    self.logger.error("Exception occurred on thread",
                                      exc_info=True)
        return contents
Ejemplo n.º 20
0
class BemihoScrapProcessor(BemihoProcessor):
    def __init__(self, user_input, output_processor_class):
        self.user_input = user_input
        self.traversal = get_traversal_based_on_content_request(user_input)
        self.scrapper_class = get_scrapper_class_based_on_input(user_input)

        self.output_processor = output_processor_class(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def execute_single_scraper(self, page_number):
        content = self.user_input.content
        self.logger.debug(f'Starting fetch {content} for page {page_number}')
        scrapper = self.scrapper_class(self.user_input, page_number,
                                       self.traversal)
        blog_data = scrapper.start_web_scrape()
        self.output_processor.process_blog_data(blog_data)
        return page_number

    def start(self):
        group = self.user_input.group
        member = self.user_input.member
        firstpage = self.user_input.firstpage
        number_of_pages = self.user_input.number_of_pages
        content = self.user_input.content
        self.logger.debug(
            f'Starting scrap process for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with content {content} and {number_of_pages} page count from page {firstpage}'
        )
        self.output_processor.create_output_directory()
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            page_index = self.scrapper_class.get_proper_page_index(firstpage)
            for page_number in range(page_index, page_index + number_of_pages):
                futures.append(
                    executor.submit(self.execute_single_scraper, page_number))
            for future in as_completed(futures):
                try:
                    data = future.result()
                    self.logger.debug(
                        f"Successfully fetched {content} data for page {data}")
                except Exception:
                    self.logger.error("Exception occurred on thread",
                                      exc_info=True)
Ejemplo n.º 21
0
def get_traversal_based_on_content_request(user_input):
    logger = BemihoLogger(get_traversal_based_on_content_request).get_logger()
    qualified_name = get_qualified_name(ScrapperTraversal)
    logger.debug(
        f'Getting traversal method ({qualified_name}) class for content {user_input.content}.'
    )
    traversal = get_class_in_module(
        __file__, __name__, ScrapperTraversal,
        lambda clazz: clazz.content == user_input.content)
    if (traversal == None):
        raise TraversalClassNotFound(user_input.content)
    logger.debug(f'Traversal method ({get_qualified_name(traversal)}) found.')
    return traversal()
Ejemplo n.º 22
0
class PhotosOutputProcessor(ScrapperOutputProcessor):
    content = 'photos'

    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def get_metadata_handler_class(self, user_input, member_path):
        return PhotosMetadataHandler(user_input, member_path)

    def process_blog_data(self, blog_datas):
        self.logger.debug(f'Starting saving photos content to {self.member_path}.')
        self.logger.debug(f'Blog data number {len(blog_datas)}.')
        for blog_data in blog_datas:
            header = blog_data.header
            contents = list(filter(lambda content: type(content) is ImageBlogDownloadContent or type(content) is SessionBasedImageBlogDownloadContent, blog_data.contents))
            self.logger.debug(f'Saving contents from {header.title} with content count {len(contents)}.')
            for (index, download_content) in enumerate(contents):
                self.download_file(header, index, download_content)
        self.metadata_handler.save_metadata()

    def on_save(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = True
        self.metadata_handler.add_to_metadata(header, content_data)

    def on_except(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = False
        self.metadata_handler.add_to_metadata(header, content_data)

    def download_file(self, header, index, download_content):
        image_url = download_content.content
        download_url = download_content.format_download_url(self.member_path, header.title, index)
        metadata_content = self.metadata_handler.build_content_object_from_data(image_url=image_url, download_url=download_url, successful=True)
        try:
            if self.metadata_handler.check_duplicates(header, metadata_content):
                self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.')
            else:
                metadata_content.download_url = download_content.format_download_url(self.member_path, clean_file_name(header.title), index)
                if self.metadata_handler.check_duplicates(header, metadata_content):
                    self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.')
                else:
                    download_content.download_to_file(self.member_path, index,
                        lambda file_path : self.on_save(header, metadata_content, file_path),
                        lambda file_path : self.on_except(header, metadata_content, file_path))
                    self.metadata_handler.add_to_metadata(header, metadata_content)
        except Exception:
            self.logger.error(f'Download from {image_url} to {download_url} is unsuccessful due to issue.', exc_info=True)
Ejemplo n.º 23
0
 def __init__(self, user_input, metadata_directory):
     super().__init__(user_input, metadata_directory)
     self.logger = BemihoLogger(self.__class__).get_logger()
Ejemplo n.º 24
0
 def __init__(self, user_input):
     super().__init__(user_input)
     self.logger = BemihoLogger(self.__class__).get_logger()
Ejemplo n.º 25
0
class BlogEntryOutputProcessor(ScrapperOutputProcessor):
    content = 'blog'

    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def get_metadata_handler_class(self, user_input, member_path):
        return BlogMetadataHandler(user_input, member_path)

    def process_blog_data(self, blog_datas):
        self.logger.debug(
            f'Starting saving blog content to {self.member_path}.')
        directory = self.member_path
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = []
            for blog_data in blog_datas:
                self.logger.debug(
                    f'Starting thread execution for building document.')
                futures.append(
                    executor.submit(self.build_document, directory, blog_data))
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception:
                    self.logger.error("Exception occurred on thread",
                                      exc_info=True)
        self.metadata_handler.save_metadata()

    def build_document(self, directory, blog_data):
        content_data = None
        header = blog_data.header
        contents = blog_data.contents
        date_string = header.date.strftime("%Y.%m.%d")
        document_path = join(
            directory,
            f"{date_string} ({clean_file_separators(header.title)}).docx")

        try:
            content_data = self.metadata_handler.build_content_object_from_data(
                download_url=document_path, successful=False)
            self.save_to_document(header, contents, content_data,
                                  document_path)
        except OSError as os_error:
            if os_error.errno == errno.EILSEQ:
                document_path = join(
                    directory,
                    f"{date_string} ({clean_file_name(header.title)}).docx")
                content_data = self.metadata_handler.build_content_object_from_data(
                    download_url=document_path, successful=False)
                self.save_to_document(header, contents, content_data,
                                      document_path)
            else:
                raise os_error
        except:
            content_data = self.metadata_handler.build_content_object_from_data(
                download_url=document_path, successful=False)
            self.metadata_handler.add_to_metadata(header, content_data)
            self.logger.error(
                f'Download from {header.link} to {document_path} is unsuccessful due to issue.',
                exc_info=True)

    def save_to_document(self, header, contents, content_data, document_path):
        if not self.metadata_handler.check_duplicates(header, content_data):
            document = Document()
            paragraph_format = document.styles['Normal'].paragraph_format
            paragraph_format.line_spacing = 1

            HeaderDocumentModifier(header.title,
                                   level=1).change_document(document)
            HeaderDocumentModifier(header.date.strftime("%Y-%m-%d %H:%M:%S"),
                                   level=4).change_document(document)
            HeaderDocumentModifier(header.link,
                                   level=4).change_document(document)

            for content in contents:
                content.download_to_document(document)
            document.save(document_path)
            content_data.successful = True
            self.metadata_handler.add_to_metadata(header, content_data)
Ejemplo n.º 26
0
class SessionBasedImageBlogDownloadContent(BlogDownloadContent):
    def __init__(self, header, content, element):
        super().__init__(header, content)
        self.element = element
        self.session_img_service = SessionImageService()
        self.session_img_service.start()
        self.bit_content = None
        self.logger = BemihoLogger(__class__).get_logger()

    def download_to_file(self, directory, index, on_save, on_except):
        ( image_url ) = self.content
        if (image_url and not image_url == ''):
            self.logger.debug(f'Image url is not empty. Building download path from {image_url}.')
            bit_content = self.get_bit_content()
            if bit_content is not None:
                download_url = self.format_download_url(directory, self.header.title, index)
                self.save_to_file(directory, download_url, bit_content, index, on_save, on_except)
            else:
                smaller_image = self.element.find('img')
                if (smaller_image is not None):
                    ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_file(directory, index, on_save, on_except)
    
    def format_download_url(self, directory, title, index):
        header_date_string = self.header.date_to_string()
        bit_content = self.get_bit_content()
        if bit_content is not None:
            guessed_ext = get_extension_for_bit_content(bit_content)
            self.logger.debug(f'Extension for image URL ({self.content[0]}): {guessed_ext}')
            download_url = join(directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext))
            self.logger.debug(f'Download path for image URL {self.content[0]} created: {download_url}')
            return download_url
        else:
            smaller_image = self.element.find('img')
            if (smaller_image is not None):
                return ImageBlogDownloadContent(self.header, smaller_image.get('src')).format_download_url(directory, title, index)

    def save_to_file(self, directory, download_url, bit_content, index, on_save, on_except):
        try:
            with open(download_url, 'wb') as download_file:
                download_file.write(bit_content)
            on_save(download_url)
        except OSError as os_err:
            if os_err.errno == 92:
                rollback_save_url = self.format_download_url(directory, clean_file_name(self.header.title), index)
                self.logger.error(f'Download from {self.content} to {download_url} is unsuccessful due to OS issue. Will re-download with a cleaned name ({rollback_save_url}).', exc_info=True)
                self.save_to_file(directory, rollback_save_url, bit_content, index, on_save, on_except)
            else:
                on_except(download_url)
                raise os_err
        except Exception as other_error:
            on_except(download_url)
            raise other_error
    
    def download_to_document(self, document):
        ( image_url ) = self.content
        if (image_url and not image_url == ''):
            try:
                bit_content = self.get_bit_content()
                if bit_content is not None:
                    image = io.BytesIO(bit_content)
                    document.add_picture(image, width=Inches(4))
                else:
                    smaller_image = self.element.find('img')
                    if (smaller_image is not None):
                        ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_document(document)
            except Exception:
                document.add_paragraph(image_url)
                self.logger.debug(f'Unable to fetch {image_url}. The URL was added instead.')

    def get_bit_content(self):
        if self.bit_content is None:
            ( image_url, image_selector ) = self.content
            return self.session_img_service.get_image_content(image_url, image_selector)
        else:
            return self.bit_content

    def clear(self):
        self.session_img_service.stop()
Ejemplo n.º 27
0
class AllOutputProcessor(ScrapperOutputProcessor):
    content = 'all'
    
    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()
        other_processors = []
        self.logger.debug('Getting other output processors for all context implementation.')
        self.other_processors_blog_datas = {}
        for output_p in get_output_processor_classes_for_content_except(self.content):
            other_processors.append(output_p(user_input))
            self.other_processors_blog_datas[output_p.content] = []
        self.other_processors = other_processors
        self.logger.debug(f'Found the following other output processor classes: {other_processors}')

    def get_metadata_handler_class(self, user_input, member_path):
        pass

    def create_output_directory(self):
        for processor in self.other_processors:
            processor.create_output_directory()

    def do_blog_datas_remapping(self, blog_datas):
        self.logger.debug('Performing remapping for blog data for performing output processor for all.')
        for blog_data in blog_datas:
            header = blog_data.header
            contents = blog_data.contents
            for content in contents:
                for content_key in content.keys():
                    self.other_processors_blog_datas[content_key].append(BlogData(header, content[content_key]))

    def process_blog_data(self, blog_datas):
        self.logger.debug('Starting blog data processing for all processor. One thread is created per output processor.')
        self.do_blog_datas_remapping(blog_datas)
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = []
            for processor in self.other_processors:
                self.logger.debug(f'Starting thread execution for processing {processor.content} content.')
                futures.append(executor.submit(processor.process_blog_data, self.other_processors_blog_datas[processor.content]))
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception:
                    self.logger.error("Exception occurred on thread", exc_info=True)
Ejemplo n.º 28
0
import sys
import time

from input.accept_input import get_user_input
from input.exceptions import JSONDataNotFound, PageNumberNotDigits, InvalidContentInput, NumberOfPageShouldBeAtLeastOne
from logger import BemihoLogger 

from output_processor.exceptions import OutputProcessorNotFound
from scrapper.traversal.exceptions import TraversalClassNotFound

from processor import create_bemiho_processor
from utilities.text import seconds_to_minutes_format

if __name__ == '__main__':
    logger = BemihoLogger('bemiho').get_logger()
    start = time.time()
    try:
        logger.info('Starting Bemiho.')
        user_input = get_user_input()
        processor = create_bemiho_processor(user_input)
        processor.start()
    except (JSONDataNotFound, PageNumberNotDigits, NumberOfPageShouldBeAtLeastOne, InvalidContentInput):
        logger.error("There were exceptions in acquiring data", exc_info=True)
    except OutputProcessorNotFound as oe:
        logger.error(oe.message, exc_info=True)
    except TraversalClassNotFound as te:
        logger.error(te.message, exc_info=True)
    except KeyboardInterrupt as ke:
        logger.debug("User stopped the application.")
    except Exception as e:
        logger.error('Uncaught exception occurred', exc_info=True)
Ejemplo n.º 29
0
 def __init__(self, header, content):
     super().__init__(header, content)
     self.logger = BemihoLogger(__class__).get_logger()
Ejemplo n.º 30
0
 def __init__(self, header, content):
     self.header = header
     self.content = content
     self.logger = BemihoLogger(__class__).get_logger()