예제 #1
0
    def _do_main_work(self):
        if self.overwrite:
            warn('Overwrite mode in effect.')
        if self.dry_run:
            warn('Running in dry run mode – will not modify files.')
        inform(f'Will process {pluralized("file", self._targets, True)}' +
               f' using {pluralized("method", self.methods)}' +
               f' [cyan2]{", ".join(self.methods)}[/].')
        if len(self._targets) > 10000:
            inform(
                "(That's a huge number of files – this will take a long time.)"
            )
        elif len(self._targets) > 1000:
            inform("(That's a lot of files – this will take some time.)")

        for file in self._targets:
            (record, failure) = self._zotero.record_for_file(file)
            if failure:
                warn(failure)
                continue
            ext = filename_extension(file)
            for method in self._writers:
                if method.file_extension() and ext != method.file_extension():
                    f = antiformat(f'[steel_blue3]{file}[/]')
                    warn(
                        f"Method [cyan2]{method.name()}[/] can't be used on {f}"
                    )
                else:
                    method.write_link(file, record.link)
예제 #2
0
 def _resized_image(self, file):
     (max_width, max_height) = self._max_dimensions
     file_ext = filename_extension(file)
     name_tail = '.handprint' + file_ext
     new_file = file if name_tail in file else filename_basename(
         file) + name_tail
     if path.exists(new_file) and readable(new_file):
         from handprint.images import image_dimensions
         (image_width, image_height) = image_dimensions(new_file)
         if image_width < max_width and image_height < max_height:
             inform(f'Using reduced image found in {relative(new_file)}')
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, dimension are too large.
             if __debug__:
                 log('existing resized file larger than' +
                     f' {max_width}x{max_height}: {new_file}')
     inform(f'Dimensions too large; reducing dimensions: {relative(file)}')
     from handprint.images import reduced_image_dimensions
     (resized, error) = reduced_image_dimensions(file, new_file, max_width,
                                                 max_height)
     if error:
         alert(f'Failed to re-dimension {relative(file)}: {error}')
         return None
     return resized
예제 #3
0
 def _smaller_file(self, file):
     if not file:
         return None
     file_ext = filename_extension(file)
     name_tail = '.handprint' + file_ext
     new_file = file if name_tail in file else filename_basename(
         file) + name_tail
     if path.exists(new_file):
         from handprint.images import image_size
         if image_size(new_file) < self._max_size:
             inform(f'Reusing resized image found in {relative(new_file)}')
             return new_file
         else:
             # We found a ".handprint.ext" file, perhaps from a previous run,
             # but for the current set of services, it's larger than allowed.
             if __debug__:
                 log('existing resized file larger than' +
                     f' {self._max_size}b: {new_file}')
     inform(f'Size too large; reducing size: {relative(file)}')
     from handprint.images import reduced_image_size
     (resized, error) = reduced_image_size(file, new_file, self._max_size)
     if error:
         alert(f'Failed to resize {relative(file)}: {error}')
         return None
     return resized
예제 #4
0
    def targets_from_arguments(self):
        # Validator_collection takes a long time to load.  Delay loading it
        # until needed, so that overall application startup time is faster.
        from validator_collection.checkers import is_url

        targets = []
        if self.from_file:
            if __debug__: log(f'reading {self.from_file}')
            targets = filter(None, open(self.from_file).read().splitlines())
        else:
            for item in self.files:
                if is_url(item):
                    targets.append(item)
                elif isfile(item) and filename_extension(
                        item) in ACCEPTED_FORMATS:
                    targets.append(item)
                elif isdir(item):
                    # It's a directory, so look for files within.
                    targets += files_in_directory(item,
                                                  extensions=ACCEPTED_FORMATS)
                else:
                    warn(f'"{item}" not a file or directory')

        # Filter files created in past runs.
        targets = filter(lambda name: '.handprint' not in name, targets)

        # If there is both a file in the format we generate and another
        # format of that file, ignore the other formats and just use ours.
        # Note: the value of targets is an iterator, but b/c it's tested inside
        # the loop, a separate list is needed (else get unexpected results).
        targets = list(targets)
        keep = []
        for item in targets:
            ext = filename_extension(item)
            base = filename_basename(item)
            if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets):
                # png version of file is also present => skip this other version
                continue
            keep.append(item)
        return keep
예제 #5
0
파일: images.py 프로젝트: ccarvel/handprint
def converted_image(orig_file, to_format, dest_file=None):
    '''Returns a tuple of (success, output file, error message).
    Returns a tuple of (new_file, error).  The value of 'error' will be None
    if no error occurred; otherwise, the value will be a string summarizing the
    error that occurred and 'new_file' will be set to None.
    '''
    dest_format = canonical_format_name(to_format)
    if dest_file is None:
        dest_file = filename_basename(file) + '.' + dest_format
    # PIL is unable to read PDF files, so in that particular case, we have to
    # convert it using another tool.
    if filename_extension(orig_file) == '.pdf':
        import fitz
        doc = fitz.open(orig_file)
        if len(doc) >= 1:
            if len(doc) >= 2:
                if __debug__:
                    log(f'{orig_file} has > 1 images; using only 1st')
            # FIXME: if there's more than 1 image, we could extra the rest.
            # Doing so will require some architectural changes first.
            if __debug__:
                log(f'extracting 1st image from {relative(dest_file)}')
            page = doc[0]
            pix = page.getPixmap(alpha=False)
            if __debug__: log(f'writing {relative(dest_file)}')
            pix.writeImage(dest_file, dest_format)
            return (dest_file, None)
        else:
            if __debug__:
                log(f'fitz says there is no image image in {relative(orig_file)}'
                    )
            return (None, f'Cannot find an image inside {relative(orig_file)}')
    else:
        # When converting images, PIL may issue a DecompressionBombWarning but
        # it's not a concern in our application.  Ignore it.
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                im = Image.open(orig_file)
                if __debug__: log(f'converting {relative(orig_file)} to RGB')
                im.convert('RGB')
                if __debug__:
                    log(f'saving converted image to {relative(dest_file)}')
                if orig_file == dest_file:
                    im.seek(0)
                im.save(dest_file, dest_format)
                return (dest_file, None)
            except Exception as ex:
                return (None, str(ex))
예제 #6
0
    def _get(self, item, base_name, index):
        # Shortcuts to make the code more readable.
        output_dir = self._output_dir

        # For URLs, we download the corresponding files and name them with
        # the base_name.
        from validator_collection.checkers import is_url
        if is_url(item):
            # First make sure the URL actually points to an image.
            if __debug__: log(f'testing if URL contains an image: {item}')
            headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}
            try:
                request = urllib.request.Request(item, None, headers)
                response = urllib.request.urlopen(request)
            except Exception as ex:
                warn(f'Skipping URL due to error: {ex}')
                return (None, None)
            if response.headers.get_content_maintype() != 'image':
                warn(f'Did not find an image at {item}')
                return (None, None)
            orig_fmt = response.headers.get_content_subtype()
            base = f'{base_name}-{index}'
            # If we weren't given an output dir, then for URLs, we have no
            # choice but to use the current dir to download the file.
            # Important: don't change self._output_dir because if other
            # inputs *are* files, then those files will need other output dirs.
            if not output_dir:
                output_dir = os.getcwd()
            file = path.realpath(path.join(output_dir, base + '.' + orig_fmt))
            if not download_file(item, file):
                warn(f'Unable to download {item}')
                return (None, None)
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                inform(
                    f'Wrote URL to [white on grey42]{relative(url_file)}[/]')
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            orig_fmt = filename_extension(file)[1:]

        if not path.getsize(file) > 0:
            warn(f'File has zero length: {relative(file)}')
            return (None, None)

        if __debug__: log(f'{relative(file)} has original format {orig_fmt}')
        return (file, orig_fmt)
예제 #7
0
    def _do_preflight(self):
        '''Check the option values given by the user, and do other prep.'''

        if not network_available():
            alert_fatal('No network connection.')
            raise CannotProceed(ExitCode.no_network)

        # Sanity-check the arguments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        hint = '(Hint: use -h for help.)'

        if not self.files:
            alert_fatal(
                f'Need at least one folder path or file as argument. {hint}')
            raise CannotProceed(ExitCode.bad_arg)
        if any(item.startswith('-') for item in self.files):
            bad = next(item for item in self.files if item.startswith('-'))
            alert_fatal(f'Unrecognized option "{bad}" in arguments. {hint}')
            raise CannotProceed(ExitCode.bad_arg)
        if not self.use_keyring and not any([self.api_key, self.user_id]):
            alert_fatal(
                f"Need Zotero credentials if not using keyring. {hint}")
            raise CannotProceed(ExitCode.bad_arg)

        if self.after_date:
            try:
                # Convert user's input into a canonical format.
                self.after_date = parsed_datetime(self.after_date)
                self.after_date_str = self.after_date.strftime(DATE_FORMAT)
                if __debug__:
                    log(f'parsed after_date as {self.after_date_str}')
            except KeyboardInterrupt as ex:
                if __debug__: log(f'got exception {str(ex)}')
                raise
            except Exception as ex:
                alert_fatal(f'Unable to parse after_date: "{str(ex)}". {hint}')
                raise CannotProceed(ExitCode.bad_arg)

        if self.file_ext:
            self.file_ext = self.file_ext.lower().split(',')
            self.file_ext = [
                '.' + e for e in self.file_ext if not e.startswith('.')
            ]

        # Set up Zotero connection and gather files for work ~~~~~~~~~~~~~~~~~~

        inform('Connecting to Zotero network servers ...')
        self._zotero = Zotero(self.api_key, self.user_id, self.use_keyring)

        if len(self.files) > 1 or path.isdir(self.files[0]):
            inform('Examining folders and looking for files ...')
        # 2 passes: traverse subdirectories recursively, then filter results.
        candidates = []
        for item in self.files:
            if path.isfile(item):
                candidates.append(item)
            elif path.isdir(item):
                if __debug__: log(f'adding files in subdir {antiformat(item)}')
                candidates += files_in_directory(item)
            else:
                warn(f'Not a file nor a folder of files: "{antiformat(item)}"')
        if __debug__: log('gathering list of files ...')
        self._targets = []
        for file in candidates:
            ext = filename_extension(file)
            if path.basename(file).startswith('.') or ext in _IGNORED_EXT:
                if __debug__:
                    log(f'ignoring ignorable file {antiformat(file)}')
                continue
            if self.file_ext and ext not in self.file_ext:
                warn(
                    f'Skipping file without desired extension: {antiformat(file)}'
                )
                continue
            if file_is_alias(file):
                if __debug__: log(f'ignoring macOS alias {antiformat(file)}')
                continue
            self._targets.append(file)
        if __debug__:
            log(f'gathered {pluralized("file", self._targets, True)}')

        if self.after_date:
            if __debug__: log(f'filtering files by date {self.after_date_str}')
            kept = []
            tzinfo = self.after_date.tzinfo
            for file in self.files:
                mtime = datetime.fromtimestamp(Path(file).stat().st_mtime)
                if mtime.replace(tzinfo=tzinfo) >= self.after_date:
                    if __debug__: log(f'keeping {file}')
                    kept.append(file)
            self._targets = kept

        if not self._targets:
            alert_fatal('No files to process; quitting.')
            raise CannotProceed(ExitCode.bad_arg)
예제 #8
0
    def _save_article_pmc(self, dest_dir, article, xml, zip_articles):
        inform('Writing ' + article.doi)
        to_archive = []

        pdf_file = pmc_pdf_filename(article, dest_dir)
        if __debug__: log(f'downloading PDF to {pdf_file}')
        if not download_file(article.pdf, pdf_file):
            warn(f'Could not download PDF file for {article.doi}')
            article.status = 'failed-pdf-download'
        to_archive.append(pdf_file)

        jats_file = jats_filename(article, dest_dir)
        if __debug__: log(f'downloading JATS XML to {jats_file}')
        if not download_file(article.jats, jats_file):
            warn(f'Could not download JATS file for {article.doi}')
            article.status = 'failed-jats-download'
        if self.do_validate:
            if not valid_xml(jats_file, self._dtd):
                warn(f'Failed to validate JATS for article {article.doi}')
                article.status = 'failed-jats-validation'
        else:
            if __debug__: log(f'skipping DTD validation of {jats_file}')
        to_archive.append(jats_file)

        # We need to store the image with the name that appears in the
        # JATS file. That requires a little extra work to extract.
        image_extension = filename_extension(article.image)
        image_file = image_filename(article, dest_dir, ext=image_extension)
        if article.image:
            if __debug__: log(f'downloading image file to {image_file}')
            if download_file(article.image, image_file):
                with Image.open(image_file) as img:
                    converted_img = image_without_alpha(img)
                    converted_img = converted_img.convert('RGB')
                    if __debug__: log(f'converting image to TIFF format')
                    tiff_file = filename_basename(image_file) + '.tif'
                    # Using save() means that only the 1st frame of a
                    # multiframe image will be saved.
                    converted_img.save(tiff_file,
                                       dpi=_TIFF_DPI,
                                       compression=None,
                                       description=tiff_comments(article))
                    to_archive.append(tiff_file)
                # We keep only the uncompressed TIFF version.
                if __debug__: log(f'deleting original image file {image_file}')
                delete_existing(image_file)
            else:
                warn(f'Failed to download image for {article.doi}')
                article.status = 'failed-image-download'
        else:
            if __debug__:
                log(f'skipping empty image file URL for {article.doi}')

        # Finally, put the files into their own zip archive.
        if zip_articles:
            if not article.status.startswith('failed'):
                zip_file = pmc_zip_filename(article, dest_dir)
                inform(f'Creating ZIP archive file "{zip_file}"')
                archive_files(zip_file, to_archive)
                if __debug__: log(f'verifying ZIP file {zip_file}')
                verify_archive(zip_file, 'zip')
                for file in to_archive:
                    if __debug__: log(f'deleting file {file}')
                    delete_existing(file)
            else:
                warn(
                    f'ZIP archive for {article.doi} not created due to errors')
예제 #9
0
    def _save_article_portico(self, dest_dir, article, xmldict):
        article_dir = path.join(dest_dir, article.basename)
        jats_dir = path.join(article_dir, 'jats')
        try:
            os.makedirs(article_dir)
            if self.journal.uses_jats:
                os.makedirs(jats_dir)
        except FileExistsError:
            pass
        inform('Writing ' + article.doi)
        xml_file = xml_filename(article, article_dir)
        with open(xml_file, 'w', encoding='utf8') as f:
            if __debug__: log(f'writing XML to {xml_file}')
            f.write(xmltodict.unparse(xmldict, pretty=True))

        pdf_file = pdf_filename(article, article_dir)
        if __debug__: log(f'downloading PDF to {pdf_file}')
        if not download_file(article.pdf, pdf_file):
            warn(f'Could not download PDF file for {article.doi}')
            article.status = 'failed-pdf-download'

        if not self.journal.uses_jats:
            # Nothing more to do.
            return

        jats_file = jats_filename(article, jats_dir)
        if __debug__: log(f'downloading JATS XML to {jats_file}')
        if not download_file(article.jats, jats_file):
            warn(f'Could not download JATS file for {article.doi}')
            article.status = 'failed-jats-download'
        if self.do_validate:
            if not valid_xml(jats_file, self._dtd):
                warn(f'Failed to validate JATS for article {article.doi}')
                article.status = 'failed-jats-validation'
        else:
            if __debug__: log(f'skipping DTD validation of {jats_file}')

        # We need to store the image with the name that appears in the
        # JATS file. That requires a little extra work to extract.
        image_extension = filename_extension(article.image)
        image_file = image_filename(article, jats_dir, ext=image_extension)
        if article.image:
            if __debug__: log(f'downloading image file to {image_file}')
            if download_file(article.image, image_file):
                with Image.open(image_file) as img:
                    converted = image_without_alpha(img)
                    converted = converted.convert('RGB')
                    if __debug__: log(f'converting image to TIFF format')
                    tiff_name = filename_basename(image_file) + '.tif'
                    comments = tiff_comments(article, self.journal.name)
                    # Using save() means only the 1st frame of a multiframe
                    # image will be saved.
                    converted.save(tiff_name,
                                   compression=None,
                                   dpi=_TIFF_DPI,
                                   description=comments)
                # We keep only the uncompressed TIFF version.
                if __debug__: log(f'deleting original image file {image_file}')
                delete_existing(image_file)
            else:
                warn(f'Failed to download image for {article.doi}')
                article.status = 'failed-image-download'
        else:
            if __debug__: log(f'skipping empty image URL for {article.doi}')