Exemple #1
0
def segmenter(text_direction, script_detect, allowed_scripts, scale,
              maxcolseps, black_colseps, remove_hlines, pad, base_image, input,
              output) -> None:
    import json

    from kraken import pageseg

    try:
        im = Image.open(input)
    except IOError as e:
        raise click.BadParameter(str(e))
    message('Segmenting\t', nl=False)
    try:
        res = pageseg.segment(im,
                              text_direction,
                              scale,
                              maxcolseps,
                              black_colseps,
                              no_hlines=remove_hlines,
                              pad=pad)
        if script_detect:
            res = pageseg.detect_scripts(im,
                                         res,
                                         valid_scripts=allowed_scripts)
    except Exception:
        message('\u2717', fg='red')
        raise
    with open_file(output, 'w') as fp:
        fp = cast(IO[Any], fp)
        json.dump(res, fp)
    message('\u2713', fg='green')
Exemple #2
0
def segmenter(text_direction, script_detect, allowed_scripts, scale,
              maxcolseps, black_colseps, remove_hlines, pad, mask, base_image, input,
              output) -> None:
    import json

    from kraken import pageseg

    try:
        im = Image.open(input)
    except IOError as e:
        raise click.BadParameter(str(e))
    if mask:
        try:
            mask = Image.open(mask)
        except IOError as e:
            raise click.BadParameter(str(e))
    message('Segmenting\t', nl=False)
    try:
        res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps, no_hlines=remove_hlines, pad=pad, mask=mask)
        if script_detect:
            res = pageseg.detect_scripts(im, res, valid_scripts=allowed_scripts)
    except Exception:
        message('\u2717', fg='red')
        raise
    with open_file(output, 'w') as fp:
        fp = cast(IO[Any], fp)
        json.dump(res, fp)
    message('\u2713', fg='green')
Exemple #3
0
def segmenter(text_direction, script_detect, scale, maxcolseps, black_colseps, base_image, input, output):
    try:
        im = Image.open(input)
    except IOError as e:
        raise click.BadParameter(str(e))
    click.echo('Segmenting\t', nl=False)
    try:
        res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps)
        if script_detect:
            res = pageseg.detect_scripts(im, res)
    except:
        click.secho(u'\u2717', fg='red')
        raise
    with open_file(output, 'w') as fp:
        json.dump(res, fp)
    click.secho(u'\u2713', fg='green')
Exemple #4
0
    def process(self):
        """
        Segment with kraken
        """
        log = getLogger('processor.KrakenSegment')
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            im = self.workspace.resolve_image_as_pil(image_url)

            log.info('Segmenting')
            log.info('Params %s', self.parameter)
            res = segment(im, self.parameter['text_direction'],
                          self.parameter['scale'],
                          self.parameter['maxcolseps'],
                          self.parameter['black_colseps'])
            if self.parameter['script_detect']:
                res = detect_scripts(im, res)

            dummyRegion = TextRegionType()
            pcgts.get_Page().add_TextRegion(dummyRegion)
            #  print(res)
            for lineno, box in enumerate(res['boxes']):
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(points=points_from_x0y0x1y1(box)))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(self.output_file_grp,
                                    pageId=input_file.pageId,
                                    ID=ID,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))