Ejemplo n.º 1
0
    def __init__(self, file_path='kodexa.yml'):
        self.file_path = file_path

        if file_path.endswith('.yml'):
            import yaml

            with open(file_path, 'r') as stream:
                self.kodexa_metadata = addict.Dict(yaml.safe_load(stream))

        if file_path.endswith('.json'):
            import json

            with open(file_path, 'r') as stream:
                self.kodexa_metadata = addict.Dict(json.load(stream))
Ejemplo n.º 2
0
    def ocr_on_cells(self, path, grid):
        ''' Reopens the picture, because that works faster than taking it from pdftab by the tesseract api and reads
        every single cell.

        :param path: path to pic
        :param grid: computed grid
        :return: addict-dict table
        '''

        table = addict.Dict()

        with timeit_context('ocr on table cells'):
            image = Image.open(path)
            self.api.SetImage(image)
            #  self.api.SetImage(Image.fromarray(image_to_process.input_img)) # It's faster to reload the picture
            for x, row in enumerate(grid):
                for y, col in enumerate(row):
                    cell = grid[x][y]

                    box = {'x': int(cell[0][0]),
                           'y': int(cell[0][1]),
                           'w': int(cell[1][0] - cell[0][0]),
                           'h': int(cell[1][1] - cell[0][1])}

                    self.api.SetRectangle(box['x'], box['y'], box['w'], box['h'])

                    ocrResult = self.api.GetUTF8Text()
                    table[str(x)][str(y)] = ocrResult.replace('\n', ' ').strip()

        return table
Ejemplo n.º 3
0
def descriptor_as_adict():
    return addict.Dict(
        spec_version="0.1.0",
        info=addict.Dict(description="some benchmark"),
        hardware=addict.Dict(instance_type="p3.8xlarge", strategy="single_node"),
        env=addict.Dict(docker_image="jlcont/benchmarking:270219"),
        ml=addict.Dict(benchmark_code="python /home/benchmark/image_classification.py"),
        data=addict.Dict(sources=[addict.Dict(src="foo1", path="bar1"), addict.Dict(src="foo2", path="bar2")]),
    )
Ejemplo n.º 4
0
def create_tensorflow_estimator(session: Session,
                                descriptor: BenchmarkDescriptor,
                                source_dir: str,
                                config: SageMakerExecutorConfig) -> Framework:
    kwargs = _create_common_estimator_args(session, descriptor, source_dir,
                                           config)

    if descriptor.hardware.strategy == DistributedStrategy.HOROVOD:
        kwargs.distributions.mpi = addict.Dict(
            enabled=True,
            processes_per_host=int(descriptor.hardware.processes_per_instance),
            custom_mpi_options=MPI_OPTIONS,
        )
    hps = get_hyper_params(descriptor)
    kwargs.script_mode = True
    logger.info(f"Creating TF Estimator with parameters {kwargs}")
    return TensorFlow(**kwargs, hyperparameters=hps)
Ejemplo n.º 5
0
def parse_name(context: ExecutionToken, name: str) -> str:
    """
    Parse the name of a task, or a lane depending on the
    current event token.
    """

    # don't bother to do complicated things, if there's no
    # interpolation to be done
    if "{" not in name or "}" not in name:
        return name

    try:
        eval_data = addict.Dict(get_eval_data(context))
        return name.format(**eval_data)
    except Exception:
        # LOG.warn(f"Failed to parse name {e}")
        return name
Ejemplo n.º 6
0
def _create_common_estimator_args(
        session: Session, descriptor: BenchmarkDescriptor, source_dir: str,
        config: SageMakerExecutorConfig) -> addict.Dict:
    metrics = get_metric_definitions(descriptor)
    py_version = ""
    if descriptor.custom_params:
        py_version = descriptor.custom_params.python_version
    return addict.Dict(
        source_dir=source_dir,
        entry_point="tmp_entry.py",
        sagemaker_session=session,
        image_name=descriptor.env.docker_image,
        py_version=py_version or "py3",
        framework_version=descriptor.ml.framework_version
        or "",  # None is not a valid value
        train_instance_type=descriptor.hardware.instance_type,
        train_instance_count=descriptor.hardware.distributed.num_instances,
        role=config.sm_role,
        output_path=f"s3://{config.s3_output_bucket}",
        security_group_ids=config.security_group_ids,
        subnets=config.subnets,
        metric_definitions=metrics or None,
    )
Ejemplo n.º 7
0
    def post_process_table(self, table, *args, **kwargs):
        """ If column is totally empty, it woun't be a column, but a double placed column border.
        The postprocessing collapses cell content to single strings, if there is content in some
        of the three cells. Erases the '|' in front of the strings.

        :param table: a unclean table
        :return: clean table
        """

        pattern_header = re.compile(r'[^@]+ vs [^@]+')
        clean_table = addict.Dict()
        # header_found = False
        for i, row in enumerate(table.items()):
            all_texts = [cell[1] for cell in row[1].items()]
            all_texts = [self.correct_this(s) for s in all_texts]

            # if not header_found and any(pattern_header.match(cell) for cell in all_texts):
            #    header = ''.join(all_texts)
            #    logging.info(header)
            #    header_found = True

            if any(s for s in row[1].values()):
                clean_table[i] = all_texts

        for col in range(len(table['0']))[::-1]:
            all_texts = [row[col] for k, row in clean_table.items()]

            if not any(all_texts):
                logging.info('empty column %d' % col)

                for k, row in clean_table.items():
                    del row[col]

        clean_table = [row for row in clean_table.values()]

        self.miss_out_logo_in_corner(clean_table, *args, **kwargs)
        return clean_table