def __init__(self, file_path='kodexa.yml'): self.file_path = file_path if file_path.endswith('.yml'): import yaml with open(file_path, 'r') as stream: self.kodexa_metadata = addict.Dict(yaml.safe_load(stream)) if file_path.endswith('.json'): import json with open(file_path, 'r') as stream: self.kodexa_metadata = addict.Dict(json.load(stream))
def ocr_on_cells(self, path, grid): ''' Reopens the picture, because that works faster than taking it from pdftab by the tesseract api and reads every single cell. :param path: path to pic :param grid: computed grid :return: addict-dict table ''' table = addict.Dict() with timeit_context('ocr on table cells'): image = Image.open(path) self.api.SetImage(image) # self.api.SetImage(Image.fromarray(image_to_process.input_img)) # It's faster to reload the picture for x, row in enumerate(grid): for y, col in enumerate(row): cell = grid[x][y] box = {'x': int(cell[0][0]), 'y': int(cell[0][1]), 'w': int(cell[1][0] - cell[0][0]), 'h': int(cell[1][1] - cell[0][1])} self.api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = self.api.GetUTF8Text() table[str(x)][str(y)] = ocrResult.replace('\n', ' ').strip() return table
def descriptor_as_adict(): return addict.Dict( spec_version="0.1.0", info=addict.Dict(description="some benchmark"), hardware=addict.Dict(instance_type="p3.8xlarge", strategy="single_node"), env=addict.Dict(docker_image="jlcont/benchmarking:270219"), ml=addict.Dict(benchmark_code="python /home/benchmark/image_classification.py"), data=addict.Dict(sources=[addict.Dict(src="foo1", path="bar1"), addict.Dict(src="foo2", path="bar2")]), )
def create_tensorflow_estimator(session: Session, descriptor: BenchmarkDescriptor, source_dir: str, config: SageMakerExecutorConfig) -> Framework: kwargs = _create_common_estimator_args(session, descriptor, source_dir, config) if descriptor.hardware.strategy == DistributedStrategy.HOROVOD: kwargs.distributions.mpi = addict.Dict( enabled=True, processes_per_host=int(descriptor.hardware.processes_per_instance), custom_mpi_options=MPI_OPTIONS, ) hps = get_hyper_params(descriptor) kwargs.script_mode = True logger.info(f"Creating TF Estimator with parameters {kwargs}") return TensorFlow(**kwargs, hyperparameters=hps)
def parse_name(context: ExecutionToken, name: str) -> str: """ Parse the name of a task, or a lane depending on the current event token. """ # don't bother to do complicated things, if there's no # interpolation to be done if "{" not in name or "}" not in name: return name try: eval_data = addict.Dict(get_eval_data(context)) return name.format(**eval_data) except Exception: # LOG.warn(f"Failed to parse name {e}") return name
def _create_common_estimator_args( session: Session, descriptor: BenchmarkDescriptor, source_dir: str, config: SageMakerExecutorConfig) -> addict.Dict: metrics = get_metric_definitions(descriptor) py_version = "" if descriptor.custom_params: py_version = descriptor.custom_params.python_version return addict.Dict( source_dir=source_dir, entry_point="tmp_entry.py", sagemaker_session=session, image_name=descriptor.env.docker_image, py_version=py_version or "py3", framework_version=descriptor.ml.framework_version or "", # None is not a valid value train_instance_type=descriptor.hardware.instance_type, train_instance_count=descriptor.hardware.distributed.num_instances, role=config.sm_role, output_path=f"s3://{config.s3_output_bucket}", security_group_ids=config.security_group_ids, subnets=config.subnets, metric_definitions=metrics or None, )
def post_process_table(self, table, *args, **kwargs): """ If column is totally empty, it woun't be a column, but a double placed column border. The postprocessing collapses cell content to single strings, if there is content in some of the three cells. Erases the '|' in front of the strings. :param table: a unclean table :return: clean table """ pattern_header = re.compile(r'[^@]+ vs [^@]+') clean_table = addict.Dict() # header_found = False for i, row in enumerate(table.items()): all_texts = [cell[1] for cell in row[1].items()] all_texts = [self.correct_this(s) for s in all_texts] # if not header_found and any(pattern_header.match(cell) for cell in all_texts): # header = ''.join(all_texts) # logging.info(header) # header_found = True if any(s for s in row[1].values()): clean_table[i] = all_texts for col in range(len(table['0']))[::-1]: all_texts = [row[col] for k, row in clean_table.items()] if not any(all_texts): logging.info('empty column %d' % col) for k, row in clean_table.items(): del row[col] clean_table = [row for row in clean_table.values()] self.miss_out_logo_in_corner(clean_table, *args, **kwargs) return clean_table