def program_api(port: int = Opt(config.DEFAULT_SERVER_PORT, help="Specify server port"), ): """ Start API server """ server = system.create_server("api") server.listen(port=port)
def program_api( port: int = Opt(settings.DEFAULT_SERVER_PORT, help="Specify server port"), ): """ Start API server """ server = system.create_server("api") server.start(port=port)
def program_describe( source: List[str] = Arg(None, help="Data source to describe [default: stdin]"), type: str = Opt(None, help='Specify source type e.g. "package"'), # File scheme: str = Opt(None, help="Specify schema [default: inferred]"), format: str = Opt(None, help="Specify format [default: inferred]"), hashing: str = Opt(None, help="Specify hashing algorithm [default: inferred]"), encoding: str = Opt(None, help="Specify encoding [default: inferred]"), innerpath: str = Opt(None, help="Specify in-archive path [default: first]"), compression: str = Opt(None, help="Specify compression [default: inferred]"), # Layout header_rows: str = Opt(None, help="Comma-separated row numbers [default: 1]"), header_join: str = Opt(None, help="A separator to join a multiline header"), pick_fields: str = Opt(None, help='Comma-separated fields to pick e.g. "1,name1"'), skip_fields: str = Opt(None, help='Comma-separated fields to skip e.g. "2,name2"'), limit_fields: int = Opt(None, help="Limit fields by this integer"), offset_fields: int = Opt(None, help="Offset fields by this integer"), pick_rows: str = Opt(None, help='Comma-separated rows to pick e.g. "1,<blank>"'), skip_rows: str = Opt(None, help='Comma-separated rows to skip e.g. "2,3,4,5"'), limit_rows: int = Opt(None, help="Limit rows by this integer"), offset_rows: int = Opt(None, help="Offset rows by this integer"), # Detector buffer_size: int = Opt(None, help="Limit byte buffer size by this integer"), sample_size: int = Opt(None, help="Limit data sample size by this integer"), field_type: str = Opt(None, help="Force all the fields to have this type"), field_names: str = Opt(None, help="Comma-separated list of field names"), field_confidence: float = Opt(None, help="A float from 0 to 1"), field_float_numbers: bool = Opt(None, help="Make number floats instead of decimals"), field_missing_values: str = Opt(None, help="Comma-separated list of missing values"), # Description basepath: str = Opt(None, help="Basepath of the resource/package"), expand: bool = Opt(None, help="Expand default values"), stats: bool = Opt(None, help="Do not infer stats"), yaml: bool = Opt(False, help="Return in pure YAML format"), json: bool = Opt(False, help="Return in JSON format"), ): """ Describe a data source. Based on the inferred data source type it will return resource or package descriptor. Default output format is YAML with a front matter. """ # Support stdin is_stdin = False if not source: is_stdin = True source = [helpers.create_byte_stream(sys.stdin.buffer.read())] # Normalize parameters source = list(source) if len(source) > 1 else source[0] header_rows = helpers.parse_csv_string(header_rows, convert=int) pick_fields = helpers.parse_csv_string(pick_fields, convert=int, fallback=True) skip_fields = helpers.parse_csv_string(skip_fields, convert=int, fallback=True) pick_rows = helpers.parse_csv_string(pick_rows, convert=int, fallback=True) skip_rows = helpers.parse_csv_string(skip_rows, convert=int, fallback=True) field_names = helpers.parse_csv_string(field_names) field_missing_values = helpers.parse_csv_string(field_missing_values) # Prepare layout layout = ( Layout( header_rows=header_rows, header_join=header_join, pick_fields=pick_fields, skip_fields=skip_fields, limit_fields=limit_fields, offset_fields=offset_fields, pick_rows=pick_rows, skip_rows=skip_rows, limit_rows=limit_rows, offset_rows=offset_rows, ) or None ) # Prepare detector detector = Detector( **helpers.remove_non_values( dict( buffer_size=buffer_size, sample_size=sample_size, field_type=field_type, field_names=field_names, field_confidence=field_confidence, field_float_numbers=field_float_numbers, field_missing_values=field_missing_values, ) ) ) # Prepare options options = helpers.remove_non_values( dict( type=type, # Spec scheme=scheme, format=format, hashing=hashing, encoding=encoding, innerpath=innerpath, compression=compression, layout=layout, # Extra detector=detector, expand=expand, stats=stats, ) ) # Describe source try: metadata = describe(source, **options) except Exception as exception: typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) raise typer.Exit(1) # Return JSON if json: descriptor = metadata.to_json() typer.secho(descriptor) raise typer.Exit() # Return YAML if yaml: descriptor = metadata.to_yaml().strip() typer.secho(descriptor) raise typer.Exit() # Return default if is_stdin: source = "stdin" elif isinstance(source, list): source = " ".join(source) typer.secho("---") typer.secho(f"metadata: {source}", bold=True) typer.secho("---") typer.secho("") typer.secho(metadata.to_yaml().strip()) typer.secho("")
def program_validate( source: List[str] = Arg( None, help="Data source to describe [default: stdin]"), type: str = Opt(None, help='Specify source type e.g. "package"'), # File scheme: str = Opt(None, help="Specify schema [default: inferred]"), format: str = Opt(None, help="Specify format [default: inferred]"), hashing: str = Opt( None, help="Specify hashing algorithm [default: inferred]"), encoding: str = Opt(None, help="Specify encoding [default: inferred]"), innerpath: str = Opt(None, help="Specify in-archive path [default: first]"), compression: str = Opt( None, help="Specify compression [default: inferred]"), # Layout header_rows: str = Opt( None, help="Comma-separated row numbers [default: 1]"), header_join: str = Opt(None, help="A separator to join a multiline header"), pick_fields: str = Opt( None, help='Comma-separated fields to pick e.g. "1,name1"'), skip_fields: str = Opt( None, help='Comma-separated fields to skip e.g. "2,name2"'), limit_fields: int = Opt(None, help="Limit fields by this integer"), offset_fields: int = Opt(None, help="Offset fields by this integer"), pick_rows: str = Opt( None, help='Comma-separated rows to pick e.g. "1,<blank>"'), skip_rows: str = Opt( None, help='Comma-separated rows to skip e.g. "2,3,4,5"'), limit_rows: int = Opt(None, help="Limit rows by this integer"), offset_rows: int = Opt(None, help="Offset rows by this integer"), # Schema schema: str = Opt(None, help="Specify a path to a schema"), # Stats stats_hash: str = Opt(None, help="Expected hash based on hashing option"), stats_bytes: int = Opt(None, help="Expected size in bytes"), stats_fields: int = Opt(None, help="Expected amount of fields"), stats_rows: int = Opt(None, help="Expected amount of rows"), # Detector buffer_size: int = Opt(None, help="Limit byte buffer size by this integer"), sample_size: int = Opt(None, help="Limit data sample size by this integer"), field_type: str = Opt(None, help="Force all the fields to have this type"), field_names: str = Opt(None, help="Comma-separated list of field names"), field_confidence: float = Opt(None, help="A float from 0 to 1"), field_float_numbers: bool = Opt( None, help="Make number floats instead of decimals"), field_missing_values: str = Opt( None, help="Comma-separated list of missing values"), schema_sync: bool = Opt(None, help="Sync the schema based on headers"), # Validation basepath: str = Opt(None, help="Basepath of the resource/package"), pick_errors: str = Opt( None, help='Comma-separated errors to pick e.g. "type-error"'), skip_errors: str = Opt( None, help='Comma-separated errors to skip e.g. "blank-row"'), limit_errors: int = Opt(None, help="Limit errors by this integer"), limit_memory: int = Opt(None, help="Limit memory by this integer in MB"), original: bool = Opt(None, help="Don't call infer on resources"), parallel: bool = Opt(None, help="Enable multiprocessing"), yaml: bool = Opt(False, help="Return in pure YAML format"), json: bool = Opt(False, help="Return in JSON format"), ): """ Validate a data source. Based on the inferred data source type it will validate resource or package. Default output format is YAML with a front matter. """ # Support stdin is_stdin = False if not source: is_stdin = True source = [helpers.create_byte_stream(sys.stdin.buffer.read())] # Normalize parameters source = list(source) if len(source) > 1 else source[0] header_rows = helpers.parse_csv_string(header_rows, convert=int) pick_fields = helpers.parse_csv_string(pick_fields, convert=int, fallback=True) skip_fields = helpers.parse_csv_string(skip_fields, convert=int, fallback=True) pick_rows = helpers.parse_csv_string(pick_rows, convert=int, fallback=True) skip_rows = helpers.parse_csv_string(skip_rows, convert=int, fallback=True) field_names = helpers.parse_csv_string(field_names) field_missing_values = helpers.parse_csv_string(field_missing_values) pick_errors = helpers.parse_csv_string(pick_errors) skip_errors = helpers.parse_csv_string(skip_errors) # Prepare layout layout = (Layout( header_rows=header_rows, header_join=header_join, pick_fields=pick_fields, skip_fields=skip_fields, limit_fields=limit_fields, offset_fields=offset_fields, pick_rows=pick_rows, skip_rows=skip_rows, limit_rows=limit_rows, offset_rows=offset_rows, ) or None) # Prepare stats stats = (helpers.remove_non_values( dict( hash=stats_hash, bytes=stats_bytes, fields=stats_fields, rows=stats_rows, )) or None) # Prepare detector detector = Detector(**helpers.remove_non_values( dict( buffer_size=buffer_size, sample_size=sample_size, field_type=field_type, field_names=field_names, field_confidence=field_confidence, field_float_numbers=field_float_numbers, field_missing_values=field_missing_values, schema_sync=schema_sync, ))) # Prepare options options = helpers.remove_non_values( dict( type=type, # Spec scheme=scheme, format=format, hashing=hashing, encoding=encoding, innerpath=innerpath, compression=compression, layout=layout, schema=schema, stats=stats, # Extra basepath=basepath, detector=detector, pick_errors=pick_errors, skip_errors=skip_errors, limit_errors=limit_errors, limit_memory=limit_memory, original=original, parallel=parallel, )) # Validate source try: report = validate(source, **options) except Exception as exception: typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) raise typer.Exit(1) # Return JSON if json: content = report.to_json() typer.secho(content) raise typer.Exit() # Return YAML if yaml: content = report.to_yaml().strip() typer.secho(content) raise typer.Exit() # Return report if report.errors: content = [] if is_stdin: source = "stdin" typer.secho("---") typer.secho(f"invalid: {source}", bold=True) typer.secho("---") for error in report.errors: content.append([error.code, error.message]) typer.secho( str( petl.util.vis.lookall([["code", "message"]] + content, vrepr=str, style="simple"))) # Return tables prev_invalid = False for number, task in enumerate(report.tasks, start=1): if number != 1 and prev_invalid: typer.secho("") prefix = "valid" if task.valid else "invalid" source = task.resource.path if is_stdin: source = "stdin" typer.secho("---") typer.secho(f"{prefix}: {source}", bold=True) typer.secho("---") if task.errors: prev_invalid = True typer.secho("") content = [] for error in task.errors: content.append([ error.get("rowPosition"), error.get("fieldPosition"), error.code, error.message, ]) typer.secho( str( petl.util.vis.lookall( [["row", "field", "code", "message"]] + content, vrepr=str, style="simple", ))) # Return retcode raise typer.Exit(code=int(not report.valid))
def program_extract( source: List[str] = Arg( None, help="Data source to describe [default: stdin]"), type: str = Opt(None, help='Specify source type e.g. "package"'), # File scheme: str = Opt(None, help="Specify schema [default: inferred]"), format: str = Opt(None, help="Specify format [default: inferred]"), hashing: str = Opt( None, help="Specify hashing algorithm [default: inferred]"), encoding: str = Opt(None, help="Specify encoding [default: inferred]"), innerpath: str = Opt(None, help="Specify in-archive path [default: first]"), compression: str = Opt( None, help="Specify compression [default: inferred]"), # Layout header_rows: str = Opt( None, help="Comma-separated row numbers [default: 1]"), header_join: str = Opt(None, help="A separator to join a multiline header"), pick_fields: str = Opt( None, help='Comma-separated fields to pick e.g. "1,name1"'), skip_fields: str = Opt( None, help='Comma-separated fields to skip e.g. "2,name2"'), limit_fields: int = Opt(None, help="Limit fields by this integer"), offset_fields: int = Opt(None, help="Offset fields by this integer"), pick_rows: str = Opt( None, help='Comma-separated rows to pick e.g. "1,<blank>"'), skip_rows: str = Opt( None, help='Comma-separated rows to skip e.g. "2,3,4,5"'), limit_rows: int = Opt(None, help="Limit rows by this integer"), offset_rows: int = Opt(None, help="Offset rows by this integer"), # Schema schema: str = Opt(None, help="Specify a path to a schema"), # Detector buffer_size: int = Opt(None, help="Limit byte buffer size by this integer"), sample_size: int = Opt(None, help="Limit data sample size by this integer"), field_type: str = Opt(None, help="Force all the fields to have this type"), field_names: str = Opt(None, help="Comma-separated list of field names"), field_confidence: float = Opt(None, help="A float from 0 to 1"), field_float_numbers: bool = Opt( None, help="Make number floats instead of decimals"), field_missing_values: str = Opt( None, help="Comma-separated list of missing values"), schema_sync: bool = Opt(None, help="Sync the schema based on headers"), # Extraction basepath: str = Opt(None, help="Basepath of the resource/package"), yaml: bool = Opt(False, help="Return in pure YAML format"), json: bool = Opt(False, help="Return in JSON format"), csv: bool = Opt(False, help="Return in CSV format"), ): """ Extract a data source. Based on the inferred data source type it will return resource or package data. Default output format is tabulated with a front matter. """ # Support stdin is_stdin = False if not source: is_stdin = True source = [helpers.create_byte_stream(sys.stdin.buffer.read())] # Normalize parameters source = list(source) if len(source) > 1 else source[0] header_rows = helpers.parse_csv_string(header_rows, convert=int) pick_fields = helpers.parse_csv_string(pick_fields, convert=int, fallback=True) skip_fields = helpers.parse_csv_string(skip_fields, convert=int, fallback=True) pick_rows = helpers.parse_csv_string(pick_rows, convert=int, fallback=True) skip_rows = helpers.parse_csv_string(skip_rows, convert=int, fallback=True) field_names = helpers.parse_csv_string(field_names) field_missing_values = helpers.parse_csv_string(field_missing_values) # Prepare layout layout = (Layout( header_rows=header_rows, header_join=header_join, pick_fields=pick_fields, skip_fields=skip_fields, limit_fields=limit_fields, offset_fields=offset_fields, pick_rows=pick_rows, skip_rows=skip_rows, limit_rows=limit_rows, offset_rows=offset_rows, ) or None) # Prepare detector detector = Detector(**helpers.remove_non_values( dict( buffer_size=buffer_size, sample_size=sample_size, field_type=field_type, field_names=field_names, field_confidence=field_confidence, field_float_numbers=field_float_numbers, field_missing_values=field_missing_values, schema_sync=schema_sync, ))) # Prepare options options = helpers.remove_non_values( dict( type=type, # Spec scheme=scheme, format=format, hashing=hashing, encoding=encoding, innerpath=innerpath, compression=compression, layout=layout, schema=schema, # Extra basepath=basepath, detector=detector, )) # Extract data try: process = ( lambda row: row.to_dict(json=True)) if json or yaml else None data = extract(source, process=process, **options) except Exception as exception: typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) raise typer.Exit(1) # Normalize data normdata = data if isinstance(data, list): normdata = {source: data} # Return JSON if json: content = pyjson.dumps(data, indent=2, ensure_ascii=False) typer.secho(content) raise typer.Exit() # Return YAML if yaml: content = pyyaml.safe_dump(data).strip() typer.secho(content) raise typer.Exit() # Return CSV if csv: for number, rows in enumerate(normdata.values(), start=1): for row in rows: if row.row_number == 1: typer.secho(helpers.stringify_csv_string(row.field_names)) typer.secho(row.to_str()) if number < len(normdata): typer.secho("") raise typer.Exit() # Return default for number, (name, rows) in enumerate(normdata.items(), start=1): if is_stdin: name = "stdin" typer.secho("---") typer.secho(f"data: {name}", bold=True) typer.secho("---") typer.secho("") subdata = helpers.rows_to_data(rows) typer.secho( str(petl.util.vis.lookall(subdata, vrepr=str, style="simple"))) if number < len(normdata): typer.secho("")
def program_validate( source: List[str] = Arg(None, help="Data source to describe [default: stdin]"), source_type: str = Opt(None, help='Specify source type e.g. "package"'), # File scheme: str = Opt(None, help="Specify schema [default: inferred]"), format: str = Opt(None, help="Specify format [default: inferred]"), hashing: str = Opt(None, help="Specify hashing algorithm [default: inferred]"), encoding: str = Opt(None, help="Specify encoding [default: inferred]"), compression: str = Opt(None, help="Specify compression [default: inferred]"), compression_path: str = Opt(None, help="Specify in-archive path [default: first]"), # Control/Dialect/Query/Header header_rows: str = Opt(None, help="Comma-separated row numbers [default: 1]"), header_join: str = Opt(None, help="A separator to join a multiline header"), pick_fields: str = Opt(None, help='Comma-separated fields to pick e.g. "1,name1"'), skip_fields: str = Opt(None, help='Comma-separated fields to skip e.g. "2,name2"'), limit_fields: int = Opt(None, help="Limit fields by this integer"), offset_fields: int = Opt(None, help="Offset fields by this integer"), pick_rows: str = Opt(None, help='Comma-separated rows to pick e.g. "1,<blank>"'), skip_rows: str = Opt(None, help='Comma-separated rows to skip e.g. "2,3,4,5"'), limit_rows: int = Opt(None, help="Limit rows by this integer"), offset_rows: int = Opt(None, help="Offset rows by this integer"), # Schema schema: str = Opt(None, help="Specify a path to a schema"), sync_schema: bool = Opt(None, help="Sync the schema based on headers"), # Infer infer_type: str = Opt(None, help="Force all the fields to have this type"), infer_names: str = Opt(None, help="Comma-separated list of field names"), infer_volume: int = Opt(None, help="Limit data sample size by this integer"), infer_confidence: float = Opt(None, help="A float from 0 to 1"), infer_missing_values: str = Opt(None, help="Comma-separated list of missing values"), # Package/Resource basepath: str = Opt(None, help="Basepath of the resource/package"), nopool: bool = Opt(None, help="Disable multiprocessing"), # Validation checksum_hash: str = Opt(None, help="Expected hash based on hashing option"), checksum_bytes: int = Opt(None, help="Expected size in bytes"), checksum_rows: int = Opt(None, help="Expected amoutn of rows"), pick_errors: str = Opt(None, help='Comma-separated errors to pick e.g. "type-error"'), skip_errors: str = Opt(None, help='Comma-separated errors to skip e.g. "blank-row"'), limit_errors: int = Opt(None, help="Limit errors by this integer"), limit_memory: int = Opt(None, help="Limit memory by this integer in MB"), # Output yaml: bool = Opt(False, help="Return in pure YAML format"), json: bool = Opt(False, help="Return in JSON format"), ): """ Validate a data source. Based on the inferred data source type it will validate resource or package. Default output format is YAML with a front matter. """ # Support stdin is_stdin = False if not source: is_stdin = True source = [helpers.create_byte_stream(sys.stdin.buffer.read())] # Normalize parameters source = list(source) if len(source) > 1 else source[0] header_rows = helpers.parse_csv_string(header_rows, convert=int) pick_fields = helpers.parse_csv_string(pick_fields, convert=int, fallback=True) skip_fields = helpers.parse_csv_string(skip_fields, convert=int, fallback=True) pick_rows = helpers.parse_csv_string(pick_rows, convert=int, fallback=True) skip_rows = helpers.parse_csv_string(skip_rows, convert=int, fallback=True) infer_names = helpers.parse_csv_string(infer_names) infer_missing_values = helpers.parse_csv_string(infer_missing_values) pick_errors = helpers.parse_csv_string(pick_errors) skip_errors = helpers.parse_csv_string(skip_errors) # Prepare dialect dialect = ( Dialect( header_rows=header_rows, header_join=header_join, ) or None ) # Prepare query query = ( Query( pick_fields=pick_fields, skip_fields=skip_fields, limit_fields=limit_fields, offset_fields=offset_fields, pick_rows=pick_rows, skip_rows=skip_rows, limit_rows=limit_rows, offset_rows=offset_rows, ) or None ) # Prepare checksum checksum = ( helpers.remove_non_values( dict(hash=checksum_hash, bytes=checksum_bytes, rows=checksum_rows) ) or None ) # Prepare options options = helpers.remove_non_values( dict( source_type=source_type, # File scheme=scheme, format=format, hashing=hashing, encoding=encoding, compression=compression, compression_path=compression_path, # Control/Dialect/Query/Header dialect=dialect, query=query, # Schema schema=schema, sync_schema=sync_schema, # Infer infer_type=infer_type, infer_names=infer_names, infer_volume=infer_volume, infer_confidence=infer_confidence, infer_missing_values=infer_missing_values, # Package/Resource basepath=basepath, nopool=nopool, # Validation checksum=checksum, pick_errors=pick_errors, skip_errors=skip_errors, limit_errors=limit_errors, limit_memory=limit_memory, ) ) # Validate source try: report = validate(source, **options) except Exception as exception: typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) raise typer.Exit(1) # Return JSON if json: content = report.to_json() typer.secho(content) raise typer.Exit() # Return YAML if yaml: content = report.to_yaml().strip() typer.secho(content) raise typer.Exit() # Return report if report.errors: content = [] if is_stdin: source = "stdin" typer.secho("---") typer.secho(f"invalid: {source}", bold=True) typer.secho("---") for error in report.errors: content.append([error.code, error.message]) typer.secho( str( petl.util.vis.lookall( [["code", "message"]] + content, vrepr=str, style="simple" ) ) ) # Return tables prev_invalid = False for number, table in enumerate(report.tables, start=1): if number != 1 and prev_invalid: typer.secho("") prefix = "valid" if table.valid else "invalid" source = table.path if is_stdin: source = "stdin" typer.secho("---") typer.secho(f"{prefix}: {source}", bold=True) typer.secho("---") if table.errors: prev_invalid = True typer.secho("") content = [] for error in table.errors: content.append( [ error.get("rowPosition"), error.get("fieldPosition"), error.code, error.message, ] ) typer.secho( str( petl.util.vis.lookall( [["row", "field", "code", "message"]] + content, vrepr=str, style="simple", ) ) ) # Return retcode raise typer.Exit(code=int(not report.valid))