def match(p: pathlib.PurePosixPath): ext = os.path.splitext(p)[1] if ext in ('.pyc', '.pyo'): return False if ignore_common_stdlib_files and p in STDLIB_IGNORE_FILES: return False if ignore_stdlib_test_dirs: for ignore in STDLIB_TEST_DIRS: try: p.relative_to(ignore) return False except ValueError: pass if ignore_common_stdlib_dirs: for ignore in STDLIB_NONTEST_IGNORE_DIRS: try: p.relative_to(ignore) return False except ValueError: pass return True
def subindex(self, *path): """ Returns an `IndexFile` object listing only those files in or below the directory given by ``*path``. The path keys in the resulting object will be relative to ``*path``. ``*path`` may be any relative path specification accepted by `PurePath.relative_to`, such as a string (e.g., ``"main"`` or ``"main/binary-amd64"``), a sequence of strings (e.g., ``"main", "binary-amd64"``), or a `PurePosixPath` object. """ ### TODO: Add an option for also updating the `filename` attributes of ### the IndexEntries? ### TODO: Add an option for controlling whether to keep `fields`? subfiles = {} for p, entry in self.files.items(): pathobj = PurePosixPath(p) ### TODO: Handle absolute paths and paths beginning with .. (or .?) try: subpath = pathobj.relative_to(*path) except ValueError: pass else: subfiles[str(subpath)] = entry return type(self)(files=subfiles, fields=self.fields.copy())
def _decode_name(name): if name is None: return None path = PurePosixPath(name) if path.is_absolute(): path = path.relative_to(path.root) parts = [i for i in path.parts if i not in ('.', '..')] return PurePosixPath(*parts)
def convert_posix_to_win32(rootfs, cwd, path): # rootfs is a concrete path. rootfs = Path(rootfs) # cwd and path are pure paths cwd = PurePosixPath(cwd[1:]) path = PurePosixPath(path) if path.is_absolute(): return rootfs / path.relative_to(path.anchor) else: return rootfs / cwd / path
def _scp(self, local_path, remote_path, dry=True): # remote_path = '/mnt/ibl/zadorlab/Subjects/flowers/2018-07-13/001 remote_path = PurePosixPath('/').joinpath( remote_path.relative_to(PurePosixPath(FLATIRON_MOUNT))) # local_path self.mktree(remote_path.parent) self.ftp.pwd() _logger.info(f"FTP upload {local_path}") with open(local_path, 'rb') as fid: self.ftp.storbinary(f'STOR {local_path.name}', fid) return 0, ''
def get_name_relative_to(base_dir, curr_file): # Watch out for root directory when using this base_path = PurePosixPath(base_dir.full_name) file_path = PurePosixPath(curr_file.full_name) try: return file_path.relative_to(base_path) if isinstance( base_dir, Directory) else file_path except ValueError: if base_dir == curr_file and isinstance(base_dir, Directory): return "." if base_dir.get_parent_directory() == base_dir: return ".."
def _scp(self, local_path, remote_path, dry=True): remote_path = PurePosixPath('/').joinpath( remote_path.relative_to(PurePosixPath(FLATIRON_MOUNT)) ) _logger.info(f"Globus copy {local_path} to {remote_path}") if not dry: if isinstance(self.globus_transfer, globus_sdk.transfer.data.TransferData): self.globus_transfer.add_item(local_path, remote_path) else: self.globus_transfer.path_src.append(local_path) self.globus_transfer.path_dest.append(remote_path) return 0, ''
def extract_bundle(bundle: ZipFile) -> BundleContents: # noqa(C901) """ Extract a bundle and verify its contents and structure. """ if not _has_files_at_root(bundle): raise RuntimeError( 'No files found in ZIP file\'s root directory. When selecting ' 'files to zip, make sure to directly select the files ' 'themselves. Do not select their parent directory, which would ' 'result in nesting all files inside that directory in the ZIP.') try: with bundle.open(MAIN_PROTOCOL_FILENAME, 'r') as protocol_file: py_protocol = protocol_file.read().decode('utf-8') except KeyError: raise RuntimeError( f'Bundled protocol should have a {MAIN_PROTOCOL_FILENAME} ' + 'file in the root directory') bundled_labware: Dict[str, 'LabwareDefinition'] = {} bundled_data = {} bundled_python = {} for zipInfo in bundle.infolist(): filepath = PurePosixPath(zipInfo.filename) rootpath = filepath.parts[0] # skip directories and weird OS-added directories # (note: the __MACOSX dir would contain '__MACOSX/foo.py' # and other files. This would break our inferences, so we need # to exclude all contents of that directory) if rootpath == '__MACOSX' or zipInfo.is_dir(): continue with bundle.open(zipInfo) as f: if rootpath == LABWARE_DIR and filepath.suffix == '.json': labware_def = json.loads(f.read().decode('utf-8')) labware_key = uri_from_definition(labware_def) if labware_key in bundled_labware: raise RuntimeError( f'Conflicting labware in bundle: {labware_key}') bundled_labware[labware_key] = labware_def elif rootpath == DATA_DIR: # note: data files are read as binary bundled_data[str(filepath.relative_to(DATA_DIR))] = f.read() elif (filepath.suffix == '.py' and str(filepath) != MAIN_PROTOCOL_FILENAME): bundled_python[str(filepath)] = f.read().decode('utf-8') if not bundled_labware: raise RuntimeError('No labware definitions found in bundle.') return BundleContents(py_protocol, bundled_labware, bundled_data, bundled_python)
def __host_casefold_path(hostpath: str) -> Optional[str]: # assuming posix host p = PurePosixPath(hostpath) norm = Path(p.anchor) for elem in p.relative_to(norm).parts: folded = elem.casefold() try: norm = next(entry for entry in norm.iterdir() if entry.name.casefold() == folded) except StopIteration: return None return str(norm)
def _scp(self, local_path, remote_path, dry=True): remote_path = PurePosixPath('/').joinpath( remote_path.relative_to(PurePosixPath(FLATIRON_MOUNT)) ) # local_path self.mktree(remote_path.parent) # if the file already exists on the buffer, do not overwrite if local_path.name in self.ftp.nlst(): _logger.info(f"FTP already on server {local_path}") return 0, '' self.ftp.pwd() _logger.info(f"FTP upload {local_path}") with open(local_path, 'rb') as fid: self.ftp.storbinary(f'STOR {local_path.name}', fid) return 0, ''
def full_split(_path): """ Return a list with all the intermediate paths. The input path must be a POSIX path string (i.e., Linux or OSX). """ intermediate_paths = list() _path = PurePosixPath(_path) if _path.is_absolute(): _path = _path.relative_to("/") parts = _path.parts for i in range(1, len(parts)): intermediate_paths.append(PurePosixPath(*parts[0:i]).as_posix()) return intermediate_paths
def _commit_line_for_file(self, filename: str) -> Optional[str]: '''Works out a reasonable single-line commit comment for the given file path.''' path = PurePosixPath(self.config.settings.OutputPath / filename) # Generic line for removals if not Path(path).is_file: return f'{path} removed' # Don't report manifest file updates if path.name.lower() == MANIFEST_FILENAME.lower(): # Do not report this file return False # type: ignore # no Literal support in Python 3.7 # Don't report files in dotted directories for dirname in PurePosixPath(filename).parent.parts: if dirname.startswith('.') and len(dirname) > 1: # Do not report this file return False # type: ignore # no Literal support in Python 3.7 # Look up the relevant root root = self._find_matching_root(str(path)) if not root: return None # we don't know this file - fall back to default report relative_path = path.relative_to(root.path) # See if there's a relevant manifest entry with a version number entry = self._find_matching_manifest_entry(root, str(path)) version: Optional[str] = entry.get('version', None) if entry else None # Get the of the path from the root name = root.get_name_for_path(relative_path) if name and version: return f'{name} updated to version {version}' if name: return f'{name} updated' if version: return f'{relative_path} updated to version {version}' # Don't know this file return None
def _calculate_relative_path(origin: PurePosixPath, target: PurePosixPath): ''' >>> _calculate_relative_path(PurePosixPath('input.json'), PurePosixPath('.schema', 'output.json')) PurePosixPath('.schema/output.json') >>> _calculate_relative_path(PurePosixPath('sub','input.json'), PurePosixPath('.schema', 'output.json')) PurePosixPath('../.schema/output.json') ''' while True: origin = origin.parent try: result = target.relative_to(origin) return result except ValueError: pass if origin == PurePosixPath('.'): raise ValueError("Unable to calculate relative path") target = PurePosixPath('..', target)
def _commit_line_for_file(self, filename: str) -> Optional[str]: '''Works out a reasonable single-line commit comment for the given file path.''' path = PurePosixPath(self.config.settings.OutputPath / filename) # Generic line for removals if not Path(path).is_file: return f'{path} removed' # Don't report manifest file updates if path.name.lower() == MANIFEST_FILENAME.lower(): return None # Look up the relevant root root = self._find_matching_root(str(path)) if not root: # We don't know this file return None relative_path = path.relative_to(root.path) # See if there's a relevant manifest entry with a version number entry = self._find_matching_manifest_entry(root, str(path)) version: Optional[str] = entry.get('version', None) if entry else None # Get the of the path from the root name = root.get_name_for_path(relative_path) if name and version: return f'{name} updated to version {version}' if name: return f'{name} updated' if version: return f'{relative_path} updated to version {version}' # Don't know this file return None
def resolve(self, notebookPath: PurePosixPath): return notebookPath.relative_to(self.__baseDirPath).with_suffix('')
def __init__(self, base_path: PPPath, path: PPPath, content_device) -> None: super().__init__(base_path, path) self._content_device = content_device self._rel_path = path.relative_to(base_path)
def get_path_relative_to_this(self, partial: PPPath) -> PPPath: return partial.relative_to(self._path.relative_to(self._base_path))
class CSVWMapping: def __init__(self): self._csv_filename: Optional[Path] = None self._csv_stream: Optional[TextIO] = None self._mapping: Dict[str, Any] = {} self._column_names: List[str] = [] self._columns: Dict[str, Column] = {} self._external_tables: List[Table] = [] self._dataset_uri: Optional[URI] = None self._dataset_root_uri: Optional[URI] = None self._dataset = DataSet() self._components: List[Component] = [] self._registry: Optional[URI] = None self._keys: List[str] = [] self._metadata_filename: Optional[Path] = None self._foreign_keys: Optional[List[ForeignKey]] = None self._measureTemplate: Optional[URITemplate] = None self._measureTypes: Optional[List[str]] = None self._accretive_upload: bool = False self._containing_graph_uri: Optional[URI] = None self._codelist_base: Optional[Path] = None self._suppress_catalog_and_dsd_output: bool = False self._index: int = 0 @staticmethod def namify(column_header: str): return pathify(column_header).replace('-', '_') @staticmethod def classify(column_header: str): return ''.join(part.capitalize() for part in pathify(column_header).split('-')) def join_dataset_uri(self, relative: str, use_true_dataset_root: bool = False): # treat the dataset URI as an entity that when joined with a fragment, just adds # the fragment, but when joined with a relative path, turns the dataset URI into a container # by adding a / to the end before adding the relative path f""" Where datasets have multiple distinct dataframes, `self._dataset_uri` is of the form http://gss-data.org.uk/data/gss_data/<family_path>/<dataset_root_path>/<dataset_path> Codelists are defined at the `dataset_root_path` level, so we need to be able to create URIs relative to http://gss-data.org.uk/data/gss_data/<family_path>/<dataset_root_path> """ root_uri = self._dataset_root_uri if use_true_dataset_root else self._dataset_uri if root_uri is None: return URI(relative) elif relative.startswith('#'): return URI(urljoin(root_uri, relative, allow_fragments=True)) else: return URI(urljoin(root_uri + '/', relative, allow_fragments=True)) def set_csv(self, csv_filename: URI): # csv and csv.gz need to be read in slightly different ways if str(csv_filename).endswith("csv"): with open(csv_filename, newline='', encoding='utf-8') as f: self.set_input(csv_filename, f) elif str(csv_filename).endswith("csv.gz"): with gzip.open(csv_filename, encoding='utf-8') as f: self.set_input(csv_filename, f) else: raise ValueError("Only csv types of .csv and /csv.gz are supported." " Not {}".format(csv_filename)) def set_input(self, filename: URI, stream: TextIO): self._csv_stream = stream self._csv_filename = Path(str(filename)[:-3]) if str(filename).endswith(".csv.gz") else filename reader = csv.DictReader(stream) self._column_names = reader.fieldnames for col in self._column_names: self._columns[col] = Column(name=CSVWMapping.namify(col), titles=col, datatype="string") def set_local_codelist_base(self, base: str): self._codelist_base = Path(base) def set_accretive_upload(self, info_json: Dict): if "load" in info_json and "accretiveUpload" in info_json["load"]: self._accretive_upload = info_json["load"]["accretiveUpload"] # Else default of false def set_mapping(self, mapping): if 'transform' in mapping and 'columns' in mapping['transform']: self._mapping = mapping['transform']['columns'] else: logging.error(f'No column mapping found.') def set_suppress_catalog_and_dsd_output(self, should_suppress: bool): self._suppress_catalog_and_dsd_output = should_suppress def add_foreign_key(self, foreign_key: ForeignKey): if self._foreign_keys is None: self._foreign_keys = [] self._foreign_keys.append(foreign_key) def set_containing_graph_uri(self, uri: URI): self._containing_graph_uri = uri def set_dataset_uri(self, uri: URI, dataset_root_uri: Optional[URI] = None): f""" Please make sure you set the dataset_root_uri. If this dataset has only one dataframe associated then both {uri} and {dataset_root_uri} should be the same, e.g. `http://gss-data.org.uk/data/gss_data/<family-name>/<dataset-name>` If the dataset has more than one dataframe associated and so has a {uri} of the form `http://gss-data.org.uk/data/gss_data/<family-name>/<dataset-name>/<dataframe-name>` then the {dataset_root_uri} must represent the URI fragment common to all dataframes, e.g. `http://gss-data.org.uk/data/gss_data/<family-name>/<dataset-name>` """ self._dataset_uri = uri if dataset_root_uri is None: logging.warning("Dataset_root_uri is unset. " + "In future this warning will be converted to an error and terminate your build.") # Legacy compatibility code: # This code will NOT survive any change is URI standards. if self._dataset_uri is not None: matches: re.Match = re.match("^(.+)/gss_data/([^/]+)/([^/]+).*$", self._dataset_uri, re.RegexFlag.IGNORECASE) base_uri = f"{matches.group(1)}/gss_data" family_path = matches.group(2) dataset_root_path = matches.group(3) dataset_root_uri = f"{base_uri}/{family_path}/{dataset_root_path}" self._dataset_root_uri = dataset_root_uri def set_registry(self, uri: URI): self._registry = uri def _validate(self): # check variable names are consistent declared_names = set([col.name for col in self._columns.values()]) used_names: Set[str] = set() for name_set in ( variables(t) for col in self._columns.values() for t in [col.propertyUrl, col.valueUrl] if t is not None ): used_names.update(name_set) if not declared_names.issuperset(used_names): logging.error(f"Unmatched variable names: {used_names.difference(declared_names)}") # check used prefixes used_prefixes = set( t.split(':')[0] for col in self._columns.values() for t in [col.propertyUrl, col.valueUrl] if t is not None and not t.startswith('http') and ':' in t ) if not set(prefix_map.keys()).issuperset(used_prefixes): logging.error(f"Unknown prefixes used: {used_prefixes.difference(prefix_map.keys())}") def _next_index(self): self._index = self._index + 1 return self._index def _as_csvw_object(self): def get_conventional_local_codelist_scheme_uri(column_name: str) -> Resource: codelist_uri = self.join_dataset_uri(f"#scheme/{pathify(column_name)}", use_true_dataset_root=True) return Resource(at_id=codelist_uri) def get_maybe_codelist_for_col(column_config: object, column_name: str) -> Optional[Resource]: if "codelist" in column_config: codelist = column_config["codelist"] if isinstance(codelist, bool) and not codelist: # Config explicitly forbids a codelist being linked here. return None return Resource(at_id=codelist) # Codelist should exist. Convention dictates it should be a local codelist. return get_conventional_local_codelist_scheme_uri(column_name) def get_conventional_local_codelist_concept_uri_template(column_name: str) -> URI: return self.join_dataset_uri(f"#concept/{pathify(column_name)}/{{+{self._columns[column_name].name}}}", use_true_dataset_root=True) def get_value_uri_template_for_col(column_def: object, column_name: str) -> URI: if "value" in column_def: return URI(column_def["value"]) return get_conventional_local_codelist_concept_uri_template(column_name) def add_local_codelist(name: str): if self._codelist_base is not None: codelist_csv = (self._codelist_base / pathify(name)).with_suffix('.csv') codelist_relative_uri = URI(codelist_csv) self._external_tables.append(Table( url=codelist_relative_uri, tableSchema=URI("https://gss-cogs.github.io/family-schemas/codelist-schema.json"), suppressOutput=True )) self.add_foreign_key(ForeignKey( columnReference=self._columns[name].name, reference=ColumnReference( resource=codelist_relative_uri, columnReference="notation" ) )) # Look to see whether the measure type has its own column for map_name, map_obj in self._mapping.items(): if isinstance(map_obj, dict) and "dimension" in map_obj and map_obj["dimension"] == "http://purl.org/linked-data/cube#measureType": self._measureTemplate = URITemplate(map_obj["value"]) if "types" in map_obj: self._measureTypes = map_obj["types"] # add a component specification for each measure for t in map_obj["types"]: template_vars = {CSVWMapping.namify(map_name): t} self._components.append( MeasureComponent( at_id=self.join_dataset_uri(f"#component/{pathify(t)}"), qb_componentProperty=Resource(at_id=self._measureTemplate.expand(template_vars)), qb_measure=MeasureProperty(at_id=self._measureTemplate.expand(template_vars)) ) ) # Now iterate over column headers in the given CSV file for name in self._column_names: if ( self._mapping is not None and name in self._mapping and isinstance(self._mapping[name], dict) ): obj = self._mapping[name] if "dimension" in obj and "value" in obj: self._keys.append(self._columns[name].name) datatype = "string" # if this is a measure type column and has a "types" list, we can validate the # expected values of the column using a regular expression, see # https://www.w3.org/TR/tabular-data-primer/#h-enumeration-regexp if obj["dimension"] == "http://purl.org/linked-data/cube#measureType" and "types" in obj: datatype = Datatype( base="string", format=f"^({'|'.join(obj['types'])})$" ) self._columns[name] = self._columns[name]._replace( propertyUrl=URI(obj["dimension"]), valueUrl=URI(obj["value"]), datatype=datatype ) self._components.append(DimensionComponent( at_id=self.join_dataset_uri(f"#component/{pathify(name)}"), qb_componentProperty=Resource(at_id=URI(obj["dimension"])), qb_dimension=DimensionProperty( at_id=URI(obj["dimension"]), rdfs_range=Resource( at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}") ) ), qb_order=self._next_index() )) elif "parent" in obj: # a local dimension that has a super property description: Optional[str] = obj.get("description", None) label: str = obj.get("label", name) source: Optional[Resource] = None if "source" in obj: source = Resource(at_id=URI(obj["source"])) self._keys.append(self._columns[name].name) self._columns[name] = self._columns[name]._replace( propertyUrl=self.join_dataset_uri(f"#dimension/{pathify(name)}"), valueUrl=get_value_uri_template_for_col(obj, name) ) self._components.append(DimensionComponent( at_id=self.join_dataset_uri(f"#component/{pathify(name)}"), qb_componentProperty=Resource(at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}")), qb_dimension=DimensionProperty( at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}"), rdfs_range=Resource( at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}") ), qb_codeList=get_maybe_codelist_for_col(obj, name), rdfs_label=label, rdfs_comment=description, rdfs_subPropertyOf=Resource(at_id=URI(obj["parent"])), rdfs_isDefinedBy=source ), qb_order=self._next_index() )) if "codelist" not in obj: if "parent" not in obj or obj["parent"] != "http://purl.org/linked-data/sdmx/2009/dimension#refPeriod": add_local_codelist(name) elif "description" in obj or "label" in obj: # local dimension with a definition/label and maybe source of the definition description: Optional[str] = obj.get("description", None) label: Optional[str] = obj.get("label", name) source: Optional[Resource] = None if "source" in obj: source = Resource(at_id=URI(obj["source"])) self._keys.append(self._columns[name].name) self._columns[name] = self._columns[name]._replace( propertyUrl=self.join_dataset_uri(f"#dimension/{pathify(name)}"), valueUrl=get_value_uri_template_for_col(obj, name) ) self._components.append(DimensionComponent( at_id=self.join_dataset_uri(f"#component/{pathify(name)}"), qb_componentProperty=Resource(at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}")), qb_dimension=DimensionProperty( at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}"), rdfs_range=Resource( at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}") ), qb_codeList=get_maybe_codelist_for_col(obj, name), rdfs_label=label, rdfs_comment=description, rdfs_isDefinedBy=source ), qb_order=self._next_index() )) if "codelist" not in obj: add_local_codelist(name) elif "attribute" in obj: # Optionally, add the valueUrl if one has been specified if "value" in obj : self._columns[name] = self._columns[name]._replace( propertyUrl=URI(obj["attribute"]), valueUrl=URI(obj["value"]) ) else: # no valueUrl has been specified self._columns[name] = self._columns[name]._replace( propertyUrl=URI(obj["attribute"]) ) #Datatype has been speficied for attribute if "datatype" in obj: self._columns[name] = self._columns[name]._replace(datatype=obj["datatype"]) self._components.append(AttributeComponent( at_id=self.join_dataset_uri(f"#component/{pathify(name)}"), qb_componentProperty=Resource(at_id=URI(obj["attribute"])), qb_attribute=AttributeProperty( at_id=URI(obj["attribute"]), rdfs_range=Resource( at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}") ) ) )) elif "unit" in obj and "measure" in obj: self._columns[name] = self._columns[name]._replace(propertyUrl=obj["measure"]) if "datatype" in obj: self._columns[name] = self._columns[name]._replace(datatype=obj["datatype"]) else: self._columns[name] = self._columns[name]._replace(datatype="number") self._components.extend([ DimensionComponent( at_id=self.join_dataset_uri("#component/measure_type"), qb_componentProperty=Resource(at_id=URI("http://purl.org/linked-data/cube#measureType")), qb_dimension=DimensionProperty( at_id=URI("http://purl.org/linked-data/cube#measureType"), rdfs_range=Resource(at_id=URI("http://purl.org/linked-data/cube#MeasureProperty")) ) ), MeasureComponent( at_id=self.join_dataset_uri(f"#component/{pathify(name)}"), qb_componentProperty=Resource(at_id=obj["measure"]), qb_measure=MeasureProperty(at_id=obj["measure"]) ), AttributeComponent( at_id=self.join_dataset_uri(f"#component/unit"), qb_componentProperty=Resource( at_id=URI("http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure") ), qb_attribute=AttributeProperty( at_id=URI("http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure") ) ) ]) self._columns["virt_unit"] = Column( name="virt_unit", virtual=True, propertyUrl=URI("http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure"), valueUrl=URI(obj["unit"]) ) self._columns["virt_measure"] = Column( name="virt_measure", virtual=True, propertyUrl=URI("http://purl.org/linked-data/cube#measureType"), valueUrl=URI(obj["measure"]) ) elif "datatype" in obj and not ("measure" in obj or "unit" in obj): # Where a measure type column exists assert self._measureTemplate is not None, "Must have a measure type column." self._columns[name] = self._columns[name]._replace( propertyUrl=self._measureTemplate.uri, datatype=obj["datatype"] ) elif self._mapping is not None and name in self._mapping and isinstance(self._mapping[name], bool): self._columns[name] = self._columns[name]._replace( suppressOutput=not self._mapping[name] ) else: # assume local dimension, with optional definition description: Optional[str] = None if self._mapping is not None and name in self._mapping and isinstance(self._mapping[name], str): description = self._mapping[name] self._keys.append(self._columns[name].name) self._columns[name] = self._columns[name]._replace( propertyUrl=self.join_dataset_uri(f"#dimension/{pathify(name)}"), valueUrl=get_conventional_local_codelist_concept_uri_template(name) ) self._components.append(DimensionComponent( at_id=self.join_dataset_uri(f"#component/{pathify(name)}"), qb_componentProperty=Resource(at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}")), qb_dimension=DimensionProperty( at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}"), rdfs_range=Resource( at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}") ), qb_codeList=get_conventional_local_codelist_scheme_uri(name), rdfs_label=name, rdfs_comment=description ), qb_order=self._next_index() )) add_local_codelist(name) self._columns["virt_dataset"] = Column( name="virt_dataset", virtual=True, propertyUrl=URI("qb:dataSet"), valueUrl=URI(self.join_dataset_uri("#dataset")) ) self._columns["virt_type"] = Column( name="virt_type", virtual=True, propertyUrl=URI("rdf:type"), valueUrl=URI("qb:Observation") ) self._validate() if self._containing_graph_uri is None: print("WARNING: _containing_graph_uri is unset. Imputing graph URI from context.") containing_graph_uri = self._dataset_uri.replace("gss-data.org.uk/data/gss_data", "gss-data.org.uk/graph/gss_data") else: containing_graph_uri = self._containing_graph_uri csvw_structure = { "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}], "tables": self._as_tables(), "@id": containing_graph_uri, # sd:NamedGraph => https://www.w3.org/TR/sparql11-service-description/#sd-NamedGraph "rdf:type": { "@id": "sd:NamedGraph" }, "sd:name": { "@id": containing_graph_uri } } if not self._accretive_upload and not self._suppress_catalog_and_dsd_output: # Don't want to upload DSD twice where we're just adding new data to existing data. # void:rootResource => https://www.w3.org/TR/void/#root-resource csvw_structure["void:rootResource"] = DataSet( at_id=self.join_dataset_uri('#dataset'), qb_structure=DSD( at_id=self.join_dataset_uri('#structure'), qb_component=self._components ) ) return csvw_structure def _as_tables(self): table_uri = URI(Path(self._csv_filename).name) # default is that metadata is filename + '-metadata.json' if self._metadata_filename is not None: table_uri = URI(self._csv_filename.relative_to(self._metadata_filename.parent)) main_table = Table( url=table_uri, tableSchema=TableSchema( columns=list(self._columns.values()), primaryKey=self._keys, aboutUrl=self.join_dataset_uri('/'.join('{+' + s + '}' for s in self._keys)), foreignKeys=self._foreign_keys ) ) return self._external_tables + [main_table] @staticmethod def _as_plain_obj(o): def fix_prefix(key: str): for prefix, replace in {'at_': '@', 'qb_': 'qb:', 'rdfs_': 'rdfs:'}.items(): if key.startswith(prefix): return replace + key[len(prefix):] return key if isinstance(o, tuple): try: return {fix_prefix(k): CSVWMapping._as_plain_obj(v) for (k, v) in dict(o._asdict()).items() if v is not None} except AttributeError: return o elif isinstance(o, dict): return {k: CSVWMapping._as_plain_obj(v) for (k, v) in o.items()} elif isinstance(o, Path): return str(o) elif isinstance(o, list): return [CSVWMapping._as_plain_obj(i) for i in o] else: return o def write(self, out: Union[URI, TextIO]): if not isinstance(out, TextIOBase): self._metadata_filename = Path(out) stream = open(out, "w", encoding="utf-8") else: stream = out plain_obj = CSVWMapping._as_plain_obj(self._as_csvw_object()) logging.debug(json.dumps(plain_obj, indent=2)) json.dump(plain_obj, stream, indent=2)
def relative_to(self, path): try: return PurePosixPath.relative_to(self, path) except ValueError: return self
def relative_path( current: PurePosixPath, target: PurePosixPath, static_prefix: Optional[PurePosixPath] = None, ) -> PurePosixPath: """Calculate a dotted path from a source to destination. Relative paths are hard. Lots of edge cases, lots of configurable policies. This function is the innermost logic, which presumes lots of complexity is handled before stuff gets passed in. Themester's logic is based on Python's ``PurePosixPath``: a virtual hierarchy that is sort of like the filesystem, but not actually tied to a filesystem. References to documents in the site and static assets are done as these virtual pure paths. Static asset references are "normalized" at definition time to be relative to a configurable site root. Both ``current`` and ``target`` are expected to start with a slash. It doesn't matter if it does or doesn't end with a slash. This function doesn't care about whether folders should get ``/index`` added to their path. In fact, it doesn't understand folders. It expects to the path to include ``index`` when current or target are a collection of some kind. Policies handled before this is called: - Adding '/index' to current/target if it is a collection - Adding a configurable suffix such as ``index.html`` - Converting a resource to a path - Detecting a resource is a collection and should get ``index`` added to path Args: current: Source from which target is relative, with leading slash target: Destination, with leading slash static_prefix: Path to insert between dots and target Returns: The path to the target. Raises: ValueError: Trying to get an invalid path. """ if not current.is_absolute(): m = f'Source path "{str(current)}" must start with a slash' raise ValueError(m) if static_prefix is None and not target.is_absolute(): m = f'Target path "{str(target)}" must start with a slash' raise ValueError(m) # Do an optimization...bail out immediately if the same, but make # it relative if current == target: return PurePosixPath(current.name) # noinspection PyTypeChecker current_parents = iter(current.parents) target_parents = target.parents result: Optional[PurePosixPath] = None hops = -1 while True: try: result = next(current_parents) hops += 1 if result in target_parents: raise StopIteration() except StopIteration: break # What is the "leftover" part of target remainder_parts = target.relative_to(str(result)) # How many hops up to go prefix = PurePosixPath("/".join(repeat("..", hops))) # Join it all together if static_prefix is None: v = prefix.joinpath(remainder_parts) else: v = prefix.joinpath(static_prefix, remainder_parts) return v
def absolute(self, path: PurePosixPath): path = '/' / path path = (self._basedir / path.as_posix()[1:]) #type: Path path.relative_to(self._basedir) return path
def _parse_bundle(bundle: ZipFile, filename: str = None) -> PythonProtocol: # noqa: C901 """ Parse a bundled Python protocol """ if not ff.use_protocol_api_v2(): raise RuntimeError( 'Uploading a bundled protocol requires the robot to be set to ' 'Protocol API V2. Enable the \'Use Protocol API version 2\' ' 'toggle in the robot\'s Advanced Settings and restart the robot') if not _has_files_at_root(bundle): raise RuntimeError( 'No files found in ZIP file\'s root directory. When selecting ' 'files to zip, make sure to directly select the files ' 'themselves. Do not select their parent directory, which would ' 'result in nesting all files inside that directory in the ZIP.') MAIN_PROTOCOL_FILENAME = 'protocol.ot2.py' LABWARE_DIR = 'labware' DATA_DIR = 'data' bundled_labware: Dict[str, Dict[str, Any]] = {} bundled_data = {} bundled_python = {} try: with bundle.open(MAIN_PROTOCOL_FILENAME, 'r') as protocol_file: py_protocol = protocol_file.read().decode('utf-8') except KeyError: raise RuntimeError( f'Bundled protocol should have a {MAIN_PROTOCOL_FILENAME} ' + 'file in the root directory') for zipInfo in bundle.infolist(): filepath = PurePosixPath(zipInfo.filename) rootpath = filepath.parts[0] # skip directories and weird OS-added directories # (note: the __MACOSX dir would contain '__MACOSX/foo.py' # and other files. This would break our inferences, so we need # to exclude all contents of that directory) if rootpath == '__MACOSX' or zipInfo.is_dir(): continue with bundle.open(zipInfo) as f: if rootpath == LABWARE_DIR and filepath.suffix == '.json': labware_def = json.load(f) labware_key = _get_labware_uri(labware_def) if labware_key in bundled_labware: raise RuntimeError( f'Conflicting labware in bundle. {labware_key}') bundled_labware[labware_key] = labware_def elif rootpath == DATA_DIR: # note: data files are read as binary bundled_data[str(filepath.relative_to(DATA_DIR))] = f.read() elif (filepath.suffix == '.py' and str(filepath) != MAIN_PROTOCOL_FILENAME): bundled_python[str(filepath)] = f.read().decode('utf-8') if not bundled_labware: raise RuntimeError('No labware definitions found in bundle.') result = _parse_python( py_protocol, filename, bundled_labware, bundled_data, bundled_python) if result.api_level != '2': raise RuntimeError('Bundled protocols must use Protocol API V2, ' + f'got {result.api_level}') return result
if __name__ == "__main__": archive_dir = res.archive_dir nb_dir = res.nb_dir out_dir = res.out_dir do_zip = res.zip verbose = res.verbose if os.path.exists(archive_dir): shutil.rmtree(archive_dir) os.makedirs(archive_dir) for path in Path(nb_dir).rglob('*.html'): if verbose: print("Archiving {}".format(path)) pure_path = PurePosixPath(path) rel_path = pure_path.relative_to(nb_dir) archive_path = os.path.join(archive_dir, rel_path) if verbose: print("\tCopying to {}".format(archive_path)) if not os.path.exists(os.path.split(archive_path)[0]): os.makedirs(os.path.split(archive_path)[0]) shutil.copy(path, archive_path) central_path = os.path.join(out_dir, rel_path) if verbose: print("\tCopying to {}".format(central_path)) if not os.path.exists(os.path.split(central_path)[0]): os.makedirs(os.path.split(central_path)[0]) shutil.copy(path, central_path) if os.path.isfile("index.html"): shutil.copy("index.html", os.path.join(archive_dir, "index.html")) shutil.copy("index.html", os.path.join(out_dir, "index.html"))