Esempio n. 1
0
    def match(p: pathlib.PurePosixPath):
        ext = os.path.splitext(p)[1]

        if ext in ('.pyc', '.pyo'):
            return False

        if ignore_common_stdlib_files and p in STDLIB_IGNORE_FILES:
            return False

        if ignore_stdlib_test_dirs:
            for ignore in STDLIB_TEST_DIRS:
                try:
                    p.relative_to(ignore)
                    return False
                except ValueError:
                    pass

        if ignore_common_stdlib_dirs:
            for ignore in STDLIB_NONTEST_IGNORE_DIRS:
                try:
                    p.relative_to(ignore)
                    return False
                except ValueError:
                    pass

        return True
Esempio n. 2
0
    def subindex(self, *path):
        """
        Returns an `IndexFile` object listing only those files in or below the
        directory given by ``*path``.  The path keys in the resulting object
        will be relative to ``*path``.

        ``*path`` may be any relative path specification accepted by
        `PurePath.relative_to`, such as a string (e.g., ``"main"`` or
        ``"main/binary-amd64"``), a sequence of strings (e.g., ``"main",
        "binary-amd64"``), or a `PurePosixPath` object.
        """
        ### TODO: Add an option for also updating the `filename` attributes of
        ### the IndexEntries?
        ### TODO: Add an option for controlling whether to keep `fields`?
        subfiles = {}
        for p, entry in self.files.items():
            pathobj = PurePosixPath(p)
            ### TODO: Handle absolute paths and paths beginning with .. (or .?)
            try:
                subpath = pathobj.relative_to(*path)
            except ValueError:
                pass
            else:
                subfiles[str(subpath)] = entry
        return type(self)(files=subfiles, fields=self.fields.copy())
Esempio n. 3
0
def _decode_name(name):
    if name is None:
        return None
    path = PurePosixPath(name)
    if path.is_absolute():
        path = path.relative_to(path.root)
    parts = [i for i in path.parts if i not in ('.', '..')]
    return PurePosixPath(*parts)
Esempio n. 4
0
 def convert_posix_to_win32(rootfs, cwd, path):
     # rootfs is a concrete path.
     rootfs = Path(rootfs)
     # cwd and path are pure paths
     cwd = PurePosixPath(cwd[1:])
     path = PurePosixPath(path)
     if path.is_absolute():
         return rootfs / path.relative_to(path.anchor)
     else:
         return rootfs / cwd / path
Esempio n. 5
0
 def _scp(self, local_path, remote_path, dry=True):
     # remote_path = '/mnt/ibl/zadorlab/Subjects/flowers/2018-07-13/001
     remote_path = PurePosixPath('/').joinpath(
         remote_path.relative_to(PurePosixPath(FLATIRON_MOUNT)))
     # local_path
     self.mktree(remote_path.parent)
     self.ftp.pwd()
     _logger.info(f"FTP upload {local_path}")
     with open(local_path, 'rb') as fid:
         self.ftp.storbinary(f'STOR {local_path.name}', fid)
     return 0, ''
Esempio n. 6
0
def get_name_relative_to(base_dir, curr_file):
    # Watch out for root directory when using this
    base_path = PurePosixPath(base_dir.full_name)
    file_path = PurePosixPath(curr_file.full_name)
    try:
        return file_path.relative_to(base_path) if isinstance(
            base_dir, Directory) else file_path
    except ValueError:
        if base_dir == curr_file and isinstance(base_dir, Directory):
            return "."
        if base_dir.get_parent_directory() == base_dir:
            return ".."
Esempio n. 7
0
 def _scp(self, local_path, remote_path, dry=True):
     remote_path = PurePosixPath('/').joinpath(
         remote_path.relative_to(PurePosixPath(FLATIRON_MOUNT))
     )
     _logger.info(f"Globus copy {local_path} to {remote_path}")
     if not dry:
         if isinstance(self.globus_transfer, globus_sdk.transfer.data.TransferData):
             self.globus_transfer.add_item(local_path, remote_path)
         else:
             self.globus_transfer.path_src.append(local_path)
             self.globus_transfer.path_dest.append(remote_path)
     return 0, ''
Esempio n. 8
0
def extract_bundle(bundle: ZipFile) -> BundleContents:  # noqa(C901)
    """ Extract a bundle and verify its contents and structure. """
    if not _has_files_at_root(bundle):
        raise RuntimeError(
            'No files found in ZIP file\'s root directory. When selecting '
            'files to zip, make sure to directly select the files '
            'themselves. Do not select their parent directory, which would '
            'result in nesting all files inside that directory in the ZIP.')
    try:
        with bundle.open(MAIN_PROTOCOL_FILENAME, 'r') as protocol_file:
            py_protocol = protocol_file.read().decode('utf-8')
    except KeyError:
        raise RuntimeError(
            f'Bundled protocol should have a {MAIN_PROTOCOL_FILENAME} ' +
            'file in the root directory')
    bundled_labware: Dict[str, 'LabwareDefinition'] = {}
    bundled_data = {}
    bundled_python = {}
    for zipInfo in bundle.infolist():
        filepath = PurePosixPath(zipInfo.filename)
        rootpath = filepath.parts[0]

        # skip directories and weird OS-added directories
        # (note: the __MACOSX dir would contain '__MACOSX/foo.py'
        # and other files. This would break our inferences, so we need
        # to exclude all contents of that directory)
        if rootpath == '__MACOSX' or zipInfo.is_dir():
            continue

        with bundle.open(zipInfo) as f:
            if rootpath == LABWARE_DIR and filepath.suffix == '.json':
                labware_def = json.loads(f.read().decode('utf-8'))
                labware_key = uri_from_definition(labware_def)
                if labware_key in bundled_labware:
                    raise RuntimeError(
                        f'Conflicting labware in bundle: {labware_key}')
                bundled_labware[labware_key] = labware_def
            elif rootpath == DATA_DIR:
                # note: data files are read as binary
                bundled_data[str(filepath.relative_to(DATA_DIR))] = f.read()
            elif (filepath.suffix == '.py'
                  and str(filepath) != MAIN_PROTOCOL_FILENAME):
                bundled_python[str(filepath)] = f.read().decode('utf-8')

    if not bundled_labware:
        raise RuntimeError('No labware definitions found in bundle.')

    return BundleContents(py_protocol, bundled_labware, bundled_data,
                          bundled_python)
Esempio n. 9
0
    def __host_casefold_path(hostpath: str) -> Optional[str]:
        # assuming posix host
        p = PurePosixPath(hostpath)
        norm = Path(p.anchor)

        for elem in p.relative_to(norm).parts:
            folded = elem.casefold()

            try:
                norm = next(entry for entry in norm.iterdir()
                            if entry.name.casefold() == folded)
            except StopIteration:
                return None

        return str(norm)
Esempio n. 10
0
    def _scp(self, local_path, remote_path, dry=True):
        remote_path = PurePosixPath('/').joinpath(
            remote_path.relative_to(PurePosixPath(FLATIRON_MOUNT))
        )

        # local_path
        self.mktree(remote_path.parent)
        # if the file already exists on the buffer, do not overwrite
        if local_path.name in self.ftp.nlst():
            _logger.info(f"FTP already on server {local_path}")
            return 0, ''
        self.ftp.pwd()
        _logger.info(f"FTP upload {local_path}")
        with open(local_path, 'rb') as fid:
            self.ftp.storbinary(f'STOR {local_path.name}', fid)
        return 0, ''
Esempio n. 11
0
def full_split(_path):
    """
    Return a list with all the intermediate paths.
    The input path must be a POSIX path string (i.e., Linux or OSX).
    """
    intermediate_paths = list()

    _path = PurePosixPath(_path)

    if _path.is_absolute():
        _path = _path.relative_to("/")

    parts = _path.parts

    for i in range(1, len(parts)):
        intermediate_paths.append(PurePosixPath(*parts[0:i]).as_posix())

    return intermediate_paths
Esempio n. 12
0
    def _commit_line_for_file(self, filename: str) -> Optional[str]:
        '''Works out a reasonable single-line commit comment for the given file path.'''
        path = PurePosixPath(self.config.settings.OutputPath / filename)

        # Generic line for removals
        if not Path(path).is_file:
            return f'{path} removed'

        # Don't report manifest file updates
        if path.name.lower() == MANIFEST_FILENAME.lower():
            # Do not report this file
            return False  # type: ignore  # no Literal support in Python 3.7

        # Don't report files in dotted directories
        for dirname in PurePosixPath(filename).parent.parts:
            if dirname.startswith('.') and len(dirname) > 1:
                # Do not report this file
                return False  # type: ignore  # no Literal support in Python 3.7

        # Look up the relevant root
        root = self._find_matching_root(str(path))
        if not root:
            return None  # we don't know this file - fall back to default report

        relative_path = path.relative_to(root.path)

        # See if there's a relevant manifest entry with a version number
        entry = self._find_matching_manifest_entry(root, str(path))
        version: Optional[str] = entry.get('version', None) if entry else None

        # Get the of the path from the root
        name = root.get_name_for_path(relative_path)

        if name and version:
            return f'{name} updated to version {version}'

        if name:
            return f'{name} updated'

        if version:
            return f'{relative_path} updated to version {version}'

        # Don't know this file
        return None
Esempio n. 13
0
def _calculate_relative_path(origin: PurePosixPath, target: PurePosixPath):
    '''
    >>> _calculate_relative_path(PurePosixPath('input.json'), PurePosixPath('.schema', 'output.json'))
    PurePosixPath('.schema/output.json')
    >>> _calculate_relative_path(PurePosixPath('sub','input.json'), PurePosixPath('.schema', 'output.json'))
    PurePosixPath('../.schema/output.json')
    '''
    while True:
        origin = origin.parent

        try:
            result = target.relative_to(origin)
            return result
        except ValueError:
            pass

        if origin == PurePosixPath('.'):
            raise ValueError("Unable to calculate relative path")

        target = PurePosixPath('..', target)
Esempio n. 14
0
    def _commit_line_for_file(self, filename: str) -> Optional[str]:
        '''Works out a reasonable single-line commit comment for the given file path.'''
        path = PurePosixPath(self.config.settings.OutputPath / filename)

        # Generic line for removals
        if not Path(path).is_file:
            return f'{path} removed'

        # Don't report manifest file updates
        if path.name.lower() == MANIFEST_FILENAME.lower():
            return None

        # Look up the relevant root
        root = self._find_matching_root(str(path))
        if not root:
            # We don't know this file
            return None

        relative_path = path.relative_to(root.path)

        # See if there's a relevant manifest entry with a version number
        entry = self._find_matching_manifest_entry(root, str(path))
        version: Optional[str] = entry.get('version', None) if entry else None

        # Get the of the path from the root
        name = root.get_name_for_path(relative_path)

        if name and version:
            return f'{name} updated to version {version}'

        if name:
            return f'{name} updated'

        if version:
            return f'{relative_path} updated to version {version}'

        # Don't know this file
        return None
 def resolve(self, notebookPath: PurePosixPath):
     return notebookPath.relative_to(self.__baseDirPath).with_suffix('')
Esempio n. 16
0
 def __init__(self, base_path: PPPath, path: PPPath,
              content_device) -> None:
     super().__init__(base_path, path)
     self._content_device = content_device
     self._rel_path = path.relative_to(base_path)
Esempio n. 17
0
 def get_path_relative_to_this(self, partial: PPPath) -> PPPath:
     return partial.relative_to(self._path.relative_to(self._base_path))
Esempio n. 18
0
class CSVWMapping:
    def __init__(self):
        self._csv_filename: Optional[Path] = None
        self._csv_stream: Optional[TextIO] = None
        self._mapping: Dict[str, Any] = {}
        self._column_names: List[str] = []
        self._columns: Dict[str, Column] = {}
        self._external_tables: List[Table] = []
        self._dataset_uri: Optional[URI] = None
        self._dataset_root_uri: Optional[URI] = None
        self._dataset = DataSet()
        self._components: List[Component] = []
        self._registry: Optional[URI] = None
        self._keys: List[str] = []
        self._metadata_filename: Optional[Path] = None
        self._foreign_keys: Optional[List[ForeignKey]] = None
        self._measureTemplate: Optional[URITemplate] = None
        self._measureTypes: Optional[List[str]] = None
        self._accretive_upload: bool = False
        self._containing_graph_uri: Optional[URI] = None
        self._codelist_base: Optional[Path] = None
        self._suppress_catalog_and_dsd_output: bool = False
        self._index: int = 0

    @staticmethod
    def namify(column_header: str):
        return pathify(column_header).replace('-', '_')

    @staticmethod
    def classify(column_header: str):
        return ''.join(part.capitalize() for part in pathify(column_header).split('-'))

    def join_dataset_uri(self, relative: str, use_true_dataset_root: bool = False):
        # treat the dataset URI as an entity that when joined with a fragment, just adds
        # the fragment, but when joined with a relative path, turns the dataset URI into a container
        # by adding a / to the end before adding the relative path

        f"""
        Where datasets have multiple distinct dataframes, `self._dataset_uri` is of the form
            http://gss-data.org.uk/data/gss_data/<family_path>/<dataset_root_path>/<dataset_path>

        Codelists are defined at the `dataset_root_path` level, so we need to be able to create URIs relative to
            http://gss-data.org.uk/data/gss_data/<family_path>/<dataset_root_path>
        """
        root_uri = self._dataset_root_uri if use_true_dataset_root else self._dataset_uri

        if root_uri is None:
            return URI(relative)
        elif relative.startswith('#'):
            return URI(urljoin(root_uri, relative, allow_fragments=True))
        else:
            return URI(urljoin(root_uri + '/', relative, allow_fragments=True))

    def set_csv(self, csv_filename: URI):

        # csv and csv.gz need to be read in slightly different ways
        if str(csv_filename).endswith("csv"):
            with open(csv_filename, newline='', encoding='utf-8') as f:
                self.set_input(csv_filename, f)
        elif str(csv_filename).endswith("csv.gz"):
            with gzip.open(csv_filename, encoding='utf-8') as f:
                self.set_input(csv_filename, f)
        else:
            raise ValueError("Only csv types of .csv and /csv.gz are supported."
                    " Not {}".format(csv_filename))

    def set_input(self, filename: URI, stream: TextIO):
        self._csv_stream = stream
        self._csv_filename = Path(str(filename)[:-3]) if str(filename).endswith(".csv.gz") else filename
        reader = csv.DictReader(stream)
        self._column_names = reader.fieldnames
        for col in self._column_names:
            self._columns[col] = Column(name=CSVWMapping.namify(col), titles=col, datatype="string")

    def set_local_codelist_base(self, base: str):
        self._codelist_base = Path(base)

    def set_accretive_upload(self, info_json: Dict):
        if "load" in info_json and "accretiveUpload" in info_json["load"]:
            self._accretive_upload = info_json["load"]["accretiveUpload"]
        # Else default of false

    def set_mapping(self, mapping):
        if 'transform' in mapping and 'columns' in mapping['transform']:
            self._mapping = mapping['transform']['columns']
        else:
            logging.error(f'No column mapping found.')

    def set_suppress_catalog_and_dsd_output(self, should_suppress: bool):
        self._suppress_catalog_and_dsd_output = should_suppress

    def add_foreign_key(self, foreign_key: ForeignKey):
        if self._foreign_keys is None:
            self._foreign_keys = []
        self._foreign_keys.append(foreign_key)

    def set_containing_graph_uri(self, uri: URI):
        self._containing_graph_uri = uri

    def set_dataset_uri(self, uri: URI, dataset_root_uri: Optional[URI] = None):
        f"""
        Please make sure you set the dataset_root_uri.

        If this dataset has only one dataframe associated then both {uri} and {dataset_root_uri} should be the same, 
        e.g.
            `http://gss-data.org.uk/data/gss_data/<family-name>/<dataset-name>`

        If the dataset has more than one dataframe associated and so has a {uri} of the form
            `http://gss-data.org.uk/data/gss_data/<family-name>/<dataset-name>/<dataframe-name>`
        then the {dataset_root_uri} must represent the URI fragment common to all dataframes, e.g.
            `http://gss-data.org.uk/data/gss_data/<family-name>/<dataset-name>`
        """
        self._dataset_uri = uri

        if dataset_root_uri is None:
            logging.warning("Dataset_root_uri is unset. " +
                  "In future this warning will be converted to an error and terminate your build.")

            # Legacy compatibility code:
            # This code will NOT survive any change is URI standards.
            if self._dataset_uri is not None:
                matches: re.Match = re.match("^(.+)/gss_data/([^/]+)/([^/]+).*$", self._dataset_uri,
                                             re.RegexFlag.IGNORECASE)
                base_uri = f"{matches.group(1)}/gss_data"
                family_path = matches.group(2)
                dataset_root_path = matches.group(3)
                dataset_root_uri = f"{base_uri}/{family_path}/{dataset_root_path}"

        self._dataset_root_uri = dataset_root_uri

    def set_registry(self, uri: URI):
        self._registry = uri

    def _validate(self):
        # check variable names are consistent
        declared_names = set([col.name for col in self._columns.values()])
        used_names: Set[str] = set()
        for name_set in (
            variables(t)
            for col in self._columns.values()
            for t in [col.propertyUrl, col.valueUrl]
            if t is not None
        ):
            used_names.update(name_set)
        if not declared_names.issuperset(used_names):
            logging.error(f"Unmatched variable names: {used_names.difference(declared_names)}")
        # check used prefixes
        used_prefixes = set(
            t.split(':')[0]
            for col in self._columns.values()
            for t in [col.propertyUrl, col.valueUrl]
            if t is not None and not t.startswith('http') and ':' in t
        )
        if not set(prefix_map.keys()).issuperset(used_prefixes):
            logging.error(f"Unknown prefixes used: {used_prefixes.difference(prefix_map.keys())}")

    def _next_index(self):
        self._index = self._index + 1
        return self._index

    def _as_csvw_object(self):
        def get_conventional_local_codelist_scheme_uri(column_name: str) -> Resource:
            codelist_uri = self.join_dataset_uri(f"#scheme/{pathify(column_name)}", use_true_dataset_root=True)
            return Resource(at_id=codelist_uri)

        def get_maybe_codelist_for_col(column_config: object, column_name: str) -> Optional[Resource]:
            if "codelist" in column_config:
                codelist = column_config["codelist"]
                if isinstance(codelist, bool) and not codelist:
                    # Config explicitly forbids a codelist being linked here.
                    return None

                return Resource(at_id=codelist)

            # Codelist should exist. Convention dictates it should be a local codelist.
            return get_conventional_local_codelist_scheme_uri(column_name)

        def get_conventional_local_codelist_concept_uri_template(column_name: str) -> URI:
            return self.join_dataset_uri(f"#concept/{pathify(column_name)}/{{+{self._columns[column_name].name}}}",
                                         use_true_dataset_root=True)

        def get_value_uri_template_for_col(column_def: object, column_name: str) -> URI:
            if "value" in column_def:
                return URI(column_def["value"])

            return get_conventional_local_codelist_concept_uri_template(column_name)

        def add_local_codelist(name: str):
            if self._codelist_base is not None:
                codelist_csv = (self._codelist_base / pathify(name)).with_suffix('.csv')
                codelist_relative_uri = URI(codelist_csv)
                self._external_tables.append(Table(
                    url=codelist_relative_uri,
                    tableSchema=URI("https://gss-cogs.github.io/family-schemas/codelist-schema.json"),
                    suppressOutput=True
                ))
                self.add_foreign_key(ForeignKey(
                    columnReference=self._columns[name].name,
                    reference=ColumnReference(
                        resource=codelist_relative_uri,
                        columnReference="notation"
                    )
                ))

    # Look to see whether the measure type has its own column
        for map_name, map_obj in self._mapping.items():
            if isinstance(map_obj, dict) and "dimension" in map_obj and map_obj["dimension"] == "http://purl.org/linked-data/cube#measureType":
                self._measureTemplate = URITemplate(map_obj["value"])
                if "types" in map_obj:
                    self._measureTypes = map_obj["types"]
                    # add a component specification for each measure
                    for t in map_obj["types"]:
                        template_vars = {CSVWMapping.namify(map_name): t}
                        self._components.append(
                            MeasureComponent(
                                at_id=self.join_dataset_uri(f"#component/{pathify(t)}"),
                                qb_componentProperty=Resource(at_id=self._measureTemplate.expand(template_vars)),
                                qb_measure=MeasureProperty(at_id=self._measureTemplate.expand(template_vars))
                            )
                        )
        # Now iterate over column headers in the given CSV file
        for name in self._column_names:
            if (
                self._mapping is not None 
                and name in self._mapping 
                and isinstance(self._mapping[name], dict)
            ):
                obj = self._mapping[name]
                if "dimension" in obj and "value" in obj:
                    self._keys.append(self._columns[name].name)
                    datatype = "string"
                    # if this is a measure type column and has a "types" list, we can validate the
                    # expected values of the column using a regular expression, see
                    # https://www.w3.org/TR/tabular-data-primer/#h-enumeration-regexp
                    if obj["dimension"] == "http://purl.org/linked-data/cube#measureType" and "types" in obj:
                        datatype = Datatype(
                            base="string",
                            format=f"^({'|'.join(obj['types'])})$"
                        )
                    self._columns[name] = self._columns[name]._replace(
                        propertyUrl=URI(obj["dimension"]),
                        valueUrl=URI(obj["value"]),
                        datatype=datatype
                    )
                    self._components.append(DimensionComponent(
                        at_id=self.join_dataset_uri(f"#component/{pathify(name)}"),
                        qb_componentProperty=Resource(at_id=URI(obj["dimension"])),
                        qb_dimension=DimensionProperty(
                            at_id=URI(obj["dimension"]),
                            rdfs_range=Resource(
                                at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}")
                            )
                        ),
                        qb_order=self._next_index()
                    ))
                elif "parent" in obj:
                    # a local dimension that has a super property
                    description: Optional[str] = obj.get("description", None)
                    label: str = obj.get("label", name)
                    source: Optional[Resource] = None
                    if "source" in obj:
                        source = Resource(at_id=URI(obj["source"]))
                    self._keys.append(self._columns[name].name)
                    self._columns[name] = self._columns[name]._replace(
                        propertyUrl=self.join_dataset_uri(f"#dimension/{pathify(name)}"),
                        valueUrl=get_value_uri_template_for_col(obj, name)
                    )
                    self._components.append(DimensionComponent(
                        at_id=self.join_dataset_uri(f"#component/{pathify(name)}"),
                        qb_componentProperty=Resource(at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}")),
                        qb_dimension=DimensionProperty(
                            at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}"),
                            rdfs_range=Resource(
                                at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}")
                            ),
                            qb_codeList=get_maybe_codelist_for_col(obj, name),
                            rdfs_label=label,
                            rdfs_comment=description,
                            rdfs_subPropertyOf=Resource(at_id=URI(obj["parent"])),
                            rdfs_isDefinedBy=source
                        ),
                        qb_order=self._next_index()
                    ))
                    if "codelist" not in obj:
                        if "parent" not in obj or obj["parent"] != "http://purl.org/linked-data/sdmx/2009/dimension#refPeriod":
                            add_local_codelist(name)
                elif "description" in obj or "label" in obj:
                    # local dimension with a definition/label and maybe source of the definition
                    description: Optional[str] = obj.get("description", None)
                    label: Optional[str] = obj.get("label", name)
                    source: Optional[Resource] = None
                    if "source" in obj:
                        source = Resource(at_id=URI(obj["source"]))
                    self._keys.append(self._columns[name].name)
                    self._columns[name] = self._columns[name]._replace(
                        propertyUrl=self.join_dataset_uri(f"#dimension/{pathify(name)}"),
                        valueUrl=get_value_uri_template_for_col(obj, name)
                    )
                    self._components.append(DimensionComponent(
                        at_id=self.join_dataset_uri(f"#component/{pathify(name)}"),
                        qb_componentProperty=Resource(at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}")),
                        qb_dimension=DimensionProperty(
                            at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}"),
                            rdfs_range=Resource(
                                at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}")
                            ),
                            qb_codeList=get_maybe_codelist_for_col(obj, name),
                            rdfs_label=label,
                            rdfs_comment=description,
                            rdfs_isDefinedBy=source
                        ),
                        qb_order=self._next_index()
                    ))
                    if "codelist" not in obj:
                        add_local_codelist(name)
                elif "attribute" in obj:
                    # Optionally, add the valueUrl if one has been specified
                    if "value" in obj :
                        self._columns[name] = self._columns[name]._replace(
                            propertyUrl=URI(obj["attribute"]),
                            valueUrl=URI(obj["value"])
                        )
                    else: # no valueUrl has been specified
                        self._columns[name] = self._columns[name]._replace(
                            propertyUrl=URI(obj["attribute"])
                        )
                    #Datatype has been speficied for attribute
                    if "datatype" in obj:
                        self._columns[name] = self._columns[name]._replace(datatype=obj["datatype"])
                    
                    self._components.append(AttributeComponent(
                        at_id=self.join_dataset_uri(f"#component/{pathify(name)}"),
                        qb_componentProperty=Resource(at_id=URI(obj["attribute"])),
                        qb_attribute=AttributeProperty(
                            at_id=URI(obj["attribute"]),
                            rdfs_range=Resource(
                                at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}")
                            )
                        )
                    ))
                elif "unit" in obj and "measure" in obj:
                    self._columns[name] = self._columns[name]._replace(propertyUrl=obj["measure"])
                    if "datatype" in obj:
                        self._columns[name] = self._columns[name]._replace(datatype=obj["datatype"])
                    else:
                        self._columns[name] = self._columns[name]._replace(datatype="number")
                    self._components.extend([
                        DimensionComponent(
                            at_id=self.join_dataset_uri("#component/measure_type"),
                            qb_componentProperty=Resource(at_id=URI("http://purl.org/linked-data/cube#measureType")),
                            qb_dimension=DimensionProperty(
                                at_id=URI("http://purl.org/linked-data/cube#measureType"),
                                rdfs_range=Resource(at_id=URI("http://purl.org/linked-data/cube#MeasureProperty"))
                            )
                        ),
                        MeasureComponent(
                            at_id=self.join_dataset_uri(f"#component/{pathify(name)}"),
                            qb_componentProperty=Resource(at_id=obj["measure"]),
                            qb_measure=MeasureProperty(at_id=obj["measure"])
                        ),
                        AttributeComponent(
                            at_id=self.join_dataset_uri(f"#component/unit"),
                            qb_componentProperty=Resource(
                                at_id=URI("http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure")
                            ),
                            qb_attribute=AttributeProperty(
                                at_id=URI("http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure")
                            )
                        )
                    ])
                    self._columns["virt_unit"] = Column(
                        name="virt_unit",
                        virtual=True,
                        propertyUrl=URI("http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure"),
                        valueUrl=URI(obj["unit"])
                    )
                    self._columns["virt_measure"] = Column(
                        name="virt_measure",
                        virtual=True,
                        propertyUrl=URI("http://purl.org/linked-data/cube#measureType"),
                        valueUrl=URI(obj["measure"])
                    )
                elif "datatype" in obj and not ("measure" in obj or "unit" in obj):
                    # Where a measure type column exists
                    assert self._measureTemplate is not None, "Must have a measure type column."
                    self._columns[name] = self._columns[name]._replace(
                        propertyUrl=self._measureTemplate.uri,
                        datatype=obj["datatype"]
                    )
            elif self._mapping is not None and name in self._mapping and isinstance(self._mapping[name], bool):
                self._columns[name] = self._columns[name]._replace(
                    suppressOutput=not self._mapping[name]
                )
            else:
                # assume local dimension, with optional definition
                description: Optional[str] = None
                if self._mapping is not None and name in self._mapping and isinstance(self._mapping[name], str):
                    description = self._mapping[name]
                self._keys.append(self._columns[name].name)
                self._columns[name] = self._columns[name]._replace(
                    propertyUrl=self.join_dataset_uri(f"#dimension/{pathify(name)}"),
                    valueUrl=get_conventional_local_codelist_concept_uri_template(name)
                )
                self._components.append(DimensionComponent(
                    at_id=self.join_dataset_uri(f"#component/{pathify(name)}"),
                    qb_componentProperty=Resource(at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}")),
                    qb_dimension=DimensionProperty(
                        at_id=self.join_dataset_uri(f"#dimension/{pathify(name)}"),
                        rdfs_range=Resource(
                            at_id=self.join_dataset_uri(f"#class/{CSVWMapping.classify(name)}")
                        ),
                        qb_codeList=get_conventional_local_codelist_scheme_uri(name),
                        rdfs_label=name,
                        rdfs_comment=description
                    ),
                    qb_order=self._next_index()
                ))
                add_local_codelist(name)

        self._columns["virt_dataset"] = Column(
            name="virt_dataset",
            virtual=True,
            propertyUrl=URI("qb:dataSet"),
            valueUrl=URI(self.join_dataset_uri("#dataset"))
        )
        self._columns["virt_type"] = Column(
            name="virt_type",
            virtual=True,
            propertyUrl=URI("rdf:type"),
            valueUrl=URI("qb:Observation")
        )
        self._validate()

        if self._containing_graph_uri is None:
            print("WARNING: _containing_graph_uri is unset. Imputing graph URI from context.")
            containing_graph_uri = self._dataset_uri.replace("gss-data.org.uk/data/gss_data",
                                                             "gss-data.org.uk/graph/gss_data")
        else:
            containing_graph_uri = self._containing_graph_uri

        csvw_structure = {
            "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}],
            "tables": self._as_tables(),
            "@id": containing_graph_uri,
            # sd:NamedGraph => https://www.w3.org/TR/sparql11-service-description/#sd-NamedGraph
            "rdf:type": {
                "@id": "sd:NamedGraph"
            },
            "sd:name": {
                "@id": containing_graph_uri
            }
        }

        if not self._accretive_upload and not self._suppress_catalog_and_dsd_output:
            # Don't want to upload DSD twice where we're just adding new data to existing data.
            # void:rootResource => https://www.w3.org/TR/void/#root-resource
            csvw_structure["void:rootResource"] = DataSet(
                at_id=self.join_dataset_uri('#dataset'),
                qb_structure=DSD(
                    at_id=self.join_dataset_uri('#structure'),
                    qb_component=self._components
                )
            )

        return csvw_structure

    def _as_tables(self):
        table_uri = URI(Path(self._csv_filename).name)  # default is that metadata is filename + '-metadata.json'
        if self._metadata_filename is not None:
            table_uri = URI(self._csv_filename.relative_to(self._metadata_filename.parent))
        main_table = Table(
            url=table_uri,
            tableSchema=TableSchema(
                columns=list(self._columns.values()),
                primaryKey=self._keys,
                aboutUrl=self.join_dataset_uri('/'.join('{+' + s + '}' for s in self._keys)),
                foreignKeys=self._foreign_keys
            )
        )
        return self._external_tables + [main_table]

    @staticmethod
    def _as_plain_obj(o):
        def fix_prefix(key: str):
            for prefix, replace in {'at_': '@', 'qb_': 'qb:', 'rdfs_': 'rdfs:'}.items():
                if key.startswith(prefix):
                    return replace + key[len(prefix):]
            return key
        if isinstance(o, tuple):
            try:
                return {fix_prefix(k): CSVWMapping._as_plain_obj(v) for (k, v) in dict(o._asdict()).items() if v is not None}
            except AttributeError:
                return o
        elif isinstance(o, dict):
            return {k: CSVWMapping._as_plain_obj(v) for (k, v) in o.items()}
        elif isinstance(o, Path):
            return str(o)
        elif isinstance(o, list):
            return [CSVWMapping._as_plain_obj(i) for i in o]
        else:
            return o

    def write(self, out: Union[URI, TextIO]):
        if not isinstance(out, TextIOBase):
            self._metadata_filename = Path(out)
            stream = open(out, "w", encoding="utf-8")
        else:
            stream = out
        plain_obj = CSVWMapping._as_plain_obj(self._as_csvw_object())
        logging.debug(json.dumps(plain_obj, indent=2))
        json.dump(plain_obj, stream, indent=2)
Esempio n. 19
0
 def relative_to(self, path):
     try:
         return PurePosixPath.relative_to(self, path)
     except ValueError:
         return self
Esempio n. 20
0
 def relative_to(self, path):
     try:
         return PurePosixPath.relative_to(self, path)
     except ValueError:
         return self
Esempio n. 21
0
def relative_path(
    current: PurePosixPath,
    target: PurePosixPath,
    static_prefix: Optional[PurePosixPath] = None,
) -> PurePosixPath:
    """Calculate a dotted path from a source to destination.

    Relative paths are hard.
    Lots of edge cases, lots of configurable policies.
    This function is the innermost logic, which presumes lots of complexity is
    handled before stuff gets passed in.

    Themester's logic is based on Python's ``PurePosixPath``: a virtual hierarchy that is sort of
    like the filesystem, but not actually tied to a filesystem.
    References to documents in the site and static assets are done as these virtual pure paths.
    Static asset references are "normalized" at definition time to be relative to a configurable site root.

    Both ``current`` and ``target`` are expected to start with a slash.
    It doesn't matter if it does or doesn't end with a slash.

    This function doesn't care about whether folders should get ``/index`` added to their path.
    In fact, it doesn't understand folders.
    It expects to the path to include ``index`` when current or target are a collection of some kind.

    Policies handled before this is called:

    - Adding '/index' to current/target if it is a collection

    - Adding a configurable suffix such as ``index.html``

    - Converting a resource to a path

    - Detecting a resource is a collection and should get ``index`` added to path

    Args:
        current: Source from which target is relative, with leading slash
        target: Destination, with leading slash
        static_prefix: Path to insert between dots and target

    Returns:
        The path to the target.

    Raises:
        ValueError: Trying to get an invalid path.
    """
    if not current.is_absolute():
        m = f'Source path "{str(current)}" must start with a slash'
        raise ValueError(m)

    if static_prefix is None and not target.is_absolute():
        m = f'Target path "{str(target)}" must start with a slash'
        raise ValueError(m)

    # Do an optimization...bail out immediately if the same, but make
    # it relative
    if current == target:
        return PurePosixPath(current.name)

    # noinspection PyTypeChecker
    current_parents = iter(current.parents)
    target_parents = target.parents

    result: Optional[PurePosixPath] = None
    hops = -1

    while True:
        try:
            result = next(current_parents)
            hops += 1
            if result in target_parents:
                raise StopIteration()
        except StopIteration:
            break

    # What is the "leftover" part of target
    remainder_parts = target.relative_to(str(result))

    # How many hops up to go
    prefix = PurePosixPath("/".join(repeat("..", hops)))

    # Join it all together
    if static_prefix is None:
        v = prefix.joinpath(remainder_parts)
    else:
        v = prefix.joinpath(static_prefix, remainder_parts)
    return v
Esempio n. 22
0
 def absolute(self, path: PurePosixPath):
     path = '/' / path
     path = (self._basedir / path.as_posix()[1:])  #type: Path
     path.relative_to(self._basedir)
     return path
Esempio n. 23
0
def _parse_bundle(bundle: ZipFile, filename: str = None) -> PythonProtocol:  # noqa: C901
    """ Parse a bundled Python protocol """
    if not ff.use_protocol_api_v2():
        raise RuntimeError(
            'Uploading a bundled protocol requires the robot to be set to '
            'Protocol API V2. Enable the \'Use Protocol API version 2\' '
            'toggle in the robot\'s Advanced Settings and restart the robot')
    if not _has_files_at_root(bundle):
        raise RuntimeError(
            'No files found in ZIP file\'s root directory. When selecting '
            'files to zip, make sure to directly select the files '
            'themselves. Do not select their parent directory, which would '
            'result in nesting all files inside that directory in the ZIP.')

    MAIN_PROTOCOL_FILENAME = 'protocol.ot2.py'
    LABWARE_DIR = 'labware'
    DATA_DIR = 'data'
    bundled_labware: Dict[str, Dict[str, Any]] = {}
    bundled_data = {}
    bundled_python = {}

    try:
        with bundle.open(MAIN_PROTOCOL_FILENAME, 'r') as protocol_file:
            py_protocol = protocol_file.read().decode('utf-8')
    except KeyError:
        raise RuntimeError(
            f'Bundled protocol should have a {MAIN_PROTOCOL_FILENAME} ' +
            'file in the root directory')

    for zipInfo in bundle.infolist():
        filepath = PurePosixPath(zipInfo.filename)
        rootpath = filepath.parts[0]

        # skip directories and weird OS-added directories
        # (note: the __MACOSX dir would contain '__MACOSX/foo.py'
        # and other files. This would break our inferences, so we need
        # to exclude all contents of that directory)
        if rootpath == '__MACOSX' or zipInfo.is_dir():
            continue

        with bundle.open(zipInfo) as f:
            if rootpath == LABWARE_DIR and filepath.suffix == '.json':
                labware_def = json.load(f)
                labware_key = _get_labware_uri(labware_def)
                if labware_key in bundled_labware:
                    raise RuntimeError(
                        f'Conflicting labware in bundle. {labware_key}')
                bundled_labware[labware_key] = labware_def
            elif rootpath == DATA_DIR:
                # note: data files are read as binary
                bundled_data[str(filepath.relative_to(DATA_DIR))] = f.read()
            elif (filepath.suffix == '.py' and
                  str(filepath) != MAIN_PROTOCOL_FILENAME):
                bundled_python[str(filepath)] = f.read().decode('utf-8')

    if not bundled_labware:
        raise RuntimeError('No labware definitions found in bundle.')

    result = _parse_python(
        py_protocol, filename, bundled_labware, bundled_data,
        bundled_python)

    if result.api_level != '2':
        raise RuntimeError('Bundled protocols must use Protocol API V2, ' +
                           f'got {result.api_level}')

    return result
if __name__ == "__main__":
    archive_dir = res.archive_dir
    nb_dir = res.nb_dir
    out_dir = res.out_dir
    do_zip = res.zip
    verbose = res.verbose

    if os.path.exists(archive_dir):
        shutil.rmtree(archive_dir)
    os.makedirs(archive_dir)

    for path in Path(nb_dir).rglob('*.html'):
        if verbose:
            print("Archiving {}".format(path))
        pure_path = PurePosixPath(path)
        rel_path = pure_path.relative_to(nb_dir)
        archive_path = os.path.join(archive_dir, rel_path)
        if verbose:
            print("\tCopying to {}".format(archive_path))
        if not os.path.exists(os.path.split(archive_path)[0]):
            os.makedirs(os.path.split(archive_path)[0])
        shutil.copy(path, archive_path)
        central_path = os.path.join(out_dir, rel_path)
        if verbose:
            print("\tCopying to {}".format(central_path))
        if not os.path.exists(os.path.split(central_path)[0]):
            os.makedirs(os.path.split(central_path)[0])
        shutil.copy(path, central_path)
    if os.path.isfile("index.html"):
        shutil.copy("index.html", os.path.join(archive_dir, "index.html"))
        shutil.copy("index.html", os.path.join(out_dir, "index.html"))