コード例 #1
0
ファイル: shuffle.py プロジェクト: mathewlee11/dask
 def __call__(self, *args, **kwargs):
     import partd
     if self.tempdir:
         file = partd.File(dir=self.tempdir)
     else:
         file = partd.File()
     if self.buffer:
         return partd.PandasBlocks(partd.Buffer(partd.Dict(), file))
     else:
         return partd.PandasBlocks(file)
コード例 #2
0
ファイル: core.py プロジェクト: pwolfram/dask
    def groupby(self, grouper, npartitions=None, blocksize=2**20):
        """ Group collection by key function

        Note that this requires full dataset read, serialization and shuffle.
        This is expensive.  If possible you should use ``foldby``.

        >>> b = from_sequence(range(10))
        >>> dict(b.groupby(lambda x: x % 2 == 0))  # doctest: +SKIP
        {True: [0, 2, 4, 6, 8], False: [1, 3, 5, 7, 9]}

        See Also
        --------

        Bag.foldby
        """
        if npartitions is None:
            npartitions = self.npartitions
        token = tokenize(self, grouper, npartitions, blocksize)

        import partd
        p = ('partd-' + token,)
        try:
            dsk1 = {p: (partd.Python, (partd.Snappy, partd.File()))}
        except AttributeError:
            dsk1 = {p: (partd.Python, partd.File())}

        # Partition data on disk
        name = 'groupby-part-{0}-{1}'.format(funcname(grouper), token)
        dsk2 = dict(((name, i), (partition, grouper, (self.name, i),
                                 npartitions, p, blocksize))
                    for i in range(self.npartitions))

        # Barrier
        barrier_token = 'groupby-barrier-' + token

        def barrier(args):
            return 0

        dsk3 = {barrier_token: (barrier, list(dsk2))}

        # Collect groups
        name = 'groupby-collect-' + token
        dsk4 = dict(((name, i),
                     (collect, grouper, i, p, barrier_token))
                    for i in range(npartitions))

        return type(self)(merge(self.dask, dsk1, dsk2, dsk3, dsk4), name,
                          npartitions)
コード例 #3
0
    def __call__(self, *args, **kwargs):
        import partd

        path = tempfile.mkdtemp(suffix=".partd", dir=self.tempdir)

        try:
            partd_compression = (
                getattr(partd.compressed, self.compression)
                if self.compression
                else None
            )
        except AttributeError as e:
            raise ImportError(
                "Not able to import and load {0} as compression algorithm."
                "Please check if the library is installed and supported by Partd.".format(
                    self.compression
                )
            ) from e
        file = partd.File(path)
        partd.file.cleanup_files.append(path)
        # Envelope partd file with compression, if set and available
        if partd_compression:
            file = partd_compression(file)
        if self.buffer:
            return partd.PandasBlocks(partd.Buffer(partd.Dict(), file))
        else:
            return partd.PandasBlocks(file)
コード例 #4
0
ファイル: lib.py プロジェクト: teepee-studios/avalon-sync
def set_asset_data(gazu_project_id, gazu_asset_id, avalon_asset_id):
    # Store Zou Id and Avalon Id key value pair of the asset

    # Set the directory where partd stores it's data
    base_directory = os.environ["DATA_PATH"]
    data_directory = os.path.join(base_directory, "data")
    directory = os.path.join(data_directory, gazu_project_id)

    # Create the data directory for the project if it doesn't exist.
    if not os.path.exists(directory):
        if not os.path.exists(data_directory):
            os.mkdir(data_directory)
        os.mkdir(directory)

    # Init partd
    p = partd.File(directory)

    # Check if the asset is already stored and delete it if it is.
    # (We're making the assumption that IDs supplied to us are unique).
    if p.get(gazu_asset_id):
        p.delete(gazu_asset_id)
        logger.info("Deleting: {0}".format(gazu_asset_id))

    # Encode and store the data as a utf-8 bytes
    value = bytes(str(avalon_asset_id), "utf-8")
    key_values = {gazu_asset_id: value}
    p.append(key_values)
コード例 #5
0
ファイル: lib.py プロジェクト: teepee-studios/avalon-sync
def get_asset_data(gazu_project_id, gazu_asset_id):
    # Lookup the Zou Id and Avalon Id key value pair of the asset

    # Set the directory where partd stores it's data
    base_directory = os.environ["DATA_PATH"]
    directory = os.path.join(base_directory, "data", gazu_project_id)

    # Init partd
    p = partd.File(directory)

    if not p.get(gazu_asset_id):
        return False
    else:
        # Get the Avalon asset ID from partd
        project_data = bytes.decode(p.get(gazu_asset_id), "utf-8")
        return project_data
コード例 #6
0
ファイル: lib.py プロジェクト: teepee-studios/avalon-sync
def get_project_data(project_id):
    # Lookup the Zou Id and Avalon Id key value pair of the asset

    # Set the directory where partd stores it's data
    directory = os.path.join(os.environ["DATA_PATH"], "data", project_id)

    # Init partd
    p = partd.Pickle(partd.File(directory))

    if not p.get(project_id):
        return False
    else:
        # Get the Avalon asset ID from partd
        project_info = p.get(project_id)
        project_data = {
            "id": project_info[0],
            "collection": project_info[1]
        }
        return project_data
コード例 #7
0
 def __call__(self, *args, **kwargs):
     import partd
     if self.buffer:
         return partd.PandasBlocks(partd.Buffer(partd.Dict(), partd.File()))
     else:
         return partd.PandasBlocks(partd.File())