Exemple #1
0
    def __init__(self, filename):
        """
        Construct a GLC unpickler.

        Parameters
        ----------
        filename  : Name of the file to read from. The file can be a GLC pickle
                    file, a cloud pickle file, or a python pickle file.
        Returns
        ----------
        GLC unpickler.
        """
        self.gl_object_memo = {}
        self.pickle_filename = None
        self.tmp_file = None
        self.file = None
        self.gl_temp_storage_path = _get_tmp_file_location()

        # GLC 1.3 used Zipfiles for storing the objects.
        self.directory_mode = True

        if _file_util.is_s3_path(filename):
            self.tmp_file = _get_temp_filename()
            # GLC 1.3 uses zipfiles
            if _file_util._is_valid_s3_key(filename):
                _file_util.download_from_s3(filename, self.tmp_file, \
                        aws_credentials = _get_aws_credentials(), is_dir=False, silent=True)
            # GLC 1.4 uses directories
            else:
                _file_util.download_from_s3(filename, self.tmp_file, \
                        aws_credentials = _get_aws_credentials(), is_dir=True, silent=True)

            filename = self.tmp_file
        elif _file_util.is_hdfs_path(filename):
            self.tmp_file = _get_temp_filename()
            _file_util.download_from_hdfs(filename, self.tmp_file)
            filename = self.tmp_file
        else:
            if not _os.path.exists(filename):
                raise IOError('%s is not a valid file name.' % filename)

        # GLC 1.3 Pickle file
        if _zipfile.is_zipfile(filename):
            self.directory_mode = False
            pickle_filename = None

            # Get the pickle file name.
            zf = _zipfile.ZipFile(filename, allowZip64=True)
            for info in zf.infolist():
                if info.filename == 'pickle_file':
                    pickle_filename = zf.read(info.filename)
            if pickle_filename is None:
                raise IOError(("Cannot pickle file of the given format. File"
                        " must be one of (a) GLPickler archive, "
                        "(b) Cloudpickle archive, or (c) python pickle archive."))

            # Extract the zip file.
            try:
                outpath = self.gl_temp_storage_path
                zf.extractall(outpath)
            except IOError as err:
                print "Graphlab pickle extraction error: %s " % err

            self.pickle_filename = _os.path.join(self.gl_temp_storage_path,
                                                 pickle_filename)

        # GLC Pickle directory mode.
        elif _os.path.isdir(filename):
            self.directory_mode = True
            pickle_filename = _os.path.join(filename, "pickle_archive")
            if not _os.path.exists(pickle_filename):
                raise IOError("Corrupted archive: Missing pickle file %s." % pickle_filename)
            if not _os.path.exists(_os.path.join(filename, "version")):
                raise IOError("Corrupted archive: Missing version file.")
            self.pickle_filename = pickle_filename
            self.gl_temp_storage_path = _os.path.abspath(filename)

        # Pure pickle file.
        else:
            self.directory_mode = False
            self.pickle_filename = filename

        self.file = open(self.pickle_filename, 'rb')
        _pickle.Unpickler.__init__(self, self.file)
Exemple #2
0
    def __init__(self, filename):
        """
        Construct a GLC unpickler.

        Parameters
        ----------
        filename  : Name of the file to read from. The file can be a GLC pickle
                    file, a cloud pickle file, or a python pickle file.
        Returns
        ----------
        GLC unpickler.
        """
        self.gl_object_memo = {}
        self.pickle_filename = None
        self.tmp_file = None
        self.file = None
        self.gl_temp_storage_path = _get_tmp_file_location()

        # GLC 1.3 used Zipfiles for storing the objects.
        self.directory_mode = True

        if _file_util.is_s3_path(filename):
            self.tmp_file = _get_temp_filename()
            # GLC 1.3 uses zipfiles
            if _file_util._is_valid_s3_key(filename):
                _file_util.download_from_s3(filename, self.tmp_file, \
                        aws_credentials = _get_aws_credentials(), is_dir=False, silent=True)
            # GLC 1.4 uses directories
            else:
                _file_util.download_from_s3(filename, self.tmp_file, \
                        aws_credentials = _get_aws_credentials(), is_dir=True, silent=True)

            filename = self.tmp_file
        elif _file_util.is_hdfs_path(filename):
            self.tmp_file = _get_temp_filename()
            _file_util.download_from_hdfs(filename, self.tmp_file)
            filename = self.tmp_file
        else:
            if not _os.path.exists(filename):
                raise IOError('%s is not a valid file name.' % filename)

        # GLC 1.3 Pickle file
        if _zipfile.is_zipfile(filename):
            self.directory_mode = False
            pickle_filename = None

            # Get the pickle file name.
            zf = _zipfile.ZipFile(filename, allowZip64=True)
            for info in zf.infolist():
                if info.filename == 'pickle_file':
                    pickle_filename = zf.read(info.filename)
            if pickle_filename is None:
                raise IOError(
                    ("Cannot pickle file of the given format. File"
                     " must be one of (a) GLPickler archive, "
                     "(b) Cloudpickle archive, or (c) python pickle archive."))

            # Extract the zip file.
            try:
                outpath = self.gl_temp_storage_path
                zf.extractall(outpath)
            except IOError as err:
                print "Graphlab pickle extraction error: %s " % err

            self.pickle_filename = _os.path.join(self.gl_temp_storage_path,
                                                 pickle_filename)

        # GLC Pickle directory mode.
        elif _os.path.isdir(filename):
            self.directory_mode = True
            pickle_filename = _os.path.join(filename, "pickle_archive")
            if not _os.path.exists(pickle_filename):
                raise IOError("Corrupted archive: Missing pickle file %s." %
                              pickle_filename)
            if not _os.path.exists(_os.path.join(filename, "version")):
                raise IOError("Corrupted archive: Missing version file.")
            self.pickle_filename = pickle_filename
            self.gl_temp_storage_path = _os.path.abspath(filename)

        # Pure pickle file.
        else:
            self.directory_mode = False
            self.pickle_filename = filename

        self.file = open(self.pickle_filename, 'rb')
        _pickle.Unpickler.__init__(self, self.file)
Exemple #3
0
    def __init__(self, filename, protocol = -1, min_bytes_to_save = 0):
        """

        Construct a  GLC pickler.

        Parameters
        ----------
        filename  : Name of the file to write to. This file is all you need to pickle
                    all objects (including GLC objects).

        protocol  : Pickle protocol (see pickle docs). Note that all pickle protocols
                    may not be compatable with GLC objects.

        min_bytes_to_save : Cloud pickle option (see cloud pickle docs).

        Returns
        ----------
        GLC pickler.

        """
        # Zipfile
        # --------
        # Version 1: GLC 1.2.1
        #
        # Directory:
        # ----------
        # Version 1: GLC 1.4: 1

        self.archive_filename = None
        self.gl_temp_storage_path = _get_tmp_file_location()
        self.gl_object_memo = set()
        self.mark_for_delete = set()

        if _file_util.is_s3_path(filename):
            self.s3_path = filename
            self.hdfs_path = None
        elif _file_util.is_hdfs_path(filename):
            self.s3_path = None
            self.hdfs_path = filename
            self.hadoop_conf_dir = None
        else:
            # Make sure the directory exists.
            filename = _os.path.abspath(filename)
            if not _os.path.exists(filename):
                _os.makedirs(filename)
            elif _os.path.isdir(filename):
                self.mark_for_delete = self._to_abs_path_set(
                             _glob.glob(_os.path.join(filename, "*")))
                self.mark_for_delete -= self._to_abs_path_set(
                        [_os.path.join(filename, 'pickle_archive'),
                         _os.path.join(filename, 'version')])

            elif _os.path.isfile(filename):
               _os.remove(filename)
               _os.makedirs(filename)

            # Create a new directory.
            self.gl_temp_storage_path = filename
            self.s3_path = None
            self.hdfs_path = None
            self.hadoop_conf_dir = None

        # The pickle file where all the Python objects are saved.
        relative_pickle_filename = "pickle_archive"
        pickle_filename = _os.path.join(self.gl_temp_storage_path,
                                        relative_pickle_filename)

        try:
            # Initialize the pickle file with cloud _pickle. Note, cloud pickle
            # takes a file handle for initialization.
            self.file = open(pickle_filename, 'wb')
            _cloudpickle.CloudPickler.__init__(self, self.file, protocol)
        except IOError as err:
            print "GraphLab create pickling error: %s" % err

        # Write the version number.
        with open(_os.path.join(self.gl_temp_storage_path, 'version'), 'w') as f:
            f.write("1.0")
Exemple #4
0
    def __init__(self, filename, protocol=-1, min_bytes_to_save=0):
        """

        Construct a  GLC pickler.

        Parameters
        ----------
        filename  : Name of the file to write to. This file is all you need to pickle
                    all objects (including GLC objects).

        protocol  : Pickle protocol (see pickle docs). Note that all pickle protocols
                    may not be compatable with GLC objects.

        min_bytes_to_save : Cloud pickle option (see cloud pickle docs).

        Returns
        ----------
        GLC pickler.

        """
        # Zipfile
        # --------
        # Version 1: GLC 1.2.1
        #
        # Directory:
        # ----------
        # Version 1: GLC 1.4: 1

        self.archive_filename = None
        self.gl_temp_storage_path = _get_tmp_file_location()
        self.gl_object_memo = set()
        self.mark_for_delete = set()

        if _file_util.is_s3_path(filename):
            self.s3_path = filename
            self.hdfs_path = None
        elif _file_util.is_hdfs_path(filename):
            self.s3_path = None
            self.hdfs_path = filename
            self.hadoop_conf_dir = None
        else:
            # Make sure the directory exists.
            filename = _os.path.abspath(filename)
            if not _os.path.exists(filename):
                _os.makedirs(filename)
            elif _os.path.isdir(filename):
                self.mark_for_delete = self._to_abs_path_set(
                    _glob.glob(_os.path.join(filename, "*")))
                self.mark_for_delete -= self._to_abs_path_set([
                    _os.path.join(filename, 'pickle_archive'),
                    _os.path.join(filename, 'version')
                ])

            elif _os.path.isfile(filename):
                _os.remove(filename)
                _os.makedirs(filename)

            # Create a new directory.
            self.gl_temp_storage_path = filename
            self.s3_path = None
            self.hdfs_path = None
            self.hadoop_conf_dir = None

        # The pickle file where all the Python objects are saved.
        relative_pickle_filename = "pickle_archive"
        pickle_filename = _os.path.join(self.gl_temp_storage_path,
                                        relative_pickle_filename)

        try:
            # Initialize the pickle file with cloud _pickle. Note, cloud pickle
            # takes a file handle for initialization.
            self.file = open(pickle_filename, 'wb')
            _cloudpickle.CloudPickler.__init__(self, self.file, protocol)
        except IOError as err:
            print "GraphLab create pickling error: %s" % err

        # Write the version number.
        with open(_os.path.join(self.gl_temp_storage_path, 'version'),
                  'w') as f:
            f.write("1.0")