コード例 #1
0
ファイル: classify.py プロジェクト: Atigeo/xpatterns-xframe
    def save(self, path):
        """
        Save a model.

        The model can be saved, then reloaded later to provide recommendations.

        Parameters
        ----------
        path : str
            The path where the model will be saved.
            This should refer to a file, not to a directory.
            Three items will be stored here: the underlying model parameters, the original ratings,
            and the column names.  These are stored with suffix '.model', '.ratings', and
            '.metadata'.
        """
        sc = CommonSparkContext.Instance().sc
        delete_file_or_dir(path)
        os.makedirs(path)
        model_path, metadata_path = self._file_paths(path)
        # save model
        self.model.save(sc, model_path)
        # save metadata
        model_type = self.__class__.__name__
        metadata = [model_type, self.feature_cols]
        with fileio.open_file(metadata_path, 'w') as f:
            # TODO detect filesystem errors
            pickle.dump(metadata, f)
コード例 #2
0
    def save_as_csv(self, path, **params):
        """
        Saves the RDD to file as text.
        """
        self._entry(path=path)

        # noinspection PyShadowingNames
        def to_csv(row, **params):
            sio = StringIO.StringIO()
            writer = csv.writer(sio, **params)
            try:
                writer.writerow([row], **params)
                ret = sio.getvalue()
                return ret
            except IOError:
                return ''

        fileio.delete(path)
        with fileio.open_file(path, 'w') as f:
            self.begin_iterator()
            elems_at_a_time = 10000
            ret = self.iterator_get_next(elems_at_a_time)
            while True:
                for row in ret:
                    line = to_csv(row, **params)
                    f.write(line)
                if len(ret) == elems_at_a_time:
                    ret = self.iterator_get_next(elems_at_a_time)
                else:
                    break
コード例 #3
0
ファイル: testhdfs.py プロジェクト: Atigeo/xpatterns-xframe
 def test_save_format(self):
     t = XArray([1, 2, 3])
     path = '{}/tmp/array-csv'.format(hdfs_prefix)
     t.save(path, format='csv')
     with fileio.open_file(path) as f:
         self.assertEqual('1', f.readline().strip())
         self.assertEqual('2', f.readline().strip())
         self.assertEqual('3', f.readline().strip())
     fileio.delete(path)
コード例 #4
0
ファイル: testhdfs.py プロジェクト: Atigeo/xpatterns-xframe
 def test_save(self):
     t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']})
     path = '{}/tmp/frame'.format(hdfs_prefix)
     t.save(path, format='binary')
     with fileio.open_file(os.path.join(path, '_metadata')) as f:
         metadata = pickle.load(f)
     self.assertListEqual([['id', 'val'], [int, str]], metadata)
     # TODO find some way to check the data
     fileio.delete(path)
コード例 #5
0
ファイル: testhdfs.py プロジェクト: Atigeo/xpatterns-xframe
    def test_save(self):
        t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']})
        path = '{}/tmp/frame-csv'.format(hdfs_prefix)
        t.save(path, format='csv')

        with fileio.open_file(path + '.csv') as f:
            heading = f.readline().rstrip()
            self.assertEqual('id,val', heading)
            self.assertEqual('30,a', f.readline().rstrip())
            self.assertEqual('20,b', f.readline().rstrip())
            self.assertEqual('10,c', f.readline().rstrip())
        fileio.delete(path + '.csv')
コード例 #6
0
ファイル: lineage.py プロジェクト: Atigeo/xpatterns-xframe
    def save(self, path):
        """
        Save lineage to a file.

        Parameters
        ----------
        path : str
            The path to the lineage file.
        """
        assert isinstance(path, basestring)
        with fileio.open_file(path, 'w') as f:
            # TODO detect filesystem errors
            lineage_fields = [self.table_lineage, self.column_lineage]
            pickle.dump(lineage_fields, f)
コード例 #7
0
ファイル: lineage.py プロジェクト: Atigeo/xpatterns-xframe
    def load(path):
        """
        Load lineage from a file.

        Parameters
        ----------
        path : str
            The path to the lineage file.

        Returns
        -------
            out : Lineage
        """
        assert isinstance(path, basestring)
        with fileio.open_file(path) as f:
            table_lineage, column_lineage = pickle.load(f)
        return Lineage(table_lineage=table_lineage, column_lineage=column_lineage)
コード例 #8
0
    def save_as_text(self, path):
        """
        Saves the RDD to file as text.
        """
        self._entry(path=path)
        fileio.delete(path)
        try:
            self._rdd.saveAsTextFile(path)
        except:
            # TODO distinguish between filesystem errors and pickle errors
            raise TypeError('The XArray save failed.')
        metadata = self.elem_type
        metadata_path = os.path.join(path, '_metadata')
        with fileio.open_file(metadata_path, 'w') as f:
            # TODO detect filesystem errors
            pickle.dump(metadata, f)

        lineage_path = os.path.join(path, '_lineage')
        self.lineage.save(lineage_path)
コード例 #9
0
    def save(self, path):
        """
        Saves the RDD to file in pickled form.
        """
        self._entry(path=path)
        # this only works for local files
        fileio.delete(path)
        try:
            self._rdd.saveAsPickleFile(path)          # action ?
        except:
            # TODO distinguish between filesystem errors and pickle errors
            raise TypeError('The XArray save failed.')
        metadata = self.elem_type
        metadata_path = os.path.join(path, '_metadata')
        with fileio.open_file(metadata_path, 'w') as f:
            # TODO detect filesystem errors
            pickle.dump(metadata, f)

        lineage_path = os.path.join(path, '_lineage')
        self.lineage.save(lineage_path)
コード例 #10
0
    def load_autodetect(cls, path, dtype):
        """
        Load from the given path.

        This can be anything that spark will read from: local file or HDFS file.
        It can also be a directory, and spark will read and concatenate them all.
        """
        # Read the file as string
        # Examine the first 100 lines, and cast if necessary to int, float, or datetime
        cls._entry(path=path, dtype=dtype)
        # If the path is a directory, then look for sarray-data file in the directory.
        # If the path is a file, look for that file
        # Use type inference to determine the element type.
        # Passed-in dtype is always str and is ignored.
        lineage = Lineage.init_array_lineage(path)
        sc = CommonSparkContext.spark_context()
        if os.path.isdir(path):
            res = XRdd(sc.pickleFile(path))
            metadata_path = os.path.join(path, '_metadata')
            with fileio.open_file(metadata_path) as f:
                dtype = pickle.load(f)
            lineage_path = os.path.join(path, '_lineage')
            if fileio.exists(lineage_path):
                lineage = Lineage.load(lineage_path)
        else:
            res = XRdd(sc.textFile(path, use_unicode=False))
            dtype = infer_type(res)

        if dtype != str:
            if dtype in (list, dict):
                res = res.map(lambda x: ast.literal_eval(x))
            elif dtype is datetime.datetime:
                res = res.map(lambda x: date_parser.parse(x))
            else:
                res = res.map(lambda x: dtype(x))
        return cls(res, dtype, lineage)