def save(self, path): """ Save a model. The model can be saved, then reloaded later to provide recommendations. Parameters ---------- path : str The path where the model will be saved. This should refer to a file, not to a directory. Three items will be stored here: the underlying model parameters, the original ratings, and the column names. These are stored with suffix '.model', '.ratings', and '.metadata'. """ sc = CommonSparkContext.Instance().sc delete_file_or_dir(path) os.makedirs(path) model_path, metadata_path = self._file_paths(path) # save model self.model.save(sc, model_path) # save metadata model_type = self.__class__.__name__ metadata = [model_type, self.feature_cols] with fileio.open_file(metadata_path, 'w') as f: # TODO detect filesystem errors pickle.dump(metadata, f)
def save_as_csv(self, path, **params): """ Saves the RDD to file as text. """ self._entry(path=path) # noinspection PyShadowingNames def to_csv(row, **params): sio = StringIO.StringIO() writer = csv.writer(sio, **params) try: writer.writerow([row], **params) ret = sio.getvalue() return ret except IOError: return '' fileio.delete(path) with fileio.open_file(path, 'w') as f: self.begin_iterator() elems_at_a_time = 10000 ret = self.iterator_get_next(elems_at_a_time) while True: for row in ret: line = to_csv(row, **params) f.write(line) if len(ret) == elems_at_a_time: ret = self.iterator_get_next(elems_at_a_time) else: break
def test_save_format(self): t = XArray([1, 2, 3]) path = '{}/tmp/array-csv'.format(hdfs_prefix) t.save(path, format='csv') with fileio.open_file(path) as f: self.assertEqual('1', f.readline().strip()) self.assertEqual('2', f.readline().strip()) self.assertEqual('3', f.readline().strip()) fileio.delete(path)
def test_save(self): t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']}) path = '{}/tmp/frame'.format(hdfs_prefix) t.save(path, format='binary') with fileio.open_file(os.path.join(path, '_metadata')) as f: metadata = pickle.load(f) self.assertListEqual([['id', 'val'], [int, str]], metadata) # TODO find some way to check the data fileio.delete(path)
def test_save(self): t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']}) path = '{}/tmp/frame-csv'.format(hdfs_prefix) t.save(path, format='csv') with fileio.open_file(path + '.csv') as f: heading = f.readline().rstrip() self.assertEqual('id,val', heading) self.assertEqual('30,a', f.readline().rstrip()) self.assertEqual('20,b', f.readline().rstrip()) self.assertEqual('10,c', f.readline().rstrip()) fileio.delete(path + '.csv')
def save(self, path): """ Save lineage to a file. Parameters ---------- path : str The path to the lineage file. """ assert isinstance(path, basestring) with fileio.open_file(path, 'w') as f: # TODO detect filesystem errors lineage_fields = [self.table_lineage, self.column_lineage] pickle.dump(lineage_fields, f)
def load(path): """ Load lineage from a file. Parameters ---------- path : str The path to the lineage file. Returns ------- out : Lineage """ assert isinstance(path, basestring) with fileio.open_file(path) as f: table_lineage, column_lineage = pickle.load(f) return Lineage(table_lineage=table_lineage, column_lineage=column_lineage)
def save_as_text(self, path): """ Saves the RDD to file as text. """ self._entry(path=path) fileio.delete(path) try: self._rdd.saveAsTextFile(path) except: # TODO distinguish between filesystem errors and pickle errors raise TypeError('The XArray save failed.') metadata = self.elem_type metadata_path = os.path.join(path, '_metadata') with fileio.open_file(metadata_path, 'w') as f: # TODO detect filesystem errors pickle.dump(metadata, f) lineage_path = os.path.join(path, '_lineage') self.lineage.save(lineage_path)
def save(self, path): """ Saves the RDD to file in pickled form. """ self._entry(path=path) # this only works for local files fileio.delete(path) try: self._rdd.saveAsPickleFile(path) # action ? except: # TODO distinguish between filesystem errors and pickle errors raise TypeError('The XArray save failed.') metadata = self.elem_type metadata_path = os.path.join(path, '_metadata') with fileio.open_file(metadata_path, 'w') as f: # TODO detect filesystem errors pickle.dump(metadata, f) lineage_path = os.path.join(path, '_lineage') self.lineage.save(lineage_path)
def load_autodetect(cls, path, dtype): """ Load from the given path. This can be anything that spark will read from: local file or HDFS file. It can also be a directory, and spark will read and concatenate them all. """ # Read the file as string # Examine the first 100 lines, and cast if necessary to int, float, or datetime cls._entry(path=path, dtype=dtype) # If the path is a directory, then look for sarray-data file in the directory. # If the path is a file, look for that file # Use type inference to determine the element type. # Passed-in dtype is always str and is ignored. lineage = Lineage.init_array_lineage(path) sc = CommonSparkContext.spark_context() if os.path.isdir(path): res = XRdd(sc.pickleFile(path)) metadata_path = os.path.join(path, '_metadata') with fileio.open_file(metadata_path) as f: dtype = pickle.load(f) lineage_path = os.path.join(path, '_lineage') if fileio.exists(lineage_path): lineage = Lineage.load(lineage_path) else: res = XRdd(sc.textFile(path, use_unicode=False)) dtype = infer_type(res) if dtype != str: if dtype in (list, dict): res = res.map(lambda x: ast.literal_eval(x)) elif dtype is datetime.datetime: res = res.map(lambda x: date_parser.parse(x)) else: res = res.map(lambda x: dtype(x)) return cls(res, dtype, lineage)