def create_dataframe(self, data=None, df_format="json", csv_dates=None, index_col=None): """ Create Pandas dataframe from external source Parameters ---------- data : object, list, dict or str object : pandas dataframe - will be returned as is list : list of folders to load data frame str : filename to load data frome dict : data in dict """ if data is not None: if isinstance(data, pd.DataFrame): return data elif isinstance(data, dict): return pd.DataFrame([data]) elif isinstance(data, basestring): local_file = self.work_folder + "/data" futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.copy(data, local_file) return self._convert_dataframe(local_file, df_format, csv_dates, index_col) elif isinstance(data, list): return np.array(data).reshape(1, -1) else: raise ValueError("unknown argument type for data")
def __init__(self, input_folders=[], output_folder=None, models_folder=None, local_models_folder="./models", local_data_folder="./data", aws_key=None, aws_secret=None, data_type='json'): self.pipeline = [] self.models_folder = models_folder self.input_folders = input_folders self.output_folder = output_folder self.local_models_folder = local_models_folder self.local_data_folder = local_data_folder if not os.path.exists(self.local_models_folder): os.makedirs(self.local_models_folder) if not os.path.exists(self.local_data_folder): os.makedirs(self.local_data_folder) self.fu = fu.FileUtil(key=aws_key, secret=aws_secret) self.logger = logging.getLogger('seldon') self.logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.current_dataset = self.local_data_folder + "/current" self.next_dataset = self.local_data_folder + "/next" self.data_type = data_type
def save_dataframe(self, df, location, df_format="json", csv_index=True): """Save dataframe Parameters ---------- df : pandas daraframe dataframe to save location : str external filesystem location to save to df_format : str format to use : json or csv csv_index : bool whether to save index when outputing to csv """ self.create_work_folder() tmp_file = self.work_folder + "/df_tmp" if df_format == 'csv': logger.info("saving dataframe as csv") df.to_csv(tmp_file, index=csv_index) else: logger.info("saving dataframe as json") f = open(tmp_file, "w") for i in range(0, df.shape[0]): row = df.irow(i).dropna() jNew = row.to_dict() jStr = json.dumps(jNew, sort_keys=True) f.write(jStr + "\n") f.close() futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.copy(tmp_file, location)
def stream_and_upload(self,folder): self.truncate_table() futl = fu.FileUtil() futl.stream(folder,self.upload) if len(self.inserts) > 0: logger.info("Running final batch with rows inserted %d",self.rows) self.reallyDoInserts(self.inserts) self.rename_table() self.db.commit()
def save_extension(self, extension, location): self.create_work_folder() rint = random.randint(1, 999999) extension_folder = self.work_folder + "/extension_tmp" + str(rint) if not os.path.exists(extension_folder): logger.info("creating folder %s", extension_folder) os.makedirs(extension_folder) tmp_file = extension_folder + "/ext" joblib.dump(extension, tmp_file) extension.save(extension_folder) futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.copy(extension_folder, location)
def load_extension(self,extension_folder): self.create_work_folder() rint = random.randint(1,999999) local_extension_folder = self.work_folder+"/extension_tmp"+str(rint) if not os.path.exists(local_extension_folder): logger.info("creating folder %s",local_extension_folder) os.makedirs(local_extension_folder) futil = fu.FileUtil(aws_key=self.aws_key,aws_secret=self.aws_secret) futil.copy(extension_folder,local_extension_folder) extension = joblib.load(local_extension_folder+"/ext") extension.load(local_extension_folder) return extension
def _copy_features_locally(self, locations, local_file, df_format): self.df_format = df_format self.create_work_folder() logger.info("streaming features %s to %s", locations, local_file) logger.info("input type is %s", self.df_format) self.lines_read = 0 self.active_file = open(local_file, "w") if not self.df_format == 'csv': self.active_file.write("[") futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.stream_multi(locations, self._save_features_local) if not self.df_format == 'csv': self.active_file.write("]") self.active_file.close() logger.info("finished stream of features")
def load_pipeline(self, pipeline_folder): """ Load scikit learn pipeline from external folder Parameters ---------- pipeline_folder : str external folder holding pipeline """ self.create_work_folder() local_pipeline_folder = self.work_folder + "/pipeline" if not os.path.exists(local_pipeline_folder): logger.info("creating folder %s", local_pipeline_folder) os.makedirs(local_pipeline_folder) futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.copy(pipeline_folder, local_pipeline_folder) return joblib.load(local_pipeline_folder + "/p")
def save_pipeline(self, pipeline, location): """ Save scikit learn pipeline to external location Parameters ---------- pipelines : sklearn pipeline pipeline to be saved location : str external folder to save pipeline """ self.create_work_folder() pipeline_folder = self.work_folder + "/pipeline" if not os.path.exists(pipeline_folder): logger.info("creating folder %s", pipeline_folder) os.makedirs(pipeline_folder) tmp_file = pipeline_folder + "/p" joblib.dump(pipeline, tmp_file) futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.copy(pipeline_folder, location)
def load_recommender(self,recommender_folder): """ Load scikit learn recommender from external folder Parameters ---------- recommender_folder : str external folder holding recommender """ self.create_work_folder() rint = random.randint(1,999999) local_recommender_folder = self.work_folder+"/recommender_tmp"+str(rint) if not os.path.exists(local_recommender_folder): logger.info("creating folder %s",local_recommender_folder) os.makedirs(local_recommender_folder) futil = fu.FileUtil(aws_key=self.aws_key,aws_secret=self.aws_secret) futil.copy(recommender_folder,local_recommender_folder) recommender = joblib.load(local_recommender_folder+"/rec") recommender.load(local_recommender_folder) return recommender
def save_recommender(self, recommender, location): """ Save recommender to external location Parameters ---------- recommender : Recommender recommender to be saved location : str external folder to save recommender """ self.create_work_folder() rint = random.randint(1, 999999) recommender_folder = self.work_folder + "/recommender_tmp" + str(rint) if not os.path.exists(recommender_folder): logger.info("creating folder %s", recommender_folder) os.makedirs(recommender_folder) tmp_file = recommender_folder + "/rec" joblib.dump(recommender, tmp_file) recommender.save(recommender_folder) futil = fu.FileUtil(aws_key=self.aws_key, aws_secret=self.aws_secret) futil.copy(recommender_folder, location)