def _create_flow(self, flow_name, *tasks, deps): flow = Flow(flow_name, tasks, deps) flow_repodir = os.path.join(self.context.workdir, "flows") flow_workdir = os.path.join(flow_repodir, flow_name) os.makedirs(flow_workdir, exist_ok=True) for task in tasks: job_file = os.path.join(flow_workdir, task + ".job") with open(job_file, 'w') as f: f.write("type=command\n") if task in deps and len(deps[task]) > 0: f.write("dependencies=" + ','.join(deps[task]) + "\n") f.write("command=" + self.cmd.format(task=task)) f.flush() if len(flow.forest) > 1: job_file = os.path.join(flow_workdir, flow_name + ".job") with open(job_file, 'w') as f: f.write("type=command\n") f.write("dependencies=" + ','.join(flow.forest) + "\n") f.write("command=echo flow done\n") f.write("failure.emails=" + self.notify_mails) f.flush() logger.debug("Job files generation succeed") import zipfile def zipdir(path, ziph): # ziph is zipfile handle for root, dirs, files in os.walk(path): for file in files: ziph.write(os.path.join(root, file)) job_zip = os.path.join(flow_repodir, flow_name + ".zip") zipf = zipfile.ZipFile(job_zip, 'w', zipfile.ZIP_DEFLATED) zipdir(flow_workdir, zipf) zipf.close() logger.debug("Job files zipped into {}".format(job_zip)) files = { 'file': (flow_name + '.zip', open(job_zip, 'rb'), 'application/zip', {'Expires': '0'}) } self._call_api('manager', 'upload', require_login=True, method='POST', attachment=files, project=self.project) import shutil shutil.rmtree(flow_workdir, ignore_errors=True) os.remove(job_zip) logger.info("Azkaban flow {} updated, you can go to {} to check".format(flow_name, self.host + "/manager?project=" + self.project + "&flow=" + flow_name))
def load_query(self, query, db, **kwargs): conn = self.open(db) df = pd.read_sql_query(query, con=conn) logger.info("before memory: " + str(df.memory_usage(deep=True).sum() / 1024**2) + " MB") # 优化内存使用 # 1.使用子类型优化数字列 df_int = df.select_dtypes(include=['int64']) convert_int = df_int.apply(pd.to_numeric, downcast='unsigned') for col in df_int.columns: df[col] = convert_int[col] # 2.使用分类来优化对象类型 df_obj = df.select_dtypes(include=['object']) if len(df_obj.columns) > 0: for col in df_obj.columns: num_unique_values = len(df_obj[col].unique()) num_total_values = len(df_obj[col]) if num_total_values > 0 and num_unique_values / num_total_values < 0.5: df[col] = df[col].astype('category') logger.info("after memory: " + str(df.memory_usage(deep=True).sum() / 1024**2) + " MB") return df
def store(self, df, table, db, **kwargs): assert isinstance(df, pd.DataFrame), "Invalid data type" if_exists = kwargs.get('if_exists', 'fail') chunksize = kwargs.get('chunksize', 10000) pkey = kwargs.get('pkey', None) indexes = kwargs.get('indexes', []) checkpoint_column = kwargs.get('checkpoint_column', None) checkpoint = kwargs.get('checkpoint') last_checkpoint = kwargs.get('last_checkpoint') _conn = self.open(db) try: if if_exists == 'append' or if_exists == 'update': target_table = Table(table, MetaData(), autoload=True, autoload_with=_conn) assert checkpoint_column is not None, "checkpoint_column is required in update mode!" assert ( isinstance(checkpoint_column, tuple) and len(checkpoint_column) == 2) or isinstance( checkpoint_column, str), "checkpoint_column can only be str or 2-tuple!" if isinstance(checkpoint_column, tuple): (create_time_column, update_time_column) = checkpoint_column else: create_time_column = checkpoint_column update_time_column = checkpoint_column # delete extra records over last checkpoint in append/update mode clear_ins = target_table.delete().where( Column(update_time_column) >= last_checkpoint) _conn.execute(clear_ins) if if_exists == 'update': assert pkey is not None, "primary key is required in update mode!" assert isinstance( pkey, str), "update mode only support single primary key" update_df = df[df[create_time_column] < last_checkpoint] if not update_df.empty: logger.info(table + ": find {} records to update".format( len(update_df))) update_keys = list(update_df[pkey]) delete_ins = target_table.delete().where( Column(pkey).in_(update_keys)) _conn.execute(delete_ins) if_exists = 'append' except NoSuchTableError: if_exists = 'replace' schema = None if table.find('.') >= 0: toks = table.split('.', 1) schema = toks[0] table = toks[1] float_columns = list( df.select_dtypes(include=['float64', 'float']).keys()) if len(float_columns) > 0: logger.warn( table + ": Detect columns with float types {}, you better check if this is caused by NAN-integer " "column issue of pandas!".format(list(float_columns))) typehints = dict() obj_columns = list(df.select_dtypes(include=['object']).keys()) if len(obj_columns) > 0: logger.warn( table + ": Detect columns with object types {}, which is automatically converted to *VARCHAR(256)*, " "you can override this by specifying type hints!".format( list(obj_columns))) import sqlalchemy.types as sqltypes typehints.update(dict((k, sqltypes.VARCHAR(256)) for k in obj_columns)) # TODO: upddate typehints with user-specified one _typehints = kwargs.get('typehints', {}) from parade.type import stdtype_to_sqltype for col, stdtype in _typehints.items(): logger.info( table + ": Column [{}] is set to type [{}]".format(col, str(stdtype))) typehints[col] = stdtype_to_sqltype(stdtype) def _chunks(_df, _chunksize): """Yield successive n-sized chunks from l.""" for i in range(0, len(_df), _chunksize): yield df[i:i + _chunksize] # still write to database for empty dataframe if df.empty: df.to_sql(name=table, con=_conn, index=False, schema=schema, if_exists=if_exists, dtype=typehints) logger.warn(table + ": Write to {}: empty dataframe".format(table)) else: for idx, chunk in enumerate(_chunks(df, chunksize)): if_exists_ = 'append' if idx > 0 else if_exists chunk.to_sql(name=table, con=_conn, index=False, schema=schema, if_exists=if_exists_, dtype=typehints) logger.info(table + ": Write to {}: rows #{}-#{}".format( table, idx * chunksize, (idx + 1) * chunksize)) if if_exists == 'replace': if pkey: pkeys = pkey if isinstance(pkey, str) else ','.join(pkey) _conn.execute('ALTER TABLE {} ADD PRIMARY KEY ({})'.format( table, pkeys)) for index in indexes: index_str = index if isinstance(index, str) else ','.join(index) index_name = index if isinstance(index, str) else '_'.join(index) _conn.execute('ALTER TABLE {} ADD INDEX idx_{} ({})'.format( table, index_name, index_str))