def run(args): # build an odps instance o = ODPS(args.odps_access_id, args.odps_access_key, args.odps_project, endpoint=args.odps_endpoint) input_table_project = args.odps_project input_table_name = args.input_table_name if '.' in input_table_name: input_table_project = args.input_table_name.split(".")[0] input_table_name = args.input_table_name.split(".")[1] # download data from odps input_table = o.get_table(input_table_name, project=input_table_project) data = input_table.to_df().to_pandas() # sample data new_data = data.sample(args.sample_row_count) # create output table and upload data to odps o.delete_table(args.output_table_name, if_exists=True) output_table_project = args.odps_project output_table_name = args.output_table_name if '.' in output_table_name: output_table_project = args.output_table_name.split(".")[0] output_table_name = args.output_table_name.split(".")[1] table = o.create_table(output_table_name, input_table.schema, project=output_table_project, if_not_exists=False, lifecycle=3) o.write_table(output_table_name, new_data.values.tolist(), project=output_table_project)
class MaxComputeConnection(Connection): """MaxCompute connection, this class uses ODPS object to establish connection with maxcompute Args: conn_uri: uri in format: maxcompute://access_id:[email protected]/api?curr_project=test_ci&scheme=http """ def __init__(self, conn_uri): super(MaxComputeConnection, self).__init__(conn_uri) user, pwd, endpoint, proj = MaxComputeConnection.get_uri_parts( conn_uri) self.driver = "maxcompute" self.params["database"] = proj self.endpoint = endpoint self._conn = ODPS(user, pwd, project=proj, endpoint=endpoint) @staticmethod def get_uri_parts(uri): """Get username, password, endpoint, projectfrom given uri Args: uri: a valid maxcompute connection uri Returns: A tuple (username, password, endpoint, project) """ uripts = urlparse(uri) params = parse_qs(uripts.query) # compose an endpoint, only keep the host and path and replace scheme endpoint = uripts._replace(scheme=params.get("scheme", ["http"])[0], query="", netloc=uripts.hostname) endpoint = endpoint.geturl() return (uripts.username, uripts.password, endpoint, params.get("curr_project", [""])[0]) def _get_result_set(self, statement): try: instance = self._conn.execute_sql(statement) return MaxComputeResultSet(instance) except Exception as e: raise e def close(self): if self._conn: self._conn = None def get_table_schema(self, table_name): schema = self._conn.get_table(table_name).schema return [(c.name, str(c.type).upper()) for c in schema.columns] def persist_table(self, table): sql = "ALTER TABLE %s DISABLE LIFECYCLE;" % table self.execute(sql) def write_table(self, table_name, rows, compress_option=COMPRESS_ODPS_ZLIB): """Append rows to given table, this is a driver specific api Args: table_name: the table to write rows: list of rows, each row is a data tuple, like [(1,True,"ok"),(2,False,"bad")] compress_options: the compress options defined in tunnel.CompressOption.CompressAlgorithm """ self._conn.write_table(table_name, rows, compress_option=compress_option)