Exemple #1
0
def run(args):
    # build an odps instance
    o = ODPS(args.odps_access_id,
             args.odps_access_key,
             args.odps_project,
             endpoint=args.odps_endpoint)

    input_table_project = args.odps_project
    input_table_name = args.input_table_name
    if '.' in input_table_name:
        input_table_project = args.input_table_name.split(".")[0]
        input_table_name = args.input_table_name.split(".")[1]

    # download data from odps
    input_table = o.get_table(input_table_name, project=input_table_project)
    data = input_table.to_df().to_pandas()

    # sample data
    new_data = data.sample(args.sample_row_count)

    # create output table and upload data to odps
    o.delete_table(args.output_table_name, if_exists=True)
    output_table_project = args.odps_project
    output_table_name = args.output_table_name
    if '.' in output_table_name:
        output_table_project = args.output_table_name.split(".")[0]
        output_table_name = args.output_table_name.split(".")[1]

    table = o.create_table(output_table_name,
                           input_table.schema,
                           project=output_table_project,
                           if_not_exists=False,
                           lifecycle=3)
    o.write_table(output_table_name,
                  new_data.values.tolist(),
                  project=output_table_project)
Exemple #2
0
class MaxComputeConnection(Connection):
    """MaxCompute connection, this class uses ODPS object to establish
    connection with maxcompute

    Args:
        conn_uri: uri in format:
        maxcompute://access_id:[email protected]/api?curr_project=test_ci&scheme=http
    """
    def __init__(self, conn_uri):
        super(MaxComputeConnection, self).__init__(conn_uri)
        user, pwd, endpoint, proj = MaxComputeConnection.get_uri_parts(
            conn_uri)
        self.driver = "maxcompute"
        self.params["database"] = proj
        self.endpoint = endpoint
        self._conn = ODPS(user, pwd, project=proj, endpoint=endpoint)

    @staticmethod
    def get_uri_parts(uri):
        """Get username, password, endpoint, projectfrom given uri

        Args:
            uri: a valid maxcompute connection uri

        Returns:
            A tuple (username, password, endpoint, project)
        """
        uripts = urlparse(uri)
        params = parse_qs(uripts.query)
        # compose an endpoint, only keep the host and path and replace scheme
        endpoint = uripts._replace(scheme=params.get("scheme", ["http"])[0],
                                   query="",
                                   netloc=uripts.hostname)
        endpoint = endpoint.geturl()
        return (uripts.username, uripts.password, endpoint,
                params.get("curr_project", [""])[0])

    def _get_result_set(self, statement):
        try:
            instance = self._conn.execute_sql(statement)
            return MaxComputeResultSet(instance)
        except Exception as e:
            raise e

    def close(self):
        if self._conn:
            self._conn = None

    def get_table_schema(self, table_name):
        schema = self._conn.get_table(table_name).schema
        return [(c.name, str(c.type).upper()) for c in schema.columns]

    def persist_table(self, table):
        sql = "ALTER TABLE %s DISABLE LIFECYCLE;" % table
        self.execute(sql)

    def write_table(self,
                    table_name,
                    rows,
                    compress_option=COMPRESS_ODPS_ZLIB):
        """Append rows to given table, this is a driver specific api

        Args:
            table_name: the table to write
            rows: list of rows, each row is a data tuple,
                like [(1,True,"ok"),(2,False,"bad")]
            compress_options: the compress options defined in
                tunnel.CompressOption.CompressAlgorithm
        """
        self._conn.write_table(table_name,
                               rows,
                               compress_option=compress_option)