Beispiel #1
0
def main():
    hr = HadoopRuntime()
    settings = hr.settings
    print(settings)
    hr.clean_working_dir()
    output_dir = hr.get_hdfs_working_dir("dump_dir")

    sqoop = MySqoop(settings.Param.Sqoop2Server_Host,
                    int(settings.Param.Sqoop2Server_Port))

    # First, Create an connection
    conn_name = "import_m_job%s_blk%s" % (settings.GlobalParam["jobId"],
                                          settings.GlobalParam["blockId"])
    conn_ret = sqoop.create_connection(
        conn_name=conn_name,
        conn_str=settings.Param.connection_string,
        username=settings.Param.connection_username,
        password=settings.Param.connection_password)

    # Then, Run sqoop import job
    fw_ps = {
        "output.storageType": "HDFS",
        "output.outputFormat": "TEXT_FILE",
        "output.outputDirectory": output_dir
    }
    if settings.Param.where_clause and settings.Param.where_clause != None and str(
            settings.Param.where_clause).strip(" ") != "":
        table_sql = "select %s from %s where ${CONDITIONS} and %s " % (
            settings.Param.input_columns, settings.Param.table_name,
            settings.Param.where_clause)
    else:
        table_sql = "select %s from %s where ${CONDITIONS}" % (
            settings.Param.input_columns, settings.Param.table_name)
    partition_column = settings.Param.partition_column

    print settings.Param.where_clause
    print table_sql

    job_ps = {
        "table.sql": table_sql,
        "table.partitionColumn": partition_column
    }
    job_name = "import job :: username(%s) job %s, block %s" % (
        settings.GlobalParam["userName"], settings.GlobalParam["jobId"],
        settings.GlobalParam["blockId"])
    r = sqoop.create_import_job(job_name=job_name,
                                connection_id=conn_ret["id"],
                                framework_params=fw_ps,
                                job_params=job_ps)
    pp(r)
    sqoop.run_job(r['id'])
    sqoop.wait_job(r['id'])
    sqoop.delete_job(r['id'])

    # Finally, Delete connection we created
    sqoop.delete_connection_by_id(conn_ret["id"])

    settings.Output.output_dir.val = output_dir

    print("Done")
Beispiel #2
0
def main():
    settings = get_settings_from_file("spec.json")
    print(settings)

    sqoop = MySqoop(settings.Param.Sqoop2Server_Host, int(settings.Param.Sqoop2Server_Port))

    # 1. Create an connection
    conn_name = "exporter_job%s_blk%s" % (
            settings.GlobalParam["jobId"],
            settings.GlobalParam["blockId"])
    conn_ret = sqoop.create_connection(conn_name=conn_name,
            conn_str=settings.Param.connection_string,
            username=settings.Param.connection_username,
            password=settings.Param.connection_password)

    # 2. empty the table
    print "Deleting the Table %s" % settings.Param.table_name
    conn_str = settings.Param.connection_string
    cfg = parse_jdbc(conn_str)
    cfg["username"] = settings.Param.connection_username
    cfg["password"] = settings.Param.connection_password
    
    print cfg
    if "postgresql" in cfg["name"]:
        psycopg2_delete_table(cfg,settings.Param.table_name)
        print "delete table %s in POSTGRES" % settings.Param.table_name
    if "sqlserver" in cfg["name"]:
        pymssql_delete_table(cfg, settings.Param.table_name)
        print "delete table %s in MS SQL" % settings.Param.table_name

    # 3. Run sqoop export job
    print "Running Sqoop2 Job to Export"
    fw_ps = {
        "input.inputDirectory": settings.Input.hdfs_path.val
   }
    job_ps = {
        "table.tableName": settings.Param.table_name,
        "table.columns": settings.Param.table_columns
    }
    job_name = "export job :: username(%s) job %s, block %s" % (
            settings.GlobalParam["userName"],
            settings.GlobalParam["jobId"],
            settings.GlobalParam["blockId"])

    r = sqoop.create_export_job(job_name=job_name,
                                connection_id=conn_ret["id"],
                                framework_params=fw_ps,
                                job_params=job_ps)
    pp(r)
    sqoop.run_job(r['id'])
    sqoop.wait_job(r['id'])
    sqoop.delete_job(r['id'])

    # Finally, Delete connection we created
    sqoop.delete_connection_by_id(conn_ret["id"])
    
    settings.Output.signal.val="ready"
    print("Done")
Beispiel #3
0
def main():
    hr = HadoopRuntime()
    settings = hr.settings
    print(settings)
    hr.clean_working_dir()
    output_dir = hr.get_hdfs_working_dir("dump_dir")

    sqoop = MySqoop(settings.Param.Sqoop2Server_Host, int(settings.Param.Sqoop2Server_Port))

    # First, Create an connection
    conn_name = "import_m_job%s_blk%s" % (
            settings.GlobalParam["jobId"],
            settings.GlobalParam["blockId"])
    conn_ret = sqoop.create_connection(conn_name=conn_name,
            conn_str=settings.Param.connection_string,
            username=settings.Param.connection_username,
            password=settings.Param.connection_password)

    # Then, Run sqoop import job
    fw_ps = {
        "output.storageType": "HDFS",
        "output.outputFormat": "TEXT_FILE",
        "output.outputDirectory": output_dir
    }
    if settings.Param.where_clause and  settings.Param.where_clause != None and str(settings.Param.where_clause).strip(" ") != "": 
        table_sql = "select %s from %s where ${CONDITIONS} and %s " %(settings.Param.input_columns,settings.Param.table_name,settings.Param.where_clause)
    else:
        table_sql = "select %s from %s where ${CONDITIONS}" %(settings.Param.input_columns,settings.Param.table_name) 
    partition_column = settings.Param.partition_column
    
    print settings.Param.where_clause
    print table_sql

    job_ps = {
        "table.sql": table_sql,
        "table.partitionColumn": partition_column
    }
    job_name = "import job :: username(%s) job %s, block %s" % (
            settings.GlobalParam["userName"],
            settings.GlobalParam["jobId"],
            settings.GlobalParam["blockId"])
    r = sqoop.create_import_job(job_name=job_name,
                                connection_id=conn_ret["id"],
                                framework_params=fw_ps,
                                job_params=job_ps)
    pp(r)
    sqoop.run_job(r['id'])
    sqoop.wait_job(r['id'])
    sqoop.delete_job(r['id'])

    # Finally, Delete connection we created
    sqoop.delete_connection_by_id(conn_ret["id"])

    settings.Output.output_dir.val = output_dir

    print("Done")
Beispiel #4
0
def main():
    hr = HadoopRuntime()
    settings = hr.settings
    print(settings)
    hr.clean_working_dir()
    output_dir = hr.get_hdfs_working_dir("message_dir")

    sqoop = MySqoop(settings.Param.Sqoop2Server_Host,
                    int(settings.Param.Sqoop2Server_Port))

    # First, Create an connection
    conn_name = "import_m_job%s_blk%s" % (settings.GlobalParam["jobId"],
                                          settings.GlobalParam["blockId"])
    conn_ret = sqoop.create_connection(
        conn_name=conn_name,
        conn_str=settings.Param.connection_string,
        username=settings.Param.connection_username,
        password=settings.Param.connection_password)

    # Then, Run sqoop import job
    fw_ps = {
        "output.storageType": "HDFS",
        "output.outputFormat": "TEXT_FILE",
        "output.outputDirectory": output_dir
    }
    job_ps = {
        "table.sql":
        "select UserId,Description,RefreshDate from Message where ${CONDITIONS}",
        "table.partitionColumn": "UserId"
    }
    job_name = "import job :: username(%s) job %s, block %s" % (
        settings.GlobalParam["userName"], settings.GlobalParam["jobId"],
        settings.GlobalParam["blockId"])
    r = sqoop.create_import_job(job_name=job_name,
                                connection_id=conn_ret["id"],
                                framework_params=fw_ps,
                                job_params=job_ps)
    pp(r)
    sqoop.run_job(r['id'])
    sqoop.wait_job(r['id'])
    sqoop.delete_job(r['id'])

    # Finally, Delete connection we created
    sqoop.delete_connection_by_id(conn_ret["id"])

    settings.Output.message_dir.val = output_dir

    print("Done")
Beispiel #5
0
def main():
    hr = HadoopRuntime()
    settings = hr.settings
    print(settings)
    hr.clean_working_dir()
    output_dir = hr.get_hdfs_working_dir("message_dir")

    sqoop = MySqoop(settings.Param.Sqoop2Server_Host, int(settings.Param.Sqoop2Server_Port))

    # First, Create an connection
    conn_name = "import_m_job%s_blk%s" % (
            settings.GlobalParam["jobId"],
            settings.GlobalParam["blockId"])
    conn_ret = sqoop.create_connection(conn_name=conn_name,
            conn_str=settings.Param.connection_string,
            username=settings.Param.connection_username,
            password=settings.Param.connection_password)

    # Then, Run sqoop import job
    fw_ps = {
        "output.storageType": "HDFS",
        "output.outputFormat": "TEXT_FILE",
        "output.outputDirectory": output_dir
    }
    job_ps = {
        "table.sql": "select UserId,Description,RefreshDate from Message where ${CONDITIONS}",
        "table.partitionColumn": "UserId"
    }
    job_name = "import job :: username(%s) job %s, block %s" % (
            settings.GlobalParam["userName"],
            settings.GlobalParam["jobId"],
            settings.GlobalParam["blockId"])
    r = sqoop.create_import_job(job_name=job_name,
                                connection_id=conn_ret["id"],
                                framework_params=fw_ps,
                                job_params=job_ps)
    pp(r)
    sqoop.run_job(r['id'])
    sqoop.wait_job(r['id'])
    sqoop.delete_job(r['id'])

    # Finally, Delete connection we created
    sqoop.delete_connection_by_id(conn_ret["id"])

    settings.Output.message_dir.val = output_dir

    print("Done")