def drop_hive_table(tab_name, and_folder=True, in_ps=False):
    global spark
    if check_hive_table_existence(tab_name):
        if all((in_ps, and_folder)):
            cr_tb_data = spark.sql(
                f'show create table {tab_name}').collect()[0][0]
            spark.sql(f'show create table {tab_name}')
            tab_location = get_table_path(tab_name)
            sh.hadoop('fs', '-rm', '-skipTrash', '-r', tab_location)
    spark.sql(f'drop table if exists {tab_name}')
    if and_folder and not in_ps:
        os.system(
            'hdfs dfs -rm -r -skipTrash '\
            '/user/hive/warehouse/{}.db/{}'.format(*tab_name.split('.'))
        )
Example #2
0
def main():
    options,args = parser.parse_args(sys.argv[1:])

    if len(args) < 3:
        parser.print_help()
        sys.exit(45)


    run=args[0]
    is2 = int(args[1])
    ie_or_is2n = int(args[2])

    conf=shapesim.read_config(run)
    simconf = shapesim.read_config(conf['sim'])

    pattern=shapesim.get_output_url(run, is2, ie_or_is2n, itrial='*', fs='hdfs')

    flist=awk(hadoop('fs','-ls',pattern), '{print $8}').split()

    nring = simconf['nring']
    for i in xrange(nring):
        f=shapesim.get_output_url(run, is2, ie_or_is2n, itrial=i, fs='hdfs')
        f=f.replace('hdfs://','')
        if f not in flist:
            print f
Example #3
0
def ls(hdfsPath):
    try:
        return (0,sh.hadoop("fs","-ls",hdfsPath))
    except sh.ErrorReturnCode as e:
        return (-1,e.stderr)
Example #4
0
def size(hdfsPath):
    try:
        return (0,sh.hadoop("fs","-du","-h",hdfsPath))
    except sh.ErrorReturnCode as e:
        print e
        return (-1,e.stderr)
Example #5
0
def setrep(repFactor,dirName):
    try:
        return (0,sh.hadoop("fs","-setrep","-R",repFactor,dirName))
    except sh.ErrorReturnCode as e:
        print e
        return (-1,e.stderr)
Example #6
0
def runTable(jarFile, scale, base, tableName):
    try:
        return (0, sh.hadoop("jar", jarFile, "-d", base + "/" + str(scale) + "/", "-s", scale, "-t", tableName))
    except sh.ErrorReturnCode as e:
        print e
        return (-1, e.stderr)
Example #7
0
def mkdir(hdfsPath):
    try:
        return (0,sh.hadoop("fs","-mkdir","-p",hdfsPath))
    except sh.ErrorReturnCode as e:
        return (-1,e.stderr)
def save_sdf_to_ps(sdf: pyspark.sql.dataframe.DataFrame or bool = False,
                   table_name: str = 'new_tab',
                   cur_path: str or bool = False,
                   overwrite: bool = True,
                   hive_schema: str = 'default',
                   ps_folder: str = '',
                   parquet_write_mode: str = 'overwrite',
                   parquet_compression: str = 'none',
                   ps_path: str = 'hdfs://clsklsbx/user/team/team_ds_cltv/'):
    """sdf - Spark DataFrame to save
    table_name - new table name in Hive
    overwrite - overwriting Hive table if it exists
    hive_schema - name of Hive db
    ps_folder - directory in "Persistent Storage" to save
    ps_path - hdfs-link to our "Persistent Storage"
    cur_path - if files exist, we only creating external table
    """
    tab_name = f'{hive_schema}.{table_name}'
    existence = check_hive_table_existence(tab_name)
    ps_folder = hive_schema if len(ps_folder) == 0 else ps_folder
    final_path = f'{ps_path}{ps_folder}'
    table_path = f'{final_path}/{table_name}'

    if any([not existence, overwrite]):
        if existence:
            if not cur_path:
                sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path)
            else:
                sh.hadoop('distcp', cur_path, new_path)
                sh.hadoop('fs', '-rm', '-skipTrash', '-r', table_path)
            drop_hive_table(tab_name, False)
    else:
        print(f'{tab_name} already exists')
        return None

    if cur_path:
        sdf = spark.read.parquet(cur_path)
        table_path = cur_path

    for column in sdf.dtypes:
        if 'date' in column[1]:
            sdf = sdf.withColumn(
                column[0],
                F.col(column[0]).cast(T.TimestampType()).alias(column[0]))
    if not cur_path:

        if len(ps_folder) > 0:
            hadoop_folders = list(
                filter(lambda x: len(x) > 1,
                       sh.hadoop('fs', '-ls', '-C', ps_path).split('\n')))
            hadoop_folders = [x.split('/')[-1] for x in hadoop_folders]
            if not any([x == ps_folder for x in hadoop_folders]):
                sh.hadoop('fs', '-mkdir', final_path)
                sh.hdfs('dfs', '-chmod', '-R', '777', final_path)

        sdf.write.option('compression', parquet_compression) \
            .mode(parquet_write_mode).parquet(table_path)

    sh.hdfs('dfs', '-setrep', '-R', '2', table_path)

    send_beeline_query(
        query=f"create external table {tab_name} " \
              f"({','.join([f'{x[0]} {x[1]}' for x in sdf.dtypes])}) " \
              f"row format serde 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " \
              f"stored as inputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' " \
              f"outputformat 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' " \
              f"location '{table_path}' ",
        print_output=False
    )

    sh.hdfs('dfs', '-chmod', '-R', '777', table_path)
    print(f'{tab_name} created, files based in {table_path}')