Python unionAllの例

プログラミング言語: Python

名前空間/パッケージ名: CMSSpark.spark_utils

メソッド/関数: unionAll

hotexamples.comのコード掲載数: 2

Python unionAll - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのCMSSpark.spark_utils.unionAllの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: cern_monit.py プロジェクト: thongonary/CMSSpark

def run(path, amq, stomp, yarn=None, aggregation_schema=False, verbose=False):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    if stomp and os.path.isfile(stomp):
        ctx.addPyFile(stomp)
    else:
        raise Exception('No stomp module egg is provided')
    if amq and os.path.isfile(amq):
        if amq.split('/')[-1] == 'amq_broker.json':
            ctx.addFile(amq)
        else:
            raise Exception(
                'Wrong AMQ broker file name, please name it as amq_broker.json'
            )
    else:
        raise Exception('No AMQ credential file is provided')
    sqlContext = HiveContext(ctx)

    hpath = "hadoop fs -ls %s | awk '{print $8}'" % path
    if verbose:
        print("### Read files: %s" % hpath)
    pipe = Popen(hpath,
                 shell=True,
                 stdin=PIPE,
                 stdout=PIPE,
                 stderr=PIPE,
                 close_fds=True)
    pipe.wait()
    pfiles = [
        f for f in pipe.stdout.read().split('\n') if f.find('part-') != -1
    ]
    df = []

    if aggregation_schema:
        df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                    .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \
                    .load(fname, schema=aggregated_data_schema()) for fname in pfiles])
    else:
        df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                    .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \
                    .load(fname) for fname in pfiles])

    # Register temporary tables to be able to use sqlContext.sql
    df.registerTempTable('df')
    print_rows(df, "DataFrame", verbose)

    print('Schema:')
    df.printSchema()

    # for testing uncomment line below
    # df.toJSON().foreachPartition(print_data)
    # send data to CERN MONIT via stomp AMQ, see send2monit function
    df.toJSON().foreachPartition(send2monit)

    ctx.stop()

コード例 #2

ファイルを表示

ファイル: dbs_condor.py プロジェクト: MircoT/CMSSpark

def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    dtables = ['daf', 'ddf', 'bdf', 'fdf', 'aef', 'pef', 'mcf', 'ocf', 'rvf']
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose, tables=dtables))
#    tables.update(phedex_tables(sqlContext, verbose=verbose))
#    phedex_df = tables['phedex_df']
    daf = tables['daf'] # dataset access table
    ddf = tables['ddf'] # dataset table
    bdf = tables['bdf'] # block table
    fdf = tables['fdf'] # file table
    aef = tables['aef'] # acquisition era
    pef = tables['pef'] # processing era table
    mcf = tables['mcf'] # output mod config table
    ocf = tables['ocf'] # output module table
    rvf = tables['rvf'] # release version table

    # read Condor rdd
#    tables.update(condor_tables(sqlContext, hdir='hdfs:///cms/users/vk/condor', date=condor_date(date), verbose=verbose))
    tables.update(condor_tables(sqlContext, date=condor_date(date), verbose=verbose))
    condor_df = tables['condor_df'] # aaa table

    # aggregate dbs info into dataframe
    cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version']
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version']
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')

    # merge dbs+phedex and Condor data
    cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version']
    cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type', 'data.ExitCode', 'data.TaskType', 'data.RecordTime']
    stmt = 'SELECT %s FROM condor_df JOIN agg_dbs_df ON agg_dbs_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)
#     stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)

    final_df = sqlContext.sql(stmt)
    print_rows(final_df, stmt, verbose)

    # keep table around
    final_df.persist(StorageLevel.MEMORY_AND_DISK)

    # user defined function
    def rate(evts, cores):
        "Calculate the rate of events vs cores, if they're not defineed return -1"
        if evts and cores:
            return float(evts)/float(cores)
        return -1.
    func_rate = udf(rate, DoubleType())

    # our output
    store = {}

    # conditions

    # load pyspark functions to be used here to redefine any previous usage of those names
    from pyspark.sql.functions import lit, sum, count, col, split

    # here we split dataframe based on exitcode conditions to reduce dimentionality
    # of the input, otherwise job crashes with Integer.MAX_VALUE exception which
    # basically tells that input dataframe exceed number of available partitions
    for ecode in [0,1]:
        if ecode == 0:
            refdf = final_df.where(col('ExitCode') == 0)
            condf = condor_df.where(col('data.ExitCode') == 0)
        else:
            refdf = final_df.where(col('ExitCode') != 0)
            condf = condor_df.where(col('data.ExitCode') != 0)
        refdf.persist(StorageLevel.MEMORY_AND_DISK)
        condf.persist(StorageLevel.MEMORY_AND_DISK)

        # aggregate CMS datasets
        cols = ['data.DESIRED_CMSDataset', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumn("tier", split(col('DESIRED_CMSDataset'), "/").alias('tier').getItem(3))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('DESIRED_CMSDataset', 'dataset')
        store.setdefault('dataset', []).append(xdf)

        # aggregate across campaign
        cols = ['data.Campaign', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('Campaign', 'campaign')
        store.setdefault('campaign', []).append(xdf)

        # aggregate across DBS releases
        cols = ['r_release_version', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('r_release_version', 'release')
        store.setdefault('release', []).append(xdf)

        # aggregate across DBS eras
        cols = ['acquisition_era_name', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('acquisition_era_name', 'era')
        store.setdefault('era', []).append(xdf)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if  fout:
        year, month, day = split_date(date)
        for col in store.keys():
            out = '%s/%s/%s/%s/%s' % (fout, col, year, month, day)
            print("output: %s" % out)
            odf = unionAll(store[col])
            print("%s rows: %s" % (col, odf.count()))
            print_rows(odf, col, verbose=1)
            odf.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(out)

    ctx.stop()