Beispiel #1
0
def eos_tables(sqlContext,
               hdir='hdfs:///project/monitoring/archive/eos/logs/reports/cms',
               date=None,
               verbose=False):
    """
    Parse EOS HDFS records. This data set comes from EOS servers at CERN. Data
    is send directly by the EOS team, reading the EOS logs and sending them
    into the MONIT infrastructure.

    Example of EOS JSON record on HDFS
    {"data":"\"log=9e7436fe-1d8e-11e7-ba07-a0369f1fbf0c&path=/store/mc/PhaseISpring17GS/MinBias_TuneCUETP8M1_13TeV-pythia8/GEN-SIM/90X_upgrade2017_realistic_v20-v1/50000/72C78841-2110-E711-867F-F832E4CC4D39.root&ruid=8959&rgid=1399&td=nobody.693038:472@fu-c2e05-24-03-daq2fus1v0--cms&host=p05798818q44165.cern.ch&lid=1048850&fid=553521212&fsid=18722&ots=1491788403&otms=918&cts=1491789688&ctms=225&rb=19186114&rb_min=104&rb_max=524288&rb_sigma=239596.05&wb=0&wb_min=0&wb_max=0&wb_sigma=0.00&sfwdb=7576183815&sbwdb=6313410471&sxlfwdb=7575971197&sxlbwdb=6313300667&nrc=72&nwc=0&nfwds=24&nbwds=10&nxlfwds=12&nxlbwds=4&rt=9130.44&wt=0.00&osize=3850577700&csize=3850577700&sec.prot=gsi&sec.name=cmsprd&sec.host=cms-ucsrv-c2f46-32-07.cern.ch&sec.vorg=&sec.grps=&sec.role=&sec.info=/DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=amaltaro/CN=718748/CN=Alan Malta Rodrigues&sec.app=\"","metadata":{"host":"eoscms-srv-m1.cern.ch","kafka_timestamp":1491789692305,"partition":"14","path":"cms","producer":"eos","timestamp":1491789689562,"topic":"eos_logs","type":"reports","type_prefix":"logs"}}
    # in 2019 we have the following structure
    {"data":{"eos.csize":"49834","eos.ctms":"177","eos.cts":"1548009771","eos.fid":"1282182923","eos.fsid":"8953","eos.fstpath":"/data08/0001f4da/4c6c8f0b","eos.host":"lxfsrf65c02.cern.ch","eos.lid":"1048850","eos.log":"30a723b0-1ce3-11e9-b49d-a0369f1fba7c","eos.nbwds":"0","eos.nfwds":"0","eos.nrc":"0","eos.nwc":"3","eos.nxlbwds":"0","eos.nxlfwds":"0","eos.osize":"0","eos.otms":"109","eos.ots":"1548009771","eos.path":"/eos/cms/store/unified/www/joblogs/vlimant_ACDC0_task_SUS-RunIISummer16FSPremix-00090__v1_T_190118_174333_8486/139/SUS-RunIISummer16FSPremix-00090_0/f585f0eb-1e6b-4722-8266-5254e7975115-72-1-logArchive/job/WMTaskSpace/cmsRun1/lheevent/process/madevent/SubProcesses/P3_gq_urdlxgq/G168.08/run1_app.log","eos.rb":"0","eos.rb_max":"0","eos.rb_min":"0","eos.rb_sigma":"0.00","eos.rc_max":"0","eos.rc_min":"0","eos.rc_sigma":"0.00","eos.rc_sum":"0","eos.rgid":"1399","eos.rs_op":"0","eos.rsb_max":"0","eos.rsb_min":"0","eos.rsb_sigma":"0.00","eos.rsb_sum":"0","eos.rt":"0.00","eos.ruid":"103074","eos.rv_op":"0","eos.rvb_max":"0","eos.rvb_min":"0","eos.rvb_sigma":"0.00","eos.rvb_sum":"0","eos.rvt":"0.00","eos.sbwdb":"0","eos.sec.app":"fuse","eos.sec.host":"vocms0268.ipv6.cern.ch","eos.sec.name":"vlimant","eos.sec.prot":"krb5","eos.sfwdb":"0","eos.sxlbwdb":"0","eos.sxlfwdb":"0","eos.td":"daemon.23360:250@lxfsre50c03","eos.wb":"0","eos.wb_max":"0","eos.wb_min":"0","eos.wb_sigma":"0.00","eos.wt":"0.07","raw":"log=30a723b0-1ce3-11e9-b49d-a0369f1fba7c&path=/eos/cms/store/unified/www/joblogs/vlimant_ACDC0_task_SUS-RunIISummer16FSPremix-00090__v1_T_190118_174333_8486/139/SUS-RunIISummer16FSPremix-00090_0/f585f0eb-1e6b-4722-8266-5254e7975115-72-1-logArchive/job/WMTaskSpace/cmsRun1/lheevent/process/madevent/SubProcesses/P3_gq_urdlxgq/G168.08/run1_app.log&fstpath=/data08/0001f4da/4c6c8f0b&ruid=103074&rgid=1399&td=daemon.23360:250@lxfsre50c03&host=lxfsrf65c02.cern.ch&lid=1048850&fid=1282182923&fsid=8953&ots=1548009771&otms=109&cts=1548009771&ctms=177&nrc=0&nwc=3&rb=0&rb_min=0&rb_max=0&rb_sigma=0.00&rv_op=0&rvb_min=0&rvb_max=0&rvb_sum=0&rvb_sigma=0.00&rs_op=0&rsb_min=0&rsb_max=0&rsb_sum=0&rsb_sigma=0.00&rc_min=0&rc_max=0&rc_sum=0&rc_sigma=0.00&wb=0&wb_min=0&wb_max=0&wb_sigma=0.00&sfwdb=0&sbwdb=0&sxlfwdb=0&sxlbwdb=0&nfwds=0&nbwds=0&nxlfwds=0&nxlbwds=0&rt=0.00&rvt=0.00&wt=0.07&osize=0&csize=49834&sec.prot=krb5&sec.name=vlimant&sec.host=vocms0268.ipv6.cern.ch&sec.vorg=&sec.grps=&sec.role=&sec.info=&sec.app=fuse","real_timestamp":"1548009771000"},"metadata":{"_id":"e2a22c6d-77b3-dd33-d5d8-3357c1988e49","host":"eoscms-srv-m1.cern.ch","json":"true","kafka_timestamp":1548009773721,"partition":"19","path":"cms","producer":"eos","timestamp":1548009771000,"topic":"eos_logs","type":"reports","type_prefix":"logs"}}

    The EOS record consist of data and metadata parts where data part squashed
    into single string all requested parameters.

    :returns: a dictionary with eos Spark DataFrame
    """
    if not date:
        # by default we read yesterdate data
        date = time.strftime("%Y/%m/%d",
                             time.gmtime(time.time() - 60 * 60 * 24))

    hpath = '%s/%s' % (hdir, date)
    cols = ['data', 'metadata.timestamp']

    files_in_hpath = files(hpath, verbose)

    if len(files_in_hpath) == 0:
        eos_df = sqlContext.createDataFrame([], schema=schema_empty_eos())
        eos_df.registerTempTable('eos_df')
        tables = {'eos_df': eos_df}
        return tables

    # in Spark 2.X and 2019 we have different record
    edf = sqlContext.read.json(hpath)
    if verbose:
        edf.printSchema()
    data = edf.data
    eos_df = edf.select(
        data.getField("eos.path").alias("file_lfn"),
        data.getField("eos.sec.info").alias("user_dn"),
        data.getField("eos.sec.app").alias("application"),
        data.getField("eos.sec.host").alias("host"),
        edf.metadata.getField("timestamp").alias("timestamp"))
    eos_df.registerTempTable('eos_df')
    if verbose:
        eos_df.printSchema()
        records = eos_df.take(1)  # take function will return list of records
        print("### rdd records", records, type(records))

    if verbose:
        records = eos_df.take(1)  # take function will return list of records
        print("### eos_rdd records", records, type(records))

    # create new spark DataFrame
    eos_df.registerTempTable('eos_df')
    tables = {'eos_df': eos_df}
    return tables
Beispiel #2
0
def eos_tables(sqlContext,
        hdir='hdfs:///project/monitoring/archive/eos/logs/reports/cms',
        date=None, verbose=False):
    """
    Parse EOS HDFS records. This data set comes from EOS servers at CERN. Data
    is send directly by the EOS team, reading the EOS logs and sending them
    into the MONIT infrastructure.

    Example of EOS JSON record on HDFS
    {"data":"\"log=9e7436fe-1d8e-11e7-ba07-a0369f1fbf0c&path=/store/mc/PhaseISpring17GS/MinBias_TuneCUETP8M1_13TeV-pythia8/GEN-SIM/90X_upgrade2017_realistic_v20-v1/50000/72C78841-2110-E711-867F-F832E4CC4D39.root&ruid=8959&rgid=1399&td=nobody.693038:472@fu-c2e05-24-03-daq2fus1v0--cms&host=p05798818q44165.cern.ch&lid=1048850&fid=553521212&fsid=18722&ots=1491788403&otms=918&cts=1491789688&ctms=225&rb=19186114&rb_min=104&rb_max=524288&rb_sigma=239596.05&wb=0&wb_min=0&wb_max=0&wb_sigma=0.00&sfwdb=7576183815&sbwdb=6313410471&sxlfwdb=7575971197&sxlbwdb=6313300667&nrc=72&nwc=0&nfwds=24&nbwds=10&nxlfwds=12&nxlbwds=4&rt=9130.44&wt=0.00&osize=3850577700&csize=3850577700&sec.prot=gsi&sec.name=cmsprd&sec.host=cms-ucsrv-c2f46-32-07.cern.ch&sec.vorg=&sec.grps=&sec.role=&sec.info=/DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=amaltaro/CN=718748/CN=Alan Malta Rodrigues&sec.app=\"","metadata":{"host":"eoscms-srv-m1.cern.ch","kafka_timestamp":1491789692305,"partition":"14","path":"cms","producer":"eos","timestamp":1491789689562,"topic":"eos_logs","type":"reports","type_prefix":"logs"}}

    The EOS record consist of data and metadata parts where data part squashed
    into single string all requested parameters.

    :returns: a dictionary with eos Spark DataFrame
    """
    if  not date:
        # by default we read yesterdate data
        date = time.strftime("%Y/%m/%d", time.gmtime(time.time()-60*60*24))

    hpath = '%s/%s' % (hdir, date)
    cols = ['data', 'metadata.timestamp']

    files_in_hpath = files(hpath, verbose)

    if len(files_in_hpath) == 0:
        eos_df = sqlContext.createDataFrame([], schema=schema_empty_eos())
        eos_df.registerTempTable('eos_df')
        tables = {'eos_df':eos_df}
        return tables
    
    rdd = unionAll([sqlContext.jsonFile(path) for path in files_in_hpath], cols)

    def parse_log(r):
        "Local helper function to parse EOS record and extract intersting fields"
        rdict = {}
        for item in str(r['data']).split('&'):
            if  item.startswith('path='):
                rdict['file_lfn'] = item.split('path=')[-1]
            if  item.startswith('sec.info='):
                rdict['user_dn'] = item.split('sec.info=')[-1]
            if  item.startswith('sec.app='):
                rdict['application'] = item.split('sec.app=')[-1]
            if  item.startswith('sec.host='):
                rdict['host'] = item.split('sec.host=')[-1]

        rdict['timestamp'] = r['timestamp']

        return rdict

    eos_rdd = rdd.map(lambda r: parse_log(r))

    records = eos_rdd.take(1) # take function will return list of records
    if  verbose:
        print("### eos_rdd records", records, type(records))

    # create new spark DataFrame
    eos_df = sqlContext.createDataFrame(eos_rdd)
    eos_df.registerTempTable('eos_df')
    tables = {'eos_df':eos_df}
    return tables
Beispiel #3
0
def eos_tables(sqlContext,
               hdir='hdfs:///project/monitoring/archive/eos/logs/reports/cms',
               date=None,
               start_date=None,
               end_date=None,
               verbose=False):
    """
    Parse EOS HDFS records. This data set comes from EOS servers at CERN. Data
    is send directly by the EOS team, reading the EOS logs and sending them
    into the MONIT infrastructure.
    
    Use https://twiki.cern.ch/twiki/bin/view/ITAnalyticsWorkingGroup/EosFileAccessLogs as data dictionary. 

    Example of EOS JSON record on HDFS
    {"data":"\"log=9e7436fe-1d8e-11e7-ba07-a0369f1fbf0c&path=/store/mc/PhaseISpring17GS/MinBias_TuneCUETP8M1_13TeV-pythia8/GEN-SIM/90X_upgrade2017_realistic_v20-v1/50000/72C78841-2110-E711-867F-F832E4CC4D39.root&ruid=8959&rgid=1399&td=nobody.693038:472@fu-c2e05-24-03-daq2fus1v0--cms&host=p05798818q44165.cern.ch&lid=1048850&fid=553521212&fsid=18722&ots=1491788403&otms=918&cts=1491789688&ctms=225&rb=19186114&rb_min=104&rb_max=524288&rb_sigma=239596.05&wb=0&wb_min=0&wb_max=0&wb_sigma=0.00&sfwdb=7576183815&sbwdb=6313410471&sxlfwdb=7575971197&sxlbwdb=6313300667&nrc=72&nwc=0&nfwds=24&nbwds=10&nxlfwds=12&nxlbwds=4&rt=9130.44&wt=0.00&osize=3850577700&csize=3850577700&sec.prot=gsi&sec.name=cmsprd&sec.host=cms-ucsrv-c2f46-32-07.cern.ch&sec.vorg=&sec.grps=&sec.role=&sec.info=/DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=amaltaro/CN=718748/CN=Alan Malta Rodrigues&sec.app=\"","metadata":{"host":"eoscms-srv-m1.cern.ch","kafka_timestamp":1491789692305,"partition":"14","path":"cms","producer":"eos","timestamp":1491789689562,"topic":"eos_logs","type":"reports","type_prefix":"logs"}}
    # in 2019 we have the following structure
    {"data":{"eos.csize":"49834","eos.ctms":"177","eos.cts":"1548009771","eos.fid":"1282182923","eos.fsid":"8953","eos.fstpath":"/data08/0001f4da/4c6c8f0b","eos.host":"lxfsrf65c02.cern.ch","eos.lid":"1048850","eos.log":"30a723b0-1ce3-11e9-b49d-a0369f1fba7c","eos.nbwds":"0","eos.nfwds":"0","eos.nrc":"0","eos.nwc":"3","eos.nxlbwds":"0","eos.nxlfwds":"0","eos.osize":"0","eos.otms":"109","eos.ots":"1548009771","eos.path":"/eos/cms/store/unified/www/joblogs/vlimant_ACDC0_task_SUS-RunIISummer16FSPremix-00090__v1_T_190118_174333_8486/139/SUS-RunIISummer16FSPremix-00090_0/f585f0eb-1e6b-4722-8266-5254e7975115-72-1-logArchive/job/WMTaskSpace/cmsRun1/lheevent/process/madevent/SubProcesses/P3_gq_urdlxgq/G168.08/run1_app.log","eos.rb":"0","eos.rb_max":"0","eos.rb_min":"0","eos.rb_sigma":"0.00","eos.rc_max":"0","eos.rc_min":"0","eos.rc_sigma":"0.00","eos.rc_sum":"0","eos.rgid":"1399","eos.rs_op":"0","eos.rsb_max":"0","eos.rsb_min":"0","eos.rsb_sigma":"0.00","eos.rsb_sum":"0","eos.rt":"0.00","eos.ruid":"103074","eos.rv_op":"0","eos.rvb_max":"0","eos.rvb_min":"0","eos.rvb_sigma":"0.00","eos.rvb_sum":"0","eos.rvt":"0.00","eos.sbwdb":"0","eos.sec.app":"fuse","eos.sec.host":"vocms0268.ipv6.cern.ch","eos.sec.name":"vlimant","eos.sec.prot":"krb5","eos.sfwdb":"0","eos.sxlbwdb":"0","eos.sxlfwdb":"0","eos.td":"daemon.23360:250@lxfsre50c03","eos.wb":"0","eos.wb_max":"0","eos.wb_min":"0","eos.wb_sigma":"0.00","eos.wt":"0.07","raw":"log=30a723b0-1ce3-11e9-b49d-a0369f1fba7c&path=/eos/cms/store/unified/www/joblogs/vlimant_ACDC0_task_SUS-RunIISummer16FSPremix-00090__v1_T_190118_174333_8486/139/SUS-RunIISummer16FSPremix-00090_0/f585f0eb-1e6b-4722-8266-5254e7975115-72-1-logArchive/job/WMTaskSpace/cmsRun1/lheevent/process/madevent/SubProcesses/P3_gq_urdlxgq/G168.08/run1_app.log&fstpath=/data08/0001f4da/4c6c8f0b&ruid=103074&rgid=1399&td=daemon.23360:250@lxfsre50c03&host=lxfsrf65c02.cern.ch&lid=1048850&fid=1282182923&fsid=8953&ots=1548009771&otms=109&cts=1548009771&ctms=177&nrc=0&nwc=3&rb=0&rb_min=0&rb_max=0&rb_sigma=0.00&rv_op=0&rvb_min=0&rvb_max=0&rvb_sum=0&rvb_sigma=0.00&rs_op=0&rsb_min=0&rsb_max=0&rsb_sum=0&rsb_sigma=0.00&rc_min=0&rc_max=0&rc_sum=0&rc_sigma=0.00&wb=0&wb_min=0&wb_max=0&wb_sigma=0.00&sfwdb=0&sbwdb=0&sxlfwdb=0&sxlbwdb=0&nfwds=0&nbwds=0&nxlfwds=0&nxlbwds=0&rt=0.00&rvt=0.00&wt=0.07&osize=0&csize=49834&sec.prot=krb5&sec.name=vlimant&sec.host=vocms0268.ipv6.cern.ch&sec.vorg=&sec.grps=&sec.role=&sec.info=&sec.app=fuse","real_timestamp":"1548009771000"},"metadata":{"_id":"e2a22c6d-77b3-dd33-d5d8-3357c1988e49","host":"eoscms-srv-m1.cern.ch","json":"true","kafka_timestamp":1548009773721,"partition":"19","path":"cms","producer":"eos","timestamp":1548009771000,"topic":"eos_logs","type":"reports","type_prefix":"logs"}}
    # in 2019 we have the following structure, version II after I reported eos prefix issue
    {"data":{"csize":"853699786","ctms":"939","cts":"1549043141","eos_host":"p06253937y92607.cern.ch","eos_path":"/eos/cms/store/cmst3/group/wmass/w-helicity-13TeV/trees/TREES_electrons_1l_V6_TINY/friends/tree_Friend_WJetsToLNu_NLO_part1.root","fid":"1126583180","fsid":"20546","fstpath":"/data07/0001b812/43264b8c","lid":"1048850","log":"287544b4-2649-11e9-abe8-a0369f1fba7c","nbwds":"145","nfwds":"195","nrc":"479","nwc":"0","nxlbwds":"144","nxlfwds":"143","osize":"853699786","otms":"479","ots":"1549043126","prot":"krb5","raw":"log=287544b4-2649-11e9-abe8-a0369f1fba7c&path=/eos/cms/store/cmst3/group/wmass/w-helicity-13TeV/trees/TREES_electrons_1l_V6_TINY/friends/tree_Friend_WJetsToLNu_NLO_part1.root&fstpath=/data07/0001b812/43264b8c&ruid=24421&rgid=1399&td=emanuele.81:510@b6644a93b5&host=p06253937y92607.cern.ch&lid=1048850&fid=1126583180&fsid=20546&ots=1549043126&otms=479&cts=1549043141&ctms=939&nrc=479&nwc=0&rb=41276444&rb_min=294&rb_max=326196&rb_sigma=101487.35&rv_op=0&rvb_min=0&rvb_max=0&rvb_sum=0&rvb_sigma=0.00&rs_op=0&rsb_min=0&rsb_max=0&rsb_sum=0&rsb_sigma=0.00&rc_min=0&rc_max=0&rc_sum=0&rc_sigma=0.00&wb=0&wb_min=0&wb_max=0&wb_sigma=0.00&sfwdb=3019121271&sbwdb=2233253538&sxlfwdb=3017497429&sxlbwdb=2233253338&nfwds=195&nbwds=145&nxlfwds=143&nxlbwds=144&rt=48.11&rvt=0.00&wt=0.00&osize=853699786&csize=853699786&sec.prot=krb5&sec.name=emanuele&sec.host=b6644a93b5.cern.ch&sec.vorg=&sec.grps=&sec.role=&sec.info=&sec.app=","rb":"41276444","rb_max":"326196","rb_min":"294","rb_sigma":"101487.35","rc_max":"0","rc_min":"0","rc_sigma":"0.00","rc_sum":"0","real_timestamp":"1549043126000","rgid":"1399","rs_op":"0","rsb_max":"0","rsb_min":"0","rsb_sigma":"0.00","rsb_sum":"0","rt":"48.11","ruid":"24421","rv_op":"0","rvb_max":"0","rvb_min":"0","rvb_sigma":"0.00","rvb_sum":"0","rvt":"0.00","sbwdb":"2233253538","sec_host":"b6644a93b5.cern.ch","sec_name":"emanuele","sfwdb":"3019121271","sxlbwdb":"2233253338","sxlfwdb":"3017497429","td":"emanuele.81:510@b6644a93b5","wb":"0","wb_max":"0","wb_min":"0","wb_sigma":"0.00","wt":"0.00"},"metadata":{"_id":"a3b00488-552e-35a2-42f0-c52112655e15","host":"eoscms-srv-b2.cern.ch","json":"true","kafka_timestamp":1549043144145,"partition":"10","path":"cms","producer":"eos","timestamp":1549043126000,"topic":"eos_logs","type":"reports","type_prefix":"logs"}}

    The EOS record consist of data and metadata parts where data part squashed
    into single string all requested parameters.

    :returns: a dictionary with eos Spark DataFrame
    """
    if not date:
        if start_date:
            if not end_date:
                end_date = time.strftime(
                    "%Y/%m/%d", time.gmtime(time.time() - 60 * 60 * 24))
            _sd = dt.strptime(start_date, "%Y/%m/%d")
            _ed = dt.strptime(end_date, "%Y/%m/%d")
            dates = ','.join([(_sd + timedelta(days=x)).strftime("%Y/%m/%d")
                              for x in xrange(0, (_ed - _sd).days + 1)])
            date = '{{{}}}'.format(dates)
        else:
            # by default we read yesterdate data
            date = time.strftime("%Y/%m/%d",
                                 time.gmtime(time.time() - 60 * 60 * 24))

    hpath = '%s/%s/part*' % (hdir, date)
    cols = ['data', 'metadata.timestamp']
    # sqlContex can be either a SQLContext instance (for older spark/code)
    # or a SparkSession for the newer(post spark 2.2) code. In most cases it works similar,
    # but to get the SparkContext we need to threat it different:
    files_in_hpath = glob_files(
        sqlContext.sparkSession.sparkContext if isinstance(
            sqlContext, SQLContext) else sqlContext.sparkContext,
        hpath,
        verbose,
    )

    if len(files_in_hpath) == 0:
        eos_df = sqlContext.createDataFrame([], schema=schema_empty_eos())
        eos_df.registerTempTable('eos_df')
        tables = {'eos_df': eos_df}
        return tables

    # in Spark 2.X and 2019 we have different record
    # Sampling ratio, if there is more than one file we can take the 10%,
    # but if there is only one file it is probable that it have less than 10 records
    # (making the samplig size 0, which make the process fail)
    edf = sqlContext.read.option("basePath", hdir).option(
        "samplingRatio",
        0.1 if len(files_in_hpath) > 1 else 1).json(files_in_hpath)
    f_data = 'data as raw' if str(
        edf.schema['data'].dataType) == 'StringType' else 'data.raw'
    edf = edf.selectExpr(f_data, 'metadata.timestamp')

    # At this moment, json files can have one of two known schemas. In order to read several days we need to be able to work with both of them.
    # eos_df = edf.select(data.getField("eos.path").alias("file_lfn"), data.getField("eos.sec.info").alias("user_dn"), data.getField("eos.sec.app").alias("application"), data.getField("eos.sec.host").alias("host"), edf.metadata.getField("timestamp").alias("timestamp"))
    # eos_df = edf.select(eos_path, data.getField("sec_info").alias("user_dn"), data.getField("sec_app").alias("application"), data.getField("eos_host").alias("host"), edf.metadata.getField("timestamp").alias("timestamp"))
    # We can use the raw field because it doesn't change on time
    eos_df = edf\
         .withColumn('rb_max', regexp_extract(edf.raw,'&rb_max=([^&\']*)',1).cast('long'))\
         .withColumn('session', regexp_extract(edf.raw,'&td=([^&\']*)',1))\
         .withColumn('file_lfn', regexp_extract(edf.raw,'&path=([^&]*)',1))\
         .withColumn('application', regexp_extract(edf.raw,'&sec.app=([^&\']*)',1))\
         .withColumn('rt', regexp_extract(edf.raw,'&rt=([^&\']*)',1).cast('long'))\
         .withColumn('wt', regexp_extract(edf.raw,'&wt=([^&\']*)',1).cast('long'))\
         .withColumn('rb', regexp_extract(edf.raw,'&rb=([^&\']*)',1).cast('long'))\
         .withColumn('wb', regexp_extract(edf.raw,'&wb=([^&\']*)',1).cast('long'))\
         .withColumn('cts', regexp_extract(edf.raw,'&cts=([^&\']*)',1).cast('long'))\
         .withColumn('csize', regexp_extract(edf.raw,'&csize=([^&\']*)',1).cast('long'))\
         .withColumn('user', regexp_extract(edf.raw,'&sec.name=([^&\']*)',1))\
         .withColumn('user_dn', regexp_extract(edf.raw,'&sec.info=([^&\']*)',1))\
         .withColumn('day', date_format(from_unixtime(edf.timestamp/1000),'yyyyMMdd'))

    if verbose:
        eos_df.printSchema()
        records = eos_df.take(1)  # take function will return list of records
        print("### rdd records", records, type(records))

    if verbose:
        records = eos_df.take(1)  # take function will return list of records
        print("### eos_rdd records", records, type(records))

    # create new spark DataFrame
    eos_df.registerTempTable('eos_df')
    tables = {'eos_df': eos_df}
    return tables