Ejemplos de process_dataset en Python, ejemplos de e4t.process.jobs.process_dataset en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: jobs.py Proyecto: exedre/e4t

def exec_job_COMPILE(options,*args,**kw):
    """Make .tex file for JOB"""
    kw = udict(kw)
    base = join(options.process_path,'input')
    copy_file(kw,base)
    datareqs, processors, outputs = e4t.cli.get_elem_configs(*args, **kw)

    _dataset = None
    elapsed = [.0,.0,.0]

    if datareqs:
        with Timer() as t:
            _dataset = tsload_datareqs(options,_dataset,*datareqs,**kw)
        logger.debug("TIME tsload %f",t.secs)
        elapsed[0]=t.secs
        
    if processors:
        with Timer() as t:
            _dataset = process_dataset(options,_dataset,processors,*args,**kw)
        logger.debug("TIME process %f",t.secs)
        elapsed[1]=t.secs
        

    if not _dataset:
        logger.error('VOID dataset on output step')
        # raise IOError('VOID dataset')

    if True: #  outputs:
        with Timer() as t:
            output_dataset(options,_dataset,outputs,*args,**kw)
        logger.debug("TIME output %f",t.secs)
        elapsed[2]=t.secs
    else:
        logger.error('No conf for output step')
        raise IOError('VOID output')

    setattr(options,'elapsed',elapsed)

    if options.switch_debug:
        # Write the report file
        e4t.cli.save_accounting("%s.cli" % options.job)
        codecs.open("%s.time" % options.job,
                    'w',
                    'utf-8').write(u"tsload: %f\nprocss: %f\noutput: %f\n" % (elapsed[0],
                                                                              elapsed[1],
                                                                              elapsed[2]))

Ejemplo n.º 2

0

Mostrar archivo

Archivo: jobs.py Proyecto: exedre/e4t

def exec_job_EXPORT(options,*args,**kw):

    setattr(options,'job',args[0])
    kw = udict(kw)
    formats = options.options.xget('FORMAT',('XLS',))
    phase = [ x.upper() for x in options.options.xget('PHASE',('PROCESS',))]

    logger.debug('EXPORT FILE OUTPUT FOR %s - f:%s - p:%s',
                 ','.join(args),
                 ','.join(formats),
                 ','.join(phase))

    (datareqs,processors,outputs) = e4t.cli.get_elem_configs(*args,**kw)

    cfg_T = e4t.cli._read_cfgfile(kw,'OUTPUT')

    if cfg_T is None:
        return
    
    cfg = cfg_T.xget(args[0])
    
    if re.match('multiple',cfg.xget('KIND'),re.I):
        elements = cfg.xget_list('ELEMENTS')
        cfgs = dict([ (el,cfg_T.xget(el))  for el in elements ])
        cfg.update(cfgs)
        
    _cfgs = (kw, cfg, cfg_T)

    _dataset = tsload_datareqs(options,None,*datareqs)
    logger.debug('data loaded')
                 
    if 'LOAD' in phase:
        save_ds_with_fmts(formats,_dataset,_cfgs,'raw',*args,**kw)
        
        if len(phase)==1:
            return
        
    _dataset = process_dataset(options,_dataset,processors)
    logger.debug('data processed')
    
    if  'PROCESS' in phase:
        save_ds_with_fmts(formats,_dataset,_cfgs,'proc',*args,**kw)

    if 'DEFINITION' in phase:
        save_definition(options,formats,_dataset,outputs,_cfgs,'def',*args,**kw)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: cache.py Proyecto: exedre/e4t

def exec_cache_COUNT(options,jlist,base,jobspecfile,order,jjspec):
    from e4t.load.jobs        import tsload_datareqs
    from e4t.timeseries       import Timeseries
    from e4t.process.jobs     import process_dataset
    from e4t.utils            import acct
    
    cache = PickleCache(options)

    offset = _get_offset(options,1)

    num_els = 0
    num_nums = 0
    num_rew = 0
    
    for section,jobs in jlist.items():
        print section
        for j,k in jobs:
            (datareqs,processors,outputs) = e4t.cli.get_elem_configs(j,**k)
            if datareqs:
                _dataset = tsload_datareqs(options,None,*datareqs)
                _base_ds = set(_dataset.keys())
                for k,v in _dataset.items():
                    num_els += 1
                    if isinstance(v,Timeseries):
                        num_nums += len(v._data)
                    elif isinstance(v,(list,tuple,np.ndarray)):
                        num_nums += len(v)
                _dataset = process_dataset(options,_dataset,processors)
                _proc_ds = set(_dataset.keys())-_base_ds
                # num_els += len(_proc_ds)
                # print "B:",_base_ds
                # print "R:",_proc_ds
                for k in _proc_ds:
                    v = _dataset[k]
                    num_els += 1
                    num_rew += 1
                    if isinstance(v,Timeseries):
                        num_nums += len(v._data)
                    elif isinstance(v,(list,tuple,np.ndarray)):
                        num_nums += len(v)
                
    print "NELS=",num_els
    print "NREW=",num_rew
    print "NNUM=",num_nums

Ejemplo n.º 4

0

Mostrar archivo

Archivo: show.py Proyecto: exedre/e4t

def exec_show_job_SOLR(options, cwriter, *args, **kw):
    global SOURCE, TITLE, SUBTITLE, PTITLE, LABEL, DATA, FORMULA, URL, PROCESSED, DOWNLOADED, DATAREQ, PROCESSOR, PID
    if re.match("^(|us|jp|uk|in|ch|br|ru|po|tu)struct|cfa|cpt|uslms$", args[0], re.I):
        return
    setattr(options, "job", args[0])
    kw = udict(kw)
    base = join(options.process_path, "input")
    (datareqs, processors, outputs) = e4t.cli.get_elem_configs(*args, **kw)
    if len(datareqs) == 0:
        return
    acct = {}

    DATAREQ = datareqs[0].replace(options.process_path, "") if datareqs[0] else ""
    _dataset = tsload_datareqs(options, None, *datareqs, **kw)

    prcfg = {}
    if len(processors) > 0:
        pr = processors[0]
        PROCESSOR = pr
        prcfg = udict()
        if exists(pr):
            pr = join(options.process_path, pr)
            if exists(pr):
                prcfg = cfg2hash(pr).xget_dict("PROCESSOR")

    ou = outputs[0]
    if not exists(ou):
        ou = join(options.process_path, ou)
        if not exists(ou):
            logger.error("E01#001: Il file di specifica del processore %s non esiste", pr)
            raise ValueError, "E:MKEL:001"
    wcfg = cfg2hash(ou)
    jcfg = wcfg.xget_dict(options.job)
    SOURCE = clean_latex(jcfg.xget("SOURCE").replace("\n", " ") if "SOURCE" in jcfg else None)
    TITLE = clean_latex(jcfg.xget("TITLE").replace("\n", " ") if "TITLE" in jcfg else None)
    SUBTITLE = clean_latex(jcfg.xget("SUBTITLE").replace("\n", " ") if "SUBTITLE" in jcfg else None)

    KIND = jcfg.xget("KIND")
    if not re.match("figure|table", KIND, re.I):
        return

    _base_ds = set(_dataset.keys())
    _dataset = process_dataset(options, _dataset, processors, *args, **kw)

    _base_acct = udict()
    nacct = rework_acct()
    _base_acct.update(nacct)

    _proc_ds = set(_dataset.keys()) - _base_ds

    if re.match("figure", KIND, re.I):
        panels = jcfg.xget_list("PANELS")
        for panel in panels:
            pcfg = wcfg.xget_dict(panel)
            PTITLE = pcfg.xget("TITLE").replace("\n", " ") if "TITLE" in pcfg else None
            PTITLE = clean_latex(PTITLE)
            series = []
            series1 = pcfg.xget_list("SERIES")
            series2 = pcfg.xget_list("LEFT SERIES")
            series3 = pcfg.xget_list("RIGHT SERIES")
            if series1:
                series.extend(series1)
            if series2:
                series.extend(series2)
            if series3:
                series.extend(series3)
            for serie in series:
                LABEL = serie
                DATA = serie
                FORMULA = None
                URL = ""
                PROVIDER = ""
                scfg = wcfg.xget_dict(serie)
                if scfg:
                    LABEL = clean_latex(scfg.xget("LABEL"))
                    DATA = scfg.xget("DATA")
                    PROCESSED = DATA in _proc_ds
                    DOWNLOADED = DATA in _base_ds
                    if PROCESSED:
                        FORMULA = prcfg.xget(DATA).replace("\n", " ") if DATA in prcfg else None
                    if DOWNLOADED:
                        if DATA not in _base_acct:
                            print DATA, "not in acct", _base_acct.keys(), ",".join(_base_ds)
                        else:
                            D = _base_acct.xget(DATA)
                            P = D["provider"]
                            URL = P + "://" + _base_acct.xget(DATA)["url"]
                            PROVIDER = P
                PROCESSED = str(PROCESSED) if PROCESSED else None
                DOWNLOADED = str(DOWNLOADED) if DOWNLOADED else None
                WPROCESSOR = "" if not PROCESSOR else WSOURCE + PROCESSOR.replace(options.process_path, "")
                XLSRAW = WEB + "ds/%s-raw.xls" % JOB.lower()
                XLSPROC = WEB + "ds/%s-proc.xls" % JOB.lower() if PROCESSOR else ""
                XTRNAL = "True" if re.match("^dstream|http|file", PROVIDER) is not None else None
                PID += 1
                cwriter.writerow(
                    (
                        str(PID),
                        PUBLIC,
                        str(PAGE),
                        KIND,
                        JOB,
                        WEB + OUTFILE,
                        WSOURCE + DATAREQ,
                        WPROCESSOR,
                        WSOURCE + OUTPUT,
                        XLSRAW,
                        XLSPROC,
                        TITLE,
                        SUBTITLE,
                        SOURCE,
                        PTITLE,
                        LABEL,
                        DATA,
                        DOWNLOADED,
                        PROVIDER,
                        URL,
                        PROCESSED,
                        FORMULA,
                        XTRNAL,
                    )
                )
        pass
    elif re.match("table", KIND, re.I):
        blocks = jcfg.xget_list("BLOCK")
        if not blocks:
            return
        orient, m, n, ELS = _extract_data_matrix_def(prcfg, _dataset)
        line = 0
        TITLES = []
        for b in blocks:
            bcfg = wcfg.xget_dict(b)
            models = bcfg.xget_list("MODEL")
            PTITLE = bcfg.xget("TITLE").replace("\n", " ") if "TITLE" in bcfg else None
            if PTITLE:
                PTITLE = clean_latex(PTITLE)
            else:
                PTITLE = TITLE
            for mod in models:
                mod = clean_model(mod)
                if not mod:
                    continue
                LABEL = mod
                DATA = mod
                FORMULA = None
                URL = ""
                PROVIDER = ""
                scfg = wcfg.xget_dict(mod)
                if scfg:
                    TITLES.append(clean_latex(scfg.xget("TITLE")))
                    line += 1
        lines = line
        for line in range(lines):
            series = ELS[line]
            for n, serie in series:
                DATA = serie
                PROCESSED = DATA in _proc_ds
                DOWNLOADED = DATA in _base_ds
                if PROCESSED:
                    FORMULA = prcfg.xget(DATA).replace("\n", " ") if DATA in prcfg else None
                if DOWNLOADED:
                    if DATA not in _base_acct:
                        print DATA, "not in acct", _base_acct.keys(), ",".join(_base_ds)
                    else:
                        D = _base_acct.xget(DATA)
                        P = D["provider"]
                        URL = P + "://" + _base_acct.xget(DATA)["url"]
                        PROVIDER = P
                PROCESSED = str(PROCESSED) if PROCESSED else None
                DOWNLOADED = str(DOWNLOADED) if DOWNLOADED else None
                WPROCESSOR = "" if not PROCESSOR else WSOURCE + PROCESSOR.replace(options.process_path, "")
                XLSRAW = WEB + "ds/%s-raw.xls" % JOB.lower()
                XLSPROC = WEB + "ds/%s-proc.xls" % JOB.lower() if PROCESSOR else ""
                XTRNAL = "True" if re.match("^dstream|http|file", PROVIDER) is not None else None
                TITLE = TITLES[line]
                PID += 1
                cwriter.writerow(
                    (
                        str(PID),
                        PUBLIC,
                        str(PAGE),
                        KIND,
                        JOB,
                        WEB + OUTFILE,
                        WSOURCE + DATAREQ,
                        WPROCESSOR,
                        WSOURCE + OUTPUT,
                        XLSRAW,
                        XLSPROC,
                        TITLE,
                        SUBTITLE,
                        SOURCE,
                        PTITLE,
                        LABEL,
                        DATA,
                        DOWNLOADED,
                        PROVIDER,
                        URL,
                        PROCESSED,
                        FORMULA,
                        XTRNAL,
                    )
                )

    elif re.match("multiple", KIND, re.I):
        pass
    else:
        logger.error("E01#002: Il file di specifica del processore %s non esiste", pr)
        raise ValueError, "E:MKEL:001"