def cli(ctx, ed_dir, log_level):
    """
    Command Line Interface for the Energy Dashboard. This tooling 
    collects information from a number of data feeds, imports that data, 
    transforms it, and inserts it into a database.
    """
    # pass this logger as a child logger to the edl methods
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(log_level)
    log.debug(
        logger, {
            "name": __name__,
            "method": "cli",
            "ed_dir": ed_dir,
            "log_level": "%s" % log_level
        })

    if ed_dir is None:
        ed_dir = os.path.curdir
    else:
        if not os.path.exists(ed_dir):
            log.critical(
                logger, {
                    "name": __name__,
                    "method": "cli",
                    "ed_dir": ed_dir,
                    "CRITICAL": "ed_dir does not exist"
                })
    eddir = os.path.abspath(os.path.expanduser(ed_dir))
    ctx.obj = {LOGGER: logger, EDDIR: eddir}
def prune(logger, feed, ed_path, stage):
    chlogger = logger.getChild(__name__)
    p = pre_prune(logger, feed, ed_path, stage)
    ext = STAGE_DIRS[stage]
    ending = ".%s" % ext
    try:
        files = filesystem.glob_dir(p, ending)
        count = 0
        for f in files:
            os.remove(os.path.join(p, f))
            count += 1
        log.debug(
            chlogger, {
                "name": __name__,
                "method": "prune",
                "path": ed_path,
                "feed": feed,
                "target_dir": p,
                "ending": ending,
                "removed": count,
                "message": "pruned target_dir",
            })
    except Exception as e:
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "prune",
                "path": ed_path,
                "feed": feed,
                "target_dir": p,
                "ending": ending,
                "ERROR": "failed to prune target_dir",
                "exception": str(e)
            })
Ejemplo n.º 3
0
def run(logger, manifest, config):
    start_date = datetime.date(*manifest['start_date'])
    resource_name = manifest['name']
    resource_url = manifest['url']
    delay = manifest['download_delay_secs']
    download_dir = config['working_dir']
    state_file = config['state_file']
    # sleep for N seconds in between downloads to meet caiso expected use requirements
    dates = xtime.range_pairs(xtime.day_range_to_today(start_date))
    urls = list(web.generate_urls(logger, dates, resource_url))
    log.debug(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "url": resource_url,
            "delay": delay,
            "download_dir": download_dir,
            "state_file": state_file,
            "start_date": str(start_date),
            "urls_count": len(urls),
        })
    state.update(
        web.download(logger, resource_name, delay, urls, state_file,
                     download_dir), state_file)
def insert(logger, resource_name, sql_dir, db_dir, new_files):
    chlogger = logger.getChild(__name__)
    with DbMgr(chlogger, resource_name) as dbmgr:
        new_files_count = len(new_files)
        if not os.path.exists(db_dir):
            os.makedirs(db_dir)
        if not os.path.exists(db_dir):
            raise Exception("Failed to create db_dir: %s" % db_dir)
        log.info(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert",
            "sql_dir"   : sql_dir,
            "db_dir"    : db_dir,
            "new_files" : new_files_count,
            })
        for (idx, sql_file_name) in enumerate(new_files):
            yield insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth=0, max_depth=5)
       
        save_dir        = os.path.join(os.path.dirname(db_dir), "save")
        save_state_file = os.path.join(save_dir, "state.txt")
        db_files        = filesystem.glob_dir(db_dir, ".db")
        with open(save_state_file, 'w') as f:
            for dbf in db_files:
                f.write("%s\n" % dbf)
                log.debug(logger, {
                    "name"      : __name__,
                    "method"    : "insert",
                    "resource"  : resource_name,
                    "db_file"   : dbf,
                    "state_file": save_state_file,
                    "message"   : "added db_file to state file",
                    })
def archive_locally(logger, feed, ed_path, archivedir):
    chlogger = logger.getChild(__name__)
    try:
        archivedir1 = os.path.expanduser(archivedir)
        if archivedir1.startswith("/"):
            archivedire2 = archivedir1
        else:
            archivedir2 = os.path.join(ed_path, archivedir1)

        archive_name = os.path.join(archivedir2, feed)
        root_dir = os.path.expanduser(os.path.join(ed_path, 'data', feed))
        log.debug(
            chlogger, {
                "name": __name__,
                "method": "archive_locally",
                "path": ed_path,
                "feed": feed,
                "target_dir": archivedir2,
                "archive_name": archive_name,
                "root_dir": root_dir
            })
        return make_archive(archive_name, 'gztar', root_dir)
    except Exception as e:
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "archive_locally",
                "path": ed_path,
                "feed": feed,
                "target_dir": archivedir2,
                "archive_name": archive_name,
                "root_dir": root_dir,
                "ERROR": "make archive failed",
                "exception": str(e)
            })
Ejemplo n.º 6
0
def generate_urls(logger, date_pairs, url_template, date_format="%Y%m%d"):
    """
    Generate download urls for the provided date_pairs.

    date_pairs      : list of tuples with (start, end) dates
    url_template    : contains a _START_ and _END_ strings which will be replaced
                      by (start,end) tuples formated by the date_format
    date_format     : format string for the start and end dates

    TODO/BIKESHED   : replace _X_ with Jinja mustache templates {{}}
    """
    chlogger = logger.getChild(__name__)
    for (start, end) in date_pairs:
        s = start.strftime(date_format)
        e = end.strftime(date_format)
        url = url_template.replace("_START_", s).replace("_END_", e)
        log.debug(
            chlogger, {
                "name": __name__,
                "method": "generate_urls",
                "start": s,
                "end": e,
                "url": url
            })
        yield url
def list(logger, energy_dashboard_path):
    chlogger = logger.getChild(__name__)
    log.debug(chlogger, {
        "name": __name__,
        "method": "list",
        "path": energy_dashboard_path
    })
    return os.listdir(os.path.join(energy_dashboard_path, "data"))
Ejemplo n.º 8
0
def clone(logger, ed_path):
    cmd = "git clone https://github.com/energy-analytics-project/energy-dashboard.git"
    log.debug(logger, {
        "name": __name__,
        "method": "init",
        "path": ed_path,
        "cmd": cmd,
    })
    return runyield(cmd, ed_path)
Ejemplo n.º 9
0
def update(logger, ed_path):
    cmd = "git submodule update --init --recursive"
    log.debug(logger, {
        "name": __name__,
        "method": "update",
        "path": ed_path,
        "cmd": cmd,
    })
    return runyield(cmd, ed_path)
 def __exit__(self, type, value, traceback):
     for k,v in self.dbs.items():
         v.close()
         log.debug(self.logger, {
             "name"      : __name__,
             "src"       : self.resource_name,
             "method"    : "DbMgr.__exit__",
             "db_path"   : k,
             "message"   : "Closed in memory db",
             })
def reset(logger, feed, ed_path, stage):
    chlogger = logger.getChild(__name__)
    p = pre_reset(logger, feed, ed_path, stage)
    try:
        if os.path.exists(p):
            shutil.rmtree(p)
            log.debug(
                chlogger, {
                    "name": __name__,
                    "method": "reset",
                    "path": ed_path,
                    "feed": feed,
                    "target_dir": p,
                    "message": "removed target_dir",
                })
    except Exception as e:
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "reset",
                "path": ed_path,
                "feed": feed,
                "target_dir": p,
                "ERROR": "failed to remove target_dir",
                "exception": str(e)
            })
    try:
        if not os.path.exists(p):
            os.makedirs(p)
            log.debug(
                chlogger, {
                    "name": __name__,
                    "method": "reset",
                    "path": ed_path,
                    "feed": feed,
                    "target_dir": p,
                    "message": "makedirs target_dir",
                })
    except Exception as e:
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "reset",
                "path": ed_path,
                "feed": feed,
                "target_dir": p,
                "ERROR": "failed to makedirs target_dir",
                "exception": str(e)
            })
    return p
def src_files(logger, feed, ed_path):
    chlogger = logger.getChild(__name__)
    feed_dir = os.path.join(ed_path, 'data', feed)
    src_dir = os.path.join(feed_dir, 'src')
    src_files = sorted(os.listdir(src_dir))
    log.debug(
        chlogger, {
            "name": __name__,
            "method": "src_files",
            "path": ed_path,
            "feed": feed,
            "feed_dir": feed_dir,
            "src_dir": src_dir,
            "src_files": src_files
        })
    return src_files
def process_file(logger, feed, ed_path, src_file):
    chlogger = logger.getChild(__name__)
    feed_dir = os.path.join(ed_path, 'data', feed)
    rel_path = os.path.join("src", src_file)
    cmd = "%s %s" % (rel_path,
                     log.LOGGING_LEVEL_STRINGS[chlogger.getEffectiveLevel()])

    log.debug(
        chlogger, {
            "name": __name__,
            "method": "process_file",
            "path": ed_path,
            "feed": feed,
            "cmd": cmd
        })
    return runyield(cmd, feed_dir)
def process_stages(logger, feed, ed_path, stages):
    chlogger = logger.getChild(__name__)
    stage_files = [STAGE_PROCS[s] for s in stages]
    for sf in stage_files:
        if sf in src_files(logger, feed, ed_path):
            yield process_file(logger, feed, ed_path, sf)
        else:
            log.debug(
                chlogger, {
                    "name": __name__,
                    "method": "process_stages",
                    "path": ed_path,
                    "feed": feed,
                    "stage_file": sf,
                    "src_files": src_files,
                    "ERROR": "stage_file not in src_files"
                })
 def get(self, db_path):
     log.debug(self.logger, {
         "name"      : __name__,
         "src"       : self.resource_name,
         "method"    : "DbMgr.get",
         "db_path"   : db_path,
         })
     if db_path not in self.dbs:
         db = MemDb(db_path)
         cnx = db.open()
         self.dbs[db_path] = db
         log.debug(self.logger, {
             "name"      : __name__,
             "src"       : self.resource_name,
             "method"    : "DbMgr.get",
             "db_path"   : db_path,
             "message"   : "Created in memory db",
             })
     return self.dbs[db_path].db()
def parse_text_file(logger, resource_name, txt_dir, sql_dir, f):
    input_file = os.path.join(txt_dir, f)
    (dict_renewable, dict_total) = read_file_name(input_file)
    renewable_sql = gen_renewable_sql(dict_renewable)
    total_sql = gen_total_sql(dict_total)
    (f_name, f_ext) = os.path.splitext(f)
    output_file = os.path.join(sql_dir, "%s.sql" % f_name)
    with open(output_file, 'w') as sqlfile:
        [sqlfile.write("%s\n" % line) for line in renewable_sql]
        [sqlfile.write("%s\n" % line) for line in total_sql]
    log.debug(
        logger, {
            "name": __name__,
            "method": "parse_text_file",
            "src": "30_pars.py",
            "input": input_file,
            "output": output_file,
        })
    return f
def run(logger, manifest, config):
    resource_name   = manifest['name']
    resource_url    = manifest['url']
    xml_dir         = config['source_dir']
    sql_dir         = config['working_dir']
    state_file      = config['state_file']
    new_files = state.new_files(resource_name, state_file, xml_dir, '.xml')
    log.debug(logger, {
        "name"      : __name__,
        "method"    : "run",
        "resource"  : resource_name,
        "url"       : resource_url,
        "xml_dir"   : xml_dir,
        "sql_dir"   : sql_dir,
        "state_file": state_file,
        "new_files_count" : len(new_files),
        })
    state.update(
            xmlparser.parse(logger, resource_name, new_files, xml_dir, sql_dir), 
            state_file)
def run(logger, manifest, config):
    resource_name = manifest['name']
    resource_url = manifest['url']
    txt_dir = config['source_dir']
    sql_dir = config['working_dir']
    state_file = config['state_file']
    new_files = state.new_files(resource_name, state_file, txt_dir,
                                'DailyRenewablesWatch.txt')
    log.debug(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "url": resource_url,
            "txt_dir": txt_dir,
            "sql_dir": sql_dir,
            "state_file": state_file,
            "new_files_count": len(new_files),
        })
    state.update(
        parse_text_files(logger, resource_name, new_files, txt_dir, sql_dir),
        state_file)
def invoke(logger, feed, ed_path, command):
    chlogger = logger.getChild(__name__)
    target_dir = os.path.join(ed_path, 'data', feed)
    log.debug(
        chlogger, {
            "name": __name__,
            "method": "invoke",
            "path": ed_path,
            "feed": feed,
            "command": command
        })
    if not os.path.exists(target_dir):
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "invoke",
                "path": ed_path,
                "feed": feed,
                "command": command,
                "target_dir": target_dir,
                "ERROR": "target_dir does not exist"
            })
    else:
        return runyield([command], target_dir)
def process_all_stages(logger, feed, ed_path):
    chlogger = logger.getChild(__name__)
    found_src_files = src_files(logger, feed, ed_path)
    if len(found_src_files) < 1:
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "process_all_stages",
                "path": ed_path,
                "feed": feed,
                "src_files": found_src_files,
                "ERROR": "No files found, nothing to process"
            })
        return
    log.debug(
        chlogger, {
            "name": __name__,
            "method": "process_all_stages",
            "path": ed_path,
            "feed": feed,
            "src_files": found_src_files
        })
    for src_file in found_src_files:
        yield process_file(logger, feed, ed_path, src_file)
            })
    for output in clifeed.archive_to_s3(logger, resource_name, ed_path, "digitalocean", digitalocean_bandwidth_limit):
        log.info(logger, {
            "name"      : __name__,
            "method"    : "run",
            "resource"  : resource_name,
            "service"   : "digitalocean",
            "stdout"   : str(output),
            })
    shutil.rmtree(os.path.join(os.curdir, 'dist'))

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        loglevel = sys.argv[1]
    else:
        loglevel = "INFO"
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    log.debug(logger, {
        "name"      : __name__,
        "method"    : "main",
        "src"       : "70_arch.py"
        })
    with open('manifest.json', 'r') as json_file:
        m = json.load(json_file)
        run(logger, m, config())
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "sql_dir": sql_dir,
            "db_dir": db_dir,
            "state_file": state_file,
            "new_files_count": len(new_files),
            "message": "finished processing sql files",
        })


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        loglevel = sys.argv[1]
    else:
        loglevel = "INFO"
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    log.debug(logger, {
        "name": __name__,
        "method": "main",
        "src": "40_inse.py"
    })
    with open('manifest.json', 'r') as json_file:
        m = json.load(json_file)
        run(logger, m, config())
            "url": resource_url,
            "txt_dir": txt_dir,
            "sql_dir": sql_dir,
            "state_file": state_file,
            "new_files_count": len(new_files),
        })
    state.update(
        parse_text_files(logger, resource_name, new_files, txt_dir, sql_dir),
        state_file)


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        loglevel = sys.argv[1]
    else:
        loglevel = "INFO"
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    log.debug(logger, {
        "name": __name__,
        "method": "main",
        "src": "30_pars.py"
    })
    with open('manifest.json', 'r') as json_file:
        m = json.load(json_file)
        run(logger, m, config())
Ejemplo n.º 24
0
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "db_dir": db_dir,
            "save_dir": db_dir,
            "state_file": state_file,
            "message": "finished saving state",
        })


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        loglevel = sys.argv[1]
    else:
        loglevel = "INFO"
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    log.debug(logger, {
        "name": __name__,
        "method": "main",
        "src": "50_save.py"
    })
    with open('manifest.json', 'r') as json_file:
        m = json.load(json_file)
        run(logger, m, config())
Ejemplo n.º 25
0
def run(logger, manifest, config):
    start_date = datetime.date(*manifest['start_date'])
    resource_name = manifest['name']
    resource_url = manifest['url']
    delay = manifest['download_delay_secs']
    download_dir = config['working_dir']
    txt_dir = config['source_dir']
    state_file = config['state_file']
    # sleep for N seconds in between downloads to meet caiso expected use requirements
    dates = xtime.range_pairs(xtime.day_range_to_today(start_date))
    urls = list(web.generate_urls(logger, dates, resource_url))
    log.debug(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "url": resource_url,
            "delay": delay,
            "download_dir": download_dir,
            "state_file": state_file,
            "start_date": str(start_date),
            "urls_count": len(urls),
        })

    # download .txt files
    downloaded_txt_urls = web.download(logger,
                                       resource_name,
                                       delay,
                                       urls,
                                       state_file,
                                       download_dir,
                                       ending='.txt')

    # copy .txt files to ./text dir and then
    # compress original .txt files to .zip files
    if not os.path.exists(txt_dir):
        log.debug(
            logger, {
                "name": __name__,
                "method": "run",
                "src": "10_down.py",
                "message": "created target txt dir: %s" % txt_dir,
            })
        os.makedirs(txt_dir)

    # process downloaded .txt files
    data_files = glob.glob(
        os.path.join(download_dir, "*DailyRenewablesWatch.txt"))
    for tf in data_files:
        try:
            # remove write protections for .txt files
            os.chmod(
                os.path.join(download_dir, tf),
                S_IWRITE | S_IWGRP | S_IWOTH | S_IREAD | S_IRGRP | S_IROTH)

            # if the txt file is here, it needs to be copied to the ./txt dir
            fqtf = os.path.join(download_dir, tf)
            fqtf2 = os.path.join(txt_dir, tf)
            fqtfzip = os.path.join(download_dir, '%s.zip' % tf)
            if not os.path.exists(fqtf2):
                shutil.copyfile(fqtf, fqtf2)
            with zipfile.ZipFile(fqtfzip, 'w') as myzip:
                myzip.write(fqtfzip)

            # set .zip file to be read only
            os.chmod(fqtfzip, S_IREAD | S_IRGRP | S_IROTH)

            # remove the zip/.txt file as it's been copied to txt/.txt
            if os.path.exists(fqtf2) and os.path.exists(fqtfzip):
                os.remove(fqtf)
            log.debug(
                logger, {
                    "name": __name__,
                    "method": "run",
                    "src": "10_down.py",
                    "message": "zipped file: %s" % tf,
                })
        except Exception as e:
            log.error(
                logger, {
                    "name": __name__,
                    "method": "run",
                    "src": "10_down.py",
                    "file": tf,
                    "error": "failed to process file",
                    "exception": str(e),
                })

    # TODO: something is clobbering perms on the state file, so clobber it back
    os.chmod(os.path.join(download_dir, 'state.txt'),
             S_IWRITE | S_IWGRP | S_IWOTH | S_IREAD | S_IRGRP | S_IROTH)
    # final step
    state.update(downloaded_txt_urls, state_file)
def create(logger, ed_path, feed, maintainer, company, email, url, start_date,
           delay):
    """
    start_date : list of numbers : [2019,09,1]
    """
    chlogger = logger.getChild(__name__)
    new_feed_dir = os.path.join(ed_path, 'data', feed)
    try:
        os.mkdir(new_feed_dir)
        log.debug(
            chlogger, {
                "name": __name__,
                "method": "create",
                "path": ed_path,
                "feed": feed,
                "dir": new_feed_dir,
                "message": "created directory"
            })
        template_files = [
            "LICENSE", "Makefile", "README.md", "src/10_down.py",
            "src/20_unzp.py", "src/30_pars.py", "src/40_inse.py",
            "src/50_save.py", "src/60_dist.sh", "src/70_arch.py",
            "manifest.json"
        ]
        env = Environment(loader=PackageLoader('edl', 'templates'),
                          autoescape=select_autoescape(['py']))
        m = {
            'NAME': feed,
            'MAINTAINER': maintainer,
            'COMPANY': company,
            'EMAIL': email,
            'DATA_URL': url,
            'REPO_URL':
            "https://github.com/energy-analytics-project/%s" % feed,
            'START': start_date,
            'DELAY': delay
        }
        for tf in template_files:
            template = env.get_template(tf)
            target = os.path.join(new_feed_dir, tf)
            path = os.path.dirname(target)
            if not os.path.exists(path):
                os.makedirs(path)
            with open(target, 'w') as f:
                f.write(template.render(m))
                log.debug(
                    chlogger, {
                        "name": __name__,
                        "method": "create",
                        "path": ed_path,
                        "feed": feed,
                        "target": target,
                        "message": "rendered target"
                    })

        hidden_files = ['gitignore']
        for hf in hidden_files:
            template = env.get_template(hf)
            target = os.path.join(new_feed_dir, ".%s" % hf)
            with open(target, 'w') as f:
                f.write(template.render(m))
                log.debug(
                    chlogger, {
                        "name": __name__,
                        "method": "create",
                        "path": ed_path,
                        "feed": feed,
                        "target": target,
                        "message": "rendered target"
                    })
        for src_file in os.listdir(os.path.join(new_feed_dir, 'src')):
            fp = os.path.join(new_feed_dir, 'src', src_file)
            f = Path(fp)
            f.chmod(f.stat().st_mode | stat.S_IEXEC)
            log.debug(
                chlogger, {
                    "name": __name__,
                    "method": "create",
                    "path": ed_path,
                    "feed": feed,
                    "file": fp,
                    "message": "chmod +x"
                })

        for d in DIRS:
            os.makedirs(os.path.join(new_feed_dir, d))
        return feed
    except Exception as e:
        tb = traceback.format_exc()
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "create",
                "path": ed_path,
                "feed": feed,
                "ERROR": "FAILED to create feed",
                "exception": str(e),
                "trace": str(tb),
            })
Ejemplo n.º 27
0
def download(logger,
             resource_name,
             delay,
             urls,
             state_file,
             path,
             ending=".zip"):
    """
    urls        : list of urls to download
    state_file  : list of urls that have already been downloaded
    path        : path to write downloaded files to
    """
    chlogger = logger.getChild(__name__)
    downloaded = []
    prev_downloaded = set()
    if os.path.exists(state_file):
        with open(state_file, "r") as f:
            prev_downloaded = set([line.rstrip() for line in f])

    status = {'manifest': 0, 'filesystem': 0, 'downloaded': 0, 'error': 0}

    for url in urls:
        try:
            filename = filesystem.url2filename(url, ending=ending)
            if url in prev_downloaded:
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'skip_download',
                        "url": url,
                        "file": filename,
                        "msg": 'url exists in download manifest'
                    })
                status['manifest'] += 1
                continue
            target_file = os.path.join(path, filename)
            if os.path.exists(target_file):
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'skip_download',
                        "url": url,
                        "file": filename,
                        "msg": 'file exists locally, updating manifest'
                    })
                # update the state_file with files that were found on disk
                downloaded.append(url)
                status['filesystem'] += 1
                continue
            r = requests.get(url)
            if r.status_code == 200:
                with open(target_file, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                downloaded.append(url)
                status['downloaded'] += 1
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'download',
                        "url": url,
                        "file": filename
                    })
            else:
                log.error(
                    chlogger, {
                        "src": resource_name,
                        "action": 'download',
                        "url": url,
                        "file": filename,
                        "status_code": r.status_code,
                        "ERROR": 'http_request_failed'
                    })
        except Exception as e:
            log.error(
                chlogger, {
                    "src": resource_name,
                    "action": 'download',
                    "url": url,
                    "ERROR": "http_request_failed",
                    "exception": str(e),
                    "traceback": str(tb=traceback.format_exc())
                })
            status['error'] += 1
        # TODO: this is such a hack
        time.sleep(delay)
        # ensure that all files in the download directery are read only
        for f in filesystem.glob_dir(path, ending):
            os.chmod(os.path.join(path, f), S_IREAD | S_IRGRP | S_IROTH)
        log.info(chlogger, {                                        \
                "src"                   : resource_name,            \
                "action"                : 'download',               \
                "url"                   : url,                      \
                'skipped_in_manifest'   : status['manifest'],       \
                'skipped_in_filesystem' : status['filesystem'],     \
                'downloaded'            : status['downloaded'],     \
                'error'                 : status['error'],          \
                })
    return downloaded
def insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth, max_depth):
    chlogger    = logger.getChild(__name__)
    db_name     = gen_db_name(resource_name, depth)
    sql_file    = os.path.join(sql_dir, sql_file_name)
    db_file     = os.path.join(db_dir, db_name)
    if depth > max_depth:
        log.error(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert_file",
            "db_file"   : db_file,
            "file_idx"  : idx,
            "sql_file"  : sql_file,
            "depth"     : depth,
            "max_depth" : max_depth,
            "dbmgr"     : str(dbmgr),
            "ERROR"     :"insert sql_file failed, max_depth exceeded",
            })
        return

    log.info(chlogger, {
        "name"      : __name__,
        "src"       : resource_name,
        "method"    : "insert_file",
        "db_file"   : db_file,
        "file_idx"  : idx,
        "sql_file"  : sql_file,
        "depth"     : depth,
        "dbmgr"     : str(dbmgr),
        "message"   : "started",
        })
        
    cnx = dbmgr.get(db_file)
    try:
        with open(sql_file, 'r') as sf:
            log.debug(chlogger, {
                "name"      : __name__,
                "src"       : resource_name,
                "method"    : "insert_file",
                "db_file"   : db_file,
                "file_idx"  : idx,
                "sql_file"  : sql_file,
                "depth"     : depth,
                "dbmgr"     : str(dbmgr),
                "message"   : "started",
                })
            cnx.executescript(sf.read())
            log.debug(chlogger, {
                "name"      : __name__,
                "src"       : resource_name,
                "method"    : "insert_file",
                "db_file"   : db_file,
                "file_idx"  : idx,
                "sql_file"  : sql_file,
                "depth"     : depth,
                "dbmgr"     : str(dbmgr),
                "message"   : "completed",
                })
        return sql_file_name
    except Exception as e:
        log.error(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert_file",
            "file_idx"  : idx,
            "db_file"   : db_file,
            "sql_file"  : sql_file,
            "depth"     : depth,
            "dbmgr"     : str(dbmgr),
            "ERROR"     : "insert sql_file failed",
            "exception": str(e),
            })
        insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file, idx, depth+1, max_depth)

# -----------------------------------------------------------------------------
# Entrypoint
# -----------------------------------------------------------------------------
def run(logger, manifest, config):
    # nothing to do, text files have already been copied to ./txt by
    # 10_down.py.
    pass


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        loglevel = sys.argv[1]
    else:
        loglevel = "INFO"
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    log.debug(logger, {
        "name": __name__,
        "method": "main",
        "src": "20_unzp.py"
    })
    with open('manifest.json', 'r') as json_file:
        m = json.load(json_file)
        run(logger, m, config())
Ejemplo n.º 30
0
            "delay": delay,
            "download_dir": download_dir,
            "state_file": state_file,
            "start_date": str(start_date),
            "urls_count": len(urls),
        })
    state.update(
        web.download(logger, resource_name, delay, urls, state_file,
                     download_dir), state_file)


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    if len(sys.argv) > 1:
        loglevel = sys.argv[1]
    else:
        loglevel = "INFO"
    log.configure_logging()
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    log.debug(logger, {
        "name": __name__,
        "method": "main",
        "src": "10_down.py"
    })
    with open('manifest.json', 'r') as json_file:
        m = json.load(json_file)
        run(logger, m, config())