def insert(logger, resource_name, sql_dir, db_dir, new_files): chlogger = logger.getChild(__name__) with DbMgr(chlogger, resource_name) as dbmgr: new_files_count = len(new_files) if not os.path.exists(db_dir): os.makedirs(db_dir) if not os.path.exists(db_dir): raise Exception("Failed to create db_dir: %s" % db_dir) log.info(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert", "sql_dir" : sql_dir, "db_dir" : db_dir, "new_files" : new_files_count, }) for (idx, sql_file_name) in enumerate(new_files): yield insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth=0, max_depth=5) save_dir = os.path.join(os.path.dirname(db_dir), "save") save_state_file = os.path.join(save_dir, "state.txt") db_files = filesystem.glob_dir(db_dir, ".db") with open(save_state_file, 'w') as f: for dbf in db_files: f.write("%s\n" % dbf) log.debug(logger, { "name" : __name__, "method" : "insert", "resource" : resource_name, "db_file" : dbf, "state_file": save_state_file, "message" : "added db_file to state file", })
def run(logger, manifest, config): resource_name = manifest['name'] sql_dir = config['source_dir'] db_dir = config['working_dir'] state_file = config['state_file'] new_files = state.new_files(resource_name, state_file, sql_dir, '.sql') log.info( logger, { "name": __name__, "method": "run", "resource": resource_name, "sql_dir": sql_dir, "db_dir": db_dir, "state_file": state_file, "new_files_count": len(new_files), "message": "started processing sql files", }) state.update(db.insert(logger, resource_name, sql_dir, db_dir, new_files), state_file) log.info( logger, { "name": __name__, "method": "run", "resource": resource_name, "sql_dir": sql_dir, "db_dir": db_dir, "state_file": state_file, "new_files_count": len(new_files), "message": "finished processing sql files", })
def parse_file(logger, resource_name, xml_input_file_name, input_dir, output_dir): chlogger = logger.getChild(__name__) if not os.path.exists(output_dir): os.makedirs(output_dir) (base, ext) = os.path.splitext(xml_input_file_name) outfile = os.path.join(output_dir, "%s.sql" % (base)) infile = os.path.join(input_dir, xml_input_file_name) total_ddl = 0 total_sql = 0 with open(outfile, 'w') as outfh: with open(infile, 'r') as infh: # all the work happens here xst = XML2SQLTransormer(chlogger, infh).parse().scan_all() # check that the ddl and sql is correct # if this fails then it means the ddl/sql combination is incorrect sqllst = [] for ddl in xst.ddl(): sqllst.append(ddl) for sql in xst.insertion_sql(): sqllst.append(sql) sqltext = "\n".join(sqllst) #db = sqlite3.connect("file::memory:?cache=shared") db = sqlite3.connect(":memory:") db.executescript(sqltext) # all good outfh.write(sqltext) log.info( chlogger, { "src": resource_name, "action": "parse_file", "infile": infile, "outfile": outfile, }) return xml_input_file_name
def git_add_and_commit(logger, resource): """ """ for cmd in ["git add *", "git commit -am 'update state'", "git push"]: proc = subprocess.run(cmd, cwd=os.curdir, shell=True, stderr=subprocess.STDOUT) log.info( logger, { "name": __name__, "method": "git_add_and_commit", "resource": resource, "cmd": cmd, "stdout": proc.stdout, "stderr": proc.stderr, "returncode": proc.returncode, })
def run(logger, manifest, config): resource_name = manifest['name'] wasabi_bandwidth_limit = config['wasabi_bwlimit'] digitalocean_bandwidth_limit = config['digitalocean_bwlimit'] log.info(logger, { "name" : __name__, "method" : "run", "resource" : resource_name, "message" : "archiving...", }) ed_path = os.path.dirname(os.path.dirname(os.path.abspath(os.curdir))) for output in clifeed.archive_to_s3(logger, resource_name, ed_path, "wasabi", wasabi_bandwidth_limit): log.info(logger, { "name" : __name__, "method" : "run", "resource" : resource_name, "service" : "wasabi", "stdout" : str(output), }) for output in clifeed.archive_to_s3(logger, resource_name, ed_path, "digitalocean", digitalocean_bandwidth_limit): log.info(logger, { "name" : __name__, "method" : "run", "resource" : resource_name, "service" : "digitalocean", "stdout" : str(output), }) shutil.rmtree(os.path.join(os.curdir, 'dist'))
def archive_to_s3(logger, feed, ed_path, service, bwlimit="100M"): """ Archive feed dist to an S3 bucket. """ chlogger = logger.getChild(__name__) feed_dir = os.path.join(ed_path, 'data', feed) dist_dir = os.path.join(feed_dir, 'dist') s3_dir = os.path.join('eap', 'energy-dashboard', 'data', feed) cmd = "rclone sync --bwlimit=%s --no-update-modtime --verbose %s/dist %s:%s" % ( bwlimit, feed_dir, service, s3_dir) log.info( chlogger, { "name": __name__, "method": "archive_to_s3", "feed": feed, "path": ed_path, "service": service, "s3_dir": s3_dir, "cmd": cmd, }) if not os.path.exists(dist_dir) \ or not os.path.exists(os.path.join(dist_dir, 'zip')) \ or not os.path.exists(os.path.join(dist_dir, 'db')): log.error( chlogger, { "name": __name__, "method": "archive_to_s3", "feed": feed, "path": ed_path, "dist_dir": dist_dir, "service": service, "s3_dir": s3_dir, "ERROR": "One of dist_dir|dist_dir/zip|dist_dir/db does not exist", }) sys.exit(1) return runyield([cmd], feed_dir)
def run(logger, manifest, config): resource_name = manifest['name'] db_dir = config['source_dir'] save_dir = config['working_dir'] state_file = config['state_file'] log.info( logger, { "name": __name__, "method": "run", "resource": resource_name, "db_dir": db_dir, "save_dir": db_dir, "state_file": state_file, "message": "started saving state", }) if not os.path.exists(save_dir): os.makedirs(save_dir) log.info( logger, { "name": __name__, "method": "run", "resource": resource_name, "db_dir": db_dir, "save_dir": db_dir, "state_file": state_file, "message": "created save dir", }) save.git_add_and_commit(logger, resource_name) log.info( logger, { "name": __name__, "method": "run", "resource": resource_name, "db_dir": db_dir, "save_dir": db_dir, "state_file": state_file, "message": "finished saving state", })
def insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth, max_depth): chlogger = logger.getChild(__name__) db_name = gen_db_name(resource_name, depth) sql_file = os.path.join(sql_dir, sql_file_name) db_file = os.path.join(db_dir, db_name) if depth > max_depth: log.error(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "max_depth" : max_depth, "dbmgr" : str(dbmgr), "ERROR" :"insert sql_file failed, max_depth exceeded", }) return log.info(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "message" : "started", }) cnx = dbmgr.get(db_file) try: with open(sql_file, 'r') as sf: log.debug(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "message" : "started", }) cnx.executescript(sf.read()) log.debug(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "message" : "completed", }) return sql_file_name except Exception as e: log.error(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "file_idx" : idx, "db_file" : db_file, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "ERROR" : "insert sql_file failed", "exception": str(e), }) insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file, idx, depth+1, max_depth)
def download(logger, resource_name, delay, urls, state_file, path, ending=".zip"): """ urls : list of urls to download state_file : list of urls that have already been downloaded path : path to write downloaded files to """ chlogger = logger.getChild(__name__) downloaded = [] prev_downloaded = set() if os.path.exists(state_file): with open(state_file, "r") as f: prev_downloaded = set([line.rstrip() for line in f]) status = {'manifest': 0, 'filesystem': 0, 'downloaded': 0, 'error': 0} for url in urls: try: filename = filesystem.url2filename(url, ending=ending) if url in prev_downloaded: log.debug( chlogger, { "src": resource_name, "action": 'skip_download', "url": url, "file": filename, "msg": 'url exists in download manifest' }) status['manifest'] += 1 continue target_file = os.path.join(path, filename) if os.path.exists(target_file): log.debug( chlogger, { "src": resource_name, "action": 'skip_download', "url": url, "file": filename, "msg": 'file exists locally, updating manifest' }) # update the state_file with files that were found on disk downloaded.append(url) status['filesystem'] += 1 continue r = requests.get(url) if r.status_code == 200: with open(target_file, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) downloaded.append(url) status['downloaded'] += 1 log.debug( chlogger, { "src": resource_name, "action": 'download', "url": url, "file": filename }) else: log.error( chlogger, { "src": resource_name, "action": 'download', "url": url, "file": filename, "status_code": r.status_code, "ERROR": 'http_request_failed' }) except Exception as e: log.error( chlogger, { "src": resource_name, "action": 'download', "url": url, "ERROR": "http_request_failed", "exception": str(e), "traceback": str(tb=traceback.format_exc()) }) status['error'] += 1 # TODO: this is such a hack time.sleep(delay) # ensure that all files in the download directery are read only for f in filesystem.glob_dir(path, ending): os.chmod(os.path.join(path, f), S_IREAD | S_IRGRP | S_IROTH) log.info(chlogger, { \ "src" : resource_name, \ "action" : 'download', \ "url" : url, \ 'skipped_in_manifest' : status['manifest'], \ 'skipped_in_filesystem' : status['filesystem'], \ 'downloaded' : status['downloaded'], \ 'error' : status['error'], \ }) return downloaded