def get_base_description(self): return { 'precision': '{0:.{1}f}'.format(10**(-self._precision), self._precision), 'hash256': hash_file(self._file_path) }
def file_metrics(self, metrics={}): """Extracts file information such as name, last update date, number of rows and it's hash""" metrics["filename"] = self.file metrics["structured_at"] = time.time() metrics["rows"] = len(self.df) metrics["hash"] = hash_file(self.file) return metrics
def check(args: argparse.Namespace) -> None: """Verify that index matches files, print out any mismatches :param args: must have attr cold_dir: str """ cold_dir = Path(args.cold_dir) assert cold_dir.is_dir(), "cold_dir not found!" index = Index(cold_dir) fail_count = 0 # Set up progress bar total = sum([(cold_dir / p).stat().st_size if (cold_dir / p).exists() else 0 for p in index.keys()]) with tqdm(total=total, unit="B", unit_scale=True) as pbar: # Check that index is correct for p, h in index.items(): if h != hash_file(cold_dir / p, pbar): print(f"Verification failed: '{p}'.", file=sys.stderr) fail_count += 1 # Additionally check that index is complete for file in walk( cold_dir, [PathAwareGitWildMatchPattern('index.txt', cold_dir)]): rel_path: PurePath = file.relative_to(cold_dir) if rel_path not in index: print(f"File missing from index: '{rel_path}'.", file=sys.stderr) fail_count += 1 if fail_count == 0: print("OK: Data is intact!") else: print(f"FAIL: There were {fail_count} failures!")
async def post(self, request): form = await request.form() filename = form['file'].filename file = form['file'].file async with get_conn() as conn: async with conn.transaction(): master = await Replica(conn).master() if not master: return api_error('No master set') ds = parse_dcm(file) hsh = hash_file(file) file_data = { 'name': os.path.basename(filename), 'master': master['id'], 'hash': hsh, } file_data.update(ds) filedata = await Files(conn).insert_or_select(file_data) storage = await Storage.get(master) ret = await storage.copy(file, filedata) await ReplicaFiles(conn).add( master['id'], [{ 'id': filedata['id'], **ret }], ) return UJSONResponse({})
def execute(file_name: str): """ Given a file name, look up the meta data from the config (where each file name is a key), and depending on the file extension call the appropriate handler. Each handler will ultimately attempt to create a record, either a raw accident (from Excel files) or one of the meta information collections (from csv files). """ try: file_meta = files[file_name] except KeyError: print(f"Unknown file {file_name}") exit(1) file_path = path.normpath(f'{data_directory}/{file_name}') _, file_extension = path.splitext(file_path) file_meta['source_file'] = file_name file_meta['import_timestamp'] = import_timestamp file_meta['source_file_hash'] = hash_file(file_path) if file_extension == '.xlsx': import_xlsx(file_path, file_meta) elif file_extension == '.xlsb': import_xlsb(file_path, file_meta) elif file_extension == '.csv': import_csv(file_path, file_meta) else: print(f'Unknown file extension {file_extension}') exit(1)
def post_videos(): try: video = request.files['video'] video_hash = hash_file(video) video_data = mongo.db.videos.find_one({'hash': video_hash}) if video_data is not None: res = build_video_object(video_data) return jsonify(res), 200 _, ext = os.path.splitext(video.filename) ext = ext[1:] if ext in ALLOWED_EXTENSIONS: results = startProcessing( file=video, uploadFolderPath=app.config['UPLOAD_FOLDER']) else: results = {'message': 'File format not allowed'} return jsonify(results), 400 except KeyError as err: return jsonify({ 'title': 'BAD_REQUEST', 'message': f'Missing key: {str(err)}' }), 400 except Exception as e: return jsonify({'title': 'ERROR', 'message': (str(e))}), 400 results['hash'] = video_hash mongo.db.videos.insert_one(build_video_object(results)) return jsonify(results), 200
def do_pull(self): logging.info('Starting pull process') since = (utils.read_settings(self._conn, 'last_update') .get('last_update', 0)) cursor = self._conn.cursor() response = self.communicate({'ACTION': 'PULL', 'SINCE': since}) to_recv = parse.listify(parse.loads(response['CHANGES'])) logging.info('Adding %d new files from server.' % len(to_recv)) for x in to_recv: from_serv = parse.loads(x) sid = int(from_serv['ID']) logging.debug('Proccessing file update. SID: %d, type: %s' % (sid, from_serv['type'])) if from_serv['type'] == 'NEW': cursor.execute('SELECT 1 FROM files WHERE server_id=?', [sid]) if cursor.fetchone(): logging.warning('Server returned a file I already have, ' 'ignoring and continuing pull process.') continue file_path, file_hash = self.pull_remote(sid) fd = open(file_path, 'rb') our_hash = utils.hash_file(fd) if our_hash.digest() != file_hash: raise Exception('MD5 digests did not match! Transmission ' 'error suspected.') it_path = self.add_to_itunes(file_path) os.remove(file_path) record = utils.generate_file_info(it_path) record['server_id'] = sid utils.insert_file_record(record, self._conn) logging.debug('Successfuly added file: %s' % (os.path.split(it_path)[-1],)) elif from_serv['type'] == 'DELETE': cursor.execute('SELECT * FROM files WHERE server_id=?', [sid]) record = cursor.fetchone() if not record: logging.warning('Server sent delete directive on file I ' 'don\'t have. Ignoring.') continue self.remove_from_itunes(sid) cursor.execute('DELETE FROM files WHERE server_id=?', [sid]) self._conn.commit() logging.info('...finished pull process')
async def index(replica): global work replica_id = replica['id'] async with get_conn() as conn: await Replica(conn).update_status(replica_id, 'indexing') storage = await Storage.get(replica) indexing_interrupted = False async for d in storage.index(): if not work: indexing_interrupted = True break loc = None if not d.get('hash'): loc = await storage.fetch(d) if not d.get('hash'): d['hash'] = hash_file(loc) f = await Files(conn).get(d) if not f: if not replica['master']: continue if not loc: loc = await storage.fetch(d) try: dcm_data = parse_dcm(loc) except Exception as e: continue d.update(dcm_data) d['master'] = replica['id'] f = await Files(conn).add(d) d['id'] = f['id'] try: del d['meta'] except KeyError: pass if replica['master']: await ReplicaFiles(conn).add( replica_id, [d], ) else: await ReplicaFiles(conn).index(replica_id, d) files = await ReplicaFiles(conn).get_for_sync(replica) if len(files) == 0 and not indexing_interrupted: await Replica(conn).update_status(replica_id, 'ok')
def push_command(self, command, session): cursor = self._conn.cursor() if command['TYPE'] == 'NEW': cursor.execute('INSERT INTO files (received) VALUES (?)', [time.time()]) sid = cursor.lastrowid resp = parse.dumps({'ACTION': 'HSUP', 'ID': sid, 'DONE': 0}) logging.debug('CONT -> %s' % resp) self._send(resp + '\n') file_path = os.path.join( utils.read_settings(self._conn, 'storage_dir')['storage_dir'], '%d.mp3' % sid) digest = utils.pull_file(file_path, self._socket) our_digest = utils.hash_file(open(file_path, 'rb')).digest() if our_digest != digest: cursor.execute('DELETE FROM files WHERE id=?', [sid]) resp = {'ACTION': 'ERROR', 'REASON': 'Hash mismatch, record revoked, retransmit'} self._conn.commit() return resp, session cursor.execute('UPDATE files SET path=?, hash=? WHERE id=?', [file_path, digest.encode('hex'), sid]) self._conn.commit() resp = {'ACTION': 'HSUP', 'DONE': 1} return resp, session elif command['TYPE'] == 'DELETE': sid = int(command['ID']) cursor.execute( 'INSERT INTO deleted (file_id, del_time) VALUES (?, ?)', [sid, time.time()]) cursor.execute('DELETE FROM files WHERE id=?', [sid]) resp = {'ACTION': 'HSUP', 'DONE': 1} return resp, session else: resp = {'ACTION': 'ERROR', 'REASON': 'Unknown PUSH type: %s' % command['TYPE']} return resp, session
def scan(path, conn): """ Walks a directory and compares it to the databse pointed at by `conn`, returning a three tuple of files added to or removed from the directory verses its representation in the db respectively. Modified files will be included in both added and removed lists. Will update the `last_scan` fields within the db. Note that no effort is made to identify files which have moved. """ cursor = conn.cursor() scan_start = time.time() added = [] removed = [] for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: file_path = os.path.join(dirpath, filename) cursor.execute('SELECT * FROM files WHERE path = ? LIMIT 1', [file_path]) record = cursor.fetchone() if not record: added.append(file_path) else: fs_mtime = os.path.getmtime(file_path) if fs_mtime > record['mtime']: # This file is marked as having been modified since we # last saw it, time to hash to check for changes. fd = open(file_path, 'rb') hash_value = utils.hash_file(fd).hexdigest() if hash_value != record['hash']: added.append(file_path) removed.append(record['id']) cursor.execute('UPDATE files SET last_scan=? WHERE id=?', [scan_start, record['id']]) conn.commit() cursor.execute('SELECT * FROM files WHERE last_scan < ?', [scan_start]) removed.extend(cursor.fetchall()) cursor.close() return added, removed
def open_file(self): if self.hashFile != "": self.save_transits_to_file() self.clear_graph(rebuild=False) self.detrended_all_flux = [] self.folded_all_time = [] fileName, _ = QFileDialog.getOpenFileName( self, "Open light curve file", "c:\\", "Fits kepler files (*.fits)") if fileName: with fits.open(fileName) as hdu_list: light_curve = hdu_list["LIGHTCURVE"].data match = re.search("kplr[0-9]{9}", fileName) self.kepID = "" if match: self.kepID = match[0][4:] self.all_time = light_curve.TIME self.all_flux = light_curve.PDCSAP_FLUX flux_and_time_finite = np.logical_and(np.isfinite(self.all_flux), np.isfinite(self.all_time)) self.all_time = self.all_time[flux_and_time_finite] self.all_flux = self.all_flux[flux_and_time_finite] self.all_flux /= np.median(self.all_flux) self.hashFile = utils.hash_file(fileName) try: with open(self.hashDir + self.hashFile, "r") as transitFile: for line in transitFile.readlines(): item = QListWidgetItem() item.setText(line) self.listOfTransits.addItem(item) except OSError: pass self.rebuild_plot() self.fileLoaded = True self.saveButton.setVisible(True) self.clearButton.setVisible(True) self.detrendCheck.setVisible(True) self.actionDetrend.setEnabled(True)
async def store(ds, data): global initialized if not initialized: await setup() initialized = True async with get_conn() as conn: try: ds = get_meta(ds) async with conn.transaction(): master = await Replica(conn).master() hsh = hash_file(data) file_data = { 'name': str(uuid.uuid4()) + '.dcm', 'master': master['id'], 'hash': hsh, } file_data.update(ds) f = await Files(conn).insert_or_select(file_data) storage = await Storage.get(master) ret = await storage.copy(data, f) await ReplicaFiles(conn).add( master['id'], [{ 'id': f['id'], **ret }], ) except Exception as e: print(traceback.format_exc()) await Log(conn).add(str(e)) return False return True
def hash(self, location): with open(location, 'rb') as dcmf: return hash_file(dcmf)
def __init__(self, name, f, user): self.name = name self.user = user self.digest = hash_file(f) self.output = Image.analyze(f)
def test_train(params): # Get the token for authorizing with the serve endpoint config = ConfigObj(CONFIG_PATH) try: token = config["MINIKUBE"]["backend"]["token"] except KeyError: token = config["DOCKER"]["backend"]["token"] print(token) subprocess.Popen(["kaos workspace list"], shell=True, stdout=subprocess.PIPE).stdout.read() workspace_name = get_rand_str() code, stdout, stderr = run_cmd( f"kaos workspace create -n {workspace_name}") print(stdout.read()) code, stdout, stderr = run_cmd(f"kaos template get -n property-val") print(stdout.read()) print("###############################################################") print("# train model and assert results") print("###############################################################") old_job_id, old_model_id, old_model_checksum = train_and_assert( workspace_name, 0) print("###############################################################") print("# deploy inference with the trained model") print("###############################################################") code, stdout, stderr = run_cmd(f"kaos train info -i 0") data = stdout.read().decode('utf-8') model_id = parse_train_info(data)[0][3] code, stdout, stderr = run_cmd( f"kaos serve deploy -m {model_id} -s templates/property-val/model-serve" ) print(stdout.read()) serve_and_assert( deploy_command= f"kaos serve deploy -m {model_id} -s templates/property-val/model-serve", list_command="kaos serve list") code, stdout, stderr = run_cmd("kaos serve list") data = stdout.read().decode('utf-8') building_table, serving_table = parse_serve_list(data) print("###############################################################") print("# curl the running model") print("###############################################################") data = open("templates/property-val/test_payload.json").read() endpoint_name = serving_table[0][2] print(f"endpoing name: {endpoint_name}") r = requests.post( f"http://localhost:{params['k8s_port']}/{endpoint_name}/invocations", headers={ "Content-Type": "application/json", "X-Token": token }, data=data) assert r.status_code == 200 assert "result" in r.json() print("###############################################################") print("# check all the serving artifacts") print("###############################################################") serve_artifacts_dir = f"serve_artifacts-{workspace_name}" os.mkdir(serve_artifacts_dir) code, stdout, stderr = run_cmd( f"kaos serve get -e {endpoint_name} -o {serve_artifacts_dir}") print(stdout.read()) serve_code_path_matches = glob.glob( f"{serve_artifacts_dir}/*/*/code/property-val:*", recursive=True) assert len(serve_code_path_matches) == 1 serve_code_path = serve_code_path_matches[0] assert checksumdir.dirhash(serve_code_path, excluded_files=["__init__.py", "model.pkl"]) == \ checksumdir.dirhash("templates/property-val/model-serve/property-val", excluded_files=["__init__.py", "model.pkl"]) model_path_matches = glob.glob( f"{serve_artifacts_dir}/*/*/code/property-val:*/model/model.pkl", recursive=True) assert len(model_path_matches) == 1 model_path = model_path_matches[0] assert os.path.getsize(model_path) // 100000 == 4 _, stdout, _ = run_cmd( f"kaos serve provenance -e {endpoint_name} -o {serve_artifacts_dir}") print(stdout.read()) serve_provenance_matches = glob.glob( f"{serve_artifacts_dir}/{workspace_name.lower()}/provenance/serve-*.pdf", recursive=True) assert len(serve_provenance_matches) == 1 serve_provenance_path = serve_provenance_matches[0] assert os.path.exists(serve_provenance_path) assert os.path.isfile(serve_provenance_path) with open(serve_provenance_path, "rb") as prov_file: prov = PdfFileReader(prov_file, strict=False) print(prov.documentInfo) print("###############################################################") print("# modify code dir") print("###############################################################") with open( f"templates/property-val/model-train/property-val/model/{uuid.uuid4().hex}", 'w') as f: f.write(uuid.uuid4().hex) print("###############################################################") print("# RE-train model and assert results") print("###############################################################") train_and_assert(workspace_name, 1) # ############################################################### # # modify data dir # ############################################################### # # with open(f"templates/property-val/data/features{uuid.uuid4().hex}", 'w') as f: # f.write(uuid.uuid4().hex) # # ############################################################### # # RE-train model and assert results # ############################################################### # # train_and_assert(workspace_name, 2) print("# ##############################################################") print("# Check that we can still get the actual old model") print("# ##############################################################") old_artifacts_dir = f"old-artifacts-{workspace_name}" os.mkdir(old_artifacts_dir) code, stdout, stderr = run_cmd( f"kaos train get -cdm --job_id {old_job_id} -o {old_artifacts_dir}") print(stdout.read()) old_model_path_matches = glob.glob( f"{old_artifacts_dir}/*/*/models/*/model/model.pkl", recursive=True) assert len(old_model_path_matches) == 1 old_model_path = old_model_path_matches[0] old_model_checksum_now = hash_file(old_model_path) assert old_model_checksum == old_model_checksum_now
def train_and_assert(workspace_name, expected_pretrained_jobs): code, stdout, stderr = run_cmd( f"kaos train deploy -s templates/property-val/model-train/ " f"-d templates/property-val/data/") print(stdout.read()) print("###############################################################") print("# wait until the submitted job appears in BUILDING list") print("###############################################################") building_table = [] training_table = [] i = 0 while len(building_table) == 0 and i < TIMEOUT: code, stdout, stderr = run_cmd(f"kaos train list") data = stdout.read().decode('utf-8') building_table, training_table = parse_train_list(data) time.sleep(10) print(f"building -> {building_table}") print(f"training -> {training_table}") i += 1 if i == TIMEOUT: raise Exception("timeout") print("###############################################################") print("# check that the status is JOB_RUNNING") print("###############################################################") print(building_table) print(training_table) assert len(building_table) == 1 assert len(training_table) == expected_pretrained_jobs assert building_table[0][3] == 'JOB_RUNNING' print("###############################################################") print("# wait until the submitted job appears in TRAINING list") print("###############################################################") building_table = [] training_table = [] i = 0 while len(training_table) <= expected_pretrained_jobs and i < TIMEOUT: code, stdout, stderr = run_cmd(f"kaos train list") data = stdout.read().decode('utf-8') building_table, training_table = parse_train_list(data) print(f"building -> {building_table}") print(f"training -> {training_table}") time.sleep(10) i += 1 if i == TIMEOUT: raise Exception("timeout") print("###############################################################") print("# check that the job is either running or has succeeded") print("###############################################################") print(building_table) print(training_table) assert len(building_table) == 0 assert len(training_table) == 1 + expected_pretrained_jobs assert training_table[0][5] in ('JOB_RUNNING', 'JOB_SUCCESS', 'JOB_MERGING') print("###############################################################") print("# wait if any training job is still running or merging") print("###############################################################") i = 0 while any( map(lambda row: row[5] in ('JOB_RUNNING', 'JOB_MERGING'), training_table)) and i < TIMEOUT: code, stdout, stderr = run_cmd(f"kaos train list") data = stdout.read().decode('utf-8') building_table, training_table = parse_train_list(data) print(f"building -> {building_table}") print(f"training -> {training_table}") time.sleep(10) i += 1 if i == TIMEOUT: raise Exception("timeout") print("###############################################################") print("# check that job finished with JOB_SUCCESS status") print("###############################################################") print(building_table) print(training_table) assert len(building_table) == 0 assert len(training_table) == 1 + expected_pretrained_jobs assert training_table[0][5] == 'JOB_SUCCESS' print("###############################################################") print("# check all the training artifacts") print("###############################################################") artifacts_dir = f"artifacts-{workspace_name}-{expected_pretrained_jobs}" os.mkdir(artifacts_dir) job_id = training_table[0][3] train_get_cmd = f"kaos train get -cdm --job_id {job_id} -o {artifacts_dir}" print(train_get_cmd) code, stdout, stderr = run_cmd(train_get_cmd) print(stdout.read()) model_path_matches = glob.glob( f"{artifacts_dir}/*/*/models/*/model/model.pkl", recursive=True) assert len(model_path_matches) == 1 model_path = model_path_matches[0] model_checksum = hash_file(model_path) assert os.path.getsize(model_path) // 100000 == 4 data_path_matches = glob.glob(f"{artifacts_dir}/*/*/data", recursive=True) assert len(data_path_matches) == 1 data_path = data_path_matches[0] assert checksumdir.dirhash(data_path) == checksumdir.dirhash( "templates/property-val/data/") code_path_matches = glob.glob(f"{artifacts_dir}/*/*/code/property-val:*", recursive=True) assert len(code_path_matches) == 1 code_path = code_path_matches[0] print(f"code_path -> {code_path}") print(f"job id -> {job_id}") print(f"{training_table}") assert checksumdir.dirhash(code_path, excluded_files=["__init__.py"]) == \ checksumdir.dirhash("templates/property-val/model-train/property-val", excluded_files=["__init__.py"]) code, stdout, stderr = run_cmd(f"kaos train info -i 0") data = stdout.read().decode('utf-8') train_info = parse_train_info(data) assert len(train_info) > 1 model_id = train_info[0][3] print("###############################################################") print("# check provenance") print("###############################################################") _, stdout, _ = run_cmd( f"kaos train provenance -m {model_id} -o {artifacts_dir}") print(stdout.read()) prov_path = f"{artifacts_dir}/{workspace_name.lower()}/provenance/model-{model_id}.pdf" assert os.path.exists(prov_path) assert os.path.isfile(prov_path) with open(prov_path, "rb") as prov_file: prov = PdfFileReader(prov_file, strict=False) print(prov.documentInfo) return job_id, model_id, model_checksum
remote_checksum = None try: print("retrieving remote planet file checksum from:\n%s" % constants.PLANET_MD5_URL) remote = urllib.urlopen(constants.PLANET_MD5_URL) remote_checksum = remote.read().split(" ")[0] print("remote checksum retrieved:") print(remote_checksum) except Exception as e: print("ERROR: retrieving remote md5 checksum failed") print(e) # it only makes sense to compute the local checksum if we have the remote checksum if remote_checksum: try: print("computing local md5 checksum") local_checksum = utils.hash_file(constants.PLANET_PATH) print("local md5 checksum done:") print(local_checksum) except Exception as e: print("local md5 checksum computation failed") print(e) if remote_checksum is None or local_checksum is None: print("skipping the md5 check - can't get remote or local checksum") else: if remote_checksum == local_checksum: print("* md5 checksum: OK") else: print("* md5 checksum: NOK") sane = False
def test_hash_file(self): for example in TestHashFile.xxhash_examples: with mock.patch('builtins.open', mock.mock_open(read_data=example['data'])) as mock_open: pbar = mock.MagicMock() self.assertEqual(example['hexdigest'], hash_file(Path('foo_filename'), pbar)) mock_open.assert_called_once_with(Path('foo_filename'), 'rb')