def check_md5(url: str, dest_paths: List[Path], headers: Optional[dict] = None) -> bool: url_md5 = path_set_md5(url) try: if url_md5.startswith('s3://'): import boto3 s3 = boto3.resource('s3') bucket, key = url_md5[5:].split('/', maxsplit=1) obj = s3.Object(bucket, key) data = obj.get()['Body'].read().decode('utf8') else: r = requests.get(url_md5, headers=headers) if r.status_code != 200: return False data = r.text except Exception as e: log.debug( f'Could not download {url_md5} because of an exception {type(e)}: {e}' ) return False expected = {} for line in data.splitlines(): _md5, fname = line.split(' ', maxsplit=1) if fname[0] != '*': if fname[0] == ' ': log.warning( f'Hash generated in text mode for {fname}, comparison could be incorrect' ) else: log.error(f'Unknown hash content format in {url + ".md5"}') return False expected[fname[1:]] = _md5 done = None not_done = [] for base_path in dest_paths: if all( file_md5(base_path / p) == _md5 for p, _md5 in expected.items()): done = base_path else: not_done.append(base_path) if done is None: return False for base_path in not_done: log.info(f'Copying data from {done} to {base_path}') for p in expected.keys(): shutil.copy(done / p, base_path / p) return True
def check_md5(url: str, dest_paths: List[Path]) -> bool: url_md5 = path_set_md5(url) r = requests.get(url_md5) if r.status_code != 200: return False expected = {} for line in r.text.splitlines(): _md5, fname = line.split(' ', maxsplit=1) if fname[0] != '*': if fname[0] == ' ': log.warning( f'Hash generated in text mode for {fname}, comparison could be incorrect' ) else: log.error(f'Unknown hash content format in {url + ".md5"}') return False expected[fname[1:]] = _md5 done = None not_done = [] for base_path in dest_paths: if all( file_md5(base_path / p) == _md5 for p, _md5 in expected.items()): done = base_path else: not_done.append(base_path) if done is None: return False for base_path in not_done: log.info(f'Copying data from {done} to {base_path}') for p in expected.keys(): shutil.copy(done / p, base_path / p) return True
for service_name, service_args in data["services"].items(): if service_args.get("build", {}).get("args", {}).get("SRC_DIR") is not None: commit = service_args["build"]["args"].get("COMMIT", "master") repo.git.checkout(commit) config_path = Path(service_args["build"]["args"]["SRC_DIR"]) / service_args["build"]["args"]["CONFIG"] try: if service_name in {"entity-detection", "kbqa"}: with open(config_path) as fin: lines = fin.readlines() with open(config_path, "w") as fout: old_path, new_path = replace_paths[service_name] fout.writelines([line.replace(old_path, new_path) for line in lines]) config_downloads = dict(get_configs_downloads(config_path)) for url, paths in config_downloads.items(): md5_url = path_set_md5(url) resp = requests.get(md5_url) assert resp.status_code == 200, md5_url for line in resp.text.splitlines(): _md5, f_name = line.split(" ", maxsplit=1) if f_name.startswith("*"): f_name = f_name[1:] else: raise ValueError for save_dir in paths: downloads[str(save_dir / f_name)].append((service_name, url)) except Exception as e: print(service_name) raise e duplicates = {}