Exemple #1
0
def check_md5(url: str,
              dest_paths: List[Path],
              headers: Optional[dict] = None) -> bool:
    url_md5 = path_set_md5(url)

    try:
        if url_md5.startswith('s3://'):
            import boto3

            s3 = boto3.resource('s3')
            bucket, key = url_md5[5:].split('/', maxsplit=1)
            obj = s3.Object(bucket, key)
            data = obj.get()['Body'].read().decode('utf8')
        else:
            r = requests.get(url_md5, headers=headers)
            if r.status_code != 200:
                return False
            data = r.text
    except Exception as e:
        log.debug(
            f'Could not download {url_md5} because of an exception {type(e)}: {e}'
        )
        return False

    expected = {}
    for line in data.splitlines():
        _md5, fname = line.split(' ', maxsplit=1)
        if fname[0] != '*':
            if fname[0] == ' ':
                log.warning(
                    f'Hash generated in text mode for {fname}, comparison could be incorrect'
                )
            else:
                log.error(f'Unknown hash content format in {url + ".md5"}')
                return False
        expected[fname[1:]] = _md5

    done = None
    not_done = []
    for base_path in dest_paths:
        if all(
                file_md5(base_path / p) == _md5
                for p, _md5 in expected.items()):
            done = base_path
        else:
            not_done.append(base_path)

    if done is None:
        return False

    for base_path in not_done:
        log.info(f'Copying data from {done} to {base_path}')
        for p in expected.keys():
            shutil.copy(done / p, base_path / p)
    return True
def check_md5(url: str, dest_paths: List[Path]) -> bool:
    url_md5 = path_set_md5(url)
    r = requests.get(url_md5)
    if r.status_code != 200:
        return False
    expected = {}
    for line in r.text.splitlines():
        _md5, fname = line.split(' ', maxsplit=1)
        if fname[0] != '*':
            if fname[0] == ' ':
                log.warning(
                    f'Hash generated in text mode for {fname}, comparison could be incorrect'
                )
            else:
                log.error(f'Unknown hash content format in {url + ".md5"}')
                return False
        expected[fname[1:]] = _md5

    done = None
    not_done = []
    for base_path in dest_paths:
        if all(
                file_md5(base_path / p) == _md5
                for p, _md5 in expected.items()):
            done = base_path
        else:
            not_done.append(base_path)

    if done is None:
        return False

    for base_path in not_done:
        log.info(f'Copying data from {done} to {base_path}')
        for p in expected.keys():
            shutil.copy(done / p, base_path / p)
    return True
Exemple #3
0
for service_name, service_args in data["services"].items():
    if service_args.get("build", {}).get("args", {}).get("SRC_DIR") is not None:
        commit = service_args["build"]["args"].get("COMMIT", "master")
        repo.git.checkout(commit)
        config_path = Path(service_args["build"]["args"]["SRC_DIR"]) / service_args["build"]["args"]["CONFIG"]
        try:
            if service_name in {"entity-detection", "kbqa"}:
                with open(config_path) as fin:
                    lines = fin.readlines()
                with open(config_path, "w") as fout:
                    old_path, new_path = replace_paths[service_name]
                    fout.writelines([line.replace(old_path, new_path) for line in lines])
            config_downloads = dict(get_configs_downloads(config_path))
            for url, paths in config_downloads.items():
                md5_url = path_set_md5(url)
                resp = requests.get(md5_url)
                assert resp.status_code == 200, md5_url
                for line in resp.text.splitlines():
                    _md5, f_name = line.split(" ", maxsplit=1)
                    if f_name.startswith("*"):
                        f_name = f_name[1:]
                    else:
                        raise ValueError
                    for save_dir in paths:
                        downloads[str(save_dir / f_name)].append((service_name, url))
        except Exception as e:
            print(service_name)
            raise e

duplicates = {}