Exemple #1
0
def _download_extract_downstream_data(input_file, proxies=None):
    # download archive to temp dir and extract to correct position
    full_path = Path(os.path.realpath(input_file))
    directory = full_path.parent
    taskname = directory.stem
    datadir = directory.parent
    logger.info("downloading and extracting file {} to dir {}".format(
        taskname, datadir))
    if "conll03-" in taskname:
        # conll03 is copyrighted, but luckily somebody put it on github. Kudos!
        if not os.path.exists(directory):
            os.makedirs(directory)
        for dataset in ["train", "dev", "test"]:
            if "de" in taskname:
                _conll03get(dataset, directory, "de")
            elif "en" in taskname:
                _conll03get(dataset, directory, "en")
            else:
                logger.error("Cannot download {}. Unknown data source.".format(
                    taskname))
    elif taskname not in DOWNSTREAM_TASK_MAP:
        logger.error(
            "Cannot download {}. Unknown data source.".format(taskname))
    else:
        if os.name == "nt":  # make use of NamedTemporaryFile compatible with Windows
            delete_tmp_file = False
        else:
            delete_tmp_file = True
        with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
            http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
            temp_file.flush()
            temp_file.seek(0)  # making tempfile accessible

            # checking files for correctness with md5sum.
            if ("germeval14" in taskname):
                if "2c9d5337d7a25b9a4bf6f5672dd091bc" != _get_md5checksum(
                        temp_file.name):
                    logger.error(
                        f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py"
                    )
            elif "germeval18" in taskname:
                if "23244fa042dcc39e844635285c455205" != _get_md5checksum(
                        temp_file.name):
                    logger.error(
                        f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py"
                    )
            elif "gnad" in taskname:
                if "ef62fe3f59c1ad54cf0271d8532b8f22" != _get_md5checksum(
                        temp_file.name):
                    logger.error(
                        f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py"
                    )
            elif "germeval17" in taskname:
                if "f1bf67247dcfe7c3c919b7b20b3f736e" != _get_md5checksum(
                        temp_file.name):
                    logger.error(
                        f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py"
                    )
            tfile = tarfile.open(temp_file.name)
            tfile.extractall(datadir)
Exemple #2
0
def _download_extract_downstream_data(input_file, proxies=None):
    # download archive to temp dir and extract to correct position
    full_path = os.path.realpath(input_file)
    directory = os.path.dirname(full_path)
    taskname = directory.split("/")[-1]
    datadir = "/".join(directory.split("/")[:-1])
    logger.info("downloading and extracting file {} to dir {}".format(
        taskname, datadir))
    if "conll03" in taskname:
        # conll03 is copyrighted, but luckily somebody put it on github. Kudos!
        if not os.path.exists(directory):
            os.makedirs(directory)
        for dataset in ["train", "dev", "test"]:
            if "de" in taskname:
                _conll03get(dataset, directory, "de")
            elif "en" in taskname:
                _conll03get(dataset, directory, "en")
            else:
                logger.error("Cannot download {}. Unknown data source.".format(
                    taskname))
    elif taskname not in DOWNSTREAM_TASK_MAP:
        logger.error(
            "Cannot download {}. Unknown data source.".format(taskname))
    else:
        with tempfile.NamedTemporaryFile() as temp_file:
            http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
            temp_file.flush()
            temp_file.seek(0)  # making tempfile accessible
            tfile = tarfile.open(temp_file.name)
            tfile.extractall(datadir)
Exemple #3
0
def _download_extract_downstream_data(input_file, proxies=None):
    # download archive to temp dir and extract to correct position
    full_path = Path(os.path.realpath(input_file))
    directory = full_path.parent
    taskname = directory.stem
    datadir = directory.parent
    logger.info(
        "downloading and extracting file {} to dir {}".format(taskname, datadir)
    )
    if "conll03" in taskname:
        # conll03 is copyrighted, but luckily somebody put it on github. Kudos!
        if not os.path.exists(directory):
            os.makedirs(directory)
        for dataset in ["train", "dev", "test"]:
            if "de" in taskname:
                _conll03get(dataset, directory, "de")
            elif "en" in taskname:
                _conll03get(dataset, directory, "en")
            else:
                logger.error("Cannot download {}. Unknown data source.".format(taskname))
    elif taskname not in DOWNSTREAM_TASK_MAP:
        logger.error("Cannot download {}. Unknown data source.".format(taskname))
    else:
        if os.name == "nt":  # make use of NamedTemporaryFile compatible with Windows
            delete_tmp_file = False
        else:
            delete_tmp_file = True
        with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
            http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
            temp_file.flush()
            temp_file.seek(0)  # making tempfile accessible
            tfile = tarfile.open(temp_file.name)
            tfile.extractall(datadir)
Exemple #4
0
def download_file(s3_url: str, out_dir: str, file_name: str):
    print('Loading from ', s3_url)
    local_file = os.path.join(out_dir, file_name)

    if os.path.exists(local_file):
        print('File already exist ', local_file)
        return
    with open(local_file, "w") as file:
        http_get(s3_url, temp_file=file)
    wget.download(s3_url, out=local_file)
    print('Saved to ', local_file)
Exemple #5
0
def download_from_url(url: str, filepath: Union[str, Path]):
    """
    Download from a url to a local file. Skip already existing files.

    :param url: Url
    :param filepath: local path where the url content shall be stored
    :return: local path of the downloaded file
    """

    logger.info(f"Downloading {url}")
    # Create local folder
    folder, filename = os.path.split(filepath)
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Download file if not present locally
    if os.path.exists(filepath):
        logger.info(f"Skipping {url} (exists locally)")
    else:
        logger.info(f"Downloading {url} to {filepath} ")
        with open(filepath, "wb") as file:
            http_get(url=url, temp_file=file)
    return filepath