Example #1
0
def aip_file_download(request, uuid):
    # get file basename
    file = models.File.objects.get(uuid=uuid)
    file_basename = os.path.basename(file.currentlocation)

    # get file's AIP's properties
    sipuuid = helpers.get_file_sip_uuid(uuid)
    es_client = elasticSearchFunctions.get_client()
    aip = elasticSearchFunctions.get_aip_data(es_client, sipuuid, fields='uuid,name,filePath,size,origin,created')
    aip_filepath = aip['fields']['filePath'][0]

    # work out path components
    aip_archive_filename = os.path.basename(aip_filepath)

    # splittext doesn't deal with double extensions, so special-case .tar.bz2
    if aip_archive_filename.endswith('.tar.bz2'):
        subdir = aip_archive_filename[:-8]
    else:
        subdir = os.path.splitext(aip_archive_filename)[0]

    # Strip %Directory% from the path
    path_to_file_within_aip_data_dir = os.path.dirname(file.currentlocation.replace('%transferDirectory%', '').replace('%SIPDirectory%', ''))

    file_relative_path = os.path.join(
        subdir,
        'data',
        path_to_file_within_aip_data_dir,
        file_basename
    )

    redirect_url = storage_service.extract_file_url(aip['fields']['uuid'][0], file_relative_path)
    return helpers.stream_file_from_storage_service(redirect_url, 'Storage service returned {}; check logs?')
Example #2
0
def aip_file_download(request, uuid):
    es_client = es.get_client()

    # get AIP file properties
    aipfile = es.get_aipfile_data(es_client, uuid, fields="filePath,FILEUUID,AIPUUID")

    # get file's AIP's properties
    sipuuid = aipfile["_source"]["AIPUUID"]
    aip = es.get_aip_data(
        es_client, sipuuid, fields="uuid,name,filePath,size,origin,created"
    )
    aip_filepath = aip["_source"]["filePath"]

    # work out path components
    aip_archive_filename = os.path.basename(aip_filepath)

    # splittext doesn't deal with double extensions, so special-case .tar.bz2
    if aip_archive_filename.endswith(".tar.bz2"):
        subdir = aip_archive_filename[:-8]
    else:
        subdir = os.path.splitext(aip_archive_filename)[0]

    file_relative_path = os.path.join(subdir, "data", aipfile["_source"]["filePath"])

    redirect_url = storage_service.extract_file_url(
        aip["_source"]["uuid"], file_relative_path
    )
    return helpers.stream_file_from_storage_service(
        redirect_url, "Storage service returned {}; check logs?"
    )
Example #3
0
def download_by_uuid(request, uuid, preview_file=False):
    """Download a file from the Storage Service, given its UUID.

    This view will stream the response directly from the storage service,
    so, unlike download_ss, this will work even if the Storage Service is
    not accessible to the requestor.

    Returns 404 if a file with the requested UUID cannot be found. Otherwise
    the status code is returned via the call to
    ``stream_file_from_storage_service``

    ``preview_file`` is an instruction to be applied to the response headers
    to enable the file to be seen inside the browser if it is capable of being
    rendered. On receiving this instruction, the content-disposition header
    will be set in the stream_file_from_storage_service to 'inline'.
    """
    try:
        f = models.File.objects.get(uuid=uuid)
    except models.File.DoesNotExist:
        response = {
            'success': False,
            'message': _('File with UUID %(uuid)s '
                         'could not be found') % {
                             'uuid': uuid
                         },
        }
        return helpers.json_response(response, status_code=404)
    relative_path = f.currentlocation.replace('%transferDirectory%', '')
    redirect_url = storage_service.extract_file_url(f.transfer_id,
                                                    relative_path)
    return helpers.stream_file_from_storage_service(
        redirect_url, 'Storage service returned {}; check logs?', preview_file)
Example #4
0
def download_by_uuid(request, uuid, preview_file=False):
    """Download a file from the Storage Service, given its UUID.

    This view will stream the response directly from the storage service,
    so, unlike download_ss, this will work even if the Storage Service is
    not accessible to the requestor.

    It looks up the full relative path in the ``transferfiles`` search index.
    ``relative_path`` includes the ``data`` directory when the transfer package
    uses the BagIt format.

    Returns 404 if a file with the requested UUID cannot be found. Otherwise
    the status code is returned via the call to
    ``stream_file_from_storage_service``

    ``preview_file`` is an instruction to be applied to the response headers
    to enable the file to be seen inside the browser if it is capable of being
    rendered. On receiving this instruction, the content-disposition header
    will be set in the stream_file_from_storage_service to 'inline'.
    """
    not_found_err = helpers.json_response(
        {
            "success": False,
            "message": _("File with UUID %(uuid)s " "could not be found")
            % {"uuid": uuid},
        },
        status_code=404,
    )

    try:
        record = elasticSearchFunctions.get_transfer_file_info(
            elasticSearchFunctions.get_client(), "fileuuid", uuid
        )
    except elasticSearchFunctions.ElasticsearchError:
        return not_found_err

    try:
        transfer_id, relpath = record["sipuuid"], record["relative_path"]
    except KeyError:
        logger.debug("Search document is missing required parameters")
        return not_found_err

    # E.g. from "<name>-<uuid>/data/objects/bird.mp3" we only need the path
    # component not including transfer name or UUID.
    try:
        relpath = relpath.split("/", 1)[1]
    except IndexError:
        logger.debug(
            "Relative path in search document has an unexpected form: %s", relpath
        )
        return not_found_err

    redirect_url = storage_service.extract_file_url(transfer_id, relpath)
    return helpers.stream_file_from_storage_service(
        redirect_url, "Storage service returned {}; check logs?", preview_file
    )
Example #5
0
def download_ss(request):
    filepath = base64.b64decode(request.GET.get('filepath', '')).lstrip('/')
    logging.info('download filepath: %s', filepath)
    if not filepath.startswith(DEFAULT_BACKLOG_PATH):
        return django.http.HttpResponseBadRequest()
    filepath = filepath.replace(DEFAULT_BACKLOG_PATH, '', 1)

    # Get UUID
    uuid_regex = r'[\w]{8}(-[\w]{4}){3}-[\w]{12}'
    transfer_uuid = re.search(uuid_regex, filepath).group()

    # Get relative path
    # Find first /, should be at the end of the transfer name/uuid, rest is relative ptah
    relative_path = filepath[filepath.find('/')+1:]

    redirect_url = storage_service.extract_file_url(transfer_uuid, relative_path)
    return django.http.HttpResponseRedirect(redirect_url)
Example #6
0
def download_ss(request):
    filepath = base64.b64decode(request.GET.get("filepath", "")).lstrip("/")
    logger.info("download filepath: %s", filepath)
    if not filepath.startswith(DEFAULT_BACKLOG_PATH):
        return django.http.HttpResponseBadRequest()
    filepath = filepath.replace(DEFAULT_BACKLOG_PATH, "", 1)

    # Get UUID
    uuid_regex = r"[\w]{8}(-[\w]{4}){3}-[\w]{12}"
    transfer_uuid = re.search(uuid_regex, filepath).group()

    # Get relative path
    # Find first /, should be at the end of the transfer name/uuid, rest is relative ptah
    relative_path = filepath[filepath.find("/") + 1 :]

    redirect_url = storage_service.extract_file_url(transfer_uuid, relative_path)
    return helpers.stream_file_from_storage_service(
        redirect_url, "Storage service returned {}; check logs?"
    )
Example #7
0
def download_by_uuid(request, uuid):
    """
    Download a file from the Storage Service, given its UUID.

    This view will stream the response directly from the storage service, so, unlike download_ss, this will work even if the Storage Service is not accessible to the requestor.

    Returns 404 if a file with the requested UUID cannot be found, and 400 if the storage service fails to retrieve the record.
    """
    try:
        f = models.File.objects.get(uuid=uuid)
    except models.File.DoesNotExist:
        response = {
            'success': False,
            'message': 'File with UUID ' + uuid + ' could not be found',
        }
        return helpers.json_response(response, status_code=404)
    relative_path = f.currentlocation.replace('%transferDirectory%', '')
    redirect_url = storage_service.extract_file_url(f.transfer_id,
                                                    relative_path)
    return helpers.stream_file_from_storage_service(
        redirect_url, 'Storage service returned {}; check logs?')
Example #8
0
def bulk_extractor(request, fileuuid):
    """
    Fetch bulk_extractor reports for a given file, and return a parsed copy of them as JSON.

    Supports the 'reports' query parameter; this is a comma-separated list of reports to return.
    If not specified, then the 'ccn' and 'pii' reports are returned.

    If no reports are requested, or if the requested file is missing at least one of the requested reports, returns 400.
    If no file can be found for the given UUID, returns 404.

    Data structure looks like:
        {
            "report": [
                {
                    "content": "",
                    "context": "",
                    "offset": 0
                }
            ]
        }
    It will have one key for each parsed report, with each report's list of features containing zero or more objects.
    """
    reports = request.GET.get('reports', 'ccn,pii').split(',')

    if len(reports) == 0:
        response = {'success': False, 'message': 'No reports were requested.'}
        return helpers.json_response(response, status_code=400)

    try:
        es_client = elasticSearchFunctions.get_client()
        record = elasticSearchFunctions.get_transfer_file_info(
            es_client, 'fileuuid', fileuuid)
    except elasticSearchFunctions.ElasticsearchError as e:
        message = str(e)
        response = {
            'success': False,
            'message': message,
        }
        if 'no exact results' in message:
            status_code = 404
        else:
            status_code = 500
        return helpers.json_response(response, status_code=status_code)

    bulk_extractor_reports = record.get('bulk_extractor_reports', [])
    missing_reports = []
    for report in reports:
        if report not in bulk_extractor_reports:
            missing_reports.append(report)

    if len(missing_reports) > 0:
        response = {
            'success':
            False,
            'message':
            'Requested file is missing the following requested reports: ' +
            ', '.join(missing_reports),
        }
        return helpers.json_response(response, status_code=400)

    f = models.File.objects.get(uuid=fileuuid)
    features = {}

    for report in reports:
        relative_path = os.path.join('logs', 'bulk-' + fileuuid,
                                     report + '.txt')
        url = storage_service.extract_file_url(f.transfer_id, relative_path)
        response = requests.get(url)

        if response.status_code != 200:
            message = 'Unable to retrieve ' + report + ' report for file with UUID ' + fileuuid
            logger.error(message + '; response: %s', (response.text, ))
            response = {
                'success': False,
                'message': message,
            }
            helpers.json_response(response, status_code=500)

        features[report] = _parse_bulk_extractor_report(response.text)

    return helpers.json_response(features)
Example #9
0
def bulk_extractor(request, fileuuid):
    """
    Fetch bulk_extractor reports for a given file, and return a parsed copy of them as JSON.

    Supports the 'reports' query parameter; this is a comma-separated list of reports to return.
    If not specified, then the 'ccn' and 'pii' reports are returned.

    If no reports are requested, or if the requested file is missing at least one of the requested reports, returns 400.
    If no file can be found for the given UUID, returns 404.

    Data structure looks like:
        {
            "report": [
                {
                    "content": "",
                    "context": "",
                    "offset": 0
                }
            ]
        }
    It will have one key for each parsed report, with each report's list of features containing zero or more objects.
    """
    reports = request.GET.get("reports", "ccn,pii").split(",")

    if len(reports) == 0:
        response = {"success": False, "message": "No reports were requested."}
        return helpers.json_response(response, status_code=400)

    try:
        es_client = elasticSearchFunctions.get_client()
        record = elasticSearchFunctions.get_transfer_file_info(
            es_client, "fileuuid", fileuuid
        )
    except elasticSearchFunctions.ElasticsearchError as e:
        message = str(e)
        response = {"success": False, "message": message}
        if "no exact results" in message:
            status_code = 404
        else:
            status_code = 500
        return helpers.json_response(response, status_code=status_code)

    bulk_extractor_reports = record.get("bulk_extractor_reports", [])
    missing_reports = []
    for report in reports:
        if report not in bulk_extractor_reports:
            missing_reports.append(report)

    if len(missing_reports) > 0:
        response = {
            "success": False,
            "message": "Requested file is missing the following requested reports: "
            + ", ".join(missing_reports),
        }
        return helpers.json_response(response, status_code=400)

    f = models.File.objects.get(uuid=fileuuid)
    features = {}

    for report in reports:
        relative_path = os.path.join(
            "data", "logs", "bulk-" + fileuuid, report + ".txt"
        )
        url = storage_service.extract_file_url(f.transfer_id, relative_path)
        response = requests.get(
            url, timeout=django_settings.STORAGE_SERVICE_CLIENT_TIMEOUT
        )

        if response.status_code != 200:
            message = (
                "Unable to retrieve "
                + report
                + " report for file with UUID "
                + fileuuid
            )
            logger.error(message + "; response: %s", (response.text,))
            response = {"success": False, "message": message}
            helpers.json_response(response, status_code=500)

        features[report] = _parse_bulk_extractor_report(response.text)

    return helpers.json_response(features)