Exemple #1
0
def ingest_upload(request, uuid):
    """
        The upload DIP is actually not executed here, but some data is storaged
        in the database (permalink, ...), used later by upload-qubit.py
        - GET = It could be used to obtain DIP size
        - POST = Create Accesses tuple with permalink
    """
    if not models.SIP.objects.filter(uuid__exact=uuid).exists():
        raise Http404

    if request.method == 'POST':
        if 'target' in request.POST:
            try:
                access = models.Access.objects.get(sipuuid=uuid)
            except:
                access = models.Access(sipuuid=uuid)
            access.target = cPickle.dumps({"target": request.POST['target']})
            access.save()
            response = {'ready': True}
            return helpers.json_response(response)
    elif request.method == 'GET':
        try:
            access = models.Access.objects.get(sipuuid=uuid)
            data = cPickle.loads(str(access.target))
        except:
            raise Http404
        return helpers.json_response(data)

    return HttpResponseNotAllowed(['GET', 'POST'])
Exemple #2
0
def ingest_upload(request, uuid):
    """
        The upload DIP is actually not executed here, but some data is storaged
        in the database (permalink, ...), used later by upload-qubit.py
        - GET = It could be used to obtain DIP size
        - POST = Create Accesses tuple with permalink
    """
    try:
        models.SIP.objects.get(uuid__exact=uuid)
    except models.SIP.DoesNotExist:
        raise Http404

    if request.method == 'POST':
        if 'target' in request.POST:
            try:
                access = models.Access.objects.get(sipuuid=uuid)
            except:
                access = models.Access(sipuuid=uuid)
            access.target = cPickle.dumps({"target": request.POST['target']})
            access.save()
            response = {'ready': True}
            return helpers.json_response(response)
    elif request.method == 'GET':
        try:
            access = models.Access.objects.get(sipuuid=uuid)
            data = cPickle.loads(str(access.target))
        except:
            raise Http404
        # Disabled, it could be very slow
        # job = models.Job.objects.get(jobtype='Upload DIP', sipuuid=uuid)
        # data['size'] = utils.get_directory_size(job.directory)
        return helpers.json_response(data)

    return HttpResponseBadRequest()
def start(job, data):
    # Make sure we are working with an existing SIP record
    if not models.SIP.objects.filter(pk=data.uuid).exists():
        return error(job, "UUID not recognized")

    # Get directory
    jobs = models.Job.objects.filter(sipuuid=data.uuid, jobtype="Upload DIP")
    if jobs.count():
        directory = (jobs[0].directory.rstrip("/").replace(
            "%sharedPath%", "/var/archivematica/sharedDirectory/"))
    else:
        return error(job, "Directory not found: %s" % directory)

    # Check if exists
    if os.path.exists(directory) is False:
        log("Directory not found: %s" % directory)

        # Trying with uploadedDIPs
        log("Looking up uploadedDIPs/")
        directory = directory.replace("uploadDIP", "uploadedDIPs")

        if os.path.exists(directory) is False:
            return error(job, "Directory not found: %s" % directory)

    try:
        # This upload was called before, restore Access record
        access = models.Access.objects.get(sipuuid=data.uuid)
    except:  # First time this job is called, create new Access record
        access = models.Access(sipuuid=data.uuid)
        # Look for access system ID
        transfers = models.Transfer.objects.filter(
            file__sip_id=data.uuid).distinct()
        if transfers.count() == 1:
            access.target = cPickle.dumps(
                {"target": transfers[0].access_system_id})
        access.save()

    # The target columns contents a serialized Python dictionary
    # - target is the permalink string
    try:
        target = cPickle.loads(str(access.target))
        log("Target: %s" % (target["target"]))
    except:
        return error(job, "No target was selected")

    # Rsync if data.rsync_target option was passed to this script
    if data.rsync_target:
        """ Build command (rsync)
          -a =
            -r = recursive
            -l = recreate symlinks on destination
            -p = set same permissions
            -t = transfer modification times
            -g = set same group owner on destination
            -o = set same user owner on destination (if possible, super-user)
            --devices = transfer character and block device files (only super-user)
            --specials = transfer special files like sockets and fifos
          -z = compress
          -P = --partial + --stats
        """
        # Using rsync -rltzP
        command = [
            "rsync",
            "--protect-args",
            "-rltz",
            "-P",
            "--chmod=ugo=rwX",
            directory,
            data.rsync_target,
        ]

        # Add -e if data.rsync_command was passed to this script
        if data.rsync_command:
            # Insert in second position. Example: rsync -e "ssh -i key" ...
            command.insert(1, "-e %s" % data.rsync_command)

        log(" ".join(command))

        # Getting around of rsync output buffering by outputting to a temporary file
        pipe_output, file_name = tempfile.mkstemp()
        log("Rsync output is being saved in %s" % file_name)

        # Call Rsync
        process = subprocess.Popen(command,
                                   stdout=pipe_output,
                                   stderr=pipe_output)

        # poll() returns None while the process is still running
        while process.poll() is None:
            time.sleep(1)
            last_line = open(file_name).readlines()

            # It's possible that it hasn't output yet, so continue
            if len(last_line) == 0:
                continue
            last_line = last_line[-1]

            # Matching to "[bytes downloaded]  number%  [speed] number:number:number"
            match = re.match(".* ([0-9]*)%.* ([0-9]*:[0-9]*:[0-9]*).*",
                             last_line)

            if not match:
                continue

            # Update upload status
            # - percentage in match.group(1)
            # - ETA in match.group(2)
            access.status = "Sending... %s (ETA: %s)" % (match.group(1),
                                                         match.group(2))
            access.statuscode = 10
            access.save()
            log(access.status)

        # We don't need the temporary file anymore!
        # log("Removing temporary rsync output file: %s" % file_name)
        # os.unlink(file_name)

        # At this point, we should have a return code
        # If greater than zero, see man rsync (EXIT VALUES)
        access.exitcode = process.returncode
        if 0 < process.returncode:
            access.statuscode = 12
        else:
            access.statuscode = 11
        access.save()

        if 0 < process.returncode:
            return error(
                job,
                "Rsync quit unexpectedly (exit %s), the upload script will be stopped here"
                % process.returncode,
            )

    # Building headers dictionary for the deposit request
    headers = {}
    headers["User-Agent"] = "Archivematica"
    headers[
        "X-Packaging"] = "http://purl.org/net/sword-types/METSArchivematicaDIP"
    """ headers['X-On-Beahalf-Of'] """
    headers["Content-Type"] = "application/zip"
    headers["X-No-Op"] = "false"
    headers["X-Verbose"] = "false"
    headers["Content-Location"] = "file:///%s" % os.path.basename(directory)
    """ headers['Content-Disposition'] """

    # Build URL (expected sth like http://localhost/ica-atom/index.php)
    atom_url_prefix = ";" if data.version == 1 else ""
    deposit_url = "%s/%ssword/deposit/%s" % (
        data.url,
        atom_url_prefix,
        target["target"],
    )

    # Auth and request!
    log("About to deposit to: %s" % data.url)
    access.statuscode = 13
    access.resource = "%s/%s" % (data.url, target["target"])
    access.save()
    auth = requests.auth.HTTPBasicAuth(data.email, data.password)

    # Disable redirects: AtoM returns 302 instead of 202, but Location header field is valid
    response = requests.request(
        "POST",
        deposit_url,
        auth=auth,
        headers=headers,
        allow_redirects=False,
        timeout=mcpclient_settings.AGENTARCHIVES_CLIENT_TIMEOUT,
    )

    # response.{content,headers,status_code}
    log("> Response code: %s" % response.status_code)
    log("> Location: %s" % response.headers.get("Location"))

    if data.debug:
        # log("> Headers sent: %s" % headers)
        # log("> Headers received: %s" % response.headers)
        log("> Content received: %s" % response.content)

    # Check AtoM response status code
    if response.status_code not in [200, 201, 302]:
        return error(job, "Response code not expected")

    # Location is a must, if it is not included in the AtoM response something was wrong
    if response.headers["Location"] is None:
        return error(
            job,
            "Location is expected, if not is likely something is wrong with AtoM"
        )

    # (A)synchronously?
    if response.status_code == 302:
        access.status = (
            "Deposited asynchronously, AtoM is processing the DIP in the job queue"
        )
        log(access.status)
    else:
        access.statuscode = 14
        access.status = "Deposited synchronously"
        log(access.status)
    access.save()

    # We also have to parse the XML document

    return 0
Exemple #4
0
def start(data):
    # Make sure we are working with an existing SIP record
    try:
        sip = models.SIP.objects.get(pk=data.uuid)
    except:
        error("UUID not recognized")

    # Get directory
    jobs = models.Job.objects.filter(sipuuid=data.uuid, jobtype="Upload DIP")
    if jobs.count():
        directory = jobs[0].directory.rstrip('/').replace(
            '%sharedPath%', '/var/archivematica/sharedDirectory/')
    else:
        error("Directory not found: %s" % directory)

    # Check if exists
    if os.path.exists(directory) is False:
        log("Directory not found: %s" % directory)

        # Trying with uploadedDIPs
        log("Looking up uploadedDIPs/")
        directory = directory.replace('uploadDIP', 'uploadedDIPs')

        if os.path.exists(directory) is False:
            error("Directory not found: %s" % directory)

    try:
        # This upload was called before, restore Access record
        access = models.Access.objects.get(sipuuid=data.uuid)
    except:
        # First time this job is called, create new Access record
        access = models.Access(sipuuid=data.uuid)
        access.save()

    # The target columns contents a serialized Python dictionary
    # - target is the permalink string
    try:
        target = cPickle.loads(str(access.target))
        log("Target: %s" % (target['target']))
    except:
        error("No target was selected")

    # Rsync if data.rsync_target option was passed to this script
    if data.rsync_target:
        """ Build command (rsync)
          -a =
            -r = recursive
            -l = recreate symlinks on destination
            -p = set same permissions
            -t = transfer modification times
            -g = set same group owner on destination
            -o = set same user owner on destination (if possible, super-user)
            --devices = transfer character and block device files (only super-user)
            --specials = transfer special files like sockets and fifos
          -z = compress
          -P = --partial + --stats
        """
        # Using rsync -rltzP
        command = [
            "rsync", "-rltz", "-P", "--chmod=ugo=rwX", directory,
            data.rsync_target
        ]

        # Add -e if data.rsync_command was passed to this script
        if data.rsync_command:
            # Insert in second position. Example: rsync -e "ssh -i key" ...
            command.insert(1, "-e \"%s\"" % data.rsync_command)

        log(' '.join(command))

        # Getting around of rsync output buffering by outputting to a temporary file
        pipe_output, file_name = tempfile.mkstemp()
        log("Rsync output is being saved in %s" % file_name)

        # Call Rsync
        process = subprocess.Popen(command,
                                   stdout=pipe_output,
                                   stderr=pipe_output)

        # poll() returns None while the process is still running
        while process.poll() is None:
            time.sleep(1)
            last_line = open(file_name).readlines()

            # It's possible that it hasn't output yet, so continue
            if len(last_line) == 0:
                continue
            last_line = last_line[-1]

            # Matching to "[bytes downloaded]  number%  [speed] number:number:number"
            match = re.match(".* ([0-9]*)%.* ([0-9]*:[0-9]*:[0-9]*).*",
                             last_line)

            if not match:
                continue

            # Update upload status
            # - percentage in match.group(1)
            # - ETA in match.group(2)
            access.status = "Sending... %s (ETA: %s)" % (match.group(1),
                                                         match.group(2))
            access.statuscode = 10
            access.save()
            log(access.status)

        # We don't need the temporary file anymore!
        # log("Removing temporary rsync output file: %s" % file_name)
        # os.unlink(file_name)

        # At this point, we should have a return code
        # If greater than zero, see man rsync (EXIT VALUES)
        access.exitcode = process.returncode
        if 0 < process.returncode:
            access.statuscode = 12
        else:
            access.statuscode = 11
        access.save()

        if 0 < process.returncode:
            error(
                "Rsync quit unexpectedly (exit %s), the upload script will be stopped here"
                % process.returncode)

    # Building headers dictionary for the deposit request
    headers = {}
    headers['User-Agent'] = 'Archivematica'
    headers[
        'X-Packaging'] = 'http://purl.org/net/sword-types/METSArchivematicaDIP'
    """ headers['X-On-Beahalf-Of'] """
    headers['Content-Type'] = 'application/zip'
    headers['X-No-Op'] = 'false'
    headers['X-Verbose'] = 'false'
    headers['Content-Location'] = "file:///%s" % os.path.basename(directory)
    """ headers['Content-Disposition'] """

    # Build URL (expected sth like http://localhost/ica-atom/index.php)
    data.url = "%s/;sword/deposit/%s" % (data.url, target['target'])

    # Auth and request!
    log("About to deposit to: %s" % data.url)
    access.statuscode = 13
    access.save()
    auth = requests.auth.HTTPBasicAuth(data.email, data.password)
    response = requests.request('POST', data.url, auth=auth, headers=headers)

    # response.{content,headers,status_code}
    log("> Response code: %s" % response.status_code)
    log("> Location: %s" % response.headers.get('Location'))

    if data.debug:
        # log("> Headers sent: %s" % headers)
        # log("> Headers received: %s" % response.headers)
        log("> Content received: %s" % response.content)

    # Check Qubit response status code
    if not response.status_code in [200, 201, 302]:
        error("Response code not expected")

    # Location is a must, if it is not included in the Qubit response something was wrong
    if response.headers['Location'] is None:
        error(
            "Location is expected, if not is likely something is wrong with Qubit"
        )
    else:
        access.resource = data.url

    # (A)synchronously?
    if response.status_code is 200:
        access.statuscode = 14
        access.status = "Deposited synchronously"
        log(access.status)
    else:
        access.statuscode = 15
        access.status = "Deposited asynchronously, Qubit is processing the DIP in the job queue"
        log(access.status)
    access.save()