Example #1
0
def generate_datafile(path, dataset, content=None, size=-1, 
                      verify=True, verified=True):
    '''Generates a datafile AND a replica to hold its contents'''
    from tardis.tardis_portal.models import Dataset_File, Replica, Location

    saved = settings.REQUIRE_DATAFILE_CHECKSUMS 
    settings.REQUIRE_DATAFILE_CHECKSUMS = False
    try:
        datafile = Dataset_File()
        if content:
            datafile.size = str(len(content))
        else:
            datafile.size = str(size)
        # Normally we use any old string for the datafile path, but some
        # tests require the path to be the same as what 'staging' would use
        if path == None:
            datafile.dataset_id = dataset.id
            datafile.save()
            path = "%s/%s/%s" % (dataset.get_first_experiment().id,
                                 dataset.id, datafile.id)

        filepath = os.path.normpath(FILE_STORE_PATH + '/' + path)
        if content:
            try:
                os.makedirs(os.path.dirname(filepath))
                os.remove(filepath)
            except:
                pass
            file = open(filepath, 'wb+')
            file.write(content)
            file.close()
        datafile.mimetype = "application/unspecified"
        datafile.filename = os.path.basename(filepath)
        datafile.dataset_id = dataset.id
        datafile.save()

        location = _infer_location(path)
        replica = Replica(datafile=datafile, url=path, protocol='',
                          location=location)
        if verify and content:
            if not replica.verify(allowEmptyChecksums=True):
                raise RuntimeError('verify failed!?!')
        else:
            replica.verified = verified
        replica.save()
        return (datafile, replica)
    finally:
        settings.REQUIRE_DATAFILE_CHECKSUMS = saved
Example #2
0
def generate_datafile(path, dataset, content=None, size=-1,
                      verify=True, verified=True, verify_checksums_req=False):
    '''Generates a datafile AND a replica to hold its contents'''
    from tardis.tardis_portal.models import Dataset_File, Replica, Location

    saved = settings.REQUIRE_DATAFILE_CHECKSUMS
    settings.REQUIRE_DATAFILE_CHECKSUMS = False
    try:
        datafile = Dataset_File()
        if content:
            datafile.size = str(len(content))
        else:
            datafile.size = str(size)
        # Normally we use any old string for the datafile path, but some
        # tests require the path to be the same as what 'staging' would use
        if path == None:
            datafile.dataset_id = dataset.id
            datafile.save()
            path = "%s/%s/%s" % (dataset.get_first_experiment().id,
                                 dataset.id, datafile.id)

        filepath = os.path.normpath(settings.FILE_STORE_PATH + '/' + path)
        if content:
            try:
                os.makedirs(os.path.dirname(filepath))
                os.remove(filepath)
            except:
                pass
            gen_file = open(filepath, 'wb+')
            gen_file.write(content)
            gen_file.close()
        datafile.mimetype = "application/unspecified"
        datafile.filename = os.path.basename(filepath)
        datafile.dataset_id = dataset.id
        datafile.save()
        settings.REQUIRE_DATAFILE_CHECKSUMS = verify_checksums_req
        location = _infer_location(path)
        replica = Replica(datafile=datafile, url=path, protocol='',
                          location=location)
        if verify and content:
            if not replica.verify():
                raise RuntimeError('verify failed!?!')
        replica.save()
        replica.verified = verified
        replica.save(update_fields=['verified'])  # force no verification
        return (datafile, replica)
    finally:
        settings.REQUIRE_DATAFILE_CHECKSUMS = saved
Example #3
0
def add_staged_file_to_dataset(rel_filepath, dataset_id, username,
                               mimetype="application/octet-stream"):
    """
    add file in user's staging path to a dataset
    may be replaced by main code functions.
    quick and dirty hack to get it working
    """
    originfilepath = os.path.join(get_full_staging_path(username), rel_filepath)
    dataset = Dataset.objects.get(pk=dataset_id)
    newDatafile = Dataset_File()
    newDatafile.dataset = dataset
    newDatafile.size = os.path.getsize(originfilepath)
    newDatafile.protocol = "tardis"
    newDatafile.mimetype = mimetype
    file_dir = "/" + str(dataset.experiment.id) + "/" + str(dataset.id) + "/"
    file_path = file_dir + rel_filepath
    prelim_full_file_path = settings.FILE_STORE_PATH + file_path
    full_file_path = duplicate_file_check_rename(prelim_full_file_path)
    newDatafile.filename = os.path.basename(full_file_path)
    newDatafile.url = "%s://%s" % (newDatafile.protocol,
                                   full_file_path[
            len(settings.FILE_STORE_PATH) + len(file_dir):])
    if not os.path.exists(os.path.dirname(full_file_path)):
        os.makedirs(os.path.dirname(full_file_path))
    shutil.move(originfilepath, full_file_path)
    newDatafile.save()
Example #4
0
    def testRemoteFile(self):
        content = urandom(1024)
        with NamedTemporaryFile() as f:
            # Create new Datafile
            datafile = Dataset_File(dataset=self.dataset)
            datafile.filename = 'background_task_testfile'
            datafile.size = len(content)
            datafile.sha512sum = hashlib.sha512(content).hexdigest()
            datafile.url = 'file://' + path.abspath(f.name)
            datafile.save()

            def get_datafile(datafile):
                return Dataset_File.objects.get(id=datafile.id)

            # Check that it won't verify as it stands
            expect(get_datafile(datafile).verified).to_be(False)
            verify_files()
            expect(get_datafile(datafile).verified).to_be(False)
            expect(get_datafile(datafile).is_local()).to_be(False)

            # Fill in the content
            f.write(content)
            f.flush()

            # Check it now verifies
            verify_files()
            expect(get_datafile(datafile).verified).to_be(True)
            expect(get_datafile(datafile).is_local()).to_be(True)
Example #5
0
    def testLocalFile(self):
        content = urandom(1024)
        cf = ContentFile(content, 'background_task_testfile')

        # Create new Datafile
        datafile = Dataset_File(dataset=self.dataset)
        datafile.filename = cf.name
        datafile.size = len(content)
        datafile.sha512sum = hashlib.sha512(content).hexdigest()
        datafile.save()
        replica = Replica(datafile=datafile,
                          url=write_uploaded_file_to_dataset(self.dataset, cf),
                          location=Location.get_default_location())
        replica.save()

        def get_replica(datafile):
            return Replica.objects.get(datafile=datafile)

        # undo auto-verify:
        replica.verified = False
        replica.save(update_fields=['verified'])

        # Check that it's not currently verified
        expect(get_replica(datafile).verified).to_be(False)

        # Check it verifies
        verify_files()
        expect(get_replica(datafile).verified).to_be(True)
Example #6
0
    def process_enclosure(self, dataset, enclosure):
        filename = getattr(enclosure, 'title', basename(enclosure.href))
        datafile = Dataset_File(filename=filename, dataset=dataset)
        try:
            datafile.mimetype = enclosure.mime
        except AttributeError:
            pass
        try:
            datafile.size = enclosure.length
        except AttributeError:
            pass
        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
        url = enclosure.href
        # This means we will allow the atom feed to feed us any enclosure
        # URL that matches a registered location.  Maybe we should restrict
        # this to a specific location.
        location = Location.get_location_for_url(url)
        if not location:
            logger.error('Rejected ingestion for unknown location %s' % url)
            return

        replica = Replica(datafile=datafile, url=url, location=location)
        replica.protocol = enclosure.href.partition('://')[0]
        replica.save()
        self.make_local_copy(replica)
Example #7
0
    def process_enclosure(self, dataset, enclosure):
        filename = getattr(enclosure, 'title', basename(enclosure.href))
        datafile = Dataset_File(filename=filename, dataset=dataset)
        try:
            datafile.mimetype = enclosure.mime
        except AttributeError:
            pass
        try:
            datafile.size = enclosure.length
        except AttributeError:
            pass
        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
        url = enclosure.href
        # This means we will allow the atom feed to feed us any enclosure
        # URL that matches a registered location.  Maybe we should restrict
        # this to a specific location.
        location = Location.get_location_for_url(url)
        if not location:
            logger.error('Rejected ingestion for unknown location %s' % url)
            return

        replica = Replica(datafile=datafile, url=url,
                          location=location)
        replica.protocol = enclosure.href.partition('://')[0]
        replica.save()
        self.make_local_copy(replica)
Example #8
0
    def testRemoteFile(self):
            content = urandom(1024)
            with NamedTemporaryFile() as f:
                # Create new Datafile
                datafile = Dataset_File(dataset=self.dataset)
                datafile.filename = 'background_task_testfile'
                datafile.size = len(content)
                datafile.sha512sum = hashlib.sha512(content).hexdigest()
                datafile.url = 'file://' + path.abspath(f.name)
                datafile.save()

                def get_datafile(datafile):
                    return Dataset_File.objects.get(id=datafile.id)

                # Check that it won't verify as it stands
                expect(get_datafile(datafile).verified).to_be(False)
                verify_files()
                expect(get_datafile(datafile).verified).to_be(False)
                expect(get_datafile(datafile).is_local()).to_be(False)


                # Fill in the content
                f.write(content)
                f.flush()

                # Check it now verifies
                verify_files()
                expect(get_datafile(datafile).verified).to_be(True)
                expect(get_datafile(datafile).is_local()).to_be(True)
Example #9
0
def generate_datafile(path, dataset, content=None, size=-1, 
                      verify=True, verified=True):
    from tardis.tardis_portal.models import Dataset_File
    datafile = Dataset_File()
    # Normally we use any old string for the datafile path, but some
    # tests require the path to be the same as what 'staging' would use
    if path == None:
        datafile.dataset_id = dataset.id
        datafile.save()
        path = "%s/%s/%s" % (dataset.get_first_experiment().id,
                             dataset.id, datafile.id)

    filepath = os.path.normpath(FILE_STORE_PATH + '/' + path)
    if content:
        try:
            os.makedirs(os.path.dirname(filepath))
            os.remove(filepath)
        except:
            pass
        file = open(filepath, 'wb+')
        file.write(content)
        file.close()
    datafile.url = path
    datafile.mimetype = "application/unspecified"
    datafile.filename = os.path.basename(filepath)
    datafile.dataset_id = dataset.id
    if content:
        datafile.size = str(len(content))
    else:
        datafile.size = str(size)
    if verify and content:
        if not datafile.verify(allowEmptyChecksums=True):
            raise RuntimeError('verify failed!?!')
    else:
        datafile.verified = verified
    datafile.save()
    return datafile
Example #10
0
def _make_data_file(dataset, filename, content):
    # TODO:
    # create datasetfile

    f = mktemp()
    print "Inside make data file ", f
    open(f, "w+b").write(content)
    df = Dataset_File()
    df.dataset = dataset
    df.filename = filename
    df.url = 'file://'+f
    df.protocol = "staging"
    df.size = len(content)
    df.verify(allowEmptyChecksums=True)
    df.save()
    print "Df ---", df
Example #11
0
    def testRemoteFile(self):
            content = urandom(1024)
            with NamedTemporaryFile() as f:
                # Create new Datafile
                datafile = Dataset_File(dataset=self.dataset)
                datafile.filename = 'background_task_testfile'
                datafile.size = len(content)
                datafile.sha512sum = hashlib.sha512(content).hexdigest()
                datafile.save()
                url = 'file://' + path.abspath(f.name)
                base_url = 'file://' + path.dirname(path.abspath(f.name))
                location = self._get_or_create_local_location(
                    'test-staging-xxx', base_url, 'external', 10)
                replica = Replica(datafile=datafile, location=location, url=url)
                replica.save()

                def get_replica(replica):
                    try:
                        return Replica.objects.get(id=replica.id)
                    except Replica.DoesNotExist:
                        return None

                def get_new_replica(datafile):
                    location = Location.get_default_location()
                    return Replica.objects.get(datafile=datafile.id,
                                               location=location)

                # Check that it won't verify as it stands
                expect(get_replica(replica).verified).to_be(False)
                verify_files()
                expect(get_replica(replica).verified).to_be(False)
                expect(get_replica(replica).is_local()).to_be(False)

                # Fill in the content
                f.write(content)
                f.flush()

                # Check it now verifies
                verify_files()
                expect(get_replica(replica).id).to_be(
                    get_new_replica(datafile).id)
                expect(get_new_replica(datafile).verified).to_be(True)
                expect(get_new_replica(datafile).is_local()).to_be(True)
Example #12
0
    def testLocalFile(self):
        content = urandom(1024)
        cf = ContentFile(content, 'background_task_testfile')

        # Create new Datafile
        datafile = Dataset_File(dataset=self.dataset)
        datafile.filename = cf.name
        datafile.size = len(content)
        datafile.sha512sum = hashlib.sha512(content).hexdigest()
        datafile.url = write_uploaded_file_to_dataset(self.dataset, cf)
        datafile.save()

        def get_datafile(datafile):
            return Dataset_File.objects.get(id=datafile.id)

        # Check that it's not currently verified
        expect(get_datafile(datafile).verified).to_be(False)

        # Check it verifies
        verify_files()
        expect(get_datafile(datafile).verified).to_be(True)
Example #13
0
    def testLocalFile(self):
        content = urandom(1024)
        cf = ContentFile(content, 'background_task_testfile')

        # Create new Datafile
        datafile = Dataset_File(dataset=self.dataset)
        datafile.filename = cf.name
        datafile.size = len(content)
        datafile.sha512sum = hashlib.sha512(content).hexdigest()
        datafile.url = write_uploaded_file_to_dataset(self.dataset, cf)
        datafile.save()

        def get_datafile(datafile):
            return Dataset_File.objects.get(id=datafile.id)

        # Check that it's not currently verified
        expect(get_datafile(datafile).verified).to_be(False)

        # Check it verifies
        verify_files()
        expect(get_datafile(datafile).verified).to_be(True)
Example #14
0
def _create_datafile():
    user = User.objects.create_user("testuser", "*****@*****.**", "pwd")
    user.save()
    UserProfile(user=user).save()
    full_access = Experiment.PUBLIC_ACCESS_FULL
    experiment = Experiment.objects.create(title="IIIF Test", created_by=user, public_access=full_access)
    experiment.save()
    ExperimentACL(
        experiment=experiment,
        pluginId="django_user",
        entityId=str(user.id),
        isOwner=True,
        canRead=True,
        canWrite=True,
        canDelete=True,
        aclOwnershipType=ExperimentACL.OWNER_OWNED,
    ).save()
    dataset = Dataset()
    dataset.save()
    dataset.experiments.add(experiment)
    dataset.save()

    # Create new Datafile
    tempfile = TemporaryUploadedFile("iiif_stored_file", None, None, None)
    with Image(filename="magick:rose") as img:
        img.format = "tiff"
        img.save(file=tempfile.file)
        tempfile.file.flush()
    datafile = Dataset_File(dataset=dataset)
    datafile.size = os.path.getsize(tempfile.file.name)
    # os.remove(tempfilename)
    datafile.filename = "iiif_named_file"
    datafile.url = write_uploaded_file_to_dataset(dataset, tempfile)
    datafile.verify(allowEmptyChecksums=True)
    datafile.save()
    return datafile
Example #15
0
def _create_datafile():
    user = User.objects.create_user('testuser', '*****@*****.**', 'pwd')
    user.save()
    UserProfile(user=user).save()
    full_access = Experiment.PUBLIC_ACCESS_FULL
    experiment = Experiment.objects.create(title="IIIF Test",
                                           created_by=user,
                                           public_access=full_access)
    experiment.save()
    ExperimentACL(experiment=experiment,
                  pluginId='django_user',
                  entityId=str(user.id),
                  isOwner=True,
                  canRead=True,
                  canWrite=True,
                  canDelete=True,
                  aclOwnershipType=ExperimentACL.OWNER_OWNED).save()
    dataset = Dataset()
    dataset.save()
    dataset.experiments.add(experiment)
    dataset.save()

    # Create new Datafile
    tempfile = TemporaryUploadedFile('iiif_stored_file', None, None, None)
    with Image(filename='magick:rose') as img:
        img.format = 'tiff'
        img.save(file=tempfile.file)
        tempfile.file.flush()
    datafile = Dataset_File(dataset=dataset)
    datafile.size = os.path.getsize(tempfile.file.name)
    #os.remove(tempfilename)
    datafile.filename = 'iiif_named_file'
    datafile.url = write_uploaded_file_to_dataset(dataset, tempfile)
    datafile.verify(allowEmptyChecksums=True)
    datafile.save()
    return datafile
Example #16
0
 def process_enclosure(self, dataset, enclosure):
     filename = getattr(enclosure, 'title', basename(enclosure.href))
     datafile = Dataset_File(url=enclosure.href, \
                             filename=filename, \
                             dataset=dataset)
     datafile.protocol = enclosure.href.partition('://')[0]
     try:
         datafile.mimetype = enclosure.mime
     except AttributeError:
         pass
     try:
         datafile.size = enclosure.length
     except AttributeError:
         pass
     try:
         hash = enclosure.hash
         # Split on white space, then ':' to get tuples to feed into dict
         hashdict = dict([s.partition(':')[::2] for s in hash.split()])
         # Set SHA-512 sum
         datafile.sha512sum = hashdict['sha-512']
     except AttributeError:
         pass
     datafile.save()
     self.make_local_copy(datafile)
Example #17
0
    def process_enclosure(self, dataset, enclosure):
        '''
        Examines one "enclosure" from an entry, representing a datafile.
        Determines whether to process it, and if so, starts the transfer.
        '''
        # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about 
        # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods.
        # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably.
        
        def _get_enclosure_url(enclosure):
            ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt'''
            if IngestOptions.USE_LOCAL_TRANSFERS:
                return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH)
            else:
                return enclosure.href

        filename = getattr(enclosure, 'title', basename(enclosure.href))
        # check if we were provided a full path, and hence a subdirectory for the file 
        if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and
                    getattr(enclosure, "path", "") != "" and
                    enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""):
            filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:])
                
        datafiles = dataset.dataset_file_set.filter(filename=filename)
        def fromunix1000 (tstr):
            return datetime.datetime.utcfromtimestamp(float(tstr)/1000)
        if datafiles.count() > 0:
            datafile = datafiles[0]
            from django.db.models import Max                     
            newest=datafiles.aggregate(Max('modification_time'))['modification_time__max']
            if not newest:# datafile.modification_time:
                ### rethink this!
                return # We have this file, it has no time/date, let's skip it.
            
            def total_seconds(td): # exists on datetime.timedelta in Python 2.7
                return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
            timediff = total_seconds(fromunix1000(enclosure.modified) - newest)

            if timediff == 0:
                return # We have this file already, same time/date.
            elif timediff < 0:
                logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?".
                                                format(enclosure.href, self.human_time(-timediff)))
                return
            else:
                if not IngestOptions.ALLOW_UPDATING_DATAFILES:
                    logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.".
                                                format(enclosure.href, self.human_time(timediff)))
                    return
                logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.".
                                                 format(enclosure.href, self.human_time(timediff)))
                if IngestOptions.HIDE_REPLACED_DATAFILES:
                    # Mark all older versions of file as hidden. (!)
                    try:
                        from tardis.microtardis.models import Dataset_Hidden
                        Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True)
                    except ImportError:
                        logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " +
                                    "will not be hidden.".format(datafile.filename))
                  
        else: # no local copy already.
            logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href))


        # Create a record and start transferring.
        datafile = Dataset_File(dataset=dataset,
                                url=_get_enclosure_url(enclosure), 
                                filename=filename,
                                created_time=fromunix1000(enclosure.created),
                                modification_time=fromunix1000(enclosure.modified))
        datafile.protocol = enclosure.href.partition('://')[0]
        
        datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype)
        datafile.size = getattr(enclosure, "length", datafile.size)

        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()