Example #1
0
    def testRemoteFile(self):
            content = urandom(1024)
            with NamedTemporaryFile() as f:
                # Create new Datafile
                datafile = Dataset_File(dataset=self.dataset)
                datafile.filename = 'background_task_testfile'
                datafile.size = len(content)
                datafile.sha512sum = hashlib.sha512(content).hexdigest()
                datafile.url = 'file://' + path.abspath(f.name)
                datafile.save()

                def get_datafile(datafile):
                    return Dataset_File.objects.get(id=datafile.id)

                # Check that it won't verify as it stands
                expect(get_datafile(datafile).verified).to_be(False)
                verify_files()
                expect(get_datafile(datafile).verified).to_be(False)
                expect(get_datafile(datafile).is_local()).to_be(False)


                # Fill in the content
                f.write(content)
                f.flush()

                # Check it now verifies
                verify_files()
                expect(get_datafile(datafile).verified).to_be(True)
                expect(get_datafile(datafile).is_local()).to_be(True)
Example #2
0
    def process_enclosure(self, dataset, enclosure):
        filename = getattr(enclosure, 'title', basename(enclosure.href))
        datafile = Dataset_File(filename=filename, dataset=dataset)
        try:
            datafile.mimetype = enclosure.mime
        except AttributeError:
            pass
        try:
            datafile.size = enclosure.length
        except AttributeError:
            pass
        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
        url = enclosure.href
        # This means we will allow the atom feed to feed us any enclosure
        # URL that matches a registered location.  Maybe we should restrict
        # this to a specific location.
        location = Location.get_location_for_url(url)
        if not location:
            logger.error('Rejected ingestion for unknown location %s' % url)
            return

        replica = Replica(datafile=datafile, url=url, location=location)
        replica.protocol = enclosure.href.partition('://')[0]
        replica.save()
        self.make_local_copy(replica)
Example #3
0
    def process_enclosure(self, dataset, enclosure):
        filename = getattr(enclosure, 'title', basename(enclosure.href))
        datafile = Dataset_File(filename=filename, dataset=dataset)
        try:
            datafile.mimetype = enclosure.mime
        except AttributeError:
            pass
        try:
            datafile.size = enclosure.length
        except AttributeError:
            pass
        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
        url = enclosure.href
        # This means we will allow the atom feed to feed us any enclosure
        # URL that matches a registered location.  Maybe we should restrict
        # this to a specific location.
        location = Location.get_location_for_url(url)
        if not location:
            logger.error('Rejected ingestion for unknown location %s' % url)
            return

        replica = Replica(datafile=datafile, url=url,
                          location=location)
        replica.protocol = enclosure.href.partition('://')[0]
        replica.save()
        self.make_local_copy(replica)
Example #4
0
    def testRemoteFile(self):
        content = urandom(1024)
        with NamedTemporaryFile() as f:
            # Create new Datafile
            datafile = Dataset_File(dataset=self.dataset)
            datafile.filename = 'background_task_testfile'
            datafile.size = len(content)
            datafile.sha512sum = hashlib.sha512(content).hexdigest()
            datafile.url = 'file://' + path.abspath(f.name)
            datafile.save()

            def get_datafile(datafile):
                return Dataset_File.objects.get(id=datafile.id)

            # Check that it won't verify as it stands
            expect(get_datafile(datafile).verified).to_be(False)
            verify_files()
            expect(get_datafile(datafile).verified).to_be(False)
            expect(get_datafile(datafile).is_local()).to_be(False)

            # Fill in the content
            f.write(content)
            f.flush()

            # Check it now verifies
            verify_files()
            expect(get_datafile(datafile).verified).to_be(True)
            expect(get_datafile(datafile).is_local()).to_be(True)
Example #5
0
    def testLocalFile(self):
        content = urandom(1024)
        cf = ContentFile(content, 'background_task_testfile')

        # Create new Datafile
        datafile = Dataset_File(dataset=self.dataset)
        datafile.filename = cf.name
        datafile.size = len(content)
        datafile.sha512sum = hashlib.sha512(content).hexdigest()
        datafile.save()
        replica = Replica(datafile=datafile,
                          url=write_uploaded_file_to_dataset(self.dataset, cf),
                          location=Location.get_default_location())
        replica.save()

        def get_replica(datafile):
            return Replica.objects.get(datafile=datafile)

        # undo auto-verify:
        replica.verified = False
        replica.save(update_fields=['verified'])

        # Check that it's not currently verified
        expect(get_replica(datafile).verified).to_be(False)

        # Check it verifies
        verify_files()
        expect(get_replica(datafile).verified).to_be(True)
Example #6
0
    def testRemoteFile(self):
            content = urandom(1024)
            with NamedTemporaryFile() as f:
                # Create new Datafile
                datafile = Dataset_File(dataset=self.dataset)
                datafile.filename = 'background_task_testfile'
                datafile.size = len(content)
                datafile.sha512sum = hashlib.sha512(content).hexdigest()
                datafile.save()
                url = 'file://' + path.abspath(f.name)
                base_url = 'file://' + path.dirname(path.abspath(f.name))
                location = self._get_or_create_local_location(
                    'test-staging-xxx', base_url, 'external', 10)
                replica = Replica(datafile=datafile, location=location, url=url)
                replica.save()

                def get_replica(replica):
                    try:
                        return Replica.objects.get(id=replica.id)
                    except Replica.DoesNotExist:
                        return None

                def get_new_replica(datafile):
                    location = Location.get_default_location()
                    return Replica.objects.get(datafile=datafile.id,
                                               location=location)

                # Check that it won't verify as it stands
                expect(get_replica(replica).verified).to_be(False)
                verify_files()
                expect(get_replica(replica).verified).to_be(False)
                expect(get_replica(replica).is_local()).to_be(False)

                # Fill in the content
                f.write(content)
                f.flush()

                # Check it now verifies
                verify_files()
                expect(get_replica(replica).id).to_be(
                    get_new_replica(datafile).id)
                expect(get_new_replica(datafile).verified).to_be(True)
                expect(get_new_replica(datafile).is_local()).to_be(True)
Example #7
0
    def testLocalFile(self):
        content = urandom(1024)
        cf = ContentFile(content, 'background_task_testfile')

        # Create new Datafile
        datafile = Dataset_File(dataset=self.dataset)
        datafile.filename = cf.name
        datafile.size = len(content)
        datafile.sha512sum = hashlib.sha512(content).hexdigest()
        datafile.url = write_uploaded_file_to_dataset(self.dataset, cf)
        datafile.save()

        def get_datafile(datafile):
            return Dataset_File.objects.get(id=datafile.id)

        # Check that it's not currently verified
        expect(get_datafile(datafile).verified).to_be(False)

        # Check it verifies
        verify_files()
        expect(get_datafile(datafile).verified).to_be(True)
Example #8
0
    def testLocalFile(self):
        content = urandom(1024)
        cf = ContentFile(content, 'background_task_testfile')

        # Create new Datafile
        datafile = Dataset_File(dataset=self.dataset)
        datafile.filename = cf.name
        datafile.size = len(content)
        datafile.sha512sum = hashlib.sha512(content).hexdigest()
        datafile.url = write_uploaded_file_to_dataset(self.dataset, cf)
        datafile.save()

        def get_datafile(datafile):
            return Dataset_File.objects.get(id=datafile.id)

        # Check that it's not currently verified
        expect(get_datafile(datafile).verified).to_be(False)

        # Check it verifies
        verify_files()
        expect(get_datafile(datafile).verified).to_be(True)
Example #9
0
 def process_enclosure(self, dataset, enclosure):
     filename = getattr(enclosure, 'title', basename(enclosure.href))
     datafile = Dataset_File(url=enclosure.href, \
                             filename=filename, \
                             dataset=dataset)
     datafile.protocol = enclosure.href.partition('://')[0]
     try:
         datafile.mimetype = enclosure.mime
     except AttributeError:
         pass
     try:
         datafile.size = enclosure.length
     except AttributeError:
         pass
     try:
         hash = enclosure.hash
         # Split on white space, then ':' to get tuples to feed into dict
         hashdict = dict([s.partition(':')[::2] for s in hash.split()])
         # Set SHA-512 sum
         datafile.sha512sum = hashdict['sha-512']
     except AttributeError:
         pass
     datafile.save()
     self.make_local_copy(datafile)
Example #10
0
    def process_enclosure(self, dataset, enclosure):
        '''
        Examines one "enclosure" from an entry, representing a datafile.
        Determines whether to process it, and if so, starts the transfer.
        '''
        # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about 
        # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods.
        # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably.
        
        def _get_enclosure_url(enclosure):
            ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt'''
            if IngestOptions.USE_LOCAL_TRANSFERS:
                return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH)
            else:
                return enclosure.href

        filename = getattr(enclosure, 'title', basename(enclosure.href))
        # check if we were provided a full path, and hence a subdirectory for the file 
        if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and
                    getattr(enclosure, "path", "") != "" and
                    enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""):
            filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:])
                
        datafiles = dataset.dataset_file_set.filter(filename=filename)
        def fromunix1000 (tstr):
            return datetime.datetime.utcfromtimestamp(float(tstr)/1000)
        if datafiles.count() > 0:
            datafile = datafiles[0]
            from django.db.models import Max                     
            newest=datafiles.aggregate(Max('modification_time'))['modification_time__max']
            if not newest:# datafile.modification_time:
                ### rethink this!
                return # We have this file, it has no time/date, let's skip it.
            
            def total_seconds(td): # exists on datetime.timedelta in Python 2.7
                return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
            timediff = total_seconds(fromunix1000(enclosure.modified) - newest)

            if timediff == 0:
                return # We have this file already, same time/date.
            elif timediff < 0:
                logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?".
                                                format(enclosure.href, self.human_time(-timediff)))
                return
            else:
                if not IngestOptions.ALLOW_UPDATING_DATAFILES:
                    logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.".
                                                format(enclosure.href, self.human_time(timediff)))
                    return
                logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.".
                                                 format(enclosure.href, self.human_time(timediff)))
                if IngestOptions.HIDE_REPLACED_DATAFILES:
                    # Mark all older versions of file as hidden. (!)
                    try:
                        from tardis.microtardis.models import Dataset_Hidden
                        Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True)
                    except ImportError:
                        logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " +
                                    "will not be hidden.".format(datafile.filename))
                  
        else: # no local copy already.
            logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href))


        # Create a record and start transferring.
        datafile = Dataset_File(dataset=dataset,
                                url=_get_enclosure_url(enclosure), 
                                filename=filename,
                                created_time=fromunix1000(enclosure.created),
                                modification_time=fromunix1000(enclosure.modified))
        datafile.protocol = enclosure.href.partition('://')[0]
        
        datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype)
        datafile.size = getattr(enclosure, "length", datafile.size)

        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()