Example #1
0
    def process_enclosure(self, dataset, enclosure):
        filename = getattr(enclosure, 'title', basename(enclosure.href))
        datafile = Dataset_File(filename=filename, dataset=dataset)
        try:
            datafile.mimetype = enclosure.mime
        except AttributeError:
            pass
        try:
            datafile.size = enclosure.length
        except AttributeError:
            pass
        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
        url = enclosure.href
        # This means we will allow the atom feed to feed us any enclosure
        # URL that matches a registered location.  Maybe we should restrict
        # this to a specific location.
        location = Location.get_location_for_url(url)
        if not location:
            logger.error('Rejected ingestion for unknown location %s' % url)
            return

        replica = Replica(datafile=datafile, url=url, location=location)
        replica.protocol = enclosure.href.partition('://')[0]
        replica.save()
        self.make_local_copy(replica)
Example #2
0
def add_staged_file_to_dataset(rel_filepath, dataset_id, username,
                               mimetype="application/octet-stream"):
    """
    add file in user's staging path to a dataset
    may be replaced by main code functions.
    quick and dirty hack to get it working
    """
    originfilepath = os.path.join(get_full_staging_path(username), rel_filepath)
    dataset = Dataset.objects.get(pk=dataset_id)
    newDatafile = Dataset_File()
    newDatafile.dataset = dataset
    newDatafile.size = os.path.getsize(originfilepath)
    newDatafile.protocol = "tardis"
    newDatafile.mimetype = mimetype
    file_dir = "/" + str(dataset.experiment.id) + "/" + str(dataset.id) + "/"
    file_path = file_dir + rel_filepath
    prelim_full_file_path = settings.FILE_STORE_PATH + file_path
    full_file_path = duplicate_file_check_rename(prelim_full_file_path)
    newDatafile.filename = os.path.basename(full_file_path)
    newDatafile.url = "%s://%s" % (newDatafile.protocol,
                                   full_file_path[
            len(settings.FILE_STORE_PATH) + len(file_dir):])
    if not os.path.exists(os.path.dirname(full_file_path)):
        os.makedirs(os.path.dirname(full_file_path))
    shutil.move(originfilepath, full_file_path)
    newDatafile.save()
Example #3
0
    def process_enclosure(self, dataset, enclosure):
        filename = getattr(enclosure, 'title', basename(enclosure.href))
        datafile = Dataset_File(filename=filename, dataset=dataset)
        try:
            datafile.mimetype = enclosure.mime
        except AttributeError:
            pass
        try:
            datafile.size = enclosure.length
        except AttributeError:
            pass
        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
        url = enclosure.href
        # This means we will allow the atom feed to feed us any enclosure
        # URL that matches a registered location.  Maybe we should restrict
        # this to a specific location.
        location = Location.get_location_for_url(url)
        if not location:
            logger.error('Rejected ingestion for unknown location %s' % url)
            return

        replica = Replica(datafile=datafile, url=url,
                          location=location)
        replica.protocol = enclosure.href.partition('://')[0]
        replica.save()
        self.make_local_copy(replica)
Example #4
0
def generate_datafile(path, dataset, content=None, size=-1, 
                      verify=True, verified=True):
    '''Generates a datafile AND a replica to hold its contents'''
    from tardis.tardis_portal.models import Dataset_File, Replica, Location

    saved = settings.REQUIRE_DATAFILE_CHECKSUMS 
    settings.REQUIRE_DATAFILE_CHECKSUMS = False
    try:
        datafile = Dataset_File()
        if content:
            datafile.size = str(len(content))
        else:
            datafile.size = str(size)
        # Normally we use any old string for the datafile path, but some
        # tests require the path to be the same as what 'staging' would use
        if path == None:
            datafile.dataset_id = dataset.id
            datafile.save()
            path = "%s/%s/%s" % (dataset.get_first_experiment().id,
                                 dataset.id, datafile.id)

        filepath = os.path.normpath(FILE_STORE_PATH + '/' + path)
        if content:
            try:
                os.makedirs(os.path.dirname(filepath))
                os.remove(filepath)
            except:
                pass
            file = open(filepath, 'wb+')
            file.write(content)
            file.close()
        datafile.mimetype = "application/unspecified"
        datafile.filename = os.path.basename(filepath)
        datafile.dataset_id = dataset.id
        datafile.save()

        location = _infer_location(path)
        replica = Replica(datafile=datafile, url=path, protocol='',
                          location=location)
        if verify and content:
            if not replica.verify(allowEmptyChecksums=True):
                raise RuntimeError('verify failed!?!')
        else:
            replica.verified = verified
        replica.save()
        return (datafile, replica)
    finally:
        settings.REQUIRE_DATAFILE_CHECKSUMS = saved
Example #5
0
def generate_datafile(path, dataset, content=None, size=-1,
                      verify=True, verified=True, verify_checksums_req=False):
    '''Generates a datafile AND a replica to hold its contents'''
    from tardis.tardis_portal.models import Dataset_File, Replica, Location

    saved = settings.REQUIRE_DATAFILE_CHECKSUMS
    settings.REQUIRE_DATAFILE_CHECKSUMS = False
    try:
        datafile = Dataset_File()
        if content:
            datafile.size = str(len(content))
        else:
            datafile.size = str(size)
        # Normally we use any old string for the datafile path, but some
        # tests require the path to be the same as what 'staging' would use
        if path == None:
            datafile.dataset_id = dataset.id
            datafile.save()
            path = "%s/%s/%s" % (dataset.get_first_experiment().id,
                                 dataset.id, datafile.id)

        filepath = os.path.normpath(settings.FILE_STORE_PATH + '/' + path)
        if content:
            try:
                os.makedirs(os.path.dirname(filepath))
                os.remove(filepath)
            except:
                pass
            gen_file = open(filepath, 'wb+')
            gen_file.write(content)
            gen_file.close()
        datafile.mimetype = "application/unspecified"
        datafile.filename = os.path.basename(filepath)
        datafile.dataset_id = dataset.id
        datafile.save()
        settings.REQUIRE_DATAFILE_CHECKSUMS = verify_checksums_req
        location = _infer_location(path)
        replica = Replica(datafile=datafile, url=path, protocol='',
                          location=location)
        if verify and content:
            if not replica.verify():
                raise RuntimeError('verify failed!?!')
        replica.save()
        replica.verified = verified
        replica.save(update_fields=['verified'])  # force no verification
        return (datafile, replica)
    finally:
        settings.REQUIRE_DATAFILE_CHECKSUMS = saved
Example #6
0
def generate_datafile(path, dataset, content=None, size=-1, 
                      verify=True, verified=True):
    from tardis.tardis_portal.models import Dataset_File
    datafile = Dataset_File()
    # Normally we use any old string for the datafile path, but some
    # tests require the path to be the same as what 'staging' would use
    if path == None:
        datafile.dataset_id = dataset.id
        datafile.save()
        path = "%s/%s/%s" % (dataset.get_first_experiment().id,
                             dataset.id, datafile.id)

    filepath = os.path.normpath(FILE_STORE_PATH + '/' + path)
    if content:
        try:
            os.makedirs(os.path.dirname(filepath))
            os.remove(filepath)
        except:
            pass
        file = open(filepath, 'wb+')
        file.write(content)
        file.close()
    datafile.url = path
    datafile.mimetype = "application/unspecified"
    datafile.filename = os.path.basename(filepath)
    datafile.dataset_id = dataset.id
    if content:
        datafile.size = str(len(content))
    else:
        datafile.size = str(size)
    if verify and content:
        if not datafile.verify(allowEmptyChecksums=True):
            raise RuntimeError('verify failed!?!')
    else:
        datafile.verified = verified
    datafile.save()
    return datafile
Example #7
0
 def process_enclosure(self, dataset, enclosure):
     filename = getattr(enclosure, 'title', basename(enclosure.href))
     datafile = Dataset_File(url=enclosure.href, \
                             filename=filename, \
                             dataset=dataset)
     datafile.protocol = enclosure.href.partition('://')[0]
     try:
         datafile.mimetype = enclosure.mime
     except AttributeError:
         pass
     try:
         datafile.size = enclosure.length
     except AttributeError:
         pass
     try:
         hash = enclosure.hash
         # Split on white space, then ':' to get tuples to feed into dict
         hashdict = dict([s.partition(':')[::2] for s in hash.split()])
         # Set SHA-512 sum
         datafile.sha512sum = hashdict['sha-512']
     except AttributeError:
         pass
     datafile.save()
     self.make_local_copy(datafile)
Example #8
0
    def process_enclosure(self, dataset, enclosure):
        '''
        Examines one "enclosure" from an entry, representing a datafile.
        Determines whether to process it, and if so, starts the transfer.
        '''
        # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about 
        # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods.
        # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably.
        
        def _get_enclosure_url(enclosure):
            ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt'''
            if IngestOptions.USE_LOCAL_TRANSFERS:
                return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH)
            else:
                return enclosure.href

        filename = getattr(enclosure, 'title', basename(enclosure.href))
        # check if we were provided a full path, and hence a subdirectory for the file 
        if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and
                    getattr(enclosure, "path", "") != "" and
                    enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""):
            filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:])
                
        datafiles = dataset.dataset_file_set.filter(filename=filename)
        def fromunix1000 (tstr):
            return datetime.datetime.utcfromtimestamp(float(tstr)/1000)
        if datafiles.count() > 0:
            datafile = datafiles[0]
            from django.db.models import Max                     
            newest=datafiles.aggregate(Max('modification_time'))['modification_time__max']
            if not newest:# datafile.modification_time:
                ### rethink this!
                return # We have this file, it has no time/date, let's skip it.
            
            def total_seconds(td): # exists on datetime.timedelta in Python 2.7
                return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
            timediff = total_seconds(fromunix1000(enclosure.modified) - newest)

            if timediff == 0:
                return # We have this file already, same time/date.
            elif timediff < 0:
                logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?".
                                                format(enclosure.href, self.human_time(-timediff)))
                return
            else:
                if not IngestOptions.ALLOW_UPDATING_DATAFILES:
                    logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.".
                                                format(enclosure.href, self.human_time(timediff)))
                    return
                logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.".
                                                 format(enclosure.href, self.human_time(timediff)))
                if IngestOptions.HIDE_REPLACED_DATAFILES:
                    # Mark all older versions of file as hidden. (!)
                    try:
                        from tardis.microtardis.models import Dataset_Hidden
                        Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True)
                    except ImportError:
                        logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " +
                                    "will not be hidden.".format(datafile.filename))
                  
        else: # no local copy already.
            logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href))


        # Create a record and start transferring.
        datafile = Dataset_File(dataset=dataset,
                                url=_get_enclosure_url(enclosure), 
                                filename=filename,
                                created_time=fromunix1000(enclosure.created),
                                modification_time=fromunix1000(enclosure.modified))
        datafile.protocol = enclosure.href.partition('://')[0]
        
        datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype)
        datafile.size = getattr(enclosure, "length", datafile.size)

        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()
Example #9
0
    def testDatasetFile(self):

        # check registered text file for physical file meta information
        df = Dataset_File.objects.get(pk=self.dataset_file1.id)

        try:
            from magic import Magic
            self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(df.size, str(13))
        self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858')

        # now check a JPG file
        filename = join(abspath(dirname(__file__)),
                        '../static/images/ands-logo-hi-res.jpg')

        dataset = Dataset.objects.get(pk=self.dataset1.id)

        pdf1 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file')
        pdf1.save()
        try:
            from magic import Magic
            self.assertEqual(pdf1.mimetype, 'image/jpeg')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf1.size, str(14232))
        self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb')

        # now check that we can override the physical file meta information
        pdf2 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file',
                            mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            size=str(0),
                            md5sum='md5sum')
        pdf2.save()
        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf2.size, str(0))
        self.assertEqual(pdf2.md5sum, 'md5sum')

        pdf2.mimetype = ''
        pdf2.save()

        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/pdf')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
Example #10
0
    def testDatasetFile(self):

        # check registered text file for physical file meta information
        df = Dataset_File.objects.get(pk=self.dataset_file1.id)

        try:
            from magic import Magic
            self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(df.size, str(13))
        self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858')

        # now check a pdf file
        filename = join(abspath(dirname(__file__)),
                        '../static/downloads/DatasetDepositionGuide.pdf')

        dataset = Dataset.objects.get(pk=self.dataset1.id)

        pdf1 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file')
        pdf1.save()
        try:
            from magic import Magic
            self.assertEqual(pdf1.mimetype, 'application/pdf')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf1.size, str(1008475))
        self.assertEqual(pdf1.md5sum, '9192b3d3e0056412b1d21d3e33562eba')

        # now check that we can override the physical file meta information
        pdf2 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file',
                            mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            size=str(0),
                            md5sum='md5sum')
        pdf2.save()
        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf2.size, str(0))
        self.assertEqual(pdf2.md5sum, 'md5sum')

        pdf2.mimetype = ''
        pdf2.save()

        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/pdf')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
Example #11
0
    def testDatasetFile(self):

        # check registered text file for physical file meta information
        df = Dataset_File.objects.get(pk=self.dataset_file1.id)

        try:
            from magic import Magic
            self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(df.size, str(13))
        self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858')

        # now check a JPG file
        filename = join(abspath(dirname(__file__)),
                        '../static/images/ands-logo-hi-res.jpg')

        dataset = Dataset.objects.get(pk=self.dataset1.id)

        pdf1 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file')
        pdf1.save()
        try:
            from magic import Magic
            self.assertEqual(pdf1.mimetype, 'image/jpeg')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf1.size, str(14232))
        self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb')

        # now check that we can override the physical file meta information
        pdf2 = Dataset_File(
            dataset=dataset,
            filename=basename(filename),
            url='file://%s' % filename,
            protocol='file',
            mimetype=
            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
            size=str(0),
            md5sum='md5sum')
        pdf2.save()
        try:
            from magic import Magic
            self.assertEqual(
                pdf2.mimetype,
                'application/vnd.openxmlformats-officedocument.presentationml.presentation'
            )
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf2.size, str(0))
        self.assertEqual(pdf2.md5sum, 'md5sum')

        pdf2.mimetype = ''
        pdf2.save()

        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/pdf')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
Example #12
0
    def testDatasetFile(self):

        # check registered text file for physical file meta information
        df = Dataset_File.objects.get(pk=self.dataset_file1.id)

        try:
            from magic import Magic
            self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii')
        except:
            # XXX Test disabled because lib magic can't be loaded
            pass
        self.assertEqual(df.size, str(13))
        self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858')

        # now check a JPG file
        filename = abspath(join(dirname(__file__),
                                '../static/images/ands-logo-hi-res.jpg'))

        dataset = Dataset.objects.get(pk=self.dataset1.id)

        size, sha512sum = get_size_and_sha512sum(filename)
        pdf1 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            size=str(size),
                            sha512sum=sha512sum,
                            url='file://%s' % filename,
                            protocol='file')
        pdf1.verify()
        pdf1.save()
        try:
            from magic import Magic
            self.assertEqual(pdf1.mimetype, 'image/jpeg')
        except:
            # XXX Test disabled because lib magic can't be loaded
            pass
        self.assertEqual(pdf1.size, str(14232))
        self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb')

        # now check that we can override the physical file meta information
        pdf2 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file',
                            mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            size=str(0),
                            # Empty string always has the same hash
                            sha512sum='cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
        pdf2.save()
        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
        except:
            # XXX Test disabled because lib magic can't be loaded
            pass
        self.assertEqual(pdf2.size, str(0))
        self.assertEqual(pdf2.md5sum, '')

        pdf2.mimetype = ''
        pdf2.save()

        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/pdf')
        except:
            # XXX Test disabled because lib magic can't be loaded
            pass
Example #13
0
    def testDatasetFile(self):

        # check registered text file for physical file meta information
        df = Dataset_File.objects.get(pk=self.dataset_file1.id)

        try:
            from magic import Magic
            self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(df.size, str(13))
        self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858')

        # now check a JPG file
        filename = abspath(join(dirname(__file__),
                                '../static/images/ands-logo-hi-res.jpg'))

        dataset = Dataset.objects.get(pk=self.dataset1.id)

        size, sha512sum = get_size_and_sha512sum(filename)
        pdf1 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            size=str(size),
                            sha512sum=sha512sum,
                            url='file://%s' % filename,
                            protocol='file')
        pdf1.verify()
        pdf1.save()
        try:
            from magic import Magic
            self.assertEqual(pdf1.mimetype, 'image/jpeg')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf1.size, str(14232))
        self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb')

        # now check that we can override the physical file meta information
        pdf2 = Dataset_File(dataset=dataset,
                            filename=basename(filename),
                            url='file://%s' % filename,
                            protocol='file',
                            mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation',
                            size=str(0),
                            # Empty string always has the same hash
                            sha512sum='cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
        pdf2.save()
        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass
        self.assertEqual(pdf2.size, str(0))
        self.assertEqual(pdf2.md5sum, '')

        pdf2.mimetype = ''
        pdf2.save()

        try:
            from magic import Magic
            self.assertEqual(pdf2.mimetype, 'application/pdf')
        except:
            # XXX Test disabled becuse lib magic can't be loaded
            pass