def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(filename=filename, dataset=dataset) try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() url = enclosure.href # This means we will allow the atom feed to feed us any enclosure # URL that matches a registered location. Maybe we should restrict # this to a specific location. location = Location.get_location_for_url(url) if not location: logger.error('Rejected ingestion for unknown location %s' % url) return replica = Replica(datafile=datafile, url=url, location=location) replica.protocol = enclosure.href.partition('://')[0] replica.save() self.make_local_copy(replica)
def add_staged_file_to_dataset(rel_filepath, dataset_id, username, mimetype="application/octet-stream"): """ add file in user's staging path to a dataset may be replaced by main code functions. quick and dirty hack to get it working """ originfilepath = os.path.join(get_full_staging_path(username), rel_filepath) dataset = Dataset.objects.get(pk=dataset_id) newDatafile = Dataset_File() newDatafile.dataset = dataset newDatafile.size = os.path.getsize(originfilepath) newDatafile.protocol = "tardis" newDatafile.mimetype = mimetype file_dir = "/" + str(dataset.experiment.id) + "/" + str(dataset.id) + "/" file_path = file_dir + rel_filepath prelim_full_file_path = settings.FILE_STORE_PATH + file_path full_file_path = duplicate_file_check_rename(prelim_full_file_path) newDatafile.filename = os.path.basename(full_file_path) newDatafile.url = "%s://%s" % (newDatafile.protocol, full_file_path[ len(settings.FILE_STORE_PATH) + len(file_dir):]) if not os.path.exists(os.path.dirname(full_file_path)): os.makedirs(os.path.dirname(full_file_path)) shutil.move(originfilepath, full_file_path) newDatafile.save()
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(filename=filename, dataset=dataset) try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() url = enclosure.href # This means we will allow the atom feed to feed us any enclosure # URL that matches a registered location. Maybe we should restrict # this to a specific location. location = Location.get_location_for_url(url) if not location: logger.error('Rejected ingestion for unknown location %s' % url) return replica = Replica(datafile=datafile, url=url, location=location) replica.protocol = enclosure.href.partition('://')[0] replica.save() self.make_local_copy(replica)
def generate_datafile(path, dataset, content=None, size=-1, verify=True, verified=True): '''Generates a datafile AND a replica to hold its contents''' from tardis.tardis_portal.models import Dataset_File, Replica, Location saved = settings.REQUIRE_DATAFILE_CHECKSUMS settings.REQUIRE_DATAFILE_CHECKSUMS = False try: datafile = Dataset_File() if content: datafile.size = str(len(content)) else: datafile.size = str(size) # Normally we use any old string for the datafile path, but some # tests require the path to be the same as what 'staging' would use if path == None: datafile.dataset_id = dataset.id datafile.save() path = "%s/%s/%s" % (dataset.get_first_experiment().id, dataset.id, datafile.id) filepath = os.path.normpath(FILE_STORE_PATH + '/' + path) if content: try: os.makedirs(os.path.dirname(filepath)) os.remove(filepath) except: pass file = open(filepath, 'wb+') file.write(content) file.close() datafile.mimetype = "application/unspecified" datafile.filename = os.path.basename(filepath) datafile.dataset_id = dataset.id datafile.save() location = _infer_location(path) replica = Replica(datafile=datafile, url=path, protocol='', location=location) if verify and content: if not replica.verify(allowEmptyChecksums=True): raise RuntimeError('verify failed!?!') else: replica.verified = verified replica.save() return (datafile, replica) finally: settings.REQUIRE_DATAFILE_CHECKSUMS = saved
def generate_datafile(path, dataset, content=None, size=-1, verify=True, verified=True, verify_checksums_req=False): '''Generates a datafile AND a replica to hold its contents''' from tardis.tardis_portal.models import Dataset_File, Replica, Location saved = settings.REQUIRE_DATAFILE_CHECKSUMS settings.REQUIRE_DATAFILE_CHECKSUMS = False try: datafile = Dataset_File() if content: datafile.size = str(len(content)) else: datafile.size = str(size) # Normally we use any old string for the datafile path, but some # tests require the path to be the same as what 'staging' would use if path == None: datafile.dataset_id = dataset.id datafile.save() path = "%s/%s/%s" % (dataset.get_first_experiment().id, dataset.id, datafile.id) filepath = os.path.normpath(settings.FILE_STORE_PATH + '/' + path) if content: try: os.makedirs(os.path.dirname(filepath)) os.remove(filepath) except: pass gen_file = open(filepath, 'wb+') gen_file.write(content) gen_file.close() datafile.mimetype = "application/unspecified" datafile.filename = os.path.basename(filepath) datafile.dataset_id = dataset.id datafile.save() settings.REQUIRE_DATAFILE_CHECKSUMS = verify_checksums_req location = _infer_location(path) replica = Replica(datafile=datafile, url=path, protocol='', location=location) if verify and content: if not replica.verify(): raise RuntimeError('verify failed!?!') replica.save() replica.verified = verified replica.save(update_fields=['verified']) # force no verification return (datafile, replica) finally: settings.REQUIRE_DATAFILE_CHECKSUMS = saved
def generate_datafile(path, dataset, content=None, size=-1, verify=True, verified=True): from tardis.tardis_portal.models import Dataset_File datafile = Dataset_File() # Normally we use any old string for the datafile path, but some # tests require the path to be the same as what 'staging' would use if path == None: datafile.dataset_id = dataset.id datafile.save() path = "%s/%s/%s" % (dataset.get_first_experiment().id, dataset.id, datafile.id) filepath = os.path.normpath(FILE_STORE_PATH + '/' + path) if content: try: os.makedirs(os.path.dirname(filepath)) os.remove(filepath) except: pass file = open(filepath, 'wb+') file.write(content) file.close() datafile.url = path datafile.mimetype = "application/unspecified" datafile.filename = os.path.basename(filepath) datafile.dataset_id = dataset.id if content: datafile.size = str(len(content)) else: datafile.size = str(size) if verify and content: if not datafile.verify(allowEmptyChecksums=True): raise RuntimeError('verify failed!?!') else: datafile.verified = verified datafile.save() return datafile
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(url=enclosure.href, \ filename=filename, \ dataset=dataset) datafile.protocol = enclosure.href.partition('://')[0] try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() self.make_local_copy(datafile)
def process_enclosure(self, dataset, enclosure): ''' Examines one "enclosure" from an entry, representing a datafile. Determines whether to process it, and if so, starts the transfer. ''' # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods. # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably. def _get_enclosure_url(enclosure): ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt''' if IngestOptions.USE_LOCAL_TRANSFERS: return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH) else: return enclosure.href filename = getattr(enclosure, 'title', basename(enclosure.href)) # check if we were provided a full path, and hence a subdirectory for the file if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and getattr(enclosure, "path", "") != "" and enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""): filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:]) datafiles = dataset.dataset_file_set.filter(filename=filename) def fromunix1000 (tstr): return datetime.datetime.utcfromtimestamp(float(tstr)/1000) if datafiles.count() > 0: datafile = datafiles[0] from django.db.models import Max newest=datafiles.aggregate(Max('modification_time'))['modification_time__max'] if not newest:# datafile.modification_time: ### rethink this! return # We have this file, it has no time/date, let's skip it. def total_seconds(td): # exists on datetime.timedelta in Python 2.7 return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 timediff = total_seconds(fromunix1000(enclosure.modified) - newest) if timediff == 0: return # We have this file already, same time/date. elif timediff < 0: logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?". format(enclosure.href, self.human_time(-timediff))) return else: if not IngestOptions.ALLOW_UPDATING_DATAFILES: logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.". format(enclosure.href, self.human_time(timediff))) return logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.". format(enclosure.href, self.human_time(timediff))) if IngestOptions.HIDE_REPLACED_DATAFILES: # Mark all older versions of file as hidden. (!) try: from tardis.microtardis.models import Dataset_Hidden Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True) except ImportError: logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " + "will not be hidden.".format(datafile.filename)) else: # no local copy already. logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href)) # Create a record and start transferring. datafile = Dataset_File(dataset=dataset, url=_get_enclosure_url(enclosure), filename=filename, created_time=fromunix1000(enclosure.created), modification_time=fromunix1000(enclosure.modified)) datafile.protocol = enclosure.href.partition('://')[0] datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype) datafile.size = getattr(enclosure, "length", datafile.size) try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save()
def testDatasetFile(self): # check registered text file for physical file meta information df = Dataset_File.objects.get(pk=self.dataset_file1.id) try: from magic import Magic self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(df.size, str(13)) self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858') # now check a JPG file filename = join(abspath(dirname(__file__)), '../static/images/ands-logo-hi-res.jpg') dataset = Dataset.objects.get(pk=self.dataset1.id) pdf1 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file') pdf1.save() try: from magic import Magic self.assertEqual(pdf1.mimetype, 'image/jpeg') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf1.size, str(14232)) self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb') # now check that we can override the physical file meta information pdf2 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file', mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation', size=str(0), md5sum='md5sum') pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf2.size, str(0)) self.assertEqual(pdf2.md5sum, 'md5sum') pdf2.mimetype = '' pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/pdf') except: # XXX Test disabled becuse lib magic can't be loaded pass
def testDatasetFile(self): # check registered text file for physical file meta information df = Dataset_File.objects.get(pk=self.dataset_file1.id) try: from magic import Magic self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(df.size, str(13)) self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858') # now check a pdf file filename = join(abspath(dirname(__file__)), '../static/downloads/DatasetDepositionGuide.pdf') dataset = Dataset.objects.get(pk=self.dataset1.id) pdf1 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file') pdf1.save() try: from magic import Magic self.assertEqual(pdf1.mimetype, 'application/pdf') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf1.size, str(1008475)) self.assertEqual(pdf1.md5sum, '9192b3d3e0056412b1d21d3e33562eba') # now check that we can override the physical file meta information pdf2 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file', mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation', size=str(0), md5sum='md5sum') pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf2.size, str(0)) self.assertEqual(pdf2.md5sum, 'md5sum') pdf2.mimetype = '' pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/pdf') except: # XXX Test disabled becuse lib magic can't be loaded pass
def testDatasetFile(self): # check registered text file for physical file meta information df = Dataset_File.objects.get(pk=self.dataset_file1.id) try: from magic import Magic self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(df.size, str(13)) self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858') # now check a JPG file filename = join(abspath(dirname(__file__)), '../static/images/ands-logo-hi-res.jpg') dataset = Dataset.objects.get(pk=self.dataset1.id) pdf1 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file') pdf1.save() try: from magic import Magic self.assertEqual(pdf1.mimetype, 'image/jpeg') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf1.size, str(14232)) self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb') # now check that we can override the physical file meta information pdf2 = Dataset_File( dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file', mimetype= 'application/vnd.openxmlformats-officedocument.presentationml.presentation', size=str(0), md5sum='md5sum') pdf2.save() try: from magic import Magic self.assertEqual( pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ) except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf2.size, str(0)) self.assertEqual(pdf2.md5sum, 'md5sum') pdf2.mimetype = '' pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/pdf') except: # XXX Test disabled becuse lib magic can't be loaded pass
def testDatasetFile(self): # check registered text file for physical file meta information df = Dataset_File.objects.get(pk=self.dataset_file1.id) try: from magic import Magic self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii') except: # XXX Test disabled because lib magic can't be loaded pass self.assertEqual(df.size, str(13)) self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858') # now check a JPG file filename = abspath(join(dirname(__file__), '../static/images/ands-logo-hi-res.jpg')) dataset = Dataset.objects.get(pk=self.dataset1.id) size, sha512sum = get_size_and_sha512sum(filename) pdf1 = Dataset_File(dataset=dataset, filename=basename(filename), size=str(size), sha512sum=sha512sum, url='file://%s' % filename, protocol='file') pdf1.verify() pdf1.save() try: from magic import Magic self.assertEqual(pdf1.mimetype, 'image/jpeg') except: # XXX Test disabled because lib magic can't be loaded pass self.assertEqual(pdf1.size, str(14232)) self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb') # now check that we can override the physical file meta information pdf2 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file', mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation', size=str(0), # Empty string always has the same hash sha512sum='cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e') pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation') except: # XXX Test disabled because lib magic can't be loaded pass self.assertEqual(pdf2.size, str(0)) self.assertEqual(pdf2.md5sum, '') pdf2.mimetype = '' pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/pdf') except: # XXX Test disabled because lib magic can't be loaded pass
def testDatasetFile(self): # check registered text file for physical file meta information df = Dataset_File.objects.get(pk=self.dataset_file1.id) try: from magic import Magic self.assertEqual(df.mimetype, 'text/plain; charset=us-ascii') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(df.size, str(13)) self.assertEqual(df.md5sum, '8ddd8be4b179a529afa5f2ffae4b9858') # now check a JPG file filename = abspath(join(dirname(__file__), '../static/images/ands-logo-hi-res.jpg')) dataset = Dataset.objects.get(pk=self.dataset1.id) size, sha512sum = get_size_and_sha512sum(filename) pdf1 = Dataset_File(dataset=dataset, filename=basename(filename), size=str(size), sha512sum=sha512sum, url='file://%s' % filename, protocol='file') pdf1.verify() pdf1.save() try: from magic import Magic self.assertEqual(pdf1.mimetype, 'image/jpeg') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf1.size, str(14232)) self.assertEqual(pdf1.md5sum, 'c450d5126ffe3d14643815204daf1bfb') # now check that we can override the physical file meta information pdf2 = Dataset_File(dataset=dataset, filename=basename(filename), url='file://%s' % filename, protocol='file', mimetype='application/vnd.openxmlformats-officedocument.presentationml.presentation', size=str(0), # Empty string always has the same hash sha512sum='cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e') pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/vnd.openxmlformats-officedocument.presentationml.presentation') except: # XXX Test disabled becuse lib magic can't be loaded pass self.assertEqual(pdf2.size, str(0)) self.assertEqual(pdf2.md5sum, '') pdf2.mimetype = '' pdf2.save() try: from magic import Magic self.assertEqual(pdf2.mimetype, 'application/pdf') except: # XXX Test disabled becuse lib magic can't be loaded pass