Example #1
0
def add_staged_file_to_dataset(rel_filepath, dataset_id, username,
                               mimetype="application/octet-stream"):
    """
    add file in user's staging path to a dataset
    may be replaced by main code functions.
    quick and dirty hack to get it working
    """
    originfilepath = os.path.join(get_full_staging_path(username), rel_filepath)
    dataset = Dataset.objects.get(pk=dataset_id)
    newDatafile = Dataset_File()
    newDatafile.dataset = dataset
    newDatafile.size = os.path.getsize(originfilepath)
    newDatafile.protocol = "tardis"
    newDatafile.mimetype = mimetype
    file_dir = "/" + str(dataset.experiment.id) + "/" + str(dataset.id) + "/"
    file_path = file_dir + rel_filepath
    prelim_full_file_path = settings.FILE_STORE_PATH + file_path
    full_file_path = duplicate_file_check_rename(prelim_full_file_path)
    newDatafile.filename = os.path.basename(full_file_path)
    newDatafile.url = "%s://%s" % (newDatafile.protocol,
                                   full_file_path[
            len(settings.FILE_STORE_PATH) + len(file_dir):])
    if not os.path.exists(os.path.dirname(full_file_path)):
        os.makedirs(os.path.dirname(full_file_path))
    shutil.move(originfilepath, full_file_path)
    newDatafile.save()
Example #2
0
def _make_data_file(dataset, filename, content):
    # TODO:
    # create datasetfile

    f = mktemp()
    print "Inside make data file ", f
    open(f, "w+b").write(content)
    df = Dataset_File()
    df.dataset = dataset
    df.filename = filename
    df.url = 'file://'+f
    df.protocol = "staging"
    df.size = len(content)
    df.verify(allowEmptyChecksums=True)
    df.save()
    print "Df ---", df
Example #3
0
 def process_enclosure(self, dataset, enclosure):
     filename = getattr(enclosure, 'title', basename(enclosure.href))
     datafile = Dataset_File(url=enclosure.href, \
                             filename=filename, \
                             dataset=dataset)
     datafile.protocol = enclosure.href.partition('://')[0]
     try:
         datafile.mimetype = enclosure.mime
     except AttributeError:
         pass
     try:
         datafile.size = enclosure.length
     except AttributeError:
         pass
     try:
         hash = enclosure.hash
         # Split on white space, then ':' to get tuples to feed into dict
         hashdict = dict([s.partition(':')[::2] for s in hash.split()])
         # Set SHA-512 sum
         datafile.sha512sum = hashdict['sha-512']
     except AttributeError:
         pass
     datafile.save()
     self.make_local_copy(datafile)
Example #4
0
    def process_enclosure(self, dataset, enclosure):
        '''
        Examines one "enclosure" from an entry, representing a datafile.
        Determines whether to process it, and if so, starts the transfer.
        '''
        # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about 
        # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods.
        # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably.
        
        def _get_enclosure_url(enclosure):
            ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt'''
            if IngestOptions.USE_LOCAL_TRANSFERS:
                return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH)
            else:
                return enclosure.href

        filename = getattr(enclosure, 'title', basename(enclosure.href))
        # check if we were provided a full path, and hence a subdirectory for the file 
        if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and
                    getattr(enclosure, "path", "") != "" and
                    enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""):
            filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:])
                
        datafiles = dataset.dataset_file_set.filter(filename=filename)
        def fromunix1000 (tstr):
            return datetime.datetime.utcfromtimestamp(float(tstr)/1000)
        if datafiles.count() > 0:
            datafile = datafiles[0]
            from django.db.models import Max                     
            newest=datafiles.aggregate(Max('modification_time'))['modification_time__max']
            if not newest:# datafile.modification_time:
                ### rethink this!
                return # We have this file, it has no time/date, let's skip it.
            
            def total_seconds(td): # exists on datetime.timedelta in Python 2.7
                return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
            timediff = total_seconds(fromunix1000(enclosure.modified) - newest)

            if timediff == 0:
                return # We have this file already, same time/date.
            elif timediff < 0:
                logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?".
                                                format(enclosure.href, self.human_time(-timediff)))
                return
            else:
                if not IngestOptions.ALLOW_UPDATING_DATAFILES:
                    logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.".
                                                format(enclosure.href, self.human_time(timediff)))
                    return
                logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.".
                                                 format(enclosure.href, self.human_time(timediff)))
                if IngestOptions.HIDE_REPLACED_DATAFILES:
                    # Mark all older versions of file as hidden. (!)
                    try:
                        from tardis.microtardis.models import Dataset_Hidden
                        Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True)
                    except ImportError:
                        logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " +
                                    "will not be hidden.".format(datafile.filename))
                  
        else: # no local copy already.
            logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href))


        # Create a record and start transferring.
        datafile = Dataset_File(dataset=dataset,
                                url=_get_enclosure_url(enclosure), 
                                filename=filename,
                                created_time=fromunix1000(enclosure.created),
                                modification_time=fromunix1000(enclosure.modified))
        datafile.protocol = enclosure.href.partition('://')[0]
        
        datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype)
        datafile.size = getattr(enclosure, "length", datafile.size)

        try:
            hash = enclosure.hash
            # Split on white space, then ':' to get tuples to feed into dict
            hashdict = dict([s.partition(':')[::2] for s in hash.split()])
            # Set SHA-512 sum
            datafile.sha512sum = hashdict['sha-512']
        except AttributeError:
            pass
        datafile.save()