def main(args):
    """
    Main entry point
    """
    logger.debug('Starting file structure scan.')

    for nc_file in ilist_files(args.directory):
        nc_file_name = os.path.basename(nc_file)
        db_files = DataFile.objects.filter(name=nc_file_name)

        if db_files.count() == 0:
            logger.error('File not found in database: {}'.format(nc_file))
        elif db_files.count() > 1:
            logger.error('{} entries found in database for file: {}'.
                         format(db_files.count(), nc_file))
        else:
            db_file = db_files.first()

        act_size = os.path.getsize(nc_file)
        if act_size != db_file.size:
            logger.info('File %s has size %d', db_file.name, act_size)
            db_file.online = False
            db_file.directory = None
            db_file.save()

            os.remove(nc_file)
            if not is_same_gws(nc_file, BASE_OUTPUT_DIR):
                sym_link_path = os.path.join(BASE_OUTPUT_DIR,
                                             construct_drs_path(db_file),
                                             db_file.name)
                try:
                    if os.path.exists(sym_link_path):
                        os.remove(sym_link_path)
                except OSError:
                    logger.error('Unable to delete sym link %s', sym_link_path)
Example #2
0
 def test_ilist_files_default_suffix(self):
     new_tree_list = list(ilist_files(self.temp_dir))
     expected_files = ['file1.nc', 'dir1/file3.nc', 'dir1/file4.nc']
     expected_tree_list = [
         self.temp_dir.joinpath(ef).as_posix() for ef in expected_files
     ]
     new_tree_list.sort()
     expected_tree_list.sort()
     self.assertEqual(new_tree_list, expected_tree_list)
Example #3
0
 def test_ilist_files_ignore_symlinks(self):
     new_tree_list = list(ilist_files(self.temp_dir, ignore_symlinks=True))
     expected_files = [
         'file1.nc',
         'dir1/file3.nc',
     ]
     expected_tree_list = [
         self.temp_dir.joinpath(ef).as_posix() for ef in expected_files
     ]
     new_tree_list.sort()
     expected_tree_list.sort()
     self.assertEqual(new_tree_list, expected_tree_list)
Example #4
0
def main(args):
    """Main entry point"""
    base_dir = Settings.get_solo().base_output_dir

    for extracted_file in ilist_files(args.top_dir):
        found_name = os.path.basename(extracted_file)

        try:
            data_file = DataFile.objects.get(name=found_name)
        except django.core.exceptions.ObjectDoesNotExist:
            logger.warning('Cannot find DMT entry. Skipping {}'.
                           format(extracted_file))
            continue

        found_checksum = adler32(extracted_file)
        if not found_checksum == data_file.checksum_set.first().checksum_value:
            logger.warning("Checksum doesn't match. Skipping {}".
                           format(found_name))
            continue

        dest_dir = os.path.join(get_gws_any_dir(extracted_file), 'stream1',
                                construct_drs_path(data_file))
        dest_path = os.path.join(dest_dir, found_name)
        if os.path.exists(dest_path):
            logger.warning('Skipping {} as it already exists at {}'.
                           format(found_name, dest_path))
            continue
        # create the directory if it doesn't exist
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        os.rename(extracted_file, dest_path)

        # create a link from the base dir
        if not is_same_gws(dest_path, base_dir):
            link_dir = os.path.join(base_dir, construct_drs_path(data_file))
            link_path = os.path.join(link_dir, data_file.name)
            if not  os.path.exists(link_dir):
                os.makedirs(link_dir)
            os.symlink(dest_path, link_path)

        data_file.online = True
        data_file.directory = dest_dir
        data_file.save()
Example #5
0
def main(args):
    """
    Main entry point
    """
    for path in ilist_files(args.top_path, ignore_symlinks=True):
        data_file = Path(path)
        try:
            django_file = DataFile.objects.get(name=data_file.name)
        except django.core.exceptions.ObjectDoesNotExist:
            logger.debug(f'Not in DMT: {path}')
            continue

        if django_file.directory.startswith('/badc'):
            if not args.dryrun:
                action = 'Deleting'
                data_file.unlink()
                delete_drs_dir(str(data_file.parent))
            else:
                action = 'Deletable'
            logger.debug(f'{action}: {path}')
def scan_file_structure(directory):
    """
    Start the scan of the file structure.

    :param str directory: the top level directory to scan
    """
    logger.debug('Starting file structure scan.')

    for nc_file in ilist_files(directory):
        nc_file_name = os.path.basename(nc_file)
        db_files = DataFile.objects.filter(name=nc_file_name)

        if db_files.count() == 0:
            logger.error('File not found in database: {}'.format(nc_file))
        elif db_files.count() > 1:
            logger.error('{} entries found in database for file: {}'.
                         format(db_files.count(), nc_file))
        else:
            db_file = db_files.first()

            # Check for broken symbolic links
            # os.path.exists() returns False for broken links
            if not os.path.exists(nc_file):
                os.remove(nc_file)
                if db_file.directory:
                    db_path = os.path.join(db_file.directory, db_file.name)
                    if os.path.exists(db_path):
                        logger.warning('Replacing broken link for file {}'.
                                       format(db_file.name))
                        os.symlink(os.path.join(db_file.directory,
                                                db_file.name),
                                   nc_file)
                else:
                    logger.warning('Removing broken link for file {}'.
                                   format(db_file.name))
                    if db_file.online:
                        db_file.online = False
                        db_file.save()
                    continue

            actual_path = os.path.realpath(nc_file)
            actual_dir = os.path.dirname(actual_path)

            db_file.refresh_from_db()
            if not db_file.online:
                logger.warning('File status changed to online: {}'.
                               format(nc_file))
                db_file.online = True
                db_file.directory = actual_dir
                db_file.save()

            if db_file.directory is None:
                db_file.directory = actual_dir
                db_file.save()

            if db_file.directory != actual_dir:
                if db_file.directory.startswith(CEDA_BASE):
                    # This file is believed to be in the archive
                    logger.warning('File {} is in the CEDA archive according '
                                   'to the database.'.format(nc_file))
                else:
                    logger.warning('Directory for file {} changed from {} '
                                   'to {}'.format(nc_file_name,
                                                  db_file.directory,
                                                  actual_dir))
                    db_file.directory = actual_dir
                    db_file.save()

    logger.debug('Completed file structure scan.')
def main(args):
    """
    Main entry point
    """
    for json_file in ilist_files(INPUT_JSON_DIR, '.json'):
        with open(json_file, 'r') as fh:
            metadata = json.load(fh)

        for nc_file in metadata:
            nc_file['activity_id'] = {
                "__module__": "pdata_app.models",
                "__kwargs__": {
                    "short_name": "HighResMIP"
                },
                "__class__": "ActivityId"
            }

            nc_file['experiment'] = {
                "__module__": "pdata_app.models",
                "__kwargs__": {
                    "short_name": "highresSST-present"
                },
                "__class__": "Experiment"
            }

            nc_file['climate_model'] = {
                "__module__": "pdata_app.models",
                "__kwargs__": {
                    "short_name": "HadGEM3-GC31-LM"
                },
                "__class__": "ClimateModel"
            }

            nc_file['institute'] = {
                "__module__": "pdata_app.models",
                "__kwargs__": {
                    "short_name": "MOHC"
                },
                "__class__": "Institute"
            }

            filename = nc_file['basename']
            var_name, table_name = filename.split('_')[0:2]

            if table_name.startswith('Prim'):
                nc_file['project'] = {
                    "__module__": "pdata_app.models",
                    "__kwargs__": {
                        "short_name": "PRIMAVERA"
                    },
                    "__class__": "Project"
                }
            else:
                nc_file['project'] = {
                    "__module__": "pdata_app.models",
                    "__kwargs__": {
                        "short_name": "CMIP6"
                    },
                    "__class__": "Project"
                }

            cmor_name = _get_cmor_name(var_name, table_name)

            nc_file['variable'] = {
                "__module__": "pdata_app.models",
                "__kwargs__": {
                    "cmor_name": cmor_name,
                    "table_name": table_name
                },
                "__class__": "VariableRequest"
            }

            nc_file['data_request'] = {
                "__module__": "pdata_app.models",
                "__kwargs__": {
                    "variable_request__cmor_name": cmor_name,
                    "variable_request__table_name": table_name,
                    "climate_model__short_name": "HadGEM3-GC31-LM",
                    "experiment__short_name": "highresSST-present",
                    "institute__short_name": "MOHC"
                },
                "__class__": "DataRequest"
            }

        with open(os.path.join(OUTPUT_JSON_DIR, os.path.basename(json_file)),
                  'w') as fh:
            json.dump(metadata, fh, indent=4)