def test_no_tags(self): '''Tests calling get_data_type_tags() with no tags''' ingest = Ingest() tags = ingest.get_data_type_tags() self.assertSetEqual(tags, set())
def test_deduplicate_ingest_list_no_existing(self, ingests_by_scan): """Tests calling S3Scanner._deduplicate_ingest_list() without existing""" ingests_by_scan.return_value = [] ingests = [Ingest(file_name='test1'), Ingest(file_name='test2')] final_ingests = S3Scanner._deduplicate_ingest_list(None, ingests) self.assertItemsEqual(ingests, final_ingests)
def test_deduplicate_ingest_list_with_duplicate_file_names( self, ingests_by_scan): """Tests calling S3Scanner._deduplicate_ingest_list() with duplicates""" ingests_by_scan.return_value = [] ingests = [Ingest(file_name='test1'), Ingest(file_name='test1')] final_ingests = S3Scanner._deduplicate_ingest_list(None, ingests) self.assertEquals(len(final_ingests), 1) self.assertEquals(final_ingests[0].file_name, 'test1')
def test_deduplicate_ingest_list_with_existing_no_other_dups( self, ingests_by_scan): """Tests calling S3Scanner._deduplicate_ingest_list() with existing and no other dups""" ingests_by_scan.return_value = [Ingest(file_name='test1')] ingests = [Ingest(file_name='test1'), Ingest(file_name='test2')] final_ingests = S3Scanner._deduplicate_ingest_list(None, ingests) self.assertEquals(len(final_ingests), 1) self.assertEquals(final_ingests[0].file_name, 'test2')
def test_tags(self): '''Tests calling get_data_type_tags() with tags''' ingest = Ingest(data_type='A,B,c') tags = ingest.get_data_type_tags() correct_set = set() correct_set.add('A') correct_set.add('B') correct_set.add('c') self.assertSetEqual(tags, correct_set)
def test_tags(self): """Tests calling get_data_type_tags() with tags""" ingest = Ingest(data_type_tags=['A', 'B', 'c']) tags = ingest.get_data_type_tags() correct_set = set() correct_set.add('A') correct_set.add('B') correct_set.add('c') self.assertSetEqual(tags, correct_set)
def test_same_tag(self): """Tests calling add_data_type_tag() with the same tag twice""" ingest = Ingest() ingest.add_data_type_tag('Hello1') ingest.add_data_type_tag('Hello1') tags = ingest.get_data_type_tags() correct_set = set() correct_set.add('Hello1') self.assertSetEqual(tags, correct_set)
def test_invalid(self): """Tests calling add_data_type_tag() with invalid tags""" ingest = Ingest() self.assertRaises(InvalidDataTypeTag, ingest.add_data_type_tag, 'my.invalid.tag') self.assertRaises(InvalidDataTypeTag, ingest.add_data_type_tag, 'my\invalid\tag!')
def _create_ingest(self, file_name): """Creates a new ingest for the given file name. The database save is the caller's responsibility. :param file_name: The name of the file being ingested :type file_name: string :returns: The new ingest model :rtype: :class:`ingest.models.Ingest` """ ingest = Ingest() ingest.file_name = file_name ingest.strike_id = self.strike_id ingest.media_type = get_media_type(file_name) ingest.workspace = self._monitored_workspace logger.info('New file on %s: %s', ingest.workspace.name, file_name) return ingest
def test_valid(self): '''Tests calling add_data_type_tag() with valid tags''' ingest = Ingest() ingest.add_data_type_tag('Hello1') ingest.add_data_type_tag('foo_BAR') tags = ingest.get_data_type_tags() correct_set = set() correct_set.add('Hello1') correct_set.add('foo_BAR') self.assertSetEqual(tags, correct_set)
def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method migrates existing data files into scale. """ logger.info(u'Command starting: migratedata') workspace, workspace_path, local_path, data_types = None, None, None, [] if options['workspace'] is not None and options[ 'workspace_path'] is not None: workspace, workspace_path = options['workspace'], options[ 'workspace_path'] tmp = Workspace.objects.filter(name=workspace) if tmp.count() > 0: workspace = tmp.first() else: workspace = Workspace.objects.get(id=int(workspace)) else: logger.error('Must specify workspace and workspace-path.') return False if options['data_type'] is not None: data_types.extend(options['data_type']) mnt_dirs = None if options['local_path'] is not None: local_path = options['local_path'] else: # mount mnt_dirs = "/tmp", tempfile.mkdtemp() workspace.setup_download_dir(*mnt_dirs) local_path = os.path.join(mnt_dirs[1], workspace_path) logger.info("Ingesting files from %s/%s", workspace.name, workspace_path) filenames = self.generate_file_list(local_path, options['include'], options['exclude']) logger.info("Found %d files", len(filenames)) # prepare for ingest ala strike ingest_records = {} for filename in filenames: logger.info("Generating ingest record for %s" % filename) ingest = Ingest() ingest.file_name = os.path.basename(filename) ingest.file_path = os.path.join( workspace_path, os.path.relpath(filename, local_path)) ingest.transfer_started = datetime.utcfromtimestamp( os.path.getatime(filename)) ingest.file_size = ingest.bytes_transferred = os.path.getsize( filename) ingest.transfer_ended = timezone.now() ingest.media_type = get_media_type(filename) ingest.workspace = workspace for data_type in data_types: ingest.add_data_type_tag(data_type) ingest.status = 'TRANSFERRED' if options['no_commit']: s = IngestDetailsSerializer() logger.info(s.to_representation(ingest)) else: ingest.save() ingest_records[filename] = ingest.id logging.info("Ingests records created") # start ingest tasks for all the files if not options['no_commit']: logging.info("Starting ingest tasks") for filename in filenames: ingest = Ingest.objects.get(id=ingest_records[filename]) logging.info("Processing ingest %s" % ingest.file_name) with transaction.atomic(): ingest.ingest_started = timezone.now() sf = ingest.source_file = SourceFile.create() sf.update_uuid(ingest.file_name) for tag in ingest.get_data_type_tags(): sf.add_data_type_tag(tag) sf.media_type = ingest.media_type sf.file_name = ingest.file_name sf.file_size = ingest.file_size sf.file_path = ingest.file_path sf.workspace = workspace sf.is_deleted = False sf.deleted = None sf.save() sf.set_countries() sf.save() ingest.status = 'INGESTED' ingest.ingest_ended = timezone.now() ingest.source_file = sf ingest.save() IngestTriggerHandler().process_ingested_source_file( ingest.source_file, ingest.ingest_ended) logging.info( "Ingests processed, monitor the queue for triggered jobs.") if mnt_dirs is not None: workspace.cleanup_download_dir(*mnt_dirs) logger.info(u'Command completed: migratedata')
def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method migrates existing data files into scale. """ logger.info(u'Command starting: migratedata') workspace, workspace_path, local_path, data_types = None, None, None, [] if options['workspace'] is not None and options['workspace_path'] is not None: workspace, workspace_path = options['workspace'], options['workspace_path'] tmp = Workspace.objects.filter(name=workspace) if tmp.count() > 0: workspace = tmp.first() else: workspace = Workspace.objects.get(id=int(workspace)) else: logger.error('Must specify workspace and workspace-path.') return False if options['data_type'] is not None: data_types.extend(options['data_type']) mnt_dirs = None if options['local_path'] is not None: local_path = options['local_path'] else: # mount mnt_dirs = "/tmp", tempfile.mkdtemp() workspace.setup_download_dir(*mnt_dirs) local_path = os.path.join(mnt_dirs[1], workspace_path) logger.info("Ingesting files from %s/%s", workspace.name, workspace_path) filenames = self.generate_file_list(local_path, options['include'], options['exclude']) logger.info("Found %d files", len(filenames)) # prepare for ingest ala strike ingest_records = {} for filename in filenames: logger.info("Generating ingest record for %s" % filename) ingest = Ingest() ingest.file_name = os.path.basename(filename) ingest.transfer_path = filename ingest.file_path = os.path.join(workspace_path, os.path.relpath(filename, local_path)) ingest.transfer_started = datetime.utcfromtimestamp(os.path.getatime(filename)) ingest.file_size = ingest.bytes_transferred = os.path.getsize(filename) ingest.transfer_ended = datetime.utcnow() ingest.media_type = get_media_type(filename) ingest.workspace = workspace for data_type in data_types: ingest.add_data_type_tag(data_type) ingest.status = 'TRANSFERRED' if options['no_commit']: s = IngestDetailsSerializer() logger.info(s.to_representation(ingest)) else: ingest.save() ingest_records[filename] = ingest.id logging.info("Ingests records created") # start ingest tasks for all the files if not options['no_commit']: logging.info("Starting ingest tasks") for filename in filenames: ingest = Ingest.objects.get(id=ingest_records[filename]) logging.info("Processing ingest %s" % ingest.file_name) with transaction.atomic(): ingest.ingest_started = datetime.utcnow() sf = ingest.source_file = SourceFile() sf.update_uuid(ingest.file_name) for tag in ingest.get_data_type_tags(): sf.add_data_type_tag(tag) sf.media_type = ingest.media_type sf.file_name = ingest.file_name sf.file_size = ingest.file_size sf.file_path = ingest.file_path sf.workspace = workspace sf.is_deleted = False sf.deleted = None sf.save() sf.set_countries() sf.save() ingest.status = 'INGESTED' ingest.ingest_ended = datetime.utcnow() ingest.source_file = sf ingest.save() IngestTriggerHandler().process_ingested_source_file(ingest.source_file, ingest.ingest_ended) logging.info("Ingests processed, monitor the queue for triggered jobs.") if mnt_dirs is not None: workspace.cleanup_download_dir(*mnt_dirs) logger.info(u'Command completed: migratedata')
def _process_file(self, file_name, ingest): '''Processes the given file in the Strike directory. The file_name argument represents a file in the Strike directory to process. If file_name is None, then the ingest argument represents an ongoing transfer where the file is unexpectedly not in the Strike directory. If file_name is not None and ingest is None, then this is a new transfer without an ingest record yet. If both arguments are None an exception is thrown. :param file_name: The name of the file to process (possibly None) :type file_name: str :param ingest: The ingest model for the file (possibly None) :type ingest: :class:`ingest.models.Ingest` ''' if file_name is None and ingest is None: raise Exception('Nothing for Strike to process') if file_name is None: file_name = ingest.file_name file_path = os.path.join(self.strike_dir, file_name) final_name = self._final_filename(file_name) # Create ingest model for new transfer if ingest is None: msg = 'New file %s has arrived, creating ingest for %s' logger.info(msg, file_path, final_name) ingest = Ingest() # Ingest model should record the actual name of the file (no # temporary suffix) ingest.file_name = final_name ingest.strike_id = self.strike_id # TODO: investigate better way to get start time of transfer last_access = os.path.getatime(file_path) ingest.transfer_path = os.path.join(self.strike_dir, final_name) ingest.transfer_started = datetime.utcfromtimestamp(last_access) if ingest.status == 'TRANSFERRING': # Update bytes transferred size = os.path.getsize(file_path) ingest.bytes_transferred = size # Ensure that file is still in Strike dir as expected if not os.path.exists(file_path): msg = '%s was being transferred, but the file is now lost' logger.error(msg, file_path) ingest.status = 'ERRORED' ingest.save() logger.info('Ingest for %s marked as ERRORED', final_name) return if self._is_still_transferring(file_name): # Update with current progress of the transfer ingest.save() logger.info('%s is still transferring, progress updated', file_path) else: # Transfer is complete, will move on to next section self._complete_transfer(ingest, size) if ingest.status == 'TRANSFERRED': if ingest.ingest_path: self._prepare_file_for_ingest(ingest) else: self._defer_file(ingest) elif not ingest.status == 'TRANSFERRING': msg = 'Strike not expecting to process file with status %s' raise Exception(msg, ingest.status)