def test_read_one_file(): loader = CardLiveDataLoader(data_dir / 'data1') loader.add_data_modifiers([ AddGeographicNamesModifier(region_codes), ]) data = loader.read_data() assert 1 == len(data.main_df) assert ['file1'] == data.main_df.index.tolist() assert [15] == data.main_df['geo_area_code'].tolist() assert 'timestamp' in set(data.main_df.columns.tolist()) assert 'geo_area_code' in set(data.main_df.columns.tolist()) assert 'geo_area_name_standard' in set(data.main_df.columns.tolist()) assert ['Northern Africa'] == data.main_df['geo_area_name_standard'].tolist() assert 'matches' not in set(data.main_df.columns.tolist()) assert 2 == len(data.rgi_df) assert ['Perfect', 'Strict'] == data.rgi_df['rgi_main.Cut_Off'].tolist() assert ['macrolide antibiotic; cephalosporin', 'macrolide antibiotic; cephalosporin'] == data.rgi_df[ 'rgi_main.Drug Class'].tolist() assert 'timestamp' not in set(data.rgi_df.columns.tolist()) assert 'geo_area_code' not in set(data.rgi_df.columns.tolist()) assert ['Enterobacteriaceae (chromosome)'] == data.rgi_kmer_df['rgi_kmer.CARD*kmer Prediction'].tolist() assert 'timestamp' not in set(data.rgi_kmer_df.columns.tolist()) assert 'geo_area_code' not in set(data.rgi_kmer_df.columns.tolist()) assert ['senterica'] == data.mlst_df['mlst.scheme'].tolist() assert 'timestamp' not in set(data.mlst_df.columns.tolist()) assert 'geo_area_code' not in set(data.mlst_df.columns.tolist()) assert 'timestamp' not in set(data.lmat_df.columns.tolist()) assert 'geo_area_code' not in set(data.lmat_df.columns.tolist())
def test_read_or_update_data_noupdate(): loader = CardLiveDataLoader(data_dir / 'data1') data = loader.read_or_update_data() assert 1 == len(data.main_df) new_data = loader.read_or_update_data(data) assert data is new_data
class CardLiveDataManager: INSTANCE = None def __init__(self, cardlive_home: Path): ncbi_db_path = cardlive_home / 'db' / 'taxa.sqlite' card_live_data_dir = cardlive_home / 'data' / 'card_live' self._data_loader = CardLiveDataLoader(card_live_data_dir) self._data_loader.add_data_modifiers([ AntarcticaNAModifier(np.datetime64('2020-07-20')), AddGeographicNamesModifier(region_codes), AddTaxonomyModifier(ncbi_db_path), ]) self._card_live_data = self._data_loader.read_or_update_data() self._scheduler = BackgroundScheduler( jobstores={'default': MemoryJobStore()}, executors={'default': ThreadPoolExecutor(1)}, job_defaults={'max_instances': 1}) self._scheduler.add_job(self.update_job, 'interval', minutes=10) self._scheduler.start() def update_job(self): logger.debug('Updating CARD:Live data.') try: new_data = self._data_loader.read_or_update_data( self._card_live_data) if new_data is not self._card_live_data: logger.debug( f'Old data has {len(self._card_live_data)} samples, new data has {len(new_data)} samples' ) self._card_live_data = new_data except Exception as e: logger.info( 'An exeption occured when attempting to load new data. Skipping new data.' ) logger.exception(e) logger.debug('Finished updating CARD:Live data.') @property def card_data(self) -> CardLiveData: return self._card_live_data @classmethod def create_instance(cls, cardlive_home: Path) -> None: cls.INSTANCE = CardLiveDataManager(cardlive_home) @classmethod def get_instance(cls) -> CardLiveDataManager: if cls.INSTANCE is not None: return cls.INSTANCE else: raise Exception(f'{cls} does not yet have an instance.')
def test_data_archive_generator_skip_invalid_file(): loader = CardLiveDataLoader(data_dir / 'data3') memory_archive = write_zip_to_memory_file(loader, ['file1', 'file-invalid']) with zipfile.ZipFile(memory_archive, 'r') as zf: assert {'card_live/file1'} == set(zf.namelist()) memory_archive.close()
def test_read_antarctica_switch(): loader = CardLiveDataLoader(data_dir / 'data2') loader.add_data_modifiers([ AntarcticaNAModifier(np.datetime64('2020-07-20')), AddGeographicNamesModifier(region_codes), ]) data = loader.read_data() assert 2 == len(data.main_df) assert ['file1', 'file2'] == data.main_df.index.tolist() assert [-10, 10] == data.main_df['geo_area_code'].tolist() assert ['N/A', 'Antarctica'] == data.main_df['geo_area_name_standard'].tolist() assert ['Perfect', 'Strict'] == data.rgi_df['rgi_main.Cut_Off'].tolist() assert ['macrolide antibiotic; cephalosporin', 'macrolide antibiotic'] == data.rgi_df[ 'rgi_main.Drug Class'].tolist() assert ['Enterobacteriaceae (chromosome)', 'Salmonella enterica (chromosome)'] == data.rgi_kmer_df[ 'rgi_kmer.CARD*kmer Prediction'].tolist() assert ['senterica', 'senterica'] == data.mlst_df['mlst.scheme'].tolist() assert ['Salmonella enterica', 'Salmonella enterica'] == data.lmat_df['lmat.taxonomy_label'].tolist()
def __init__(self, cardlive_home: Path): ncbi_db_path = cardlive_home / 'db' / 'taxa.sqlite' card_live_data_dir = cardlive_home / 'data' / 'card_live' self._data_loader = CardLiveDataLoader(card_live_data_dir) self._data_loader.add_data_modifiers([ AntarcticaNAModifier(np.datetime64('2020-07-20')), AddGeographicNamesModifier(region_codes), AddTaxonomyModifier(ncbi_db_path), ]) self._card_live_data = self._data_loader.read_or_update_data() self._scheduler = BackgroundScheduler( jobstores={'default': MemoryJobStore()}, executors={'default': ThreadPoolExecutor(1)}, job_defaults={'max_instances': 1}) self._scheduler.add_job(self.update_job, 'interval', minutes=10) self._scheduler.start()
def write_zip_to_memory_file(loader: CardLiveDataLoader, files: List[str]) -> io.BytesIO: """ Helper method to generate an in-memory zip archive for testing zipping of files. :param loader: The CardLiveDataLoader. :param files: The files to zip. :return: An in-memory file containing the zipped data. """ memory_archive = io.BytesIO() for chunk in loader.data_archive_generator(files): memory_archive.write(chunk) memory_archive.seek(0) return memory_archive
def test_read_or_update_data_withupdate(): loader = CardLiveDataLoader(data_dir / 'data1') data = loader.read_or_update_data() assert 1 == len(data.main_df) loader = CardLiveDataLoader(data_dir / 'data2') new_data = loader.read_or_update_data(data) assert data is not new_data assert 2 == len(new_data.main_df)
class CardLiveDataManager: INSTANCE = None def __init__(self, cardlive_home: Path): ncbi_db_path = cardlive_home / 'db' / 'taxa.sqlite' card_live_data_dir = cardlive_home / 'data' / 'card_live' self._data_loader = CardLiveDataLoader(card_live_data_dir) self._data_loader.add_data_modifiers([ AntarcticaNAModifier(np.datetime64('2020-07-20')), AddGeographicNamesModifier(region_codes), AddTaxonomyModifier(ncbi_db_path), ]) self._card_live_data = self._data_loader.read_or_update_data() self._scheduler = BackgroundScheduler( jobstores={'default': MemoryJobStore()}, executors={'default': ThreadPoolExecutor(1)}, job_defaults={'max_instances': 1}) self._scheduler.add_job(self.update_job, 'interval', minutes=10) self._scheduler.start() def update_job(self): logger.debug('Updating CARD:Live data.') try: new_data = self._data_loader.read_or_update_data( self._card_live_data) if new_data is not self._card_live_data: logger.debug( f'Old data has {len(self._card_live_data)} samples, new data has {len(new_data)} samples' ) self._card_live_data = new_data except Exception as e: logger.info( 'An exeption occured when attempting to load new data. Skipping new data.' ) logger.exception(e) logger.debug('Finished updating CARD:Live data.') def data_archive_generator( self, file_names: Union[List[str], Set[str]] = None ) -> Generator[bytes, None, None]: """ Get the CARD:Live JSON files as a zipstream generator. :param file_names: The file names to load into the archive. :return: A generator which allows streaming of the zip file contents. """ if file_names is None: file_names = self.card_data.files() return self._data_loader.data_archive_generator(file_names) @property def card_data(self) -> CardLiveData: return self._card_live_data @classmethod def create_instance(cls, cardlive_home: Path) -> None: cls.INSTANCE = CardLiveDataManager(cardlive_home) @classmethod def get_instance(cls) -> CardLiveDataManager: if cls.INSTANCE is not None: return cls.INSTANCE else: raise Exception(f'{cls} does not yet have an instance.')