def test_read_one_file():
    loader = CardLiveDataLoader(data_dir / 'data1')
    loader.add_data_modifiers([
        AddGeographicNamesModifier(region_codes),
    ])
    data = loader.read_data()

    assert 1 == len(data.main_df)
    assert ['file1'] == data.main_df.index.tolist()
    assert [15] == data.main_df['geo_area_code'].tolist()
    assert 'timestamp' in set(data.main_df.columns.tolist())
    assert 'geo_area_code' in set(data.main_df.columns.tolist())
    assert 'geo_area_name_standard' in set(data.main_df.columns.tolist())
    assert ['Northern Africa'] == data.main_df['geo_area_name_standard'].tolist()
    assert 'matches' not in set(data.main_df.columns.tolist())

    assert 2 == len(data.rgi_df)
    assert ['Perfect', 'Strict'] == data.rgi_df['rgi_main.Cut_Off'].tolist()
    assert ['macrolide antibiotic; cephalosporin', 'macrolide antibiotic; cephalosporin'] == data.rgi_df[
        'rgi_main.Drug Class'].tolist()
    assert 'timestamp' not in set(data.rgi_df.columns.tolist())
    assert 'geo_area_code' not in set(data.rgi_df.columns.tolist())

    assert ['Enterobacteriaceae (chromosome)'] == data.rgi_kmer_df['rgi_kmer.CARD*kmer Prediction'].tolist()
    assert 'timestamp' not in set(data.rgi_kmer_df.columns.tolist())
    assert 'geo_area_code' not in set(data.rgi_kmer_df.columns.tolist())

    assert ['senterica'] == data.mlst_df['mlst.scheme'].tolist()
    assert 'timestamp' not in set(data.mlst_df.columns.tolist())
    assert 'geo_area_code' not in set(data.mlst_df.columns.tolist())

    assert 'timestamp' not in set(data.lmat_df.columns.tolist())
    assert 'geo_area_code' not in set(data.lmat_df.columns.tolist())
def test_read_or_update_data_noupdate():
    loader = CardLiveDataLoader(data_dir / 'data1')
    data = loader.read_or_update_data()

    assert 1 == len(data.main_df)

    new_data = loader.read_or_update_data(data)
    assert data is new_data
Esempio n. 3
0
class CardLiveDataManager:
    INSTANCE = None

    def __init__(self, cardlive_home: Path):
        ncbi_db_path = cardlive_home / 'db' / 'taxa.sqlite'
        card_live_data_dir = cardlive_home / 'data' / 'card_live'

        self._data_loader = CardLiveDataLoader(card_live_data_dir)
        self._data_loader.add_data_modifiers([
            AntarcticaNAModifier(np.datetime64('2020-07-20')),
            AddGeographicNamesModifier(region_codes),
            AddTaxonomyModifier(ncbi_db_path),
        ])

        self._card_live_data = self._data_loader.read_or_update_data()

        self._scheduler = BackgroundScheduler(
            jobstores={'default': MemoryJobStore()},
            executors={'default': ThreadPoolExecutor(1)},
            job_defaults={'max_instances': 1})
        self._scheduler.add_job(self.update_job, 'interval', minutes=10)
        self._scheduler.start()

    def update_job(self):
        logger.debug('Updating CARD:Live data.')
        try:
            new_data = self._data_loader.read_or_update_data(
                self._card_live_data)
            if new_data is not self._card_live_data:
                logger.debug(
                    f'Old data has {len(self._card_live_data)} samples, new data has {len(new_data)} samples'
                )
                self._card_live_data = new_data
        except Exception as e:
            logger.info(
                'An exeption occured when attempting to load new data. Skipping new data.'
            )
            logger.exception(e)
        logger.debug('Finished updating CARD:Live data.')

    @property
    def card_data(self) -> CardLiveData:
        return self._card_live_data

    @classmethod
    def create_instance(cls, cardlive_home: Path) -> None:
        cls.INSTANCE = CardLiveDataManager(cardlive_home)

    @classmethod
    def get_instance(cls) -> CardLiveDataManager:
        if cls.INSTANCE is not None:
            return cls.INSTANCE
        else:
            raise Exception(f'{cls} does not yet have an instance.')
def test_data_archive_generator_skip_invalid_file():
    loader = CardLiveDataLoader(data_dir / 'data3')
    memory_archive = write_zip_to_memory_file(loader, ['file1', 'file-invalid'])

    with zipfile.ZipFile(memory_archive, 'r') as zf:
        assert {'card_live/file1'} == set(zf.namelist())

    memory_archive.close()
def test_read_antarctica_switch():
    loader = CardLiveDataLoader(data_dir / 'data2')
    loader.add_data_modifiers([
        AntarcticaNAModifier(np.datetime64('2020-07-20')),
        AddGeographicNamesModifier(region_codes),
    ])
    data = loader.read_data()

    assert 2 == len(data.main_df)
    assert ['file1', 'file2'] == data.main_df.index.tolist()
    assert [-10, 10] == data.main_df['geo_area_code'].tolist()
    assert ['N/A', 'Antarctica'] == data.main_df['geo_area_name_standard'].tolist()
    assert ['Perfect', 'Strict'] == data.rgi_df['rgi_main.Cut_Off'].tolist()
    assert ['macrolide antibiotic; cephalosporin', 'macrolide antibiotic'] == data.rgi_df[
        'rgi_main.Drug Class'].tolist()
    assert ['Enterobacteriaceae (chromosome)', 'Salmonella enterica (chromosome)'] == data.rgi_kmer_df[
        'rgi_kmer.CARD*kmer Prediction'].tolist()
    assert ['senterica', 'senterica'] == data.mlst_df['mlst.scheme'].tolist()
    assert ['Salmonella enterica', 'Salmonella enterica'] == data.lmat_df['lmat.taxonomy_label'].tolist()
Esempio n. 6
0
    def __init__(self, cardlive_home: Path):
        ncbi_db_path = cardlive_home / 'db' / 'taxa.sqlite'
        card_live_data_dir = cardlive_home / 'data' / 'card_live'

        self._data_loader = CardLiveDataLoader(card_live_data_dir)
        self._data_loader.add_data_modifiers([
            AntarcticaNAModifier(np.datetime64('2020-07-20')),
            AddGeographicNamesModifier(region_codes),
            AddTaxonomyModifier(ncbi_db_path),
        ])

        self._card_live_data = self._data_loader.read_or_update_data()

        self._scheduler = BackgroundScheduler(
            jobstores={'default': MemoryJobStore()},
            executors={'default': ThreadPoolExecutor(1)},
            job_defaults={'max_instances': 1})
        self._scheduler.add_job(self.update_job, 'interval', minutes=10)
        self._scheduler.start()
def write_zip_to_memory_file(loader: CardLiveDataLoader, files: List[str]) -> io.BytesIO:
    """
    Helper method to generate an in-memory zip archive for testing zipping of files.
    :param loader: The CardLiveDataLoader.
    :param files: The files to zip.
    :return: An in-memory file containing the zipped data.
    """
    memory_archive = io.BytesIO()
    for chunk in loader.data_archive_generator(files):
        memory_archive.write(chunk)
    memory_archive.seek(0)

    return memory_archive
def test_read_or_update_data_withupdate():
    loader = CardLiveDataLoader(data_dir / 'data1')
    data = loader.read_or_update_data()

    assert 1 == len(data.main_df)

    loader = CardLiveDataLoader(data_dir / 'data2')
    new_data = loader.read_or_update_data(data)
    assert data is not new_data
    assert 2 == len(new_data.main_df)
Esempio n. 9
0
class CardLiveDataManager:
    INSTANCE = None

    def __init__(self, cardlive_home: Path):
        ncbi_db_path = cardlive_home / 'db' / 'taxa.sqlite'
        card_live_data_dir = cardlive_home / 'data' / 'card_live'

        self._data_loader = CardLiveDataLoader(card_live_data_dir)
        self._data_loader.add_data_modifiers([
            AntarcticaNAModifier(np.datetime64('2020-07-20')),
            AddGeographicNamesModifier(region_codes),
            AddTaxonomyModifier(ncbi_db_path),
        ])

        self._card_live_data = self._data_loader.read_or_update_data()

        self._scheduler = BackgroundScheduler(
            jobstores={'default': MemoryJobStore()},
            executors={'default': ThreadPoolExecutor(1)},
            job_defaults={'max_instances': 1})
        self._scheduler.add_job(self.update_job, 'interval', minutes=10)
        self._scheduler.start()

    def update_job(self):
        logger.debug('Updating CARD:Live data.')
        try:
            new_data = self._data_loader.read_or_update_data(
                self._card_live_data)
            if new_data is not self._card_live_data:
                logger.debug(
                    f'Old data has {len(self._card_live_data)} samples, new data has {len(new_data)} samples'
                )
                self._card_live_data = new_data
        except Exception as e:
            logger.info(
                'An exeption occured when attempting to load new data. Skipping new data.'
            )
            logger.exception(e)
        logger.debug('Finished updating CARD:Live data.')

    def data_archive_generator(
        self,
        file_names: Union[List[str], Set[str]] = None
    ) -> Generator[bytes, None, None]:
        """
        Get the CARD:Live JSON files as a zipstream generator.
        :param file_names: The file names to load into the archive.
        :return: A generator which allows streaming of the zip file contents.
        """
        if file_names is None:
            file_names = self.card_data.files()

        return self._data_loader.data_archive_generator(file_names)

    @property
    def card_data(self) -> CardLiveData:
        return self._card_live_data

    @classmethod
    def create_instance(cls, cardlive_home: Path) -> None:
        cls.INSTANCE = CardLiveDataManager(cardlive_home)

    @classmethod
    def get_instance(cls) -> CardLiveDataManager:
        if cls.INSTANCE is not None:
            return cls.INSTANCE
        else:
            raise Exception(f'{cls} does not yet have an instance.')