예제 #1
0
    def modify(self, data: CardLiveData) -> CardLiveData:
        logger.debug(f'Main df before {data.main_df}')
        taxonomy_parser = TaxonomicParser(ncbi_taxa_file=self._ncbi_taxa_file,
                                          df_rgi_kmer=data.rgi_kmer_df,
                                          df_lmat=data.lmat_df)
        matches_df = taxonomy_parser.create_file_matches().rename(
            columns={
                'lmat.taxonomy_label': 'lmat_taxonomy',
                'rgi_kmer.taxonomy_label': 'rgi_kmer_taxonomy'
            })
        matches_df = matches_df.drop(columns=['matches'])
        main_df = data.main_df.merge(matches_df,
                                     left_index=True,
                                     right_index=True,
                                     how='left')
        logger.debug(f'Main df after {main_df}')

        rgi_df = data.rgi_df.copy()
        rgi_kmer_df = data.rgi_kmer_df.copy()
        lmat_df = data.lmat_df.copy()
        mlst_df = data.mlst_df.copy()

        return CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)
    def read_data(self, input_files: list = None) -> CardLiveData:
        """
        Reads in the data and constructs a CardLiveData object.
        :param input_files: The (optional) list of input files. Leave as None to read from the configured directory.
                            The optional list is used so I don't have to re-read the directory after running read_or_update_data().
        :return: The CardLiveData object.
        """
        if input_files is None:
            if not self._directory.exists():
                raise Exception(f'Data directory [card_live_dir={self._directory}] does not exist')
            else:
                input_files = list(Path(self._directory).glob('*'))

        json_data = []
        for input_file in input_files:
            filename = path.basename(input_file)
            with open(input_file) as f:
                json_obj = json.load(f)
                json_obj['filename'] = filename
                json_data.append(json_obj)

        full_df = pd.json_normalize(json_data).set_index('filename')
        full_df = self._replace_empty_list_na(full_df, self.JSON_DATA_FIELDS)
        full_df = self._create_analysis_valid_column(full_df, self.JSON_DATA_FIELDS)
        full_df['timestamp'] = pd.to_datetime(full_df['timestamp'])

        main_df = full_df.drop(columns=self.JSON_DATA_FIELDS)
        rgi_df = self._expand_column(full_df, 'rgi_main', na_char='n/a').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)
        rgi_kmer_df = self._expand_column(full_df, 'rgi_kmer', na_char='n/a').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)
        mlst_df = self._expand_column(full_df, 'mlst', na_char='-').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)
        lmat_df = self._expand_column(full_df, 'lmat', na_char='n/a').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)

        data = CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)

        # apply data modifiers
        for modifier in self._data_modifiers:
            data = modifier.modify(data)

        return data
    def modify(self, data: CardLiveData) -> CardLiveData:
        main_df = data.main_df.copy()
        logger.debug(f'Main df before {main_df}')
        main_df = self._region_codes_service.add_region_standard_names(
            main_df, 'geo_area_code')
        logger.debug(f'Main df after {main_df}')

        rgi_df = data.rgi_df.copy()
        rgi_kmer_df = data.rgi_kmer_df.copy()
        lmat_df = data.lmat_df.copy()
        mlst_df = data.mlst_df.copy()

        return CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)
예제 #4
0
    def modify(self, data: CardLiveData) -> CardLiveData:
        na_code = -10

        main_df = data.main_df.copy()
        main_df.loc[(main_df['geo_area_code'] == 10) &
                    (main_df['timestamp'] < self._date_threshold),
                    'geo_area_code'] = na_code

        rgi_df = data.rgi_df.copy()
        rgi_kmer_df = data.rgi_kmer_df.copy()
        lmat_df = data.lmat_df.copy()
        mlst_df = data.mlst_df.copy()

        return CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)
        [
            'file1', 'Strict', 'class1; class2; class3', 'gene2',
            'antibiotic inactivation', 'family2'
        ],
        [
            'file2', 'Perfect', 'class1; class2; class4', 'gene1',
            'antibiotic efflux; antibiotic target alteration', 'family1'
        ],
        ['file3', None, None, None, None, None],
    ]).set_index('filename')

RGI_PARSER = RGIParser(RGI_DF)

DATA = CardLiveData(main_df=MAIN_DF,
                    rgi_parser=RGI_PARSER,
                    rgi_kmer_df=OTHER_DF,
                    lmat_df=OTHER_DF,
                    mlst_df=OTHER_DF)


def test_select_by_time_keepall():
    data = DATA
    start = datetime.strptime('2020-08-05 00:00:00', TIME_FMT)
    end = datetime.strptime('2020-08-08 00:00:00', TIME_FMT)

    assert 3 == len(data), 'Data not initialized to correct number of entries'
    data = data.select_by_time(start, end)
    assert 3 == len(data), 'Invalid number after selection'
    assert 3 == len(data.main_df), 'Invalid number after selection'
    assert {'file1', 'file2', 'file3'} == data.files(), 'Invalid files'
    assert 4 == len(data.rgi_parser.df_rgi), 'Invalid number after selection'