def modify(self, data: CardLiveData) -> CardLiveData: logger.debug(f'Main df before {data.main_df}') taxonomy_parser = TaxonomicParser(ncbi_taxa_file=self._ncbi_taxa_file, df_rgi_kmer=data.rgi_kmer_df, df_lmat=data.lmat_df) matches_df = taxonomy_parser.create_file_matches().rename( columns={ 'lmat.taxonomy_label': 'lmat_taxonomy', 'rgi_kmer.taxonomy_label': 'rgi_kmer_taxonomy' }) matches_df = matches_df.drop(columns=['matches']) main_df = data.main_df.merge(matches_df, left_index=True, right_index=True, how='left') logger.debug(f'Main df after {main_df}') rgi_df = data.rgi_df.copy() rgi_kmer_df = data.rgi_kmer_df.copy() lmat_df = data.lmat_df.copy() mlst_df = data.mlst_df.copy() return CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df)
def read_data(self, input_files: list = None) -> CardLiveData: """ Reads in the data and constructs a CardLiveData object. :param input_files: The (optional) list of input files. Leave as None to read from the configured directory. The optional list is used so I don't have to re-read the directory after running read_or_update_data(). :return: The CardLiveData object. """ if input_files is None: if not self._directory.exists(): raise Exception(f'Data directory [card_live_dir={self._directory}] does not exist') else: input_files = list(Path(self._directory).glob('*')) json_data = [] for input_file in input_files: filename = path.basename(input_file) with open(input_file) as f: json_obj = json.load(f) json_obj['filename'] = filename json_data.append(json_obj) full_df = pd.json_normalize(json_data).set_index('filename') full_df = self._replace_empty_list_na(full_df, self.JSON_DATA_FIELDS) full_df = self._create_analysis_valid_column(full_df, self.JSON_DATA_FIELDS) full_df['timestamp'] = pd.to_datetime(full_df['timestamp']) main_df = full_df.drop(columns=self.JSON_DATA_FIELDS) rgi_df = self._expand_column(full_df, 'rgi_main', na_char='n/a').drop( columns=self.OTHER_TABLE_DROP_FIELDS) rgi_kmer_df = self._expand_column(full_df, 'rgi_kmer', na_char='n/a').drop( columns=self.OTHER_TABLE_DROP_FIELDS) mlst_df = self._expand_column(full_df, 'mlst', na_char='-').drop( columns=self.OTHER_TABLE_DROP_FIELDS) lmat_df = self._expand_column(full_df, 'lmat', na_char='n/a').drop( columns=self.OTHER_TABLE_DROP_FIELDS) data = CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df) # apply data modifiers for modifier in self._data_modifiers: data = modifier.modify(data) return data
def modify(self, data: CardLiveData) -> CardLiveData: main_df = data.main_df.copy() logger.debug(f'Main df before {main_df}') main_df = self._region_codes_service.add_region_standard_names( main_df, 'geo_area_code') logger.debug(f'Main df after {main_df}') rgi_df = data.rgi_df.copy() rgi_kmer_df = data.rgi_kmer_df.copy() lmat_df = data.lmat_df.copy() mlst_df = data.mlst_df.copy() return CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df)
def modify(self, data: CardLiveData) -> CardLiveData: na_code = -10 main_df = data.main_df.copy() main_df.loc[(main_df['geo_area_code'] == 10) & (main_df['timestamp'] < self._date_threshold), 'geo_area_code'] = na_code rgi_df = data.rgi_df.copy() rgi_kmer_df = data.rgi_kmer_df.copy() lmat_df = data.lmat_df.copy() mlst_df = data.mlst_df.copy() return CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df)
[ 'file1', 'Strict', 'class1; class2; class3', 'gene2', 'antibiotic inactivation', 'family2' ], [ 'file2', 'Perfect', 'class1; class2; class4', 'gene1', 'antibiotic efflux; antibiotic target alteration', 'family1' ], ['file3', None, None, None, None, None], ]).set_index('filename') RGI_PARSER = RGIParser(RGI_DF) DATA = CardLiveData(main_df=MAIN_DF, rgi_parser=RGI_PARSER, rgi_kmer_df=OTHER_DF, lmat_df=OTHER_DF, mlst_df=OTHER_DF) def test_select_by_time_keepall(): data = DATA start = datetime.strptime('2020-08-05 00:00:00', TIME_FMT) end = datetime.strptime('2020-08-08 00:00:00', TIME_FMT) assert 3 == len(data), 'Data not initialized to correct number of entries' data = data.select_by_time(start, end) assert 3 == len(data), 'Invalid number after selection' assert 3 == len(data.main_df), 'Invalid number after selection' assert {'file1', 'file2', 'file3'} == data.files(), 'Invalid files' assert 4 == len(data.rgi_parser.df_rgi), 'Invalid number after selection'