def apply_filters(data: CardLiveData, rgi_cutoff_select: str, drug_classes: List[str], amr_gene_families: List[str], resistance_mechanisms: List[str], amr_genes: List[str], custom_date: Dict[str, datetime]) -> Dict[str, CardLiveData]: time_now = datetime.now() data = data.select(table='rgi', by='cutoff', type='row', level=rgi_cutoff_select) \ .select(table='rgi', by='drug', type='file', elements=drug_classes) \ .select(table='rgi', by='amr_gene_family', type='file', elements=amr_gene_families) \ .select(table='rgi', by='resistance_mechanism', type='file', elements=resistance_mechanisms) \ .select(table='rgi', by='amr_gene', type='file', elements=amr_genes) time_subsets = { 'all': data, 'day': data.select(table='main', by='time', start=time_now - DAY, end=time_now), 'week': data.select(table='main', by='time', start=time_now - WEEK, end=time_now), 'month': data.select(table='main', by='time', start=time_now - MONTH, end=time_now), '3 months': data.select(table='main', by='time', start=time_now - THREE_MONTHS, end=time_now), '6 months': data.select(table='main', by='time', start=time_now - SIX_MONTHS, end=time_now), 'year': data.select(table='main', by='time', start=time_now - YEAR, end=time_now), } if custom_date is not None: time_subsets['custom'] = data.select(table='main', by='time', start=custom_date['start'], end=custom_date['end']) else: time_subsets['custom'] = data return time_subsets
def read_or_update_data(self, existing_data: CardLiveData = None ) -> CardLiveData: """ Given an existing data object, updates the data object with any new files. :param existing_data: The existing data object (None if all data should be read). :return: The original (unmodified) data object if no updates, otherwise a new data object with additional data. """ input_files = [ Path(self._directory) / f for f in listdir(self._directory) if path.isfile(Path(self._directory) / f) ] input_files.sort() if existing_data is None: return self.read_data(input_files) elif not self._directory.exists(): raise Exception( f'Data directory [card_live_dir={self._directory}] does not exist' ) else: existing_files = existing_data.files() input_files_set = {p.name for p in input_files} files_new = input_files_set - existing_files # If no new files have been found if len(files_new) == 0: logger.debug( f'Data has not changed from {len(input_files_set)} samples, not updating' ) return existing_data else: logger.info(f'{len(files_new)} additional samples found.') return self.read_data(input_files)
def modify(self, data: CardLiveData) -> CardLiveData: logger.debug(f'Main df before {data.main_df}') taxonomy_parser = TaxonomicParser(ncbi_taxa_file=self._ncbi_taxa_file, df_rgi_kmer=data.rgi_kmer_df, df_lmat=data.lmat_df) matches_df = taxonomy_parser.create_file_matches().rename( columns={ 'lmat.taxonomy_label': 'lmat_taxonomy', 'rgi_kmer.taxonomy_label': 'rgi_kmer_taxonomy' }) matches_df = matches_df.drop(columns=['matches']) main_df = data.main_df.merge(matches_df, left_index=True, right_index=True, how='left') logger.debug(f'Main df after {main_df}') rgi_df = data.rgi_df.copy() rgi_kmer_df = data.rgi_kmer_df.copy() lmat_df = data.lmat_df.copy() mlst_df = data.mlst_df.copy() return CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df)
def choropleth_drug(data: CardLiveData, world: geopandas.GeoDataFrame): df_geo = data.sample_counts(['geo_area_code', 'geo_area_name_standard']).reset_index() # Remove N/A from counts so it doesn't mess with colors of map df_geo = df_geo[~df_geo['geo_area_name_standard'].str.contains('N/A')] if df_geo.empty or df_geo['count'].sum() == 0: fig = EMPTY_MAP else: fig = px.choropleth( df_geo, geojson=world, locations='geo_area_code', featureidkey='properties.un_m49_numeric', color='count', color_continuous_scale='YlGnBu', hover_data=['geo_area_name_standard'], # Off-center to avoid a color fill issue with Antarctica # where the oceans get filled instead of the continent center={ 'lat': 0, 'lon': 0.01 }, title='Samples by geographic region', ) fig.update_traces(hovertemplate=( '<b style="font-size: 125%;">%{customdata[0]}</b><br>' '<b>Count:</b> %{z}<br>')) fig.update_layout( margin={ "r": 0, "t": 35, "l": 0, "b": 0 }, coloraxis_colorbar=dict( title='Count', yanchor='middle', y=0.5, len=1, lenmode='fraction', outlinecolor='black', outlinewidth=1, bgcolor='white', thickness=25, thicknessmode='pixels', ), ) return fig
def totals_figure(data: CardLiveData, type_value: str, color_by_value: str) -> go.Figure: type_col = TOTALS_COLUMN_SELECT_NAMES[type_value] color_col = TOTALS_COLUMN_SELECT_NAMES[color_by_value] if type_col == color_col or color_by_value == 'default': count_by_columns = [type_col] else: count_by_columns = [type_col, color_col] if data.empty: fig = EMPTY_FIGURE else: totals_df = data.sample_counts(count_by_columns).reset_index() type_col_name = TOTALS_COLUMN_DATAFRAME_NAMES[type_value] color_col_name = TOTALS_COLUMN_DATAFRAME_NAMES[color_by_value] category_orders = order_categories(totals_df, type_col_name, by_sum=True, sum_col='count') if color_by_value != 'default': category_orders.update( order_categories(totals_df, color_col_name, by_sum=True, sum_col='count')) fig = px.bar( totals_df, y=type_col_name, x='count', color=color_col_name, height=get_figure_height(len(totals_df[type_col_name].unique())), category_orders=category_orders, labels={ 'count': 'Samples count', 'geo_area_name_standard': 'Geographic region', 'lmat_taxonomy': 'Organism', 'rgi_kmer_taxonomy': 'Organism' }, title=TOTALS_FIGURE_TITLES[type_value], ) fig.update_layout(font={'size': 14}, yaxis={ 'title': '', 'ticksuffix': TICKSPACE }) return fig
def read_data(self, input_files: list = None) -> CardLiveData: """ Reads in the data and constructs a CardLiveData object. :param input_files: The (optional) list of input files. Leave as None to read from the configured directory. The optional list is used so I don't have to re-read the directory after running read_or_update_data(). :return: The CardLiveData object. """ if input_files is None: if not self._directory.exists(): raise Exception(f'Data directory [card_live_dir={self._directory}] does not exist') else: input_files = list(Path(self._directory).glob('*')) json_data = [] for input_file in input_files: filename = path.basename(input_file) with open(input_file) as f: json_obj = json.load(f) json_obj['filename'] = filename json_data.append(json_obj) full_df = pd.json_normalize(json_data).set_index('filename') full_df = self._replace_empty_list_na(full_df, self.JSON_DATA_FIELDS) full_df = self._create_analysis_valid_column(full_df, self.JSON_DATA_FIELDS) full_df['timestamp'] = pd.to_datetime(full_df['timestamp']) main_df = full_df.drop(columns=self.JSON_DATA_FIELDS) rgi_df = self._expand_column(full_df, 'rgi_main', na_char='n/a').drop( columns=self.OTHER_TABLE_DROP_FIELDS) rgi_kmer_df = self._expand_column(full_df, 'rgi_kmer', na_char='n/a').drop( columns=self.OTHER_TABLE_DROP_FIELDS) mlst_df = self._expand_column(full_df, 'mlst', na_char='-').drop( columns=self.OTHER_TABLE_DROP_FIELDS) lmat_df = self._expand_column(full_df, 'lmat', na_char='n/a').drop( columns=self.OTHER_TABLE_DROP_FIELDS) data = CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df) # apply data modifiers for modifier in self._data_modifiers: data = modifier.modify(data) return data
def modify(self, data: CardLiveData) -> CardLiveData: main_df = data.main_df.copy() logger.debug(f'Main df before {main_df}') main_df = self._region_codes_service.add_region_standard_names( main_df, 'geo_area_code') logger.debug(f'Main df after {main_df}') rgi_df = data.rgi_df.copy() rgi_kmer_df = data.rgi_kmer_df.copy() lmat_df = data.lmat_df.copy() mlst_df = data.mlst_df.copy() return CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df)
def modify(self, data: CardLiveData) -> CardLiveData: na_code = -10 main_df = data.main_df.copy() main_df.loc[(main_df['geo_area_code'] == 10) & (main_df['timestamp'] < self._date_threshold), 'geo_area_code'] = na_code rgi_df = data.rgi_df.copy() rgi_kmer_df = data.rgi_kmer_df.copy() lmat_df = data.lmat_df.copy() mlst_df = data.mlst_df.copy() return CardLiveData(main_df=main_df, rgi_parser=RGIParser(rgi_df), rgi_kmer_df=rgi_kmer_df, mlst_df=mlst_df, lmat_df=lmat_df)
[ 'file1', 'Strict', 'class1; class2; class3', 'gene2', 'antibiotic inactivation', 'family2' ], [ 'file2', 'Perfect', 'class1; class2; class4', 'gene1', 'antibiotic efflux; antibiotic target alteration', 'family1' ], ['file3', None, None, None, None, None], ]).set_index('filename') RGI_PARSER = RGIParser(RGI_DF) DATA = CardLiveData(main_df=MAIN_DF, rgi_parser=RGI_PARSER, rgi_kmer_df=OTHER_DF, lmat_df=OTHER_DF, mlst_df=OTHER_DF) def test_select_by_time_keepall(): data = DATA start = datetime.strptime('2020-08-05 00:00:00', TIME_FMT) end = datetime.strptime('2020-08-08 00:00:00', TIME_FMT) assert 3 == len(data), 'Data not initialized to correct number of entries' data = data.select_by_time(start, end) assert 3 == len(data), 'Invalid number after selection' assert 3 == len(data.main_df), 'Invalid number after selection' assert {'file1', 'file2', 'file3'} == data.files(), 'Invalid files' assert 4 == len(data.rgi_parser.df_rgi), 'Invalid number after selection'