Exemple #1
0
def apply_filters(data: CardLiveData, rgi_cutoff_select: str,
                  drug_classes: List[str], amr_gene_families: List[str],
                  resistance_mechanisms: List[str], amr_genes: List[str],
                  custom_date: Dict[str, datetime]) -> Dict[str, CardLiveData]:
    time_now = datetime.now()

    data = data.select(table='rgi', by='cutoff', type='row', level=rgi_cutoff_select) \
        .select(table='rgi', by='drug', type='file', elements=drug_classes) \
        .select(table='rgi', by='amr_gene_family', type='file', elements=amr_gene_families) \
        .select(table='rgi', by='resistance_mechanism', type='file', elements=resistance_mechanisms) \
        .select(table='rgi', by='amr_gene', type='file', elements=amr_genes)

    time_subsets = {
        'all': data,
        'day': data.select(table='main', by='time', start=time_now - DAY, end=time_now),
        'week': data.select(table='main', by='time', start=time_now - WEEK, end=time_now),
        'month': data.select(table='main', by='time', start=time_now - MONTH, end=time_now),
        '3 months': data.select(table='main', by='time', start=time_now - THREE_MONTHS, end=time_now),
        '6 months': data.select(table='main', by='time', start=time_now - SIX_MONTHS, end=time_now),
        'year': data.select(table='main', by='time', start=time_now - YEAR, end=time_now),
    }

    if custom_date is not None:
        time_subsets['custom'] = data.select(table='main', by='time', start=custom_date['start'],
                                             end=custom_date['end'])
    else:
        time_subsets['custom'] = data

    return time_subsets
Exemple #2
0
    def read_or_update_data(self,
                            existing_data: CardLiveData = None
                            ) -> CardLiveData:
        """
        Given an existing data object, updates the data object with any new files.
        :param existing_data: The existing data object (None if all data should be read).
        :return: The original (unmodified) data object if no updates, otherwise a new data object with additional data.
        """
        input_files = [
            Path(self._directory) / f for f in listdir(self._directory)
            if path.isfile(Path(self._directory) / f)
        ]
        input_files.sort()

        if existing_data is None:
            return self.read_data(input_files)
        elif not self._directory.exists():
            raise Exception(
                f'Data directory [card_live_dir={self._directory}] does not exist'
            )
        else:
            existing_files = existing_data.files()
            input_files_set = {p.name for p in input_files}

            files_new = input_files_set - existing_files

            # If no new files have been found
            if len(files_new) == 0:
                logger.debug(
                    f'Data has not changed from {len(input_files_set)} samples, not updating'
                )
                return existing_data
            else:
                logger.info(f'{len(files_new)} additional samples found.')
                return self.read_data(input_files)
Exemple #3
0
    def modify(self, data: CardLiveData) -> CardLiveData:
        logger.debug(f'Main df before {data.main_df}')
        taxonomy_parser = TaxonomicParser(ncbi_taxa_file=self._ncbi_taxa_file,
                                          df_rgi_kmer=data.rgi_kmer_df,
                                          df_lmat=data.lmat_df)
        matches_df = taxonomy_parser.create_file_matches().rename(
            columns={
                'lmat.taxonomy_label': 'lmat_taxonomy',
                'rgi_kmer.taxonomy_label': 'rgi_kmer_taxonomy'
            })
        matches_df = matches_df.drop(columns=['matches'])
        main_df = data.main_df.merge(matches_df,
                                     left_index=True,
                                     right_index=True,
                                     how='left')
        logger.debug(f'Main df after {main_df}')

        rgi_df = data.rgi_df.copy()
        rgi_kmer_df = data.rgi_kmer_df.copy()
        lmat_df = data.lmat_df.copy()
        mlst_df = data.mlst_df.copy()

        return CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)
def choropleth_drug(data: CardLiveData, world: geopandas.GeoDataFrame):
    df_geo = data.sample_counts(['geo_area_code',
                                 'geo_area_name_standard']).reset_index()

    # Remove N/A from counts so it doesn't mess with colors of map
    df_geo = df_geo[~df_geo['geo_area_name_standard'].str.contains('N/A')]

    if df_geo.empty or df_geo['count'].sum() == 0:
        fig = EMPTY_MAP
    else:
        fig = px.choropleth(
            df_geo,
            geojson=world,
            locations='geo_area_code',
            featureidkey='properties.un_m49_numeric',
            color='count',
            color_continuous_scale='YlGnBu',
            hover_data=['geo_area_name_standard'],

            # Off-center to avoid a color fill issue with Antarctica
            # where the oceans get filled instead of the continent
            center={
                'lat': 0,
                'lon': 0.01
            },
            title='Samples by geographic region',
        )

        fig.update_traces(hovertemplate=(
            '<b style="font-size: 125%;">%{customdata[0]}</b><br>'
            '<b>Count:</b>  %{z}<br>'))

    fig.update_layout(
        margin={
            "r": 0,
            "t": 35,
            "l": 0,
            "b": 0
        },
        coloraxis_colorbar=dict(
            title='Count',
            yanchor='middle',
            y=0.5,
            len=1,
            lenmode='fraction',
            outlinecolor='black',
            outlinewidth=1,
            bgcolor='white',
            thickness=25,
            thicknessmode='pixels',
        ),
    )

    return fig
def totals_figure(data: CardLiveData, type_value: str,
                  color_by_value: str) -> go.Figure:
    type_col = TOTALS_COLUMN_SELECT_NAMES[type_value]
    color_col = TOTALS_COLUMN_SELECT_NAMES[color_by_value]
    if type_col == color_col or color_by_value == 'default':
        count_by_columns = [type_col]
    else:
        count_by_columns = [type_col, color_col]

    if data.empty:
        fig = EMPTY_FIGURE
    else:
        totals_df = data.sample_counts(count_by_columns).reset_index()

        type_col_name = TOTALS_COLUMN_DATAFRAME_NAMES[type_value]
        color_col_name = TOTALS_COLUMN_DATAFRAME_NAMES[color_by_value]

        category_orders = order_categories(totals_df,
                                           type_col_name,
                                           by_sum=True,
                                           sum_col='count')
        if color_by_value != 'default':
            category_orders.update(
                order_categories(totals_df,
                                 color_col_name,
                                 by_sum=True,
                                 sum_col='count'))

        fig = px.bar(
            totals_df,
            y=type_col_name,
            x='count',
            color=color_col_name,
            height=get_figure_height(len(totals_df[type_col_name].unique())),
            category_orders=category_orders,
            labels={
                'count': 'Samples count',
                'geo_area_name_standard': 'Geographic region',
                'lmat_taxonomy': 'Organism',
                'rgi_kmer_taxonomy': 'Organism'
            },
            title=TOTALS_FIGURE_TITLES[type_value],
        )
        fig.update_layout(font={'size': 14},
                          yaxis={
                              'title': '',
                              'ticksuffix': TICKSPACE
                          })

    return fig
    def read_data(self, input_files: list = None) -> CardLiveData:
        """
        Reads in the data and constructs a CardLiveData object.
        :param input_files: The (optional) list of input files. Leave as None to read from the configured directory.
                            The optional list is used so I don't have to re-read the directory after running read_or_update_data().
        :return: The CardLiveData object.
        """
        if input_files is None:
            if not self._directory.exists():
                raise Exception(f'Data directory [card_live_dir={self._directory}] does not exist')
            else:
                input_files = list(Path(self._directory).glob('*'))

        json_data = []
        for input_file in input_files:
            filename = path.basename(input_file)
            with open(input_file) as f:
                json_obj = json.load(f)
                json_obj['filename'] = filename
                json_data.append(json_obj)

        full_df = pd.json_normalize(json_data).set_index('filename')
        full_df = self._replace_empty_list_na(full_df, self.JSON_DATA_FIELDS)
        full_df = self._create_analysis_valid_column(full_df, self.JSON_DATA_FIELDS)
        full_df['timestamp'] = pd.to_datetime(full_df['timestamp'])

        main_df = full_df.drop(columns=self.JSON_DATA_FIELDS)
        rgi_df = self._expand_column(full_df, 'rgi_main', na_char='n/a').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)
        rgi_kmer_df = self._expand_column(full_df, 'rgi_kmer', na_char='n/a').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)
        mlst_df = self._expand_column(full_df, 'mlst', na_char='-').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)
        lmat_df = self._expand_column(full_df, 'lmat', na_char='n/a').drop(
            columns=self.OTHER_TABLE_DROP_FIELDS)

        data = CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)

        # apply data modifiers
        for modifier in self._data_modifiers:
            data = modifier.modify(data)

        return data
    def modify(self, data: CardLiveData) -> CardLiveData:
        main_df = data.main_df.copy()
        logger.debug(f'Main df before {main_df}')
        main_df = self._region_codes_service.add_region_standard_names(
            main_df, 'geo_area_code')
        logger.debug(f'Main df after {main_df}')

        rgi_df = data.rgi_df.copy()
        rgi_kmer_df = data.rgi_kmer_df.copy()
        lmat_df = data.lmat_df.copy()
        mlst_df = data.mlst_df.copy()

        return CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)
Exemple #8
0
    def modify(self, data: CardLiveData) -> CardLiveData:
        na_code = -10

        main_df = data.main_df.copy()
        main_df.loc[(main_df['geo_area_code'] == 10) &
                    (main_df['timestamp'] < self._date_threshold),
                    'geo_area_code'] = na_code

        rgi_df = data.rgi_df.copy()
        rgi_kmer_df = data.rgi_kmer_df.copy()
        lmat_df = data.lmat_df.copy()
        mlst_df = data.mlst_df.copy()

        return CardLiveData(main_df=main_df,
                            rgi_parser=RGIParser(rgi_df),
                            rgi_kmer_df=rgi_kmer_df,
                            mlst_df=mlst_df,
                            lmat_df=lmat_df)
        [
            'file1', 'Strict', 'class1; class2; class3', 'gene2',
            'antibiotic inactivation', 'family2'
        ],
        [
            'file2', 'Perfect', 'class1; class2; class4', 'gene1',
            'antibiotic efflux; antibiotic target alteration', 'family1'
        ],
        ['file3', None, None, None, None, None],
    ]).set_index('filename')

RGI_PARSER = RGIParser(RGI_DF)

DATA = CardLiveData(main_df=MAIN_DF,
                    rgi_parser=RGI_PARSER,
                    rgi_kmer_df=OTHER_DF,
                    lmat_df=OTHER_DF,
                    mlst_df=OTHER_DF)


def test_select_by_time_keepall():
    data = DATA
    start = datetime.strptime('2020-08-05 00:00:00', TIME_FMT)
    end = datetime.strptime('2020-08-08 00:00:00', TIME_FMT)

    assert 3 == len(data), 'Data not initialized to correct number of entries'
    data = data.select_by_time(start, end)
    assert 3 == len(data), 'Invalid number after selection'
    assert 3 == len(data.main_df), 'Invalid number after selection'
    assert {'file1', 'file2', 'file3'} == data.files(), 'Invalid files'
    assert 4 == len(data.rgi_parser.df_rgi), 'Invalid number after selection'