Python load_covid_data Examples, functions.shared_functions.load_covid_data Python Examples

Example #1

0

Show file

def scatter_g19_score(fc):
    fig = {}
    if fc == None:
        return fig
    counts = sf.load_covid_data([fc], 'c19_read_counts.hdr.tsv')
    counts['Covid Score'] = counts['covid_ratio'] + 0.000001
    counts['total reads'] = (counts['covid_ratio'] + counts['rnase_count'] +
                             counts['spikein_count'] + counts['unknown'])
    counts['replicate_flags'] = counts['replicate_flags'].replace(np.nan, "not available")
    g_samples = counts[counts.run_sample_id.str.startswith(('G', 'H'), na=False)]
    g_samples = g_samples[~g_samples.run_sample_id.str.startswith('Ht', na=False)]
    g_samples = g_samples.sort_values('Covid Score', ascending=False)
    if len(g_samples) > 0:
        fig = sf.fix_plot(px.scatter(g_samples, color='replicate_flags',
                                     x='run_sample_id', y='Covid Score'))
        fig.update_layout(yaxis_type="log",
                          title_text="Individual clinical samples for this FC and the flags for each replicate")

    else:

        fig = sf.fix_plot(px.scatter(counts, color='replicate_flags',
                                     x='run_sample_id', y='Covid Score'))
        fig.update_layout(yaxis_type="log",
                          title_text="All Samples for this FC and the flags for each replicate")
        # some sample id are a long integer, I want them as a category.
        fig.update_xaxes(type='category')

    return fig

Example #2

0

Show file

def pos_g19ratio(fc_click, my_type, start_date, end_date):
    fig = {}
    if fc_click:
        fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid'])
        fclist_df.loc[:,
                      'date'] = (fclist_df.runid.str[2:4] + '-' +
                                 fclist_df.runid.str[4:6] + '-' +
                                 fclist_df.runid.str[:2]).astype(np.datetime64)
        fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date))
                              & (fclist_df['date'] < pd.to_datetime(end_date))]
        fclist = list(fclist_df.runid.unique())
        # now get just those runs
        calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv')
        calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' +
                                calls.runid.str[4:6] + '-' +
                                calls.runid.str[:2]).astype(np.datetime64)
        sample_df = calls[calls['sample_type'] == my_type]

        fig = sf.fix_plot(
            px.scatter(sample_df,
                       x='date',
                       y='median_covid_ratio',
                       color='flags'))
        fig_title = my_type + " control samples per day"
        fig.update_layout(yaxis_type="log", title_text=fig_title)

    return fig

Example #3

0

Show file

def control_counts(fc_click, metric, cont_type, start_date, end_date):
    fig = {}
    if fc_click:
        # to make it faster only download the runs needed!
        # Make list of runs needed
        fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid'])
        fclist_df.loc[:,
                      'date'] = (fclist_df.runid.str[2:4] + '-' +
                                 fclist_df.runid.str[4:6] + '-' +
                                 fclist_df.runid.str[:2]).astype(np.datetime64)
        fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date))
                              & (fclist_df['date'] < pd.to_datetime(end_date))]
        fclist = list(fclist_df.runid.unique())
        # now get just those runs
        calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv')
        # add date
        calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' +
                                calls.runid.str[4:6] + '-' +
                                calls.runid.str[:2]).astype(np.datetime64)
        calls = calls[(calls['date'] > pd.to_datetime(start_date))
                      & (calls['date'] < pd.to_datetime(end_date))]
        # add tube rack
        calls['tube_rack'] = calls.pos_tube_rack.str.split(":").str[0]
        # seperate by sample
        # starting October 2020, G or H and not Ht
        g_samples = calls[calls.run_sample_id.str.startswith(('G', 'H'),
                                                             na=False)]
        g_samples = g_samples[~g_samples.run_sample_id.str.
                              startswith('Ht', na=False)]

        # use tube rack to get pos and ntc samples that are in the plates with samples
        g_tubes = g_samples.tube_rack.unique()
        g_calls = calls[calls['tube_rack'].isin(g_tubes)]
        # separate out postive and ntc samples
        pos_samples = g_calls[g_calls['sample_type'] == cont_type]
        samples_short = pos_samples[['date', 'flags', 'run_sample_id']]
        pos_short = samples_short.groupby(
            by=['date', 'flags']).count().reset_index()
        pos_short = pos_short.rename(columns={'run_sample_id': 'count'})

        samples_total = pos_samples[['date', 'run_sample_id']]
        total_short = samples_total.groupby(by=[
            'date',
        ]).count().reset_index()
        total_short = total_short.rename(columns={'run_sample_id': 'total'})

        pos_short = pos_short.merge(total_short, on='date')
        pos_short['percent'] = round(pos_short['count'] / pos_short['total'],
                                     3)
        # make fig
        fig_title = cont_type + " control samples " + metric + " per day and the pass/fail flags"
        fig = px.bar(pos_short,
                     x="date",
                     y=metric,
                     color='flags',
                     hover_name='percent')
        fig = fig.update_traces(textposition='outside')
        fig = fig.update_layout(title_text=fig_title,
                                yaxis_title="total sample " + metric)
    return fig

Example #4

0

Show file

def percent_detect(fc_click, start_date, end_date):
    fig = {}
    if fc_click:
        # to make it faster only download the runs needed!
        # Make list of runs needed
        fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid'])
        fclist_df.loc[:,
                      'date'] = (fclist_df.runid.str[2:4] + '-' +
                                 fclist_df.runid.str[4:6] + '-' +
                                 fclist_df.runid.str[:2]).astype(np.datetime64)
        fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date))
                              & (fclist_df['date'] < pd.to_datetime(end_date))]
        fclist = list(fclist_df.runid.unique())
        # now get just those runs
        calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv')
        calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' +
                                calls.runid.str[4:6] + '-' +
                                calls.runid.str[:2]).astype(np.datetime64)
        # add tube rack
        # seperate by sample
        # starting October 2020, G or H and not Ht
        g_samples = calls[calls.run_sample_id.str.startswith(('G', 'H'),
                                                             na=False)]
        g_samples = g_samples[~g_samples.run_sample_id.str.
                              startswith('Ht', na=False)]

        g_short = g_samples[[
            'date', 'replicates_count', 'replicates_detected',
            'replicates_no_call', 'replicates_not_detected'
        ]]
        g_short_group = g_short.groupby(by=[
            'date',
        ]).sum().reset_index()
        g_short_group['detect_percent'] = g_short_group[
            'replicates_detected'] / g_short_group['replicates_count'] * 100
        g_short_group['no_call_percent'] = g_short_group[
            'replicates_no_call'] / g_short_group['replicates_count'] * 100

        fig = go.Figure()
        # Add traces
        fig.add_trace(
            go.Scatter(x=g_short_group['date'],
                       y=g_short_group['detect_percent'],
                       mode='lines+markers',
                       name='percent detect'))
        fig.add_trace(
            go.Scatter(x=g_short_group['date'],
                       y=g_short_group['no_call_percent'],
                       mode='lines+markers',
                       name='percent no call'))
        fig = fig.update_layout(
            yaxis_title='percent',
            title_text=
            "Percent of individual clinical samples with Covid detction or no call"
        )
    return fig

Example #5

0

Show file

File: make_mean.py Project: gh-bshih/g19_analysis

def get_discord_history(start_dt):
    start_date = start_dt
    end_date = dt.today()
    fclist_df = pd.DataFrame(fc_list(), columns=['runid'])
    fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' +
                                fclist_df.runid.str[4:6] + '-' +
                                fclist_df.runid.str[:2]).astype(np.datetime64)
    fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date))
                          & (fclist_df['date'] < pd.to_datetime(end_date))]
    fclist = list(fclist_df.runid.unique())
    # now get just those runs
    historical = pd.DataFrame()
    discord = load_covid_data(fclist, 'c19_call.hdr.tsv')
    if len(discord) > 0:
        dis_table = find_discord(discord)
        historical = historical.append(dis_table)
    print(historical.shape, "biggest table")
    historical_only3 = historical[historical['group'].isin(
        ['concordant', 'discordant', 'one rep'])]
    historical_count = historical_only3.groupby(
        by=['call', 'type', 'group']).count().reset_index()
    historical_count = historical_count[[
        'call',
        'type',
        'group',
        'Percent of Call',
    ]]
    historical_count = historical_count.rename(
        columns={'Percent of Call': 'count'})
    print(historical_count.shape, "count")

    historical_short = historical_only3[[
        'call', 'type', 'group', 'Percent of Call',
        'Percent of All (includes no call)'
    ]]
    historical_stdv = historical_short.groupby(
        by=['call', 'type', 'group']).std().reset_index()
    historical_stdv = historical_stdv.rename(
        columns={
            'Percent of Call': 'call_stdv',
            'Percent of All (includes no call)': 'all_stdv'
        })

    historical_mean = historical_short.groupby(
        by=['call', 'type', 'group']).mean().reset_index()
    historical_mean = historical_mean.rename(
        columns={
            'Percent of Call': 'call_mean',
            'Percent of All (includes no call)': 'all_mean'
        })

    history = historical_count.merge(historical_mean)
    history = history.merge(historical_stdv).round(4)
    return history

Example #6

0

Show file

def total_read_bar(fc, metric, label):
    fig = {}
    if fc == None:
        return fig
        # return empty figure if no FC
    sub_df = sf.load_covid_data([fc], 'pool_stats.hdr.tsv')
    fig = px.bar(sub_df,
                 x='pos_pooling', y=metric, color='pos_pooling',
                 color_discrete_sequence=px.colors.sequential.Agsunset)
    fig = fig.update_layout(yaxis_title=metric)
    fig = sf.fix_plot(fig)
    return fig

Example #7

0

Show file

def sample_plot(fc_click, metric, start_date, end_date):
    fig = {}
    if fc_click:
        # to make it faster only download the runs needed!
        # Make list of runs needed
        fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid'])
        fclist_df.loc[:,
                      'date'] = (fclist_df.runid.str[2:4] + '-' +
                                 fclist_df.runid.str[4:6] + '-' +
                                 fclist_df.runid.str[:2]).astype(np.datetime64)
        fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date))
                              & (fclist_df['date'] < pd.to_datetime(end_date))]
        fclist = list(fclist_df.runid.unique())
        # now get just those runs
        calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv')
        # add date
        calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' +
                                calls.runid.str[4:6] + '-' +
                                calls.runid.str[:2]).astype(np.datetime64)
        # seperate by sample
        # starting October 2020, G or H and not Ht
        g_samples = calls[calls.run_sample_id.str.startswith(('G', 'H'),
                                                             na=False)]
        g_samples = g_samples[~g_samples.run_sample_id.str.
                              startswith('Ht', na=False)]

        samples_short = g_samples[['date', 'flags', 'run_sample_id']]
        pos_short = samples_short.groupby(
            by=['date', 'flags']).count().reset_index()
        pos_short = pos_short.rename(columns={'run_sample_id': 'count'})

        samples_total = g_samples[['date', 'run_sample_id']]
        total_short = samples_total.groupby(by=[
            'date',
        ]).count().reset_index()
        total_short = total_short.rename(columns={'run_sample_id': 'total'})

        pos_short = pos_short.merge(total_short, on='date')
        pos_short['percent'] = round(pos_short['count'] / pos_short['total'],
                                     3)
        # make fig
        fig_title = "Clinical sample " + metric + " per day and the pass/fail flags"
        fig = px.bar(pos_short,
                     x="date",
                     y=metric,
                     color='flags',
                     hover_name='percent')
        fig.update_layout(title_text=fig_title)
    return fig

Example #8

0

Show file

def pool_map(fcs, count_format, label):
    fig = {}
    if fcs == None:
        return fig
        # return empty figure if no FC
    new_title = str(count_format) + " for each iPCR Pool"
    rows = [x for x in 'ABCDEFGH']
    rows2 = [x for x in 'ABCDEFGH']
    rows2.sort(reverse=True)
    cols = ['0' + str(x) for x in range(1, 10)] + [str(x) for x in range(10, 13)]
    final_df = pd.DataFrame()

    df = sf.load_covid_data([fcs], 'pool_stats.hdr.tsv')
    # print(df.shape)
    if df.pos_pooling[1].startswith('PRP'):
        # print(df.pos_pooling[1], "bip prp")
        # split first on : then  split on character
        # the other method won't work because bip doesn't include zeros
        # add rjust to add zero to match above columns
        df['col'] = df['pos_pooling'].str.split(":").str[1].str.split('[A-Z]').str[1].str.rjust(2, "0")
        df['row'] = df['pos_pooling'].str.split(":").str[1].str.split('[0-9]').str[0]
        final_df = final_df.append(df)
    else:
        print(df.pos_pooling[1], 'dev')
        df.loc[:, 'col'] = df.pos_pooling.str[-2:]
        df.loc[:, 'row'] = df.pos_pooling.str[-3]
        final_df = final_df.append(df)
    pm = pd.DataFrame(0, index=rows, columns=cols)
    for r in rows:
        for c in cols:
            well_df = final_df.loc[(df.col == c) & (final_df.row == r)]

            well_mean = well_df.loc[:, count_format].mean()
            pm.loc[r, c] = well_mean
    pm = pm.sort_index(ascending=False)
    pm = pm.apply(np.floor)
    fig = ff.create_annotated_heatmap(x=cols,
                                      y=rows2,
                                      z=pm.values,
                                      colorscale='Viridis'
                                      )
    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = 10
    fig.update_layout(title=new_title,
                      xaxis_title="iPCR Column",
                      yaxis_title="iPCR Row")
    return fig

Example #9

0

Show file

def read_type_bar(fc):
    stack_colors = ['#273c75', '#00a8ff',
                    '#c23616', '#e84118',
                    '#7f8fa6', '#e1b12c',
                    '#fbc531']
    fig = {}
    if fc == None:
        return fig
        # return empty figure if no FC
    sub_df = sf.load_covid_data([fc], 'pool_stats.hdr.tsv')
    cols = [x for x in sub_df.columns if 'percent' in x] + ['pos_pooling']
    cols.remove('percent_usable_reads')
    fig = px.bar(sub_df[cols].melt(id_vars='pos_pooling'),
                 x='pos_pooling',
                 y='value',
                 color='variable',
                 color_discrete_sequence=stack_colors,
                 width=1200)
    fig = fig.update_layout(yaxis_title='Percent of Reads')
    fig = sf.fix_plot(fig)
    return fig

Example #10

0

Show file

def find_discord(fc):
    '''
    Function that finds discordant calls and percents.  
    Inputs: fc, start_date, end_date
    Result : table
    
    1 seperate detects and not detects
    2 then get counts of replicates detected, replicates no call
    3 append those, add total count
    4 then append detects to not detects
    5 calculate percent 
    6 remove zeros, this is confusing, and no one cares about those 
    '''
    discord = pd.DataFrame()
    if fc == None:
        dis = dash_table.DataTable(
            id='table',
            columns=[{"name": i, "id": i} for i in discord.columns],
            data=discord.to_dict('records'),
        )
        return dis
    discord = sf.load_covid_data([fc], 'c19_call.hdr.tsv')
    dis = sf.find_discord(discord)
    history = pd.read_csv('Discord_Means.csv', index_col=0)
    history = history.rename(columns={'count': 'total_hist'})
    dis = dis.merge(history, on=['call', 'type', 'group'])
    # reformat to make pretty

    dis_table = dash_table.DataTable(
        data=dis.to_dict('records'),
        columns=[{
            'id': 'call',
            'name': 'Final Call',
            'type': 'text'
        }, {
            'id': 'type',
            'name': 'Singlicate Call',
            'type': 'text'

        }, {
            'id': 'group',
            'name': 'Singlicates',
            'type': 'text'

        }, {
            'id': 'total_hist',
            'name': 'Total History',
            'type': 'numeric',

        }, {
            'id': 'all_mean',
            'name': 'History Mean Percent of All',
            'type': 'numeric',
            'format': FormatTemplate.percentage(1)
        }, {
            'id': 'all_stdv',
            'name': 'History StDv Percent of All',
            'type': 'numeric',
            'format': FormatTemplate.percentage(1)
        }, {
            'id': 'Percent of All (includes no call)',
            'name': 'Percent of All (includes no call)',
            'type': 'numeric',
            'format': FormatTemplate.percentage(1)
        }, {
            'id': 'call_mean',
            'name': 'History Mean Percent of Call',
            'type': 'numeric',
            'format': FormatTemplate.percentage(1)
        }, {
            'id': 'call_stdv',
            'name': 'History StDv Percent of Call',
            'type': 'numeric',
            'format': FormatTemplate.percentage(1)
        }, {
            'id': 'Percent of Call',
            'name': 'Percent of Call',
            'type': 'numeric',
            'format': FormatTemplate.percentage(1)
        }, {
            'id': 'runid',
            'name': 'runid',
            'type': 'text'

        }

        ]
    )
    return dis_table

Example #11

0

Show file

def discordance_rate(fc_click, metric, group_list, start_date, end_date):
    fig = {}
    if fc_click:
        fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid'])
        fclist_df.loc[:,
                      'date'] = (fclist_df.runid.str[2:4] + '-' +
                                 fclist_df.runid.str[4:6] + '-' +
                                 fclist_df.runid.str[:2]).astype(np.datetime64)
        fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date))
                              & (fclist_df['date'] < pd.to_datetime(end_date))]
        fclist = list(fclist_df.runid.unique())
        # now get just those runs
        calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv')
        calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' +
                                calls.runid.str[4:6] + '-' +
                                calls.runid.str[:2]).astype(np.datetime64)
        # add tube rack
        calls_short = calls[['date', 'run_sample_id']]
        calls_all = calls_short.groupby(by=['date']).count().reset_index()
        calls_all = calls_all.rename(columns={'run_sample_id': 'total'})

        calls_short = calls[['date', 'replicates_detected', 'run_sample_id']]
        calls_repdetect = calls_short.groupby(
            by=['date', 'replicates_detected']).count().reset_index()
        calls_repdetect = calls_repdetect.rename(columns={
            'run_sample_id': 'count_detects',
            'replicates_detected': 'group'
        })

        calls_short = calls[['date', 'replicates_no_call', 'run_sample_id']]
        calls_repnocall = calls_short.groupby(
            by=['date', 'replicates_no_call']).count().reset_index()
        calls_repnocall = calls_repnocall.rename(columns={
            'run_sample_id': 'count_no_call',
            'replicates_no_call': 'group'
        })

        calls_short = calls[[
            'date', 'replicates_not_detected', 'run_sample_id'
        ]]
        calls_repNotdetect = calls_short.groupby(
            by=['date', 'replicates_not_detected']).count().reset_index()
        calls_repNotdetect = calls_repNotdetect.rename(
            columns={
                'run_sample_id': 'count_not_detect',
                'replicates_not_detected': 'group'
            })

        merge_calls = calls_repNotdetect.merge(calls_repnocall,
                                               on=['date', 'group'])
        merge_calls = merge_calls.merge(calls_repdetect, on=['date', 'group'])
        merge_calls = merge_calls.merge(calls_all, on=['date'])
        merge_calls['percent_not_detect'] = merge_calls[
            'count_not_detect'] / merge_calls['total']
        merge_calls['percent_detect'] = merge_calls[
            'count_detects'] / merge_calls['total']
        merge_calls['percent_no_call'] = merge_calls[
            'count_no_call'] / merge_calls['total']
        merge_calls = merge_calls[merge_calls['group'] < 4]
        merge_calls['group'] = merge_calls['group'].astype(str)

        merge_calls = merge_calls[merge_calls.group.isin(group_list)]

        fig = px.bar(merge_calls, x='date', y=metric, color='group')

    return fig