def scatter_g19_score(fc): fig = {} if fc == None: return fig counts = sf.load_covid_data([fc], 'c19_read_counts.hdr.tsv') counts['Covid Score'] = counts['covid_ratio'] + 0.000001 counts['total reads'] = (counts['covid_ratio'] + counts['rnase_count'] + counts['spikein_count'] + counts['unknown']) counts['replicate_flags'] = counts['replicate_flags'].replace(np.nan, "not available") g_samples = counts[counts.run_sample_id.str.startswith(('G', 'H'), na=False)] g_samples = g_samples[~g_samples.run_sample_id.str.startswith('Ht', na=False)] g_samples = g_samples.sort_values('Covid Score', ascending=False) if len(g_samples) > 0: fig = sf.fix_plot(px.scatter(g_samples, color='replicate_flags', x='run_sample_id', y='Covid Score')) fig.update_layout(yaxis_type="log", title_text="Individual clinical samples for this FC and the flags for each replicate") else: fig = sf.fix_plot(px.scatter(counts, color='replicate_flags', x='run_sample_id', y='Covid Score')) fig.update_layout(yaxis_type="log", title_text="All Samples for this FC and the flags for each replicate") # some sample id are a long integer, I want them as a category. fig.update_xaxes(type='category') return fig
def pos_g19ratio(fc_click, my_type, start_date, end_date): fig = {} if fc_click: fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid']) fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' + fclist_df.runid.str[4:6] + '-' + fclist_df.runid.str[:2]).astype(np.datetime64) fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date)) & (fclist_df['date'] < pd.to_datetime(end_date))] fclist = list(fclist_df.runid.unique()) # now get just those runs calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv') calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' + calls.runid.str[4:6] + '-' + calls.runid.str[:2]).astype(np.datetime64) sample_df = calls[calls['sample_type'] == my_type] fig = sf.fix_plot( px.scatter(sample_df, x='date', y='median_covid_ratio', color='flags')) fig_title = my_type + " control samples per day" fig.update_layout(yaxis_type="log", title_text=fig_title) return fig
def control_counts(fc_click, metric, cont_type, start_date, end_date): fig = {} if fc_click: # to make it faster only download the runs needed! # Make list of runs needed fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid']) fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' + fclist_df.runid.str[4:6] + '-' + fclist_df.runid.str[:2]).astype(np.datetime64) fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date)) & (fclist_df['date'] < pd.to_datetime(end_date))] fclist = list(fclist_df.runid.unique()) # now get just those runs calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv') # add date calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' + calls.runid.str[4:6] + '-' + calls.runid.str[:2]).astype(np.datetime64) calls = calls[(calls['date'] > pd.to_datetime(start_date)) & (calls['date'] < pd.to_datetime(end_date))] # add tube rack calls['tube_rack'] = calls.pos_tube_rack.str.split(":").str[0] # seperate by sample # starting October 2020, G or H and not Ht g_samples = calls[calls.run_sample_id.str.startswith(('G', 'H'), na=False)] g_samples = g_samples[~g_samples.run_sample_id.str. startswith('Ht', na=False)] # use tube rack to get pos and ntc samples that are in the plates with samples g_tubes = g_samples.tube_rack.unique() g_calls = calls[calls['tube_rack'].isin(g_tubes)] # separate out postive and ntc samples pos_samples = g_calls[g_calls['sample_type'] == cont_type] samples_short = pos_samples[['date', 'flags', 'run_sample_id']] pos_short = samples_short.groupby( by=['date', 'flags']).count().reset_index() pos_short = pos_short.rename(columns={'run_sample_id': 'count'}) samples_total = pos_samples[['date', 'run_sample_id']] total_short = samples_total.groupby(by=[ 'date', ]).count().reset_index() total_short = total_short.rename(columns={'run_sample_id': 'total'}) pos_short = pos_short.merge(total_short, on='date') pos_short['percent'] = round(pos_short['count'] / pos_short['total'], 3) # make fig fig_title = cont_type + " control samples " + metric + " per day and the pass/fail flags" fig = px.bar(pos_short, x="date", y=metric, color='flags', hover_name='percent') fig = fig.update_traces(textposition='outside') fig = fig.update_layout(title_text=fig_title, yaxis_title="total sample " + metric) return fig
def percent_detect(fc_click, start_date, end_date): fig = {} if fc_click: # to make it faster only download the runs needed! # Make list of runs needed fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid']) fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' + fclist_df.runid.str[4:6] + '-' + fclist_df.runid.str[:2]).astype(np.datetime64) fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date)) & (fclist_df['date'] < pd.to_datetime(end_date))] fclist = list(fclist_df.runid.unique()) # now get just those runs calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv') calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' + calls.runid.str[4:6] + '-' + calls.runid.str[:2]).astype(np.datetime64) # add tube rack # seperate by sample # starting October 2020, G or H and not Ht g_samples = calls[calls.run_sample_id.str.startswith(('G', 'H'), na=False)] g_samples = g_samples[~g_samples.run_sample_id.str. startswith('Ht', na=False)] g_short = g_samples[[ 'date', 'replicates_count', 'replicates_detected', 'replicates_no_call', 'replicates_not_detected' ]] g_short_group = g_short.groupby(by=[ 'date', ]).sum().reset_index() g_short_group['detect_percent'] = g_short_group[ 'replicates_detected'] / g_short_group['replicates_count'] * 100 g_short_group['no_call_percent'] = g_short_group[ 'replicates_no_call'] / g_short_group['replicates_count'] * 100 fig = go.Figure() # Add traces fig.add_trace( go.Scatter(x=g_short_group['date'], y=g_short_group['detect_percent'], mode='lines+markers', name='percent detect')) fig.add_trace( go.Scatter(x=g_short_group['date'], y=g_short_group['no_call_percent'], mode='lines+markers', name='percent no call')) fig = fig.update_layout( yaxis_title='percent', title_text= "Percent of individual clinical samples with Covid detction or no call" ) return fig
def get_discord_history(start_dt): start_date = start_dt end_date = dt.today() fclist_df = pd.DataFrame(fc_list(), columns=['runid']) fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' + fclist_df.runid.str[4:6] + '-' + fclist_df.runid.str[:2]).astype(np.datetime64) fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date)) & (fclist_df['date'] < pd.to_datetime(end_date))] fclist = list(fclist_df.runid.unique()) # now get just those runs historical = pd.DataFrame() discord = load_covid_data(fclist, 'c19_call.hdr.tsv') if len(discord) > 0: dis_table = find_discord(discord) historical = historical.append(dis_table) print(historical.shape, "biggest table") historical_only3 = historical[historical['group'].isin( ['concordant', 'discordant', 'one rep'])] historical_count = historical_only3.groupby( by=['call', 'type', 'group']).count().reset_index() historical_count = historical_count[[ 'call', 'type', 'group', 'Percent of Call', ]] historical_count = historical_count.rename( columns={'Percent of Call': 'count'}) print(historical_count.shape, "count") historical_short = historical_only3[[ 'call', 'type', 'group', 'Percent of Call', 'Percent of All (includes no call)' ]] historical_stdv = historical_short.groupby( by=['call', 'type', 'group']).std().reset_index() historical_stdv = historical_stdv.rename( columns={ 'Percent of Call': 'call_stdv', 'Percent of All (includes no call)': 'all_stdv' }) historical_mean = historical_short.groupby( by=['call', 'type', 'group']).mean().reset_index() historical_mean = historical_mean.rename( columns={ 'Percent of Call': 'call_mean', 'Percent of All (includes no call)': 'all_mean' }) history = historical_count.merge(historical_mean) history = history.merge(historical_stdv).round(4) return history
def total_read_bar(fc, metric, label): fig = {} if fc == None: return fig # return empty figure if no FC sub_df = sf.load_covid_data([fc], 'pool_stats.hdr.tsv') fig = px.bar(sub_df, x='pos_pooling', y=metric, color='pos_pooling', color_discrete_sequence=px.colors.sequential.Agsunset) fig = fig.update_layout(yaxis_title=metric) fig = sf.fix_plot(fig) return fig
def sample_plot(fc_click, metric, start_date, end_date): fig = {} if fc_click: # to make it faster only download the runs needed! # Make list of runs needed fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid']) fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' + fclist_df.runid.str[4:6] + '-' + fclist_df.runid.str[:2]).astype(np.datetime64) fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date)) & (fclist_df['date'] < pd.to_datetime(end_date))] fclist = list(fclist_df.runid.unique()) # now get just those runs calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv') # add date calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' + calls.runid.str[4:6] + '-' + calls.runid.str[:2]).astype(np.datetime64) # seperate by sample # starting October 2020, G or H and not Ht g_samples = calls[calls.run_sample_id.str.startswith(('G', 'H'), na=False)] g_samples = g_samples[~g_samples.run_sample_id.str. startswith('Ht', na=False)] samples_short = g_samples[['date', 'flags', 'run_sample_id']] pos_short = samples_short.groupby( by=['date', 'flags']).count().reset_index() pos_short = pos_short.rename(columns={'run_sample_id': 'count'}) samples_total = g_samples[['date', 'run_sample_id']] total_short = samples_total.groupby(by=[ 'date', ]).count().reset_index() total_short = total_short.rename(columns={'run_sample_id': 'total'}) pos_short = pos_short.merge(total_short, on='date') pos_short['percent'] = round(pos_short['count'] / pos_short['total'], 3) # make fig fig_title = "Clinical sample " + metric + " per day and the pass/fail flags" fig = px.bar(pos_short, x="date", y=metric, color='flags', hover_name='percent') fig.update_layout(title_text=fig_title) return fig
def pool_map(fcs, count_format, label): fig = {} if fcs == None: return fig # return empty figure if no FC new_title = str(count_format) + " for each iPCR Pool" rows = [x for x in 'ABCDEFGH'] rows2 = [x for x in 'ABCDEFGH'] rows2.sort(reverse=True) cols = ['0' + str(x) for x in range(1, 10)] + [str(x) for x in range(10, 13)] final_df = pd.DataFrame() df = sf.load_covid_data([fcs], 'pool_stats.hdr.tsv') # print(df.shape) if df.pos_pooling[1].startswith('PRP'): # print(df.pos_pooling[1], "bip prp") # split first on : then split on character # the other method won't work because bip doesn't include zeros # add rjust to add zero to match above columns df['col'] = df['pos_pooling'].str.split(":").str[1].str.split('[A-Z]').str[1].str.rjust(2, "0") df['row'] = df['pos_pooling'].str.split(":").str[1].str.split('[0-9]').str[0] final_df = final_df.append(df) else: print(df.pos_pooling[1], 'dev') df.loc[:, 'col'] = df.pos_pooling.str[-2:] df.loc[:, 'row'] = df.pos_pooling.str[-3] final_df = final_df.append(df) pm = pd.DataFrame(0, index=rows, columns=cols) for r in rows: for c in cols: well_df = final_df.loc[(df.col == c) & (final_df.row == r)] well_mean = well_df.loc[:, count_format].mean() pm.loc[r, c] = well_mean pm = pm.sort_index(ascending=False) pm = pm.apply(np.floor) fig = ff.create_annotated_heatmap(x=cols, y=rows2, z=pm.values, colorscale='Viridis' ) for i in range(len(fig.layout.annotations)): fig.layout.annotations[i].font.size = 10 fig.update_layout(title=new_title, xaxis_title="iPCR Column", yaxis_title="iPCR Row") return fig
def read_type_bar(fc): stack_colors = ['#273c75', '#00a8ff', '#c23616', '#e84118', '#7f8fa6', '#e1b12c', '#fbc531'] fig = {} if fc == None: return fig # return empty figure if no FC sub_df = sf.load_covid_data([fc], 'pool_stats.hdr.tsv') cols = [x for x in sub_df.columns if 'percent' in x] + ['pos_pooling'] cols.remove('percent_usable_reads') fig = px.bar(sub_df[cols].melt(id_vars='pos_pooling'), x='pos_pooling', y='value', color='variable', color_discrete_sequence=stack_colors, width=1200) fig = fig.update_layout(yaxis_title='Percent of Reads') fig = sf.fix_plot(fig) return fig
def find_discord(fc): ''' Function that finds discordant calls and percents. Inputs: fc, start_date, end_date Result : table 1 seperate detects and not detects 2 then get counts of replicates detected, replicates no call 3 append those, add total count 4 then append detects to not detects 5 calculate percent 6 remove zeros, this is confusing, and no one cares about those ''' discord = pd.DataFrame() if fc == None: dis = dash_table.DataTable( id='table', columns=[{"name": i, "id": i} for i in discord.columns], data=discord.to_dict('records'), ) return dis discord = sf.load_covid_data([fc], 'c19_call.hdr.tsv') dis = sf.find_discord(discord) history = pd.read_csv('Discord_Means.csv', index_col=0) history = history.rename(columns={'count': 'total_hist'}) dis = dis.merge(history, on=['call', 'type', 'group']) # reformat to make pretty dis_table = dash_table.DataTable( data=dis.to_dict('records'), columns=[{ 'id': 'call', 'name': 'Final Call', 'type': 'text' }, { 'id': 'type', 'name': 'Singlicate Call', 'type': 'text' }, { 'id': 'group', 'name': 'Singlicates', 'type': 'text' }, { 'id': 'total_hist', 'name': 'Total History', 'type': 'numeric', }, { 'id': 'all_mean', 'name': 'History Mean Percent of All', 'type': 'numeric', 'format': FormatTemplate.percentage(1) }, { 'id': 'all_stdv', 'name': 'History StDv Percent of All', 'type': 'numeric', 'format': FormatTemplate.percentage(1) }, { 'id': 'Percent of All (includes no call)', 'name': 'Percent of All (includes no call)', 'type': 'numeric', 'format': FormatTemplate.percentage(1) }, { 'id': 'call_mean', 'name': 'History Mean Percent of Call', 'type': 'numeric', 'format': FormatTemplate.percentage(1) }, { 'id': 'call_stdv', 'name': 'History StDv Percent of Call', 'type': 'numeric', 'format': FormatTemplate.percentage(1) }, { 'id': 'Percent of Call', 'name': 'Percent of Call', 'type': 'numeric', 'format': FormatTemplate.percentage(1) }, { 'id': 'runid', 'name': 'runid', 'type': 'text' } ] ) return dis_table
def discordance_rate(fc_click, metric, group_list, start_date, end_date): fig = {} if fc_click: fclist_df = pd.DataFrame(sf.fc_list(), columns=['runid']) fclist_df.loc[:, 'date'] = (fclist_df.runid.str[2:4] + '-' + fclist_df.runid.str[4:6] + '-' + fclist_df.runid.str[:2]).astype(np.datetime64) fclist_df = fclist_df[(fclist_df['date'] > pd.to_datetime(start_date)) & (fclist_df['date'] < pd.to_datetime(end_date))] fclist = list(fclist_df.runid.unique()) # now get just those runs calls = sf.load_covid_data(fclist, 'c19_call.hdr.tsv') calls.loc[:, 'date'] = (calls.runid.str[2:4] + '-' + calls.runid.str[4:6] + '-' + calls.runid.str[:2]).astype(np.datetime64) # add tube rack calls_short = calls[['date', 'run_sample_id']] calls_all = calls_short.groupby(by=['date']).count().reset_index() calls_all = calls_all.rename(columns={'run_sample_id': 'total'}) calls_short = calls[['date', 'replicates_detected', 'run_sample_id']] calls_repdetect = calls_short.groupby( by=['date', 'replicates_detected']).count().reset_index() calls_repdetect = calls_repdetect.rename(columns={ 'run_sample_id': 'count_detects', 'replicates_detected': 'group' }) calls_short = calls[['date', 'replicates_no_call', 'run_sample_id']] calls_repnocall = calls_short.groupby( by=['date', 'replicates_no_call']).count().reset_index() calls_repnocall = calls_repnocall.rename(columns={ 'run_sample_id': 'count_no_call', 'replicates_no_call': 'group' }) calls_short = calls[[ 'date', 'replicates_not_detected', 'run_sample_id' ]] calls_repNotdetect = calls_short.groupby( by=['date', 'replicates_not_detected']).count().reset_index() calls_repNotdetect = calls_repNotdetect.rename( columns={ 'run_sample_id': 'count_not_detect', 'replicates_not_detected': 'group' }) merge_calls = calls_repNotdetect.merge(calls_repnocall, on=['date', 'group']) merge_calls = merge_calls.merge(calls_repdetect, on=['date', 'group']) merge_calls = merge_calls.merge(calls_all, on=['date']) merge_calls['percent_not_detect'] = merge_calls[ 'count_not_detect'] / merge_calls['total'] merge_calls['percent_detect'] = merge_calls[ 'count_detects'] / merge_calls['total'] merge_calls['percent_no_call'] = merge_calls[ 'count_no_call'] / merge_calls['total'] merge_calls = merge_calls[merge_calls['group'] < 4] merge_calls['group'] = merge_calls['group'].astype(str) merge_calls = merge_calls[merge_calls.group.isin(group_list)] fig = px.bar(merge_calls, x='date', y=metric, color='group') return fig