def main(): ''' Usage: compute_gr_metrics.py input.tsv > output.tsv Compute Growth Response metrics for precomputed GR dose-response data in TSV format. The input tsv file must meet the following requirements: - The first row contains column names. - The 'concentration' column contains the numeric dose of a perturbagen. - The 'GRvalue' column contains the GR value for that condition. The input file may also have other "key" columns to distinguish multiple separate dose-response experiments within the same file. The GR value may be computed from raw cell counts using the `add_gr_column.py` script. The columns required as input for that script will be ignored by this one, and as such the output of `add_gr_column.py` may be piped directly to the input of this script. The output tsv will have all key columns in addition to a column for each GR metric. (See the documentation of gr50.gr_metrics for details) ''' if '-h' in sys.argv or '--help' in sys.argv: print textwrap.dedent(main.__doc__) return reader = LineReader(fileinput.input(mode='rb')) data = pd.read_csv(reader, delimiter='\t') metrics = gr50.gr_metrics(data) print metrics.to_csv(sep='\t', index=False)
def output_test(): curfilePath = os.path.abspath(__file__) Dir = os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.dirname(curfilePath))))) Dirfile = os.path.abspath(os.path.join(Dir, 'INPUT/toy_example_input1.tsv')) df = pd.read_csv(Dirfile, delimiter='\t') # Read the matlab output. Dirfile = os.path.abspath( os.path.join(Dir, 'OUTPUT/matlab_input1_GRmetrics.tsv')) metrics_matlab = pd.read_csv(Dirfile, delimiter='\t') # Compute the GR metrics from the data. gr_values = gr50.compute_gr(df) metrics_python = gr50.gr_metrics(gr_values) print(metrics_python.columns) print(metrics_matlab.columns) # Determine key columns and sort both dataframes by those columns. Since # this script is expected to work on arbitrary inputs, we need to "sniff" # out which columns are the keys (otherwise we'd need that as an input too). # Both the matlab and python implementations put the keys first and the # metrics second (starting with GR50) so we will use that to our advantage. # Of course they don't both sort the rows the same way which is why we need # to do that here. first_data_col_index = list(metrics_python).index('GR50') keys_python = metrics_python.columns[:first_data_col_index] keys_matlab = metrics_matlab.columns[:first_data_col_index] assert sorted(keys_python) == sorted(keys_matlab), "Key column mismatch" keys = list(keys_python) for df in metrics_python, metrics_matlab: # Perform the sort. df.sort_values(keys, inplace=True) df.reset_index(drop=True, inplace=True) # Compute log10 of GR50 and drop original column. df.insert(first_data_col_index, 'log10_GR50', np.log10(df['GR50'])) del df['GR50'] df.insert(first_data_col_index, 'log10_GEC50', np.log10(df['GEC50'])) del df['GEC50'] metrics = [c for c in metrics_python if c not in keys] errs = ['err_' + m for m in metrics] # ignore pval and r^2 for now errs = errs[:-2] error = metrics_python[keys].copy() for m, e in zip(metrics, errs): err = (1 - metrics_python[m] / metrics_matlab[m]) error[e] = err.fillna(0) rejects = np.isinf(metrics_python.log10_GR50) rejects2 = np.isinf(metrics_python.log10_GEC50) # Sanity check to make sure both versions pass/reject the same things. assert (rejects == np.isinf(metrics_matlab.log10_GR50)).all() assert (rejects2 == np.isinf(metrics_matlab.log10_GEC50)).all() # Keep pandas from wrapping. pd.set_option('display.width', None) print print("Records with > 0.1% error between python and matlab values of any " "metric:") test1 = error[(abs(error[errs]) > 0.001).any(axis=1)] print(test1) print print("Records with > 10% error between python and matlab values of any " "metric:") test2 = error[(abs(error[errs]) > 0.1).any(axis=1)] print(test2) # Insert rows at the top with median and IQR of each error column. median = error[errs].median() iqr = np.subtract(*np.percentile(error[errs], [75, 25], axis=0)) error.loc[-2] = ['<median>', -1, -1, 'N/A', -1] + list(median) error.loc[-1] = ['<IQR>', -1, -1, 'N/A', -1] + list(iqr) error.sort_index(inplace=True) # Passes if none of the metrics differ by 10% # Just an example of a passing test. assert test2.shape[0] == 0
import matplotlib.pyplot as plt import gr50 # Read the data file. base_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', '..') input_path = path.join(base_path, 'OUTPUT', 'toy_example_output.tsv') df = pd.read_csv(input_path, delimiter='\t') # Filter down to only a manageable subset of the experiments. filters = (('time', 72), ('perturbation', 0), ('replicate', 1)) for column, value in filters: df = df[df[column] == value] del df[column] # Compute the GR metrics from the data. gr_metrics = gr50.gr_metrics(df) # Produce a trellis plot showing the fitted curves and some of the metrics # across the different cell lines and drugs. sns.set(style="ticks") grid = sns.FacetGrid(df, row="cell_line", col="agent", margin_titles=True) grid.set(xscale="log") grid.map(plt.plot, "concentration", "GRvalue", lw=0, marker='o', ms=4) x_min = df.concentration.min() / 10 x_max = df.concentration.max() * 10 fit_x = np.logspace(np.log10(x_min), np.log10(x_max)) for cell_line, row_axes in zip(grid.row_names, grid.axes): for agent, ax in zip(grid.col_names, row_axes): for m in gr_metrics[(gr_metrics.agent == agent) & (gr_metrics.cell_line == cell_line)].itertuples(): fit_y = gr50.logistic(fit_x, [m.GRinf, np.log10(m.GEC50), m.h_GR])