def summary_stat_table(table, factors): '''returns summary statistics for classifier, feature set combination''' fscore_cols = [c for c in table.header if c.startswith('fscore')] distinct = table.distinct_values(factors) rows = [] for comb in tqdm(distinct, ncols=80): subtable = table.filtered(lambda x: tuple(x) == tuple(comb), columns=factors) aurocs = numpy.array(subtable.tolist('auc')) mean_prec = numpy.array(subtable.tolist('mean_precision')) accuracy = numpy.array(subtable.tolist('balanced_accuracy')) row = list(comb) + [ aurocs.mean(), aurocs.std(ddof=1), mean_prec.mean(), mean_prec.std(ddof=1), accuracy.mean(), accuracy.std(ddof=1) ] for col in fscore_cols: data = numpy.array(subtable.tolist(col)) row.append(data.mean()) row.append(data.std(ddof=1)) rows.append(row) header = list(factors) + [ 'mean_auc', 'std_auc', 'mean_ap', 'std_ap', 'mean_balanced_accuracy', 'std_balanced_accuracy' ] for col in fscore_cols: header.extend([f'mean_{col}', f'std_{col}']) table = LoadTable(header=header, rows=rows) table = table.sorted(reverse='mean_auc') return table
def get_combined_counts(table, positions): bases = 'ACGT' if type(positions) == str: counts = reduced_one_position(table, positions) mut_counts = counts['M'] unmut_counts = counts['R'] positions = [positions] states = bases header = ['mut', 'base', 'count'] else: counts = reduced_multiple_positions(table, *positions) mut_counts = counts['M'] unmut_counts = counts['R'] states = product(*list([bases] * len(positions))) header = ['mut'] + ['base%d' % (i + 1) for i in range(len(positions))] + ['count'] combined = [] for state in states: combined.append(['R'] + list(state) + [unmut_counts[state]]) combined.append(['M'] + list(state) + [mut_counts[state]]) counts_table = LoadTable(header=header, rows=combined) counts_table = counts_table.sorted(columns=header[:-1]) return counts_table
def get_grouped_combined_counts(table, position, group_label): """wraps motif_count.get_combined_counts for groups""" group_cats = table.distinct_values(group_label) all_data = [] header = None for category in group_cats: subtable = table.filtered(lambda x: x == category, columns=group_label) counts = motif_count.get_combined_counts(subtable, position) if header is None: header = [group_label] + list(counts.header) counts = counts.with_new_column(group_label, lambda x: category, columns=counts.header[0]) all_data.extend(counts.tolist(header)) counts = LoadTable(header=header, rows=all_data) counts.sorted(columns=[group_label, 'mut']) return counts
def display_available_dbs(account, release=None): """displays the available Ensembl databases at the nominated host""" db_list = get_db_name(account=account, db_type="core", release=release) db_list += get_db_name(account=account, db_type="compara", release=release) rows = [] for db_name in db_list: species_name = db_name.species if species_name: common_name = Species.get_common_name(db_name.species, level="ignore") if "compara" in db_name.name: species_name = common_name = "-" rows.append([db_name.release, db_name.name, species_name, common_name]) table = LoadTable(header=["Release", "Db Name", "Species", "Common Name"], rows=rows, space=2) table = table.sorted(["Release", "Db Name"]) table.legend = ( "Values of 'None' indicate cogent does not have a value for that database name." ) return table
def collate(base_path, output_path, exclude_paths, overwrite): """collates all classifier performance stats and writes to a single tsv file""" LOGGER.log_args() outpath = os.path.join(output_path, "collated.tsv.gz") logfile_path = os.path.join(output_path, "collated.log") if os.path.exists(outpath) and not overwrite: click.secho(f"Skipping. {outpath} exists. " "Use overwrite to force.", fg='green') exit(0) stat_fns = exec_command(f'find {base_path} -name' ' "*performance.json*"') stat_fns = stat_fns.splitlines() if not stat_fns: msg = f'No files matching "*performance.json*" in {base_path}' click.secho(msg, fg='red') return LOGGER.log_file_path = logfile_path records = [] keys = set() exclude_paths = [] if exclude_paths is None else exclude_paths.split(',') num_skipped = 0 for fn in tqdm(stat_fns, ncols=80): if skip_path(exclude_paths, fn): num_skipped += 1 LOGGER.log_message(fn, label="SKIPPED FILE") continue LOGGER.input_file(fn) data = load_json(fn) labels = data['classification_report']['labels'] fscores = data['classification_report']['f-score'] row = { "stat_path": fn, "classifier_path": data["classifier_path"], "auc": data["auc"], "algorithm": data["classifier_label"], "mean_precision": data["mean_precision"], f"fscore({labels[0]})": fscores[0], f"fscore({labels[1]})": fscores[1], 'balanced_accuracy': data['balanced_accuracy'] } row.update(data["feature_params"]) keys.update(row.keys()) records.append(row) columns = sorted(keys) rows = list(map(lambda r: [r.get(c, None) for c in columns], records)) table = LoadTable(header=columns, rows=rows) table = table.sorted(reverse="auc") table = table.with_new_column( "name", lambda x: model_name_from_features(*x), columns=["flank_size", "feature_dim", "usegc", "proximal"]) table = table.with_new_column("size", sample_size_from_path, columns="classifier_path") table.write(outpath) LOGGER.output_file(outpath) # make summary statistics via grouping by factors factors = [ "algorithm", "name", "flank_size", "feature_dim", "proximal", "usegc", "size" ] summary = summary_stat_table(table, factors=factors) outpath = os.path.join(output_path, "summary_statistics.tsv.gz") summary.write(outpath) LOGGER.output_file(outpath) if num_skipped: click.secho("Skipped %d files that matched exclude_paths" % num_skipped, fg='red')