def test_load_table_returns_static_columns(self): """for static data, load_table gives same dtypes for static_columns_type=True/False""" t = load_table("data/sample.tsv", sep="\t", static_column_types=False) is_false = {t.columns[c].dtype.name for c in t.columns} t = load_table("data/sample.tsv", sep="\t", static_column_types=True) is_true = {t.columns[c].dtype.name for c in t.columns} self.assertEqual(is_true, is_false)
def test_sorted(self): """test the table sorted method""" t1 = Table(header=self.t1_header, rows=self.t1_rows) self.assertEqual( t1.sorted("length").tolist("length"), [999, 1353, 1383, 1554, 1599, 1698, 1827, 1977, 2307, 4185], ) t5 = Table(header=self.t5_header, rows=self.t5_rows) self.assertEqual(t5.sorted("b").tolist("b"), [0, 1, 3]) self.assertEqual(t5.sorted().tolist("a"), [1, 1, 2]) self.assertEqual(t5.sorted(reverse="a").tolist("a"), [2, 1, 1]) path = os.path.dirname(os.path.dirname(__file__)) path = os.path.join(path, "data/sample.tsv") table = load_table(path) table = table.sorted(columns=["chrom", "stableid"]) last_index = len(table) - 1 self.assertEqual(table[0, "stableid"], "ENSG00000018408") self.assertEqual(table[last_index, "stableid"], "ENSG00000012174") table = table.sorted(reverse="stableid") self.assertEqual(table[0, "stableid"], "ENSG00000019485") self.assertEqual(table[last_index, "stableid"], "ENSG00000005893") table = table.sorted(reverse="chrom", columns="length") self.assertEqual(table[0, "stableid"], "ENSG00000019102") self.assertEqual(table[last_index, "stableid"], "ENSG00000019144")
def test_load_mixed_row_lengths(self): """skip_inconsistent skips rows that have different length to header""" h = list("ABCDE") r = [list("12345"), list("000"), list("12345")] text = "\n".join(["\t".join(l) for l in [h] + r]) with TemporaryDirectory(".") as dirname: path = pathlib.Path(dirname) / "table.tsv" with open(path, "w") as out: out.write(text) r = load_table(path, skip_inconsistent=True) self.assertEqual(r.shape, (2, 5)) self.assertEqual(r.header, tuple(h)) self.assertEqual(r.array.tolist(), [list(range(1, 6))] * 2) # loading without skip_inconsistent raise ValueError with self.assertRaises(ValueError): r = load_table(path, skip_inconsistent=False)
def test_load_mixed_static(self): """load data, mixed data type columns remain as string""" t = make_table(header=["A", "B"], data=[[1, 1], ["a", 2]]) with TemporaryDirectory(".") as dirname: path = pathlib.Path(dirname) / "table.txt" t.write(str(path), sep="\t") # if static types, then mixed columns become strings r = load_table(path, sep="\t", static_column_types=True) self.assertTrue("str" in r.columns["A"].dtype.name)
def test_all_counts(self): """exercising all_acounts""" runner = CliRunner() # should fail, as data files not in this directory r = runner.invoke(all_count_main, ["-cdata/*.txt", "-o%s" % self.dirname]) self.assertNotEqual(r.exit_code, 0) r = runner.invoke( all_count_main, ["-cdata/directions/*.txt", "-o%s" % self.dirname] ) # should produce directory containing two files dirlist = os.listdir(self.dirname) self.assertEqual( set(dirlist), set(["combined_counts.txt", "combined_counts.log"]) ) # check the contents of combined_counts counts = load_table(os.path.join(self.dirname, "combined_counts.txt"), sep="\t") # 4**4 nbrs x 12 mutations x 2 (M/R groups) = 6144 counts = load_table(os.path.join(self.dirname, "combined_counts.txt"), sep="\t") self.assertEqual(counts.shape[0], 6144) shutil.rmtree(self.dirname)
def test_load_mixed(self): """load data with mixed data type columns""" t = Table( header=["abcd", "data", "float"], data=[[str([1, 2, 3, 4, 5]), "0", 1.1], ["x", 5.0, 2.1], ["y", "", 3.1]], ) with TemporaryDirectory(".") as dirname: path = pathlib.Path(dirname) / "table.tsv" t.write(str(path)) r = load_table(path) self.assertEqual(str(t), str(r)) self.assertTrue("float", r.columns["float"].dtype.name)
def test_pickle_unpickle(self): """roundtrip via pickling""" data = { "edge.parent": { "NineBande": "root", "edge.1": "root", }, "x": { "NineBande": 1.0, "edge.1": 1.0, }, "length": { "NineBande": 4.0, "edge.1": 4.0, }, "y": { "NineBande": 3.0, "edge.1": 3.0, }, "z": { "NineBande": 6.0, "edge.1": 6.0, }, "edge.name": { "NineBande": "NineBande", "edge.1": "edge.1", }, } t = Table( data=data, max_width=50, row_ids="edge.name", title="My title", legend="blah", ) # via string s = pickle.dumps(t) r = pickle.loads(s) self.assertEqual(str(t), str(r)) # via file with TemporaryDirectory(".") as dirname: path = pathlib.Path(dirname) / "table.pickle" t.write(str(path)) r = load_table(path) self.assertEqual(str(t), str(r))
def test_aln_to_counts(self): """exercising aln_to_counts""" makedirs(self.dirname) runner = CliRunner() # should fail, as data files not in this directory r = runner.invoke( aln_to_counts_main, [ "-adata/sample_AtoC.fasta", "-o%s" % self.dirname, "-f1", "--direction=AtoC", "-S111", "-F", ], ) dirlist = os.listdir(self.dirname) self.assertEqual(r.exit_code, 0) self.assertEqual(set(dirlist), set(["sample_AtoC.txt", "sample_AtoC.log"])) counts = load_table(os.path.join(self.dirname, "sample_AtoC.txt"), sep="\t") # two columns with pos, two groups giving shape=2*16 self.assertEqual(counts.shape[0], 32) shutil.rmtree(self.dirname)
def test_load_table(self): """exercising load table""" path = os.path.dirname(os.path.dirname(__file__)) path = os.path.join(path, "data/sample.tsv") table = load_table(path) self.assertEqual(table.shape, (10, 3))
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite, dry_run, verbose): args = locals() table = load_table(countsfile, sep='\t') if not dry_run: log_file_path = os.path.join(util.abspath(outpath), 'spectra_analysis.log') LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') LOGGER.input_file(countsfile) # if there's a strand symmetry argument then we don't need a second file if strand_symmetry: group_label = 'strand' counts_table = util.spectra_table(table, group_label) if not strand_symmetry: group_label = 'group' # be sure there's two files assert countsfile2, f"must provide second counts file" counts_table2 = load_table(countsfile2, sep='\t') LOGGER.input_file(countsfile2) counts_table2 = counts_table2.with_new_column( 'group', lambda x: '2', columns=counts_table2.header[0]) counts_table1 = table.with_new_column('group', lambda x: '1', columns=table.header[0]) counts_table1 = util.spectra_table(counts_table1, group_label) counts_table2 = util.spectra_table(counts_table2, group_label) # now combine header = ['group'] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = make_table(header=header, rows=raw1 + raw2) if verbose: print(counts_table) # spectra table has [count, start, end, group] order # we reduce comparisons to a start base results = [] saveable = {} for start_base in counts_table.distinct_values('start'): subtable = counts_table.filtered('start == "%s"' % start_base) columns = [c for c in counts_table.header if c != 'start'] subtable = subtable.get_columns(columns) total_re, dev, df, collated, formula = log_lin.spectra_difference( subtable, group_label) r = [list(x) for x in collated.to_records(index=False)] if not strand_symmetry: grp_labels = {'1': countsfile, '2': countsfile2} grp_index = list(collated.columns).index('group') for row in r: row[grp_index] = grp_labels[row[grp_index]] p = chisqprob(dev, df) if p < 1e-6: prob = "%.2e" % p else: prob = "%.6f" % p for row in r: row.insert(0, start_base) row.append(prob) results += r significance = [ "RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df, "p=%s" % p ] stats = " : ".join(significance) print("Start base=%s %s" % (start_base, stats)) saveable[start_base] = dict(rel_entropy=total_re, deviance=dev, df=df, prob=p, formula=formula, stats=collated.to_json()) table = make_table(header=['start_base'] + list(collated.columns) + ['prob'], rows=results, digits=5).sorted(columns='ret') json_path = None outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) json_path = os.path.join(outpath, 'spectra_analysis.json') dump_json(saveable, json_path) LOGGER.output_file(json_path) table_path = os.path.join(outpath, 'spectra_summary.txt') table.write(table_path, sep='\t') LOGGER.output_file(table_path) LOGGER.log_message(str(significance), label="significance")
def one2one( ensembl_account, species, release, outdir, ref, ref_genes_file, coord_names, not_strict, introns, method_clade_id, mask_features, logfile_name, limit, force_overwrite, test, ): """Command line tool for sampling homologous sequences from Ensembl.""" outdir = abspath(outdir) if not any([ref, ref_genes_file]): # just the command name, indicate they need to display help click.secho("Missing 'ref' and 'ref_genes_file'") ctx = click.get_current_context() msg = "%s\n\n--help to see all options\n" % ctx.get_usage() click.echo(msg) exit(-1) ensembl_account = _get_account(ensembl_account) args = locals() args["ensembl_account"] = str(ensembl_account) LOGGER.log_message(str(args), label="params") if test and limit == 0: limit = 2 else: limit = limit or None if (introns and not method_clade_id) or (mask_features and not introns): msg = [ "Must specify the introns and method_clade_id in order to", "export introns. Use show_align_methods to see the options", ] click.secho("\n".join(msg), fg="red") exit(-1) species_missing = missing_species_names(species) if species_missing: msg = [ "The following species names don't match an Ensembl record." " Check spelling!", str(species_missing), "\nAvailable species are at this server are:", str(display_available_dbs(ensembl_account)), ] click.secho("\n".join(msg), fg="red") exit(-1) if ref: ref = ref.lower() if ref and ref not in species: print("The reference species not in species names") exit(-1) compara = Compara(species, release=release, account=ensembl_account) runlog_path = os.path.join(outdir, logfile_name) if os.path.exists(runlog_path) and not force_overwrite: msg = [ "Log file (%s) already exists!" % runlog_path, "Use force_overwrite or provide logfile_name", ] click.secho("\n".join(msg), fg="red") exit(-1) if not test: LOGGER.log_file_path = runlog_path chroms = None if coord_names: chroms = load_coord_names(coord_names) LOGGER.input_file(coord_names) elif coord_names and ref: chroms = get_chrom_names(ref, compara) if not os.path.exists(outdir) and not test: os.makedirs(outdir) print("Created", outdir) if ref and not ref_genes_file: ref_genome = Genome(ref, release=release, account=ensembl_account) ref_genes = [g.stableid for g in _get_ref_genes(ref_genome, chroms, limit)] else: if not (ref_genes_file.endswith(".csv") or ref_genes_file.endswith(".tsv")): msg = ( "ref_genes_file must be either a comma/tab " "delimted with the corresponding suffix (.csv/.tsv)" ) click.secho(msg, fg="red") exit(-1) ref_genes = load_table(ref_genes_file) if "stableid" not in ref_genes.header: msg = "ref_genes_file does not have a 'stableid' column header" click.secho(msg, fg="red") exit(-1) ref_genes = ref_genes.tolist("stableid") if limit: ref_genes = ref_genes[:limit] if not introns: print("Getting orthologs %d genes" % len(ref_genes)) get_one2one_orthologs( compara, ref_genes, outdir, not_strict, force_overwrite, test ) else: print("Getting orthologous introns for %d genes" % len(ref_genes)) get_syntenic_alignments_introns( compara, ref_genes, outdir, method_clade_id, mask_features, outdir, force_overwrite, test, )