Beispiel #1
0
 def test_load_table_returns_static_columns(self):
     """for static data, load_table gives same dtypes for static_columns_type=True/False"""
     t = load_table("data/sample.tsv", sep="\t", static_column_types=False)
     is_false = {t.columns[c].dtype.name for c in t.columns}
     t = load_table("data/sample.tsv", sep="\t", static_column_types=True)
     is_true = {t.columns[c].dtype.name for c in t.columns}
     self.assertEqual(is_true, is_false)
Beispiel #2
0
    def test_sorted(self):
        """test the table sorted method"""
        t1 = Table(header=self.t1_header, rows=self.t1_rows)
        self.assertEqual(
            t1.sorted("length").tolist("length"),
            [999, 1353, 1383, 1554, 1599, 1698, 1827, 1977, 2307, 4185],
        )

        t5 = Table(header=self.t5_header, rows=self.t5_rows)
        self.assertEqual(t5.sorted("b").tolist("b"), [0, 1, 3])
        self.assertEqual(t5.sorted().tolist("a"), [1, 1, 2])
        self.assertEqual(t5.sorted(reverse="a").tolist("a"), [2, 1, 1])

        path = os.path.dirname(os.path.dirname(__file__))
        path = os.path.join(path, "data/sample.tsv")
        table = load_table(path)

        table = table.sorted(columns=["chrom", "stableid"])
        last_index = len(table) - 1
        self.assertEqual(table[0, "stableid"], "ENSG00000018408")
        self.assertEqual(table[last_index, "stableid"], "ENSG00000012174")

        table = table.sorted(reverse="stableid")
        self.assertEqual(table[0, "stableid"], "ENSG00000019485")
        self.assertEqual(table[last_index, "stableid"], "ENSG00000005893")

        table = table.sorted(reverse="chrom", columns="length")
        self.assertEqual(table[0, "stableid"], "ENSG00000019102")
        self.assertEqual(table[last_index, "stableid"], "ENSG00000019144")
Beispiel #3
0
 def test_load_mixed_row_lengths(self):
     """skip_inconsistent skips rows that have different length to header"""
     h = list("ABCDE")
     r = [list("12345"), list("000"), list("12345")]
     text = "\n".join(["\t".join(l) for l in [h] + r])
     with TemporaryDirectory(".") as dirname:
         path = pathlib.Path(dirname) / "table.tsv"
         with open(path, "w") as out:
             out.write(text)
         r = load_table(path, skip_inconsistent=True)
         self.assertEqual(r.shape, (2, 5))
         self.assertEqual(r.header, tuple(h))
         self.assertEqual(r.array.tolist(), [list(range(1, 6))] * 2)
         # loading without skip_inconsistent raise ValueError
         with self.assertRaises(ValueError):
             r = load_table(path, skip_inconsistent=False)
Beispiel #4
0
 def test_load_mixed_static(self):
     """load data, mixed data type columns remain as string"""
     t = make_table(header=["A", "B"], data=[[1, 1], ["a", 2]])
     with TemporaryDirectory(".") as dirname:
         path = pathlib.Path(dirname) / "table.txt"
         t.write(str(path), sep="\t")
         # if static types, then mixed columns become strings
         r = load_table(path, sep="\t", static_column_types=True)
         self.assertTrue("str" in r.columns["A"].dtype.name)
Beispiel #5
0
 def test_all_counts(self):
     """exercising all_acounts"""
     runner = CliRunner()
     # should fail, as data files not in this directory
     r = runner.invoke(all_count_main, ["-cdata/*.txt", "-o%s" % self.dirname])
     self.assertNotEqual(r.exit_code, 0)
     r = runner.invoke(
         all_count_main, ["-cdata/directions/*.txt", "-o%s" % self.dirname]
     )
     # should produce directory containing two files
     dirlist = os.listdir(self.dirname)
     self.assertEqual(
         set(dirlist), set(["combined_counts.txt", "combined_counts.log"])
     )
     # check the contents of combined_counts
     counts = load_table(os.path.join(self.dirname, "combined_counts.txt"), sep="\t")
     # 4**4 nbrs x 12 mutations x 2 (M/R groups) = 6144
     counts = load_table(os.path.join(self.dirname, "combined_counts.txt"), sep="\t")
     self.assertEqual(counts.shape[0], 6144)
     shutil.rmtree(self.dirname)
Beispiel #6
0
 def test_load_mixed(self):
     """load data with mixed data type columns"""
     t = Table(
         header=["abcd", "data", "float"],
         data=[[str([1, 2, 3, 4, 5]), "0", 1.1], ["x", 5.0, 2.1],
               ["y", "", 3.1]],
     )
     with TemporaryDirectory(".") as dirname:
         path = pathlib.Path(dirname) / "table.tsv"
         t.write(str(path))
         r = load_table(path)
         self.assertEqual(str(t), str(r))
         self.assertTrue("float", r.columns["float"].dtype.name)
Beispiel #7
0
 def test_pickle_unpickle(self):
     """roundtrip via pickling"""
     data = {
         "edge.parent": {
             "NineBande": "root",
             "edge.1": "root",
         },
         "x": {
             "NineBande": 1.0,
             "edge.1": 1.0,
         },
         "length": {
             "NineBande": 4.0,
             "edge.1": 4.0,
         },
         "y": {
             "NineBande": 3.0,
             "edge.1": 3.0,
         },
         "z": {
             "NineBande": 6.0,
             "edge.1": 6.0,
         },
         "edge.name": {
             "NineBande": "NineBande",
             "edge.1": "edge.1",
         },
     }
     t = Table(
         data=data,
         max_width=50,
         row_ids="edge.name",
         title="My title",
         legend="blah",
     )
     # via string
     s = pickle.dumps(t)
     r = pickle.loads(s)
     self.assertEqual(str(t), str(r))
     # via file
     with TemporaryDirectory(".") as dirname:
         path = pathlib.Path(dirname) / "table.pickle"
         t.write(str(path))
         r = load_table(path)
         self.assertEqual(str(t), str(r))
Beispiel #8
0
 def test_aln_to_counts(self):
     """exercising aln_to_counts"""
     makedirs(self.dirname)
     runner = CliRunner()
     # should fail, as data files not in this directory
     r = runner.invoke(
         aln_to_counts_main,
         [
             "-adata/sample_AtoC.fasta",
             "-o%s" % self.dirname,
             "-f1",
             "--direction=AtoC",
             "-S111",
             "-F",
         ],
     )
     dirlist = os.listdir(self.dirname)
     self.assertEqual(r.exit_code, 0)
     self.assertEqual(set(dirlist), set(["sample_AtoC.txt", "sample_AtoC.log"]))
     counts = load_table(os.path.join(self.dirname, "sample_AtoC.txt"), sep="\t")
     # two columns with pos, two groups giving shape=2*16
     self.assertEqual(counts.shape[0], 32)
     shutil.rmtree(self.dirname)
Beispiel #9
0
 def test_load_table(self):
     """exercising load table"""
     path = os.path.dirname(os.path.dirname(__file__))
     path = os.path.join(path, "data/sample.tsv")
     table = load_table(path)
     self.assertEqual(table.shape, (10, 3))
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite,
         dry_run, verbose):
    args = locals()

    table = load_table(countsfile, sep='\t')
    if not dry_run:
        log_file_path = os.path.join(util.abspath(outpath),
                                     'spectra_analysis.log')
        LOGGER.log_file_path = log_file_path
        LOGGER.log_message(str(args), label='vars')

    LOGGER.input_file(countsfile)
    # if there's a strand symmetry argument then we don't need a second file
    if strand_symmetry:
        group_label = 'strand'
        counts_table = util.spectra_table(table, group_label)

    if not strand_symmetry:
        group_label = 'group'

        # be sure there's two files
        assert countsfile2, f"must provide second counts file"
        counts_table2 = load_table(countsfile2, sep='\t')
        LOGGER.input_file(countsfile2)
        counts_table2 = counts_table2.with_new_column(
            'group', lambda x: '2', columns=counts_table2.header[0])
        counts_table1 = table.with_new_column('group',
                                              lambda x: '1',
                                              columns=table.header[0])

        counts_table1 = util.spectra_table(counts_table1, group_label)
        counts_table2 = util.spectra_table(counts_table2, group_label)

        # now combine
        header = ['group'] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = make_table(header=header, rows=raw1 + raw2)

        if verbose:
            print(counts_table)

    # spectra table has [count, start, end, group] order
    # we reduce comparisons to a start base
    results = []
    saveable = {}
    for start_base in counts_table.distinct_values('start'):
        subtable = counts_table.filtered('start == "%s"' % start_base)
        columns = [c for c in counts_table.header if c != 'start']
        subtable = subtable.get_columns(columns)
        total_re, dev, df, collated, formula = log_lin.spectra_difference(
            subtable, group_label)
        r = [list(x) for x in collated.to_records(index=False)]

        if not strand_symmetry:
            grp_labels = {'1': countsfile, '2': countsfile2}
            grp_index = list(collated.columns).index('group')
            for row in r:
                row[grp_index] = grp_labels[row[grp_index]]

        p = chisqprob(dev, df)
        if p < 1e-6:
            prob = "%.2e" % p
        else:
            prob = "%.6f" % p

        for row in r:
            row.insert(0, start_base)
            row.append(prob)

        results += r

        significance = [
            "RE=%.6f" % total_re,
            "Dev=%.2f" % dev,
            "df=%d" % df,
            "p=%s" % p
        ]

        stats = "  :  ".join(significance)
        print("Start base=%s  %s" % (start_base, stats))
        saveable[start_base] = dict(rel_entropy=total_re,
                                    deviance=dev,
                                    df=df,
                                    prob=p,
                                    formula=formula,
                                    stats=collated.to_json())

    table = make_table(header=['start_base'] + list(collated.columns) +
                       ['prob'],
                       rows=results,
                       digits=5).sorted(columns='ret')
    json_path = None

    outpath = util.abspath(outpath)
    if not dry_run:
        util.makedirs(outpath)
        json_path = os.path.join(outpath, 'spectra_analysis.json')
        dump_json(saveable, json_path)
        LOGGER.output_file(json_path)
        table_path = os.path.join(outpath, 'spectra_summary.txt')
        table.write(table_path, sep='\t')
        LOGGER.output_file(table_path)
        LOGGER.log_message(str(significance), label="significance")
Beispiel #11
0
def one2one(
    ensembl_account,
    species,
    release,
    outdir,
    ref,
    ref_genes_file,
    coord_names,
    not_strict,
    introns,
    method_clade_id,
    mask_features,
    logfile_name,
    limit,
    force_overwrite,
    test,
):
    """Command line tool for sampling homologous sequences from Ensembl."""
    outdir = abspath(outdir)
    if not any([ref, ref_genes_file]):
        # just the command name, indicate they need to display help
        click.secho("Missing 'ref' and 'ref_genes_file'")
        ctx = click.get_current_context()
        msg = "%s\n\n--help to see all options\n" % ctx.get_usage()
        click.echo(msg)
        exit(-1)

    ensembl_account = _get_account(ensembl_account)
    args = locals()
    args["ensembl_account"] = str(ensembl_account)
    LOGGER.log_message(str(args), label="params")

    if test and limit == 0:
        limit = 2
    else:
        limit = limit or None

    if (introns and not method_clade_id) or (mask_features and not introns):
        msg = [
            "Must specify the introns and method_clade_id in order to",
            "export introns. Use show_align_methods to see the options",
        ]
        click.secho("\n".join(msg), fg="red")
        exit(-1)

    species_missing = missing_species_names(species)
    if species_missing:
        msg = [
            "The following species names don't match an Ensembl record."
            " Check spelling!",
            str(species_missing),
            "\nAvailable species are at this server are:",
            str(display_available_dbs(ensembl_account)),
        ]

        click.secho("\n".join(msg), fg="red")
        exit(-1)

    if ref:
        ref = ref.lower()

    if ref and ref not in species:
        print("The reference species not in species names")
        exit(-1)

    compara = Compara(species, release=release, account=ensembl_account)
    runlog_path = os.path.join(outdir, logfile_name)

    if os.path.exists(runlog_path) and not force_overwrite:
        msg = [
            "Log file (%s) already exists!" % runlog_path,
            "Use force_overwrite or provide logfile_name",
        ]
        click.secho("\n".join(msg), fg="red")
        exit(-1)

    if not test:
        LOGGER.log_file_path = runlog_path

    chroms = None
    if coord_names:
        chroms = load_coord_names(coord_names)
        LOGGER.input_file(coord_names)
    elif coord_names and ref:
        chroms = get_chrom_names(ref, compara)

    if not os.path.exists(outdir) and not test:
        os.makedirs(outdir)
        print("Created", outdir)

    if ref and not ref_genes_file:
        ref_genome = Genome(ref, release=release, account=ensembl_account)
        ref_genes = [g.stableid for g in _get_ref_genes(ref_genome, chroms, limit)]
    else:
        if not (ref_genes_file.endswith(".csv") or ref_genes_file.endswith(".tsv")):
            msg = (
                "ref_genes_file must be either a comma/tab "
                "delimted with the corresponding suffix (.csv/.tsv)"
            )
            click.secho(msg, fg="red")
            exit(-1)

        ref_genes = load_table(ref_genes_file)
        if "stableid" not in ref_genes.header:
            msg = "ref_genes_file does not have a 'stableid' column header"
            click.secho(msg, fg="red")
            exit(-1)

        ref_genes = ref_genes.tolist("stableid")

    if limit:
        ref_genes = ref_genes[:limit]

    if not introns:
        print("Getting orthologs %d genes" % len(ref_genes))
        get_one2one_orthologs(
            compara, ref_genes, outdir, not_strict, force_overwrite, test
        )
    else:
        print("Getting orthologous introns for %d genes" % len(ref_genes))
        get_syntenic_alignments_introns(
            compara,
            ref_genes,
            outdir,
            method_clade_id,
            mask_features,
            outdir,
            force_overwrite,
            test,
        )