Example #1
0
 def test_make_table_from_dataframe(self):
     """makes a table from a pandas data frame"""
     df = DataFrame(data=[[0, 1], [3, 7]], columns=["a", "b"])
     t = make_table(data_frame=df)
     assert_equal(t.columns["a"], [0, 3])
     assert_equal(t.columns["b"], [1, 7])
     with self.assertRaises(TypeError):
         make_table(data_frame="abcde")
Example #2
0
    def test_str_tex_format(self):
        """str() produces latex tabular table"""
        tex_table = make_table(header=["a", "b"],
                               data=[["val1", "val2"], ["val3", "val4"]])
        tex = tex_table.to_string(format="tex")
        self.assertFalse("caption" in tex)
        # with a title
        tex_table = make_table(
            header=["a", "b"],
            data=[["val1", "val2"], ["val3", "val4"]],
            title="a title",
        )
        tex = tex_table.to_string(format="tex")
        tex = tex.splitlines()
        self.assertEqual(tex[-2], r"\caption{a title}")

        tex = tex_table.to_string(format="tex", label="tab:first")
        tex = tex.splitlines()
        self.assertEqual(tex[-3], r"\caption{a title}")
        self.assertEqual(tex[-2], r"\label{tab:first}")

        # with a legend, no title
        tex_table = make_table(
            header=["a", "b"],
            data=[["val1", "val2"], ["val3", "val4"]],
            legend="a legend",
        )
        tex = tex_table.to_string(format="tex")
        tex = tex.splitlines()
        # because it's treated as a title by default
        self.assertEqual(tex[-2], r"\caption{a legend}")
        # unless you say not to
        tex = tex_table.to_string(format="tex", concat_title_legend=False)
        tex = tex.splitlines()
        self.assertEqual(tex[-2], r"\caption*{a legend}")
        tex_table = make_table(
            header=["a", "b"],
            data=[["val1", "val2"], ["val3", "val4"]],
            title="a title.",
            legend="a legend",
        )
        tex = tex_table.to_string(format="tex")
        tex = tex.splitlines()
        self.assertEqual(tex[-2], r"\caption{a title. a legend}")
        tex = tex_table.to_string(format="tex", concat_title_legend=False)
        tex = tex.splitlines()
        self.assertEqual(tex[2], r"\caption{a title.}")
        self.assertEqual(tex[-2], r"\caption*{a legend}")
        tex = tex_table.to_string(format="tex",
                                  concat_title_legend=False,
                                  label="table")
        tex = tex.splitlines()
        self.assertEqual(tex[2], r"\caption{a title.}")
        self.assertEqual(tex[3], r"\label{table}")
Example #3
0
    def test_make_table(self):
        """makes a table"""
        data = {
            "edge.parent": {
                "NineBande": "root",
                "edge.1": "root",
                "DogFaced": "root",
                "Human": "edge.0",
            },
            "x": {
                "NineBande": 1.0,
                "edge.1": 1.0,
                "DogFaced": 1.0,
                "Human": 1.0,
            },
            "length": {
                "NineBande": 4.0,
                "edge.1": 4.0,
                "DogFaced": 4.0,
                "Human": 4.0,
            },
            "y": {
                "NineBande": 3.0,
                "edge.1": 3.0,
                "DogFaced": 3.0,
                "Human": 3.0,
            },
            "z": {
                "NineBande": 6.0,
                "edge.1": 6.0,
                "DogFaced": 6.0,
                "Human": 6.0,
            },
            "edge.names": {
                "NineBande": "NineBande",
                "edge.1": "edge.1",
                "DogFaced": "DogFaced",
                "Human": "Human",
            },
        }
        t = make_table(data=data)
        self.assertEqual(t.shape, (4, 6))
        # if index column not specified
        with self.assertRaises(IndexError):
            _ = t["Human", "edge.parent"]

        # applies row_ids as an index
        t = make_table(data=data, row_ids="edge.names")
        # index col is the first one, and the data can be indexed
        self.assertEqual(t.columns.order[0], "edge.names")
        self.assertEqual(t["Human", "edge.parent"], "edge.0")
Example #4
0
def spectra_table(table, group_label):
    """returns a table with columns without position information"""
    assert 'direction' in table.header
    if 'mut' in table.header:
        # remove redundant category (counts of M == R)
        table = table.filtered("mut=='M'")

    columns = ['count', 'direction', group_label]
    table = table.get_columns(columns)
    # so we have a table with counts per direction
    results = []
    group_categories = table.distinct_values(group_label)
    filter_template = "direction=='%(direction)s' and "\
                      "%(label)s=='%(category)s'"
    for direction in table.distinct_values('direction'):
        start = direction[0]
        for group_category in group_categories:
            condition = dict(direction=direction,
                             label=group_label,
                             category=group_category)
            sub_table = table.filtered(filter_template % condition)
            total = sub_table.summed('count')
            results.append([total, start, direction, group_category])
    result = make_table(header=['count', 'start', 'direction', group_label],
                        rows=results)
    return result
Example #5
0
def get_combined_counts(table, positions):
    bases = 'ACGT'
    if type(positions) == str:
        counts = reduced_one_position(table, positions)
        mut_counts = counts['M']
        unmut_counts = counts['R']
        positions = [positions]
        states = bases
        header = ['mut', 'base', 'count']
    else:
        counts = reduced_multiple_positions(table, *positions)
        mut_counts = counts['M']
        unmut_counts = counts['R']
        states = product(*list([bases] * len(positions)))
        header = ['mut'] + ['base%d' % (i + 1)
                            for i in range(len(positions))] + ['count']

    combined = []
    for state in states:
        combined.append(['R'] + list(state) + [unmut_counts[state]])
        combined.append(['M'] + list(state) + [mut_counts[state]])

    counts_table = make_table(header=header, rows=combined)
    counts_table = counts_table.sorted(columns=header[:-1])
    return counts_table
    def test_strandsym_table(self):
        """makes strand symmetric table"""
        data = [
            [1, "T", "T", "T", "T", "M", "TtoG"],
            [1, "G", "A", "A", "C", "M", "TtoG"],
            [1, "A", "G", "A", "A", "M", "TtoG"],
            [1, "G", "A", "A", "G", "M", "TtoG"],
            [1, "A", "C", "A", "A", "M", "TtoG"],
            [1, "G", "A", "C", "A", "M", "TtoG"],
        ]
        exp = []
        for row in self.data:
            n = row[:]
            n.append("+")
            exp.append(n)
        for row in data:
            seq = list(map(DNA.complement, row[1:-2]))
            seq.reverse()
            n = [row[0]] + seq + ["M", "AtoC"]
            n.append("-")
            exp.append(n)

        table = make_table(header=self.header, rows=self.data + data)
        r = make_strand_symmetric_table(table)
        self.assertEqual(r.tolist(), exp)
Example #7
0
def make_strand_symmetric_table(table):
    '''takes a combined counts table and returns a table with reverse
    complemented seqs

    Uses MUTATION_COMPLEMENTS'''

    new_data = []
    direction_index = [i for i in range(len(table.header))
                       if table.header[i] == 'direction'][0]
    for plus, minus in list(MUTATION_COMPLEMENTS.items()):
        plus_table = table.filtered('direction=="%s"' % plus)
        plus_data = add_strand_column(plus_table.tolist(), '+')
        new_data.extend(plus_data)

        minus_table = table.filtered('direction=="%s"' % minus)
        if minus_table.shape[0] == 0:
            continue
        minus_table = _reverse_complement(minus_table)
        minus_data = minus_table.tolist()
        for row in minus_data:
            row[direction_index] = plus
        minus_data = add_strand_column(minus_data, '-')
        new_data.extend(minus_data)

    return make_table(header=table.header[:] + ['strand'], rows=new_data)
Example #8
0
 def test_str_md_format(self):
     """str() produces markdown table"""
     md_table = make_table(header=["a", "b"],
                           data=[["val1", "val2"], ["has | symbol",
                                                    "val4"]])
     md = md_table.to_string(format="md")
     self.assertTrue(r"has \| symbol" in md)
Example #9
0
 def test_valid_setitem(self):
     """tabular_result works when set correct item type"""
     tr = tabular_result("null")
     tr["result"] = make_table(data={"A": [0, 1]})
     darr = DictArray({"A": [0, 1]})
     tr["result2"] = darr
     js = tr.to_json()
     self.assertIsInstance(js, str)
Example #10
0
 def test_load_mixed_static(self):
     """load data, mixed data type columns remain as string"""
     t = make_table(header=["A", "B"], data=[[1, 1], ["a", 2]])
     with TemporaryDirectory(".") as dirname:
         path = pathlib.Path(dirname) / "table.txt"
         t.write(str(path), sep="\t")
         # if static types, then mixed columns become strings
         r = load_table(path, sep="\t", static_column_types=True)
         self.assertTrue("str" in r.columns["A"].dtype.name)
Example #11
0
def load_table_from_delimited_file(path, sep='\t'):
    '''returns a Table object after a quicker loading'''
    with open_(path, 'rt') as infile:
        header = infile.readline().strip().split(sep)
        count_index = header.index('count')
        records = []
        for line in infile:
            line = line.strip().split(sep)
            line[count_index] = int(line[count_index])
            records.append(line)
        table = make_table(header=header, rows=records)
    return table
Example #12
0
def missing_species_names(names):
    """returns a Table of missing species names, or None"""
    missing = []
    for name in names:
        n = Species.get_species_name(name)
        if n == "None":
            missing.append([name])

    if missing:
        result = make_table(header=["MISSING SPECIES"], data=missing)
    else:
        result = None
    return result
Example #13
0
    def test_reverse_complement(self):
        table = make_table(header=self.header, rows=self.data)
        ex = [
            [1670, "A", "A", "A", "A", "M", "AtoC"],
            [557, "G", "T", "T", "C", "M", "AtoC"],
            [1479, "T", "T", "C", "T", "M", "AtoC"],
            [925, "C", "T", "T", "C", "M", "AtoC"],
            [1919, "T", "T", "G", "T", "M", "AtoC"],
            [442, "T", "G", "T", "C", "M", "AtoC"],
        ]
        got = _reverse_complement(table)
        raw_got = got.tolist()

        self.assertEqual(raw_got, ex)
Example #14
0
def _reverse_complement(table):
    '''returns a table with sequences reverse complemented'''
    pos_indices = [i for i, c in enumerate(
        table.header) if c.startswith('pos')]

    rows = table.tolist()
    for row in rows:
        # we use the cogent3 DnaSeq object to do reverse complementing
        seq = DNA.make_seq(''.join(row[i] for i in pos_indices))
        seq = list(seq.rc())
        for i, index in enumerate(pos_indices):
            row[index] = seq[i]
    if rows:
        new = make_table(header=table.header, rows=rows)
    else:
        new = None
    return new
def get_grouped_combined_counts(table, position, group_label):
    """wraps motif_count.get_combined_counts for groups"""
    group_cats = table.distinct_values(group_label)
    all_data = []
    header = None
    for category in group_cats:
        subtable = table.filtered(lambda x: x == category, columns=group_label)
        counts = motif_count.get_combined_counts(subtable, position)
        if header is None:
            header = [group_label] + list(counts.header)

        counts = counts.with_new_column(group_label, lambda x: category,
                                        columns=counts.header[0])
        all_data.extend(counts.tolist(header))
    counts = make_table(header=header, rows=all_data)
    counts.sorted(columns=[group_label, 'mut'])
    return counts
Example #16
0
    def test_deserialise_tabular_table(self):
        """correctly deserialises Table"""
        from cogent3 import make_table

        table = make_table(
            header=["id", "foo", "bar"],
            rows=[
                [1, "abc", 11],
                [2, "bca", 22],
                [3, "cab", 33],
                [4, "abc", 44],
                [5, "bca", 55],
            ],
        )
        json = table.to_json()
        got = deserialise_object(json)
        self.assertEqual(got.to_dict(), table.to_dict())
Example #17
0
    def test_summed(self):
        """test the table summed method"""
        t5 = Table(header=self.t5_header, rows=self.t5_rows)
        self.assertEqual(t5.summed(), [4, 4, 4, 4])
        self.assertEqual(t5.summed(col_sum=False), [4, 4, 8])
        t2 = Table(header=self.t2_header, rows=self.t2_rows)
        self.assertEqual(t2.summed(indices=2), 165)

        mix = make_table(header=["A", "B"], rows=[[0, ""], [1, 2], [3, 4]])
        self.assertEqual(mix.summed("B", strict=False), 6)
        self.assertEqual(mix.summed(0, col_sum=False, strict=False), 0)
        self.assertEqual(mix.summed(1, col_sum=False), 3)
        self.assertEqual(mix.summed(strict=False), [4, 6])
        self.assertEqual(mix.summed(col_sum=False, strict=False), [0, 3, 7])
        with self.assertRaises(RuntimeError):
            _ = mix.summed([0, 2], col_sum=False, strict=False)
        with self.assertRaises(TypeError):
            _ = mix.summed(strict=True)
Example #18
0
 def test_count_unique(self):
     """correctly computes unique values"""
     data = {
         "Project_Code": [
             "Ovary-AdenoCA",
             "Liver-HCC",
             "Panc-AdenoCA",
             "Panc-AdenoCA",
         ],
         "Donor_ID": ["DO46416", "DO45049", "DO51493", "DO32860"],
         "Variant_Classification": ["IGR", "Intron", "Intron", "Intron"],
     }
     table = make_table(data=data)
     co = table.count_unique(["Project_Code", "Variant_Classification"])
     self.assertEqual(co[("Panc-AdenoCA", "Intron")], 2)
     self.assertEqual(co[("Liver-HCC", "IGR")], 0)
     co = table.count_unique("Variant_Classification")
     self.assertEqual(co["Intron"], 3)
     self.assertEqual(co["IGR"], 1)
Example #19
0
def dump_genes(ensembl_account, species, outpath, coord_names, release, limit):
    """Dump meta data table for genes from one species in release ENSEMBL_ACCOUNT
    and exits."""
    ensembl_account = _get_account(ensembl_account)
    if len(species) > 1:
        msg = "dump_genes handles single species only"
        click.secho(msg, fg="red")
        sys.exit(-1)

    missing_species = missing_species_names(species)
    if missing_species:
        msg = [
            "The following species names don't match an Ensembl record. "
            "Check spelling!",
            str(missing_species),
            "\nAvailable species are at this server are:",
            str(display_available_dbs(ensembl_account)),
        ]

        click.secho("\n".join(msg), fg="red")
        sys.exit(-1)

    if coord_names:
        chroms = load_coord_names(coord_names)
    else:
        chroms = None

    genome = Genome(species[0], release=release, account=ensembl_account)
    genes = _get_ref_genes(genome, chroms, limit)
    records = []
    for g in genes:
        records.append([g.stableid, g.biotype, g.location, g.description])

    if records:
        table = make_table(
            header=["stableid", "biotype", "location", "description"], rows=records
        )
        table.write(outpath)
        click.secho("Wrote %d genes to %s" % (table.shape[0], outpath), fg="green")
    else:
        click.secho("No genes matching criteria", fg="blue")
Example #20
0
def status(configpath):
    """checks download/install status using checkpoint files and config"""
    release, remote_path, local_path, species_dbs = read_config(configpath)
    content = os.listdir(local_path)
    dbnames = reduce_dirnames(content, species_dbs)
    rows = []
    for db in dbnames:
        row = [
            db.name,
            is_downloaded(local_path, db.name),
            is_installed(local_path, db.name),
        ]
        rows.append(row)

    table = make_table(
        header=["dbname", "Downloaded", "Installed"],
        rows=rows,
        title="Status of download and install",
        legend=f"config={configpath.name}; local_path={local_path}",
    )
    print(table)
Example #21
0
def get_count_table(observed, control, k=None):
    """return table of motif counts

    Each motif position is a separate column. All possible DNA motifs of length
    k are included.

    Arguments:
        - observed: the observed counts as {seq: count}
        - control: the control counts as {seq: count}
        - k: size of the motif"""
    rows = []
    lengths = set(
        list(map(len, list(observed.keys()))) +
        list(map(len, list(control.keys()))))
    if len(lengths) != 1:
        raise ValueError("Motifs not all same length: %s" % str(lengths))

    length = list(lengths)[0]
    if k and length != k:
        raise ValueError("k[%d] doesn't match motif length [%d]" % (k, length))
    elif k is None:
        k = length

    states = list(set(observed.keys()) | set(control.keys()))
    states.sort()
    for state in states:
        state = ''.join(state)
        control_counts = control[state]
        observed_counts = observed[state]
        if control_counts == observed_counts == 0:
            # we skip unobserved states
            continue

        rows.append([control_counts] + list(state) + ['R'])
        rows.append([observed_counts] + list(state) + ['M'])

    header = ['count'] + ["pos%d" % i for i in range(k)] + ['mut']
    table = make_table(header=header, rows=rows)
    return table
Example #22
0
def display_available_dbs(account, release=None):
    """displays the available Ensembl databases at the nominated host"""
    db_list = get_db_name(account=account, db_type="core", release=release)
    db_list += get_db_name(account=account, db_type="compara", release=release)
    rows = []
    for db_name in db_list:
        species_name = db_name.species
        if species_name:
            common_name = Species.get_common_name(db_name.species, level="ignore")

        if "compara" in db_name.name:
            species_name = common_name = "-"
        rows.append([db_name.release, db_name.name, species_name, common_name])

    table = make_table(
        header=["Release", "Db Name", "Species", "Common Name"], data=rows, space=2
    )
    table = table.sorted(["Release", "Db Name"])
    table.legend = (
        "Values of 'None' indicate cogent does not have a value for that database name."
    )
    return table
def _parse_db_display(output, columns):
    """finds the table display and accumulates the content"""
    result = output.splitlines()
    header = []
    for index, line in enumerate(result):
        if not header and columns[0] in line:
            header = columns
            break

    if header:
        rows = []
        for i in range(index + 2, len(result)):
            line = result[i].strip()
            if line.startswith("----------"):
                break

            line = line.split()
            rows.append(line[:len(columns)])
        table = make_table(header=header, data=rows)
    else:
        table = None

    return table
Example #24
0
def get_one2one_orthologs(
    compara, ref_genes, outpath, not_strict, force_overwrite, test
):
    """writes one-to-one orthologs of protein coding genes to outpath"""

    species = Counter(compara.species)
    written = 0
    records = []
    with click.progressbar(ref_genes, label="Finding 1to1 orthologs") as ids:
        for gene in ids:
            outfile_name = os.path.join(outpath, "%s.fa.gz" % gene)
            if os.path.exists(outfile_name) and not force_overwrite:
                written += 1
                continue

            syntenic = list(
                compara.get_related_genes(
                    stableid=gene, relationship="ortholog_one2one"
                )
            )

            if len(syntenic) != 1:
                continue

            syntenic = syntenic[0]

            if not not_strict and (
                syntenic is None or Counter(syntenic.get_species_set()) != species
            ):
                # skipping, not all species had a 1to1 ortholog for this gene
                continue

            seqs = []
            for m in syntenic.members:
                records.append([gene, m.stableid, m.location, m.description])
                name = Species.get_common_name(m.genome.species)
                cds = m.canonical_transcript.cds.trim_stop_codon(allow_partial=True)
                cds.name = name
                seqs.append([name, cds])

            seqs = make_unaligned_seqs(data=seqs)
            if test:
                print()
                print(gene)
                print(seqs.to_fasta())
            else:
                with gzip.open(outfile_name, "wt") as outfile:
                    outfile.write(seqs.to_fasta() + "\n")
                LOGGER.output_file(outfile_name)

            written += 1
    if test:
        msg = "Would have written %d files to %s" % (written, outpath)
    else:
        msg = "Wrote %d files to %s" % (written, outpath)

    click.echo(msg)

    if written > 0:
        metadata = make_table(
            header=["refid", "stableid", "location", "description"], rows=records
        )
        metadata.write(os.path.join(outpath, "metadata.tsv"))

    return
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite,
         dry_run, verbose):
    args = locals()

    table = load_table(countsfile, sep='\t')
    if not dry_run:
        log_file_path = os.path.join(util.abspath(outpath),
                                     'spectra_analysis.log')
        LOGGER.log_file_path = log_file_path
        LOGGER.log_message(str(args), label='vars')

    LOGGER.input_file(countsfile)
    # if there's a strand symmetry argument then we don't need a second file
    if strand_symmetry:
        group_label = 'strand'
        counts_table = util.spectra_table(table, group_label)

    if not strand_symmetry:
        group_label = 'group'

        # be sure there's two files
        assert countsfile2, f"must provide second counts file"
        counts_table2 = load_table(countsfile2, sep='\t')
        LOGGER.input_file(countsfile2)
        counts_table2 = counts_table2.with_new_column(
            'group', lambda x: '2', columns=counts_table2.header[0])
        counts_table1 = table.with_new_column('group',
                                              lambda x: '1',
                                              columns=table.header[0])

        counts_table1 = util.spectra_table(counts_table1, group_label)
        counts_table2 = util.spectra_table(counts_table2, group_label)

        # now combine
        header = ['group'] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = make_table(header=header, rows=raw1 + raw2)

        if verbose:
            print(counts_table)

    # spectra table has [count, start, end, group] order
    # we reduce comparisons to a start base
    results = []
    saveable = {}
    for start_base in counts_table.distinct_values('start'):
        subtable = counts_table.filtered('start == "%s"' % start_base)
        columns = [c for c in counts_table.header if c != 'start']
        subtable = subtable.get_columns(columns)
        total_re, dev, df, collated, formula = log_lin.spectra_difference(
            subtable, group_label)
        r = [list(x) for x in collated.to_records(index=False)]

        if not strand_symmetry:
            grp_labels = {'1': countsfile, '2': countsfile2}
            grp_index = list(collated.columns).index('group')
            for row in r:
                row[grp_index] = grp_labels[row[grp_index]]

        p = chisqprob(dev, df)
        if p < 1e-6:
            prob = "%.2e" % p
        else:
            prob = "%.6f" % p

        for row in r:
            row.insert(0, start_base)
            row.append(prob)

        results += r

        significance = [
            "RE=%.6f" % total_re,
            "Dev=%.2f" % dev,
            "df=%d" % df,
            "p=%s" % p
        ]

        stats = "  :  ".join(significance)
        print("Start base=%s  %s" % (start_base, stats))
        saveable[start_base] = dict(rel_entropy=total_re,
                                    deviance=dev,
                                    df=df,
                                    prob=p,
                                    formula=formula,
                                    stats=collated.to_json())

    table = make_table(header=['start_base'] + list(collated.columns) +
                       ['prob'],
                       rows=results,
                       digits=5).sorted(columns='ret')
    json_path = None

    outpath = util.abspath(outpath)
    if not dry_run:
        util.makedirs(outpath)
        json_path = os.path.join(outpath, 'spectra_analysis.json')
        dump_json(saveable, json_path)
        LOGGER.output_file(json_path)
        table_path = os.path.join(outpath, 'spectra_summary.txt')
        table.write(table_path, sep='\t')
        LOGGER.output_file(table_path)
        LOGGER.log_message(str(significance), label="significance")
Example #26
0
 def test_to_plotly(self):
     """exercise producing a plotly table"""
     table = make_table(header=["a", "b"], data=[[0, 1]], index="a")
     drawable = table.to_plotly()
     self.assertIsInstance(drawable, Drawable)
     self._check_drawable_attrs(drawable.figure, "table")
def single_group(counts_table, outpath, group_label, group_ref, positions,
                 plot_config, first_order, dry_run):
    # Collect statistical analysis results
    summary = []

    max_results = {}
    # Single position analysis
    print("Doing single position analysis")
    single_results = single_position_effects(counts_table, positions,
                                             group_label=group_label)
    summary += make_summary(single_results)

    max_results[1] = max(single_results[p]['rel_entropy']
                         for p in single_results)
    if not dry_run:
        outfilename = os.path.join(outpath, "1.json")
        util.dump_loglin_stats(single_results, outfilename)
        LOGGER.output_file(outfilename, label="analysis1")

    fig = get_single_position_fig(
        single_results, positions,
        plot_config.get('1-way plot', 'figsize'),
        group_label=group_label,
        group_ref=group_ref,
        figwidth=plot_config.get('1-way plot', 'figwidth'),
        xlabel_fontsize=plot_config.get('1-way plot',
                                        'xlabel_fontsize'),
        ylabel_fontsize=plot_config.get('1-way plot',
                                        'ylabel_fontsize'),
        xtick_fontsize=plot_config.get('1-way plot',
                                       'xtick_fontsize'),
        ytick_fontsize=plot_config.get('1-way plot',
                                       'ytick_fontsize'))

    format_offset(fig, int(plot_config.get('1-way plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "1.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    if first_order:
        msg = "Done! Check %s for your results" % outpath
        summary = make_table(header=['Position', 'RE', 'Deviance', 'df',
                                    'prob', 'formula'],
                            rows=summary, digits=2, space=2)
        if not dry_run:
            outfilename = os.path.join(outpath, "summary.txt")
            summary.write(outfilename, sep='\t')
            LOGGER.output_file(outfilename, label="summary")

        return msg

    print("Doing two positions analysis")
    results = get_two_position_effects(counts_table, positions,
                                       group_label=group_label)
    summary += make_summary(results)

    max_results[2] = max(results[p]['rel_entropy'] for p in results)
    if not dry_run:
        outfilename = os.path.join(outpath, "2.json")
        util.dump_loglin_stats(results, outfilename)
        LOGGER.output_file(outfilename, label="analysis2")

    fig = get_two_position_fig(results, positions,
                               plot_config.get('2-way plot', 'figsize'),
                               group_label=group_label, group_ref=group_ref,
                               xtick_fontsize=plot_config.get(
                                   '2-way plot', 'xtick_fontsize'),
                               ytick_fontsize=plot_config.get('2-way plot', 'ytick_fontsize'))
    fig.set_figwidth(plot_config.get('2-way plot', 'figwidth'))
    x_fsz = plot_config.get('2-way plot', 'xlabel_fontsize')
    y_fsz = plot_config.get('2-way plot', 'ylabel_fontsize')
    fig.text(0.5, plot_config.get('2-way plot', 'xlabel_pad'), 'Position',
             ha='center', va='center', fontsize=x_fsz)
    fig.text(plot_config.get('2-way plot', 'ylabel_pad'), 0.5, 'RE',
             ha='center', va='center', rotation='vertical', fontsize=y_fsz)
    format_offset(fig, int(plot_config.get('2-way plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "2.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    print("Doing three positions analysis")
    results = get_three_position_effects(counts_table, positions,
                                         group_label=group_label)
    summary += make_summary(results)

    max_results[3] = max(results[p]['rel_entropy'] for p in results)
    if not dry_run:
        outfilename = os.path.join(outpath, "3.json")
        util.dump_loglin_stats(results, outfilename)
        LOGGER.output_file(outfilename, label="analysis3")

    fig = get_three_position_fig(results, positions,
                                 plot_config.get('3-way plot', 'figsize'),
                                 group_label=group_label, group_ref=group_ref,
                                 xtick_fontsize=plot_config.get(
                                     '3-way plot', 'xtick_fontsize'),
                                 ytick_fontsize=plot_config.get('3-way plot', 'ytick_fontsize'))
    fig.set_figwidth(plot_config.get('3-way plot', 'figwidth'))
    x_fsz = plot_config.get('3-way plot', 'xlabel_fontsize')
    y_fsz = plot_config.get('3-way plot', 'ylabel_fontsize')
    fig.text(0.5, plot_config.get('3-way plot', 'xlabel_pad'),
             'Position', ha='center', va='center', fontsize=x_fsz)
    fig.text(plot_config.get('3-way plot', 'ylabel_pad'), 0.5, 'RE',
             ha='center', va='center', rotation='vertical', fontsize=y_fsz)
    format_offset(fig,
                  int(plot_config.get('3-way plot', 'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "3.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    print("Doing four positions analysis")
    results = get_four_position_effects(counts_table, positions,
                                        group_label=group_label)
    summary += make_summary(results)

    max_results[4] = max(results[p]['rel_entropy'] for p in results)
    if not dry_run:
        outfilename = os.path.join(outpath, "4.json")
        util.dump_loglin_stats(results, outfilename)
        LOGGER.output_file(outfilename, label="analysis4")

    fig = get_four_position_fig(results, positions,
                                plot_config.get('4-way plot', 'figsize'),
                                group_label=group_label, group_ref=group_ref)
    fig.set_figwidth(plot_config.get('4-way plot', 'figwidth'))
    ax = fig.gca()
    x_fsz = plot_config.get('4-way plot', 'xlabel_fontsize')
    y_fsz = plot_config.get('4-way plot', 'ylabel_fontsize')
    ax.set_xlabel('Position', fontsize=x_fsz)
    ax.set_ylabel('RE', fontsize=y_fsz)
    format_offset(fig, int(plot_config.get('4-way plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "4.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    # now generate summary plot
    bar_width = 0.5
    index = numpy.arange(4)
    y_lim = max(max_results.values())
    y_fmt = util.FixedOrderFormatter(numpy.floor(numpy.log10(y_lim)))

    fig = pyplot.figure(figsize=plot_config.get('summary plot', 'figsize'))
    ax = fig.gca()
    ax.yaxis.set_major_formatter(y_fmt)

    bar = pyplot.bar(index, [max_results[i] for i in range(1, 5)], bar_width)
    pyplot.xticks(index + (bar_width / 2.), list(range(1, 5)),
                  fontsize=plot_config.get('summary plot', 'xtick_fontsize'))
    x_sz = plot_config.get('summary plot', 'xlabel_fontsize')
    y_sz = plot_config.get('summary plot', 'ylabel_fontsize')
    ax.set_xlabel("Effect Order", fontsize=x_sz)
    ax.set_ylabel("RE$_{max}$", fontsize=y_sz)

    x_sz = plot_config.get('summary plot', 'xtick_fontsize')
    y_sz = plot_config.get('summary plot', 'ytick_fontsize')
    ax.tick_params(axis='x', labelsize=x_sz, pad=x_sz // 2, length=0)
    ax.tick_params(axis='y', labelsize=y_sz, pad=y_sz // 2)
    format_offset(fig, int(plot_config.get('summary plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "summary.pdf")
        pyplot.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)

    summary = make_table(header=['Position', 'RE', 'Deviance', 'df',
                                'prob', 'formula'],
                        rows=summary, digits=2, space=2)
    if not dry_run:
        outfilename = os.path.join(outpath, "summary.txt")
        summary.write(outfilename, sep='\t')
        LOGGER.output_file(outfilename, label="summary")

    print(summary)
    pyplot.close('all')
    msg = "Done! Check %s for your results" % outpath
    return msg
def nbr(countsfile, outpath, countsfile2, first_order, strand_symmetry,
        group_label, group_ref, plot_cfg, no_type3, format, verbose, dry_run):
    '''log-linear analysis of neighbouring base influence on point mutation

    Writes estimated statistics, figures and a run log to the specified
    directory outpath.

    See documentation for count table format requirements.
    '''
    if no_type3:
        util.exclude_type3_fonts()

    args = locals()

    outpath = util.abspath(outpath)

    if not dry_run:
        util.makedirs(outpath)
        runlog_path = os.path.join(outpath, "analysis.log")
        LOGGER.log_file_path = runlog_path
        LOGGER.log_message(str(args), label='vars')

    counts_filename = util.abspath(countsfile)
    counts_table = util.load_table_from_delimited_file(counts_filename,
                                                       sep='\t')

    LOGGER.input_file(counts_filename, label="countsfile1_path")

    positions = [c for c in counts_table.header if c.startswith('pos')]
    if not first_order and len(positions) != 4:
        raise ValueError("Requires four positions for analysis")

    group_label = group_label or None
    group_ref = group_ref or None
    if strand_symmetry:
        group_label = 'strand'
        group_ref = group_ref or '+'
        if group_label not in counts_table.header:
            print("ERROR: no column named 'strand', exiting.")
            exit(-1)

    if countsfile2:
        print("Performing 2 group analysis")
        group_label = group_label or 'group'
        group_ref = group_ref or '1'
        counts_table1 = counts_table.with_new_column(group_label,
                                                     lambda x: '1',
                                                     columns=counts_table.header[0])

        fn2 = util.abspath(countsfile2)
        counts_table2 = util.load_table_from_delimited_file(fn2, sep='\t')

        LOGGER.input_file(fn2, label="countsfile2_path")

        counts_table2 = counts_table2.with_new_column(group_label,
                                                      lambda x: '2',
                                                      columns=counts_table2.header[0])
        # now combine
        header = [group_label] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = make_table(header=header, rows=raw1 + raw2)

        if not dry_run:
            outfile = os.path.join(outpath, 'group_counts_table.txt')
            counts_table.write(outfile, sep='\t')
            LOGGER.output_file(outfile, label="group_counts")

    if dry_run or verbose:
        print()
        print(counts_table)
        print()

    plot_config = util.get_plot_configs(cfg_path=plot_cfg)

    msg = single_group(counts_table, outpath, group_label, group_ref,
                       positions, plot_config, first_order,
                       dry_run)
    print(msg)
Example #29
0
def get_syntenic_alignments_introns(
    compara,
    ref_genes,
    outpath,
    method_clade_id,
    mask_features,
    outdir,
    force_overwrite,
    test,
):
    """writes Ensembl `method` syntenic alignments to ref_genes"""
    species = Counter(compara.species)
    common_names = list(map(Species.get_common_name, compara.species))
    filler = make_aligned_seqs(
        data=[(n, "N") for n in common_names], moltype=DNA, array_align=False
    )

    written = 0
    records = []
    with click.progressbar(ref_genes, label="Finding 1to1 intron orthologs") as ids:
        for gene_id in ids:
            valid_locations = True
            locations = {}
            gene = _get_gene_from_compara(compara, gene_id)
            if not gene:
                LOGGER.log_message("stableid '%s' not found" % gene_id)
                continue

            if gene.canonical_transcript.introns is None:
                LOGGER.log_message("stableid '%s' has no introns" % gene_id)
                continue

            outfile_name = os.path.join(outpath, "%s.fa.gz" % gene.stableid)
            if os.path.exists(outfile_name) and not force_overwrite:
                written += 1
                continue

            regions = list(
                compara.get_syntenic_regions(
                    region=gene.canonical_transcript,
                    method_clade_id=str(method_clade_id),
                )
            )
            alignments = []
            for index, region in enumerate(regions):
                if region is None:
                    msg = "stableid '%s' has no syntenic regions" % gene_id
                    LOGGER.log_message(msg)
                    continue

                try:
                    got = Counter(region.get_species_set())
                except (AttributeError, AssertionError):
                    got = None
                    # this is a PyCogent bug
                    error = sys.exc_info()
                    err_type = str(error[0]).split(".")[-1][:-2]
                    err_msg = str(error[1])
                    msg = "gene_stable_id=%s; err_type=%s; msg=%s" % (
                        gene.stableid,
                        err_type,
                        err_msg,
                    )
                    click.secho("ERROR:" + msg, fg="red")
                    LOGGER.log_message(msg, label="ERROR")
                    continue

                if got != species:
                    msg = [
                        "stableid '%s'" % gene_id,
                        "species set %s" % got,
                        "does not match expected %s" % species,
                    ]
                    LOGGER.log_message(" ".join(msg))
                    continue

                if mask_features:
                    aln = region.get_alignment(feature_types=["gene", "repeat", "cpg"])
                    aln = with_masked_features(aln, reverse=gene.location.strand == -1)
                else:
                    aln = region.get_alignment()

                if aln is None:
                    msg = "stableid '%s' has no syntenic alignment" % gene_id
                    LOGGER.log_message(msg)
                    continue

                aln = renamed_seqs(aln)
                if aln is not None:
                    alignments.append(aln)

                for m in region.members:
                    if m.location is None:
                        valid_locations = False
                        break

                    if m.genome.species not in locations:
                        union = m.location
                    elif m.genome.species in locations:
                        try:
                            union = locations[m.genome.species].union(m.location)
                        except AttributeError:
                            raise AttributeError("%s" % str([gene_id, m.genome]))

                    if union is None:
                        valid_locations = False
                        break

                    locations[m.genome.species] = union

            if not alignments:
                msg = "stableid '%s' has no alignments" % gene_id
                LOGGER.log_message(msg)
                continue

            if not valid_locations:
                msg = [
                    "stableid '%s' has" % gene_id,
                    "inconsistent location data for gene",
                    "based syntenic block %s" % locations,
                ]
                LOGGER.log_message(" ".join(msg), label="WARN")
                continue

            assert len(locations) == len(species), locations
            for sp, loc in locations.items():
                records.append([gene_id, loc])

            # we put a column of Ns between syntenic regions so that subsequent
            # sampling for tuple aligned columns does not construct artificial
            # motifs
            align = None
            for aln in alignments:
                if align is None:
                    align = aln
                    continue

                align += filler + aln

            if test:
                print(repr(align))
            else:
                with gzip.open(outfile_name, "wt") as outfile:
                    outfile.write(align.to_fasta())
                LOGGER.output_file(outfile_name)

            written += 1

    click.secho("Wrote %d files to %s" % (written, outpath), fg="green")
    if written > 0:
        metadata = make_table(header=["refid", "location"], rows=records)
        metadata.write(os.path.join(outpath, "metadata.tsv"))

    return