def get_corrs(data, adjust=identity, corr_func='pearson'):
    max_slice = defaultdict(int)
    for sl in data.columns:
        sl = sl.split('_sl')
        emb = sl[0]
        max_slice[emb] = max(max_slice[emb], int(sl[1][0:2]))
    xs = pd.Series(index=data.columns,
                   data=[int(a.split('_sl')[1][:2])/max_slice[a.split('_sl')[0]]
                         for a in data.columns if 'sl' in a])

    corrs_same = defaultdict(list)
    corrs_diff = defaultdict(list)
    all_corrs = [corrs_diff, corrs_same]
    for emb1_name in pb()(max_slice):
        emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust)
        genotype = emb1_name.split('_')[0]
        xs1 = xs.select(startswith(emb1_name))
        for emb2_name in max_slice:
            if emb1_name == emb2_name: continue
            emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust)
            xs2 = xs.select(startswith(emb2_name))
            closest = {
                column:
                min((abs(x2 - x1), c2)
                    for c2, x2 in xs2.items())[1]
                for column, x1 in xs1.items()
            }
            for col in emb1.columns:
                same = genotype == emb2_name.split('_')[0]
                all_corrs[same][genotype].append(emb1.ix[:, col].corr(
                    emb2.ix[:, closest[col]],
                    corr_func,
                ))
    return all_corrs
def get_corrs(data, adjust=identity, corr_func='pearson'):
    max_slice = defaultdict(int)
    for sl in data.columns:
        sl = sl.split('_sl')
        emb = sl[0]
        max_slice[emb] = max(max_slice[emb], int(sl[1][0:2]))
    xs = pd.Series(index=data.columns,
                   data=[
                       int(a.split('_sl')[1][:2]) /
                       max_slice[a.split('_sl')[0]] for a in data.columns
                       if 'sl' in a
                   ])

    corrs_same = defaultdict(list)
    corrs_diff = defaultdict(list)
    all_corrs = [corrs_diff, corrs_same]
    for emb1_name in pb()(max_slice):
        emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust)
        genotype = emb1_name.split('_')[0]
        xs1 = xs.select(startswith(emb1_name))
        for emb2_name in max_slice:
            if emb1_name == emb2_name: continue
            emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust)
            xs2 = xs.select(startswith(emb2_name))
            closest = {
                column: min((abs(x2 - x1), c2) for c2, x2 in xs2.items())[1]
                for column, x1 in xs1.items()
            }
            for col in emb1.columns:
                same = genotype == emb2_name.split('_')[0]
                all_corrs[same][genotype].append(emb1.ix[:, col].corr(
                    emb2.ix[:, closest[col]],
                    corr_func,
                ))
    return all_corrs
Ejemplo n.º 3
0
def get_randomized_scores(ase):
    rand = np.random.randint(2, size=ase_expr.shape)*2-1
    flipped = ase * rand
    melXsim = flipped.select(**sel_startswith('melXsim'))
    simXmel = flipped.select(**sel_startswith('simXmel'))
    weaker_sim_bias = np.min([melXsim.T.quantile(1-FRAC_FOR_MATERNAL),
                              simXmel.T.quantile(1-FRAC_FOR_MATERNAL)],
                             axis=0)
    weaker_mel_bias = np.max([melXsim.T.quantile(FRAC_FOR_MATERNAL),
                              simXmel.T.quantile(FRAC_FOR_MATERNAL)],
                             axis=0)
    return (weaker_mel_bias, weaker_sim_bias)
Ejemplo n.º 4
0
def plot_expr_comparison(expr, gene, prefix=None, smoothed=0):
    mel = expr.select(**sel_startswith('melXmel_')).ix[gene]
    sim = expr.select(**sel_startswith('simXsim_')).ix[gene]
    hyb = expr.select(**sel_startswith(('melXsim', 'simXmel'))).ix[gene]

    if smoothed:
        mel = pd.rolling_mean(mel, smoothed, min_periods=1, center=True)
        sim = pd.rolling_mean(sim, smoothed, min_periods=1, center=True)
        hyb = pd.rolling_mean(hyb, smoothed, min_periods=1, center=True)
    pu.svg_heatmap((None, mel, sim, hyb),
                   'analysis_godot/results/spatial_diffs/{}.svg'.format(gene),
                   cmap=(gene, cm.Reds, cm.Blues, pu.ISH),
                   norm_rows_by=tuple([gene] + ['maxall'] * 7),
                   **pu_kwargs)
Ejemplo n.º 5
0
def get_randomized_scores(ase):
    rand = np.random.randint(2, size=ase_expr.shape) * 2 - 1
    flipped = ase * rand
    melXsim = flipped.select(**sel_startswith('melXsim'))
    simXmel = flipped.select(**sel_startswith('simXmel'))
    weaker_sim_bias = np.min([
        melXsim.T.quantile(1 - FRAC_FOR_MATERNAL),
        simXmel.T.quantile(1 - FRAC_FOR_MATERNAL)
    ],
                             axis=0)
    weaker_mel_bias = np.max([
        melXsim.T.quantile(FRAC_FOR_MATERNAL),
        simXmel.T.quantile(FRAC_FOR_MATERNAL)
    ],
                             axis=0)
    return (weaker_mel_bias, weaker_sim_bias)
Ejemplo n.º 6
0
def plot_expr_comparison(expr, gene, prefix=None, smoothed=0):
    mel = expr.select(**sel_startswith('melXmel_')).ix[gene]
    sim = expr.select(**sel_startswith('simXsim_')).ix[gene]
    hyb = expr.select(**sel_startswith(('melXsim', 'simXmel'))).ix[gene]

    if smoothed:
        mel = pd.rolling_mean(mel, smoothed, min_periods=1, center=True)
        sim = pd.rolling_mean(sim, smoothed, min_periods=1, center=True)
        hyb = pd.rolling_mean(hyb, smoothed, min_periods=1, center=True)
    pu.svg_heatmap(
        (None, mel, sim, hyb),
        'analysis_godot/results/spatial_diffs/{}.svg'.format(gene),
        cmap=(gene, cm.Reds,
               cm.Blues,
               pu.ISH),
        norm_rows_by=tuple([gene] + ['maxall']*7),
        **pu_kwargs
    )
Ejemplo n.º 7
0
    parser.add_argument('--fit-ymax', default=1, type=float)
    args = parser.parse_args()
    args.male_samples = tuple(args.male_samples)
    return args


if __name__ == "__main__":
    from Utils import fbgns

    args = parse_args()
    expr = pd.read_table(args.expression_file,
                         **pd_kwargs).drop('---', axis=1, errors='ignore')
    ase = (pd.read_table(args.ase_file, **pd_kwargs).dropna(
        how='all', axis=1).dropna(how='all', axis=0).drop(
            args.overlapping_genes,
            errors='ignore').select(**sel_startswith(('melXsim', 'simXmel'))))
    ase_perm = pd.DataFrame(
        data=np.random.permutation(ase.T).T,
        index=ase.index,
        columns=ase.columns,
    )
    chrom_of = get_chroms()

    if args.male_samples and 'keep' not in args.male_samples:
        on_x = chrom_of[ase.index] == 'X'
        is_male = [col.startswith(args.male_samples) for col in ase.columns]
        ase.ix[on_x, is_male] = np.nan
    ase = ase.ix[ase.T.count() >= args.min_samples]
    if args.min_var:
        ase = ase.ix[ase.T.var() >= args.min_var]
    ase_perm = ase_perm.ix[ase.index]
Ejemplo n.º 8
0
    ase_cdt = ase.ix[cdt.index]
    exp_cdt = expr.ix[cdt.index]

    columns = (
        'melXsim',
        'simXmel',
    )

    ranges = {
        'meldominant': ('FBgn0034816', 'FBgn0250755'),
        'simdominant': ('FBgn0004087', 'FBgn0038934'),
    }

    if 'sparse' in argv[1]:
        pu.svg_heatmap(
            data=exp_cdt.select(**sel_startswith(columns)),
            filename='analysis/results/all_sparse.svg',
            norm_rows_by='max',
            progress_bar=True,
            col_sep='_sl',
            total_width=120,
            box_height=1,
            split_columns=True,
            draw_box=True,
            draw_row_labels=False,
            draw_name=True,
            cmap_by_prefix=cmap_by_prefix,
            make_hyperlinks=True,
            convert=True,
        )
        from sys import exit
Ejemplo n.º 9
0
    make_hyperlinks=True,
    convert=True,
    vspacer=0,
    max_width=200,
    cmap=cm.RdBu,
)

if __name__ == "__main__":
    ase = (pd.read_table(
        'analysis_godot/ase_summary_by_read.tsv',
        index_col=0,
        keep_default_na=False,
        na_values=['---'],
    ).dropna(how='all',
             axis=1).dropna(how='all',
                            axis=0).select(**sel_startswith(('melXsim',
                                                             'simXmel'))))
    paris = pd.read_table(
        'prereqs/GSE68062_Gene_CLASS_after_FPKM_normalization.txt',
        index_col=1)['mel.CLASS']
    pzyg = paris[paris == 'zyg']

    melXsim = ase.select(**sel_startswith('melXsim')).select(pzyg.__contains__)
    simXmel = ase.select(**sel_startswith('simXmel')).select(pzyg.__contains__)

    fbgns = pd.read_table('prereqs/gene_map_table_fb_2016_01.tsv',
                          index_col=1,
                          skiprows=5).ix[:, 0]

    max_slice = defaultdict(int)
    for sl in ase.columns:
        sl = sl.split('_sl')
Ejemplo n.º 10
0
                   startswith, get_chroms)
import pandas as pd
import numpy as np
import PlotUtils as pu
from FitASEFuncs import (logistic, peak, fit_all_ase,
                         calculate_variance_explained)

if __name__ == "__main__":
    expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).dropna(how='all', axis=1)
    ase = (pd
           .read_table('analysis_godot/ase_summary_by_read.tsv',
                       **pd_kwargs
                       )
           .dropna(how='all', axis=1)
           .dropna(how='all', axis=0)
           .select(**sel_startswith(('melXsim', 'simXmel')))
          )
    ase_limited = ase.select(**sel_startswith('melXsim'))
    chrom_of = get_chroms()

    males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
    on_x = chrom_of[ase.index] == 'X'
    is_male = [col.startswith(males) for col in ase.columns]
    ase.ix[on_x, is_male] = np.nan


    xs = get_xs(ase)
    xs_ltd = get_xs(ase_limited)
    colnames = ['Amp', 'width', 'center', 'y_offset']
    recalc_ase = locals().get('recalc_ase', True)
    if recalc_ase:
Ejemplo n.º 11
0
    return (weaker_mel_bias, weaker_sim_bias)


if __name__ == "__main__":
    ase = locals().get('ase', None)
    expr = locals().get('expr', None)
    if ase is None or expr is None or not np.all(ase.index == expr.index):
        print("reloading files")
        expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).dropna(how='all', axis=1)
        ase = (pd
               .read_table('analysis_godot/ase_summary_by_read.tsv',
                           **pd_kwargs
                           )
               .dropna(how='all', axis=1)
               .dropna(how='all', axis=0)
               .select(**sel_startswith(('melXsim', 'simXmel')))
              )
        chrom_of = get_chroms()

        males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
        on_x = [chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index]
        is_male = [col.startswith(males) for col in ase.columns]
        ase.ix[on_x, is_male] = np.nan

    melXsim_expr = expr.select(**sel_startswith('melXsim'))
    simXmel_expr = expr.select(**sel_startswith('simXmel'))
    melXsim_ase = ase.select(**sel_startswith('melXsim'))
    simXmel_ase = ase.select(**sel_startswith('simXmel'))
    melXsim_is_expr = (melXsim_expr > EXPR_MIN)
    simXmel_is_expr = (simXmel_expr > EXPR_MIN)
    all_is_expr = expr > EXPR_MIN
Ejemplo n.º 12
0
    return r2


EXPR_MIN = 10
if __name__ == "__main__":
    print("Reading data")
    expr = (pd.read_table('analysis_godot/summary_wasp.tsv',
                          **pd_kwargs).dropna(how='all',
                                              axis=0).dropna(how='all',
                                                             axis=1))
    if expr.index[0].startswith('FBgn'):
        expr.index = fbgns[expr.index]
    ase = (
        pd.read_table('analysis_godot/ase_summary_by_read_with_wasp.tsv',
                      **pd_kwargs).select(
                          **sel_startswith(('melXsim', 'simXmel'))).dropna(
                              how='all',
                              axis=0).rename_axis(lambda x: x.split('_ase')[0],
                                                  axis=1)
        #.replace(pd.np.nan, 0)
    )
    ase = ase.ix[expr.index]

    read_counts = pd.read_table('analysis_godot/map_stats.tsv',
                                index_col='LongName')

    chrom_of = get_chroms()

    males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
    on_x = chrom_of[ase.index] == 'X'
    is_male = [col.startswith(males) for col in ase.columns]
Ejemplo n.º 13
0
def svg_heatmap(data, filename, row_labels=None, box_size=4,
                index=None,
                all_indices=None, all_colnames=None, internal_datanames=None,
                cmap=ISH, norm_rows_by=None, draw_row_labels=False,
                color_row_labels=False,
                col_sep='', box_height=None, total_width=None,
                draw_box=False, draw_name=False, data_names=None,
                make_hyperlinks = False,
                progress_bar = False,
                max_width=np.inf,
                x_min=10, y_min=10,
                spacers=None,
                convert=False,
                squeeze_rows=None,
                cmap_by_prefix=None,
                draw_average=False,
                draw_average_only=False,
                average_scale=1,
                split_columns=False,
                vspacer=30,
                hatch_nan=True, hatch_size=20,
                figure_title=None,
                nan_replace=None,
                first_col='', last_col=''):
    """
    Draw heatmap as an SVG file stored in filename

    *data* can be either a 2D array-like type (list of lists, numpy array,
    pandas DataFrame, etc), or a tuple of 2D array-likes, in which case a
    separator will be added between each one in the output

    *cmap* is a matplotlib-like colormap (i.e. a callable that expects floats
    in the range 0.0-1.0.), or an iterable of the same length as the tuple
    *data* containing colormaps

    *row_labels* can be supplied, otherwise they will detected from the first
    item in *data*, if available, and if not they will be blank.

    If *total_width* is supplied, width of each dataset in *data* will be
    scaled to that constant. If *box_height* is supplied, the height of each
    row will be *box_height*, otherwise it will be equal to the width of each
    element. If neither are supplied, elements will be squares equal to
    *box_size*. IT IS STRONGLY RECOMMENDED that if if supplying *total_width*,
    *box_height* also be specified, but this is not enforced.

    *draw_row_labels*, if True, will label the rows on the right hand side. As
    of 2013/09/03, this won't scale the SVG properly, so including the
    resulting file in an html element won't display properly.

    *spacers* is the distance between adjacent datasets.  Can either be a
    number, in which case it will apply to all datasets, or an interable for
    different distances. If the iterable is shorter than the number of
    datasets, the last value will be repeated.

    """
    import svgwrite as svg
    try:
        import pandas as pd
        has_pandas = True
    except:
        has_pandas = False
        assert all_indices
        assert all_colnames

    if not isinstance(data, tuple):
        data = (data,)

    if not isinstance(norm_rows_by, tuple):
        norm_rows_by = repeat(norm_rows_by)


    old_data = data
    colname_tuple = repeat(None)
    if split_columns and has_pandas:
        from Utils import sel_startswith
        data = []
        new_normers = []
        new_cmaps = []
        if isinstance(cmap, tuple):
            cmaps = cmap
        else:
            cmaps = repeat(cmap)
        for dataset, normer, c_cmap in zip(old_data, norm_rows_by, cmaps):
            if dataset is None:
                data.append(dataset)
                new_normers.append(normer)
                new_cmaps.append(c_cmap)
                continue

            if not isinstance(dataset, pd.DataFrame):
                dataset = pd.DataFrame(dataset).T
            colnames = list(sorted(
                {col.split(col_sep)[0] for col in dataset.columns}))
            data.extend(
                dataset.select(**sel_startswith(colname)) for colname in colnames
            )
            new_normers.extend(normer for colname in colnames)
            new_cmaps.extend(c_cmap for colname in colnames)
        data = tuple(data)
        norm_rows_by = tuple(new_normers)
        cmap = tuple(new_cmaps)
    elif split_columns and all_colnames:
        colnames = list(sorted(
            {col.split(col_sep)[0] for col in all_colnames}))
        colname = colnames[0]
        data = tuple([
            data[:, array([c.startswith(colname) for c in internal_datanames])]
            for colname in colnames
        ])
        colname_tuple = tuple(
            [c for c in all_colnames if c.startswith(colname)]
            for colname in colnames
        )
    elif not split_columns and all_colnames:
        colname_tuple = tuple(
            [c for c in all_colnames if c.startswith(dataname)]
            for dataname in internal_datanames
        )

    rows, cols = np.shape([ds for ds in data if ds is not None][0])
    if index is not None:
        rows = len(index)
    if box_height is None:
        box_height = box_size

    if row_labels is None:
        if index is not None:
            row_labels = list(index)
        elif hasattr(data[0], 'index'):
            row_labels = list(data[0].index)
        else:
            row_labels = ['' for row in range(rows)]

    if total_width is not None and max_width is not np.inf:
        boxes_per_row = max_width // (1.1 * total_width)
        if ((boxes_per_row + 1) * 1.1 * total_width - .1 * total_width
            < max_width):
            boxes_per_row += 1

        num_plotted_rows = np.ceil(len(data) / boxes_per_row
                                   + (draw_average or draw_average_only))
        if figure_title is None:
            fig_title_height = 0
        elif isinstance(figure_title, tuple):
            fig_title_height = len(figure_title)
        else:
            fig_title_height = 1
        dwg = svg.Drawing(filename,
                          size=(max_width + 2 * x_min + 200 * draw_row_labels,
                                2 * y_min
                                + (num_plotted_rows
                                   * (rows)
                                   * box_height)
                                + 80 * (fig_title_height)
                                + 80 * draw_name
                                + (num_plotted_rows - 1) * vspacer),
                         )
    elif total_width is not None:
        width = len(data) * total_width * 1.1 - .1 * total_width
        height = rows * box_height
        max_row_label_len = max(len(str(i)) for i in row_labels)
        dwg = svg.Drawing(filename,
                          size=(width + 2 * x_min + 20 * draw_row_labels *
                                max_row_label_len,
                                height + 2 * y_min + 80 * draw_name
                                + (80 * (figure_title is not None)))
                         )
    else:
        dwg = svg.Drawing(filename)
    dwg.add(svg.base.Title(path.basename(filename)))

    pat = dwg.pattern(id='hatch', insert=(0, 0), size=(hatch_size, hatch_size),
                      patternUnits='userSpaceOnUse')
    g = pat.add(dwg.g(style="fill:none; stroke:#B0B0B0; stroke-width:1"))
    g.add(dwg.path(('M0,0', 'l{hatch},{hatch}'.format(hatch=hatch_size))))
    g.add(dwg.path(('M{hatch2},0 l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split())))
    g.add(dwg.path(('M0,{hatch2} l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split())))

    dwg.add(pat)


    if box_height is None:
        box_height = box_size

    if not hasattr(cmap, "__len__"):
        cmap = [cmap for frame in data]

    if data_names is None:
        data_names = ["" for frame in data]

    if len(cmap) != len(data):
        raise ValueError("cmap and data should be the same length ({} vs {})"
                        .format(len(cmap), len(data)))

    if not hasattr(spacers, "__len__"):
        spacers = [spacers]
    else:
        spacers = list(spacers)
    while len(spacers) < len(data):
        spacers.append(spacers[-1])

    if ((isinstance(norm_rows_by, repeat)
         and isinstance(next(norm_rows_by), str)
         and next(norm_rows_by).startswith('center0all'))
        or (not isinstance(norm_rows_by, repeat)
            and isinstance(norm_rows_by[0], str)
            and np.any([i.startswith('center0all') for i in norm_rows_by]))):
        all_data = pd.concat(data, axis=1)

    if squeeze_rows is not None:
        data = [
            pd.DataFrame(d.apply(squeeze_rows, axis=1),
                         columns=[path.commonprefix(list(d.columns))])
            for d in data
        ]

    x_start = x_min
    y_start = y_min
    y_diff = 0
    iterator = zip(data, cmap, data_names, norm_rows_by, spacers,
                   colname_tuple)
    if figure_title:
        if isinstance(figure_title, tuple):
            font_size = '3em'
            for title_line in figure_title:
                dwg.add(dwg.text(title_line, (x_start, y_start+75,),
                                 style="font-size:{};font-family:sans-serif".format(font_size)))
                y_start += 80
                font_size = '1.5em'

        else:
            dwg.add(dwg.text(figure_title, (x_start, y_start+75,),
                             style="font-size:3em;font-family:sans-serif"))
            y_start += 80
    if progress_bar:
        from progressbar import ProgressBar
        pbar = ProgressBar(maxval=len(data)*rows).start()
        pbar_val = 0

    for frame, c_cmap, name, normer, spacer, colnames in iterator:
        if frame is None:
            dwg.add(dwg.text(normer, (x_start, y_start + box_height/2)))
            if total_width is not None:
                if spacer is None:
                    x_start += total_width * 1.1
                else:
                    x_start += total_width + spacer
            else:
                if spacer is None:
                    x_start += box_size
                else:
                    x_start += spacer
            if x_start > max_width:
                x_start = x_min
                y_start += box_height + vspacer
            continue
        if has_pandas:
            frame = pd.DataFrame(frame)
        if index is not None:
            if has_pandas:
                frame = frame.ix[index]
            else:
                setix = set(index)
                #selector = [i for i, name in enumerate(all_indices) if name in setix]
                #frame = frame[selector, :]
        if normer is None:
            norm_data = array(frame.copy())
        elif normer is 'mean':
            if has_pandas:
                norm_data = array(frame.divide(frame.dropna(axis=1, how='all').mean(axis=1)+10, axis=0))
            else:
                norm_data = frame / (frame[:,isfinite(frame[0,:])].mean(axis=1) + 10).reshape((rows, 1))
        elif normer == 'max':
            if has_pandas:
                norm_data = array(frame.divide(frame.dropna(axis=1, how='all').max(axis=1)+10, axis=0))
            else:
                norm_data = frame / (frame[:,isfinite(frame[0,:])].max(axis=1) + 10).reshape((rows, 1))
        elif normer == 'maxall':
            if has_pandas:
                maxall = frame.max(axis=1)
                assert len(data) == len(new_normers)
                for old_frame, norm_type in zip(data, new_normers):
                    if norm_type != 'maxall': continue
                    if old_frame is not None:
                        old_frame = old_frame.max(axis=1).ix[index
                                                             if index is not None
                                                             else old_frame.index]
                        maxall = maxall.where(maxall > old_frame, old_frame)
                norm_data = array(frame.divide(maxall + 10, axis=0))
            else:
                norm_data = frame / (old_data[:, isfinite(old_data[0, :])]
                                     .max(axis=1) + 10).reshape((rows, 1))
        elif normer == 'fullvar':
            norm_data = frame.subtract(frame
                                       .dropna(axis=1, how='all')
                                       .min(axis=1)-1e-6,
                                       axis=0)
            norm_data = array(norm_data.divide(norm_data
                                               .dropna(axis=1, how='all')
                                               .max(axis=1),
                                               axis=0))
        elif normer == 'center0':
            norm_data = array(0.5 +
                         0.5 * frame.divide(frame.dropna(axis=1).abs().max(axis=1),
                                      axis=0)
                        )
        elif isinstance(normer, str) and normer.startswith('center0min'):
            min_norm = (
                frame.dropna(axis=1).abs() .max(axis=1).clip(float(normer[10:]), 1e6)
            )
            norm_data = array(0.5+
                              0.5 * frame.divide(min_norm, axis=0))

        elif isinstance(normer, str) and normer.startswith('center0allmin'):
            min_norm = (
                all_data.dropna(axis=1).abs() .max(axis=1).clip(float(normer[13:]), 1e6)
            )
            norm_data = array(0.5+
                              0.5 * frame.divide(min_norm, axis=0))

        elif normer == 'center0all':
            norm_data = array(0.5 +
                         0.5 *
                         frame.divide(all_data.dropna(how='all', axis=1).abs().max(axis=1),
                                      axis=0)
                        )
        elif normer == 'center0pre':
            norm_data = array(0.5 + 0.5 * frame)
        elif isinstance(normer, (int, float)):
            norm_data = array(frame / normer)
            normer = 'constant'
        elif index is not None and hasattr(normer, "ix"):
            norm_data = array(frame.divide(normer.ix[index], axis=0))
        elif hasattr(normer, "__len__") and len(normer) == rows:
            if has_pandas:
                norm_data = array(frame.divide(normer, axis=0))
            else:
                norm_data = array(frame / np.reshape(normer, (rows, -1)))


        elif hasattr(normer, "__len__"):
            print('\n'*5)
            print(len(normer), normer, normer=='max')
            print(frame.shape)
            raise TypeError("norm_rows_by should be the same shape "
                            "as the number of rows")
        else:
            norm_data = array(frame / normer)

        if not c_cmap or str(c_cmap).lower() == 'default':
            c_cmap = ISH

        new_rows, new_cols = np.shape(frame)
        if hasattr(frame, 'index'):
            col_labels = frame.columns
        elif colnames:
            col_labels = colnames
        else:
            col_labels = ['' for col in range(new_cols)]
        if new_rows != rows:
            raise ValueError("All input elements must have the same number of"
                             " rows (and same row meanings --unchecked)")

        if total_width is not None:
            box_size = total_width / float(new_cols)

        i = 0
        if not draw_average_only:
            for i in range(rows):
                if progress_bar:
                    pbar.update(pbar_val)
                    pbar_val += 1
                prefix = col_labels[0][:col_labels[0].find(col_sep)]
                if cmap_by_prefix:
                    c_cmap = cmap_by_prefix(prefix)
                for j in range(new_cols):
                    g = dwg.g()
                    val = frame.ix[i,j] if has_pandas else frame[i,j]
                    g.add(svg.base.Title("{}, {}: {:.2f}".format(row_labels[i],
                                                                 col_labels[j],
                                                                 val)))
                    hatch = not isfinite(norm_data[i,j])
                    if hatch and nan_replace is not None:
                        if isinstance(nan_replace, float):
                            norm_data[i, j] = nan_replace
                        else:
                            if normer.startswith('center0'):
                                norm_data[i, j] = 0.5
                            else:
                                norm_data[i, j] = 0.0
                    elif hatch:
                        n = 0
                        norm_data[i, j] = 0
                        left = j - 1
                        while left >= 0:
                            if isfinite(norm_data[i, left]):
                                norm_data[i, j] += norm_data[i, left]
                                n += 1
                                break
                            left -= 1
                        right = j + 1
                        while right  < norm_data.shape[1]:
                            if isfinite(norm_data[i, right]):
                                norm_data[i, j] += norm_data[i, right]
                                n+= 1
                                break
                            right += 1
                        if n == 0:
                            norm_data[i, j] = .5 if 'center' in normer else 0
                        else:
                            norm_data[i, j] /= n
                    g.add(dwg.rect((x_start + box_size*j, y_start + i*box_height),
                                   (box_size, box_height),
                                   style="fill:#{:02x}{:02x}{:02x}"
                                   .format(*[int(255*x) for x in
                                             c_cmap(norm_data[i, j])])))
                    dwg.add(g)
                    if hatch_nan and hatch:
                        g.add(dwg.rect((x_start + box_size*j,
                                        y_start + i*box_height),
                                       (box_size, box_height),
                                       style="fill:url(#hatch)"
                                      )
                             )
                    col_base = col_labels[j][:col_labels[j].find(col_sep)]
                    if col_base != prefix:
                        prefix = col_base
                        if cmap_by_prefix:
                            c_cmap = cmap_by_prefix(prefix)
                        g.add(dwg.line((x_start + box_size * j,
                                        y_start + i * box_height),
                                       (x_start + box_size * j,
                                        y_start + (i + 1) * box_height),
                                       style="stroke-width:{}; stroke:#000000"
                                       .format(.1 * box_size)))
        else:
            for j in range(new_cols):
                hatch = not isfinite(norm_data[0, j])
                if hatch:
                    n = 0
                    norm_data[:, j] = 0
                    if j > 0 and isfinite(norm_data[0,j-1]):
                        norm_data[:, j] += norm_data[:, j-1]
                        n += 1
                    if (j + 1 < norm_data.shape[1]
                        and isfinite(norm_data[0, j+1])):
                        norm_data[:, j] += norm_data[:, j+1]
                        n += 1
                    norm_data[:, j] /= n
        dwg.add(dwg.text(first_col, (x_start,
                                     y_start + (i + 1) * box_height)))
        dwg.add(dwg.text(last_col, (x_start + (new_cols - 1) * box_size,
                                    y_start + (i + 1) * box_height)))
        if draw_box and not draw_average_only:
            dwg.add(dwg.rect((x_start, y_start + 0),
                             (new_cols*box_size, rows*box_height),
                             style="stroke-width:1; "
                             "stroke:#000000; fill:none"))
        if draw_average or draw_average_only:
            avg_frame = norm_data.mean(axis=0)
            for j in range(new_cols):
                col_base = col_labels[j][:col_labels[j].find(col_sep)]
                prefix = col_base
                if cmap_by_prefix:
                    c_cmap = cmap_by_prefix(prefix)
                g = dwg.g()
                g.add(svg.base.Title("Average, {}: {:.2f}".format(col_labels[j],
                                                                  avg_frame[j])))
                g.add(dwg.rect((x_start + box_size*j,
                                y_start + (i+(not draw_average_only))*box_height),
                               (box_size, box_height),
                               style="fill:#{:02x}{:02x}{:02x}"
                               .format(*[int(255*x) for x in
                                         c_cmap(average_scale*avg_frame[j])])))
                if not isfinite(norm_data[0, j]) and hatch_nan:
                    g.add(dwg.rect((x_start + box_size*j,
                                    y_start + (i+(not draw_average_only))*box_height),
                                   (box_size, box_height),
                                   style="fill:url(#hatch)"
                                  )
                         )

                dwg.add(g)
            dwg.add(dwg.rect((x_start,
                              y_start + (i+(not draw_average_only))*box_height),
                             (new_cols*box_size, 1*box_height),
                             style="stroke-width:1; stroke:#000000; fill:none"
                            ))


        if draw_name:
            if name == "" and split_columns:
                name = col_base
            xpos = x_start + box_size * new_cols / 2.0
            text = dwg.text('',
                             (xpos,
                              y_start
                              + box_height * (rows) * (1-draw_average_only)
                              + box_height * (draw_average or draw_average_only)
                              + 13),
                            style="text-anchor: middle;font-family:sans-serif;")
            text.add(dwg.tspan("", dy=["-1.5em"]))
            for line in name.split('_'):
                text.add(dwg.tspan(line,
                                   dy=["1.5em"],
                                   x=[xpos],
                                   style="text-anchor: middle;",
                                   ))
            dwg.add(text)

        if total_width is not None:
            if spacer is None:
                x_start += total_width * 1.1
            else:
                x_start += total_width + spacer
        else:
            if spacer is None:
                x_start += new_cols * box_size + box_size
            else:
                x_start += new_cols * box_size + spacer

        #y_diff = new_rows * box_height + vspacer
        if x_start + total_width >= max_width:
            x_start = x_min
            y_start += new_rows*box_height*(not draw_average_only) + vspacer
            y_start += box_height*(draw_average_only or draw_average)

    if draw_row_labels and isinstance(row_labels[0], tuple):
        lwidths = Counter()
        for r in row_labels:
            for i, l in enumerate(r):
                lwidths[i] = max(lwidths[i], len(str(l)))
        cum_len = 0
        for i in range(len(lwidths)):
            old_width = lwidths[i]
            lwidths[i] += cum_len
            cum_len += old_width

    if draw_row_labels and not draw_average_only:
        for i in range(rows):
            if color_row_labels:
                style = "font-family:sans-serif; font-size: {size}; fill: {color};".format(
                    size=box_height,
                    color='red' if row_labels[i] in color_row_labels else 'black',
                )
            else:
                style = "font-family:sans-serif; font-size: {}".format(box_height)
            if isinstance(row_labels[i], tuple):
                labeltext = dwg.g()
                for lnum, ltext in enumerate(row_labels[i]):
                    labeltext.add(dwg.text(ltext,
                                           (x_start + lwidths[lnum-1] * 10 + lnum * 50,
                                            y_start + i * box_height + box_height),
                                           style=style,
                                          ))
            else:
                labeltext = (dwg.text(row_labels[i],
                                      (x_start, y_start + i*box_height+box_height),
                                      style=style,
                                     ))
            if make_hyperlinks:
                if make_hyperlinks is True:
                    link = dwg.a('http://insitu.fruitfly.org/cgi-bin/ex/report.pl?ftype={}&ftext={}'
                                 .format(2 if (isinstance(row_labels[i], str)
                                               and
                                               (row_labels[i].startswith('FBgn'))
                                              )
                                         else 1,
                                         row_labels[i]),
                                 target='_replace',
                                )
                else:
                    link = dwg.a(make_hyperlinks.format(frame.index[i]))
                link.add(labeltext)
                dwg.add(link)
            else:
                dwg.add(labeltext)
    if progress_bar:
        pbar.finish()
    dwg.saveas(filename)
    if convert:
        cmd = [
            'convert',
            filename,
            '-units', 'PixelsPerInch',
            '+antialias',
            '-density', '600',
            '-background', 'none',
            '-transparent', 'white',
            filename.replace('svg', 'png'),
        ]
        subprocess.Popen(cmd)
Ejemplo n.º 14
0
    parser.add_argument('--no-multi', dest='multi', action='store_false')
    return parser.parse_args()



if __name__ == "__main__":
    synonyms = get_synonyms()

    args = parse_args()
    expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).drop('---',
                                                                         axis='columns')
    ase = pd.read_table('analysis_godot/ase_summary_by_read.tsv',
                        **pd_kwargs).drop('---', axis='columns')


    mel = expr.select(**sel_startswith('melXmel_'))
    sim = expr.select(**sel_startswith('simXsim_'))
    hybrids = expr.select(**sel_startswith(('melXsim', 'simXmel')))
    melXsim = expr.select(**sel_startswith('melXsim'))
    simXmel = expr.select(**sel_startswith('simXmel'))

    expr_in_mel = (mel.max(axis=1) > EXPR_MIN)
    expr_in_sim = sim.max(axis=1) > EXPR_MIN
    expr_in_hybrids = (hybrids.max(axis=1) > EXPR_MIN)
    expr_in_all = (expr_in_mel & expr_in_sim & expr_in_hybrids)

    expr = expr.ix[expr_in_all]
    ase = ase.ix[expr.index]
    ase_classes = hu.get_classes(ase, pbar=pbar, style='cutoff')
    not_maternal = ase_classes.index[~((ase_classes.melXsim == 0) &
                                       (ase_classes.simXmel == 0))]
Ejemplo n.º 15
0
    ],
                             axis=0)
    return (weaker_mel_bias, weaker_sim_bias)


if __name__ == "__main__":
    ase = locals().get('ase', None)
    expr = locals().get('expr', None)
    if ase is None or expr is None or not np.all(ase.index == expr.index):
        print("reloading files")
        expr = pd.read_table('analysis_godot/summary.tsv',
                             **pd_kwargs).dropna(how='all', axis=1)
        ase = (pd.read_table('analysis_godot/ase_summary_by_read.tsv',
                             **pd_kwargs).dropna(how='all', axis=1).dropna(
                                 how='all',
                                 axis=0).select(**sel_startswith(('melXsim',
                                                                  'simXmel'))))
        chrom_of = get_chroms()

        males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
        on_x = [
            chrom_of[gene] == 'X' if gene in chrom_of else False
            for gene in ase.index
        ]
        is_male = [col.startswith(males) for col in ase.columns]
        ase.ix[on_x, is_male] = np.nan

    melXsim_expr = expr.select(**sel_startswith('melXsim'))
    simXmel_expr = expr.select(**sel_startswith('simXmel'))
    melXsim_ase = ase.select(**sel_startswith('melXsim'))
    simXmel_ase = ase.select(**sel_startswith('simXmel'))
    melXsim_is_expr = (melXsim_expr > EXPR_MIN)
Ejemplo n.º 16
0
    make_hyperlinks=True,
    convert=True,
    vspacer=0,
    max_width=200,
    cmap=cm.RdBu,
)


if __name__ == "__main__":
    ase = (pd
           .read_table('analysis_godot/ase_summary_by_read.tsv',
                       index_col=0,
                       keep_default_na=False, na_values=['---'],)
           .dropna(how='all', axis=1)
           .dropna(how='all', axis=0)
           .select(**sel_startswith(('melXsim', 'simXmel')))
          )
    paris = pd.read_table('prereqs/GSE68062_Gene_CLASS_after_FPKM_normalization.txt',
                  index_col=1)['mel.CLASS']
    pzyg = paris[paris == 'zyg']

    melXsim = ase.select(**sel_startswith('melXsim')).select(pzyg.__contains__)
    simXmel = ase.select(**sel_startswith('simXmel')).select(pzyg.__contains__)

    fbgns = pd.read_table('prereqs/gene_map_table_fb_2016_01.tsv',
                          index_col=1,skiprows=5).ix[:, 0]

    max_slice = defaultdict(int)
    for sl in ase.columns:
        sl = sl.split('_sl')
        emb = sl[0]
Ejemplo n.º 17
0
    ase_cdt = ase.ix[cdt.index]
    exp_cdt = expr.ix[cdt.index]

    columns = (
        'melXsim',
        'simXmel',
    )

    ranges = {
        'meldominant': ('FBgn0034816', 'FBgn0250755'),
        'simdominant': ('FBgn0004087', 'FBgn0038934'),
    }

    if 'sparse' in argv[1]:
        pu.svg_heatmap(
            data=exp_cdt.select(**sel_startswith(columns)),
            filename='analysis/results/all_sparse.svg',
            norm_rows_by='max',
            progress_bar=True,
            col_sep='_sl',
            total_width=120,
            box_height=1,
            split_columns=True,
            draw_box=True,
            draw_row_labels=False,
            draw_name=True,
            cmap_by_prefix=cmap_by_prefix,
            make_hyperlinks=True,
            convert=True,
        )
        from sys import exit
male_hybrid_embryos = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
female_hybrid_embryos = ('melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2',
                         'simXmel_cyc14C_rep1')

if __name__ == "__main__":
    expr = pd.read_table('godot/summary_fb.tsv', **pd_kwargs)
    ase = (pd.read_table('godot/ase_summary.tsv', **pd_kwargs)
           .dropna(how='all', axis=0)
          )

    chrom_of = get_chroms()

    ase = ase.select(lambda x: chrom_of[x] != 'X')

    expr_males = expr.select(**sel_startswith(male_hybrid_embryos))
    expr_females = expr.select(**sel_startswith(female_hybrid_embryos))

    ase_males = ase.select(**sel_startswith(male_hybrid_embryos))
    ase_females = ase.select(**sel_startswith(female_hybrid_embryos))

    ase_xs = get_xs(ase)
    ase_maternals = pd.Series(
        index=ase_xs.index,
        data=[1 if col.startswith('simXmel') else -1 for col in ase_xs.index]
    )

    if 'logistic_females' in locals() and locals().get('recalculate', True):
        with Pool() as p:
            logistic_females = fit_all_ase(ase_females, logistic,
                                           ase_xs.ix[ase_females.columns],
Ejemplo n.º 19
0
    parser = ArgumentParser()
    parser.add_argument('--multi', default=True, action='store_true')
    parser.add_argument('--no-multi', dest='multi', action='store_false')
    return parser.parse_args()


if __name__ == "__main__":
    synonyms = get_synonyms()

    args = parse_args()
    expr = pd.read_table('analysis_godot/summary.tsv',
                         **pd_kwargs).drop('---', axis='columns')
    ase = pd.read_table('analysis_godot/ase_summary_by_read.tsv',
                        **pd_kwargs).drop('---', axis='columns')

    mel = expr.select(**sel_startswith('melXmel_'))
    sim = expr.select(**sel_startswith('simXsim_'))
    hybrids = expr.select(**sel_startswith(('melXsim', 'simXmel')))
    melXsim = expr.select(**sel_startswith('melXsim'))
    simXmel = expr.select(**sel_startswith('simXmel'))

    expr_in_mel = (mel.max(axis=1) > EXPR_MIN)
    expr_in_sim = sim.max(axis=1) > EXPR_MIN
    expr_in_hybrids = (hybrids.max(axis=1) > EXPR_MIN)
    expr_in_all = (expr_in_mel & expr_in_sim & expr_in_hybrids)

    expr = expr.ix[expr_in_all]
    ase = ase.ix[expr.index]
    ase_classes = hu.get_classes(ase, pbar=pbar, style='cutoff')
    not_maternal = ase_classes.index[~((ase_classes.melXsim == 0) &
                                       (ase_classes.simXmel == 0))]
Ejemplo n.º 20
0
def svg_heatmap(data,
                filename,
                row_labels=None,
                box_size=4,
                index=None,
                all_indices=None,
                all_colnames=None,
                internal_datanames=None,
                cmap=ISH,
                norm_rows_by=None,
                draw_row_labels=False,
                color_row_labels=False,
                col_sep='',
                box_height=None,
                total_width=None,
                draw_box=False,
                draw_name=False,
                data_names=None,
                make_hyperlinks=False,
                progress_bar=False,
                max_width=np.inf,
                x_min=10,
                y_min=10,
                spacers=None,
                convert=False,
                squeeze_rows=None,
                cmap_by_prefix=None,
                draw_average=False,
                draw_average_only=False,
                average_scale=1,
                split_columns=False,
                vspacer=30,
                hatch_nan=True,
                hatch_size=20,
                figure_title=None,
                nan_replace=None,
                first_col='',
                last_col=''):
    """
    Draw heatmap as an SVG file stored in filename

    *data* can be either a 2D array-like type (list of lists, numpy array,
    pandas DataFrame, etc), or a tuple of 2D array-likes, in which case a
    separator will be added between each one in the output

    *cmap* is a matplotlib-like colormap (i.e. a callable that expects floats
    in the range 0.0-1.0.), or an iterable of the same length as the tuple
    *data* containing colormaps

    *row_labels* can be supplied, otherwise they will detected from the first
    item in *data*, if available, and if not they will be blank.

    If *total_width* is supplied, width of each dataset in *data* will be
    scaled to that constant. If *box_height* is supplied, the height of each
    row will be *box_height*, otherwise it will be equal to the width of each
    element. If neither are supplied, elements will be squares equal to
    *box_size*. IT IS STRONGLY RECOMMENDED that if if supplying *total_width*,
    *box_height* also be specified, but this is not enforced.

    *draw_row_labels*, if True, will label the rows on the right hand side. As
    of 2013/09/03, this won't scale the SVG properly, so including the
    resulting file in an html element won't display properly.

    *spacers* is the distance between adjacent datasets.  Can either be a
    number, in which case it will apply to all datasets, or an interable for
    different distances. If the iterable is shorter than the number of
    datasets, the last value will be repeated.

    """
    import svgwrite as svg
    try:
        import pandas as pd
        has_pandas = True
    except:
        has_pandas = False
        assert all_indices
        assert all_colnames

    if not isinstance(data, tuple):
        data = (data, )

    if not isinstance(norm_rows_by, tuple):
        norm_rows_by = repeat(norm_rows_by)

    old_data = data
    colname_tuple = repeat(None)
    if split_columns and has_pandas:
        from Utils import sel_startswith
        data = []
        new_normers = []
        new_cmaps = []
        if isinstance(cmap, tuple):
            cmaps = cmap
        else:
            cmaps = repeat(cmap)
        for dataset, normer, c_cmap in zip(old_data, norm_rows_by, cmaps):
            if dataset is None:
                data.append(dataset)
                new_normers.append(normer)
                new_cmaps.append(c_cmap)
                continue

            if not isinstance(dataset, pd.DataFrame):
                dataset = pd.DataFrame(dataset).T
            colnames = list(
                sorted({col.split(col_sep)[0]
                        for col in dataset.columns}))
            data.extend(
                dataset.select(**sel_startswith(colname))
                for colname in colnames)
            new_normers.extend(normer for colname in colnames)
            new_cmaps.extend(c_cmap for colname in colnames)
        data = tuple(data)
        norm_rows_by = tuple(new_normers)
        cmap = tuple(new_cmaps)
    elif split_columns and all_colnames:
        colnames = list(sorted({col.split(col_sep)[0]
                                for col in all_colnames}))
        colname = colnames[0]
        data = tuple([
            data[:,
                 array([c.startswith(colname) for c in internal_datanames])]
            for colname in colnames
        ])
        colname_tuple = tuple(
            [c for c in all_colnames if c.startswith(colname)]
            for colname in colnames)
    elif not split_columns and all_colnames:
        colname_tuple = tuple(
            [c for c in all_colnames if c.startswith(dataname)]
            for dataname in internal_datanames)

    rows, cols = np.shape([ds for ds in data if ds is not None][0])
    if index is not None:
        rows = len(index)
    if box_height is None:
        box_height = box_size

    if row_labels is None:
        if index is not None:
            row_labels = list(index)
        elif hasattr(data[0], 'index'):
            row_labels = list(data[0].index)
        else:
            row_labels = ['' for row in range(rows)]

    if total_width is not None and max_width is not np.inf:
        boxes_per_row = max_width // (1.1 * total_width)
        if ((boxes_per_row + 1) * 1.1 * total_width - .1 * total_width <
                max_width):
            boxes_per_row += 1

        num_plotted_rows = np.ceil(
            len(data) / boxes_per_row + (draw_average or draw_average_only))
        if figure_title is None:
            fig_title_height = 0
        elif isinstance(figure_title, tuple):
            fig_title_height = len(figure_title)
        else:
            fig_title_height = 1
        dwg = svg.Drawing(
            filename,
            size=(max_width + 2 * x_min + 200 * draw_row_labels,
                  2 * y_min + (num_plotted_rows *
                               (rows) * box_height) + 80 * (fig_title_height) +
                  80 * draw_name + (num_plotted_rows - 1) * vspacer),
        )
    elif total_width is not None:
        width = len(data) * total_width * 1.1 - .1 * total_width
        height = rows * box_height
        max_row_label_len = max(len(str(i)) for i in row_labels)
        dwg = svg.Drawing(
            filename,
            size=(width + 2 * x_min + 20 * draw_row_labels * max_row_label_len,
                  height + 2 * y_min + 80 * draw_name +
                  (80 * (figure_title is not None))))
    else:
        dwg = svg.Drawing(filename)
    dwg.add(svg.base.Title(path.basename(filename)))

    pat = dwg.pattern(id='hatch',
                      insert=(0, 0),
                      size=(hatch_size, hatch_size),
                      patternUnits='userSpaceOnUse')
    g = pat.add(dwg.g(style="fill:none; stroke:#B0B0B0; stroke-width:1"))
    g.add(dwg.path(('M0,0', 'l{hatch},{hatch}'.format(hatch=hatch_size))))
    g.add(
        dwg.path(('M{hatch2},0 l{hatch2},{hatch2}'.format(hatch2=hatch_size /
                                                          2).split())))
    g.add(
        dwg.path(('M0,{hatch2} l{hatch2},{hatch2}'.format(hatch2=hatch_size /
                                                          2).split())))

    dwg.add(pat)

    if box_height is None:
        box_height = box_size

    if not hasattr(cmap, "__len__"):
        cmap = [cmap for frame in data]

    if data_names is None:
        data_names = ["" for frame in data]

    if len(cmap) != len(data):
        raise ValueError(
            "cmap and data should be the same length ({} vs {})".format(
                len(cmap), len(data)))

    if not hasattr(spacers, "__len__"):
        spacers = [spacers]
    else:
        spacers = list(spacers)
    while len(spacers) < len(data):
        spacers.append(spacers[-1])

    if ((isinstance(norm_rows_by, repeat)
         and isinstance(next(norm_rows_by), str)
         and next(norm_rows_by).startswith('center0all'))
            or (not isinstance(norm_rows_by, repeat)
                and isinstance(norm_rows_by[0], str)
                and np.any([i.startswith('center0all')
                            for i in norm_rows_by]))):
        all_data = pd.concat(data, axis=1)

    if squeeze_rows is not None:
        data = [
            pd.DataFrame(d.apply(squeeze_rows, axis=1),
                         columns=[path.commonprefix(list(d.columns))])
            for d in data
        ]

    x_start = x_min
    y_start = y_min
    y_diff = 0
    iterator = zip(data, cmap, data_names, norm_rows_by, spacers,
                   colname_tuple)
    if figure_title:
        if isinstance(figure_title, tuple):
            font_size = '3em'
            for title_line in figure_title:
                dwg.add(
                    dwg.text(
                        title_line, (
                            x_start,
                            y_start + 75,
                        ),
                        style="font-size:{};font-family:sans-serif".format(
                            font_size)))
                y_start += 80
                font_size = '1.5em'

        else:
            dwg.add(
                dwg.text(figure_title, (
                    x_start,
                    y_start + 75,
                ),
                         style="font-size:3em;font-family:sans-serif"))
            y_start += 80
    if progress_bar:
        from progressbar import ProgressBar
        pbar = ProgressBar(maxval=len(data) * rows).start()
        pbar_val = 0

    for frame, c_cmap, name, normer, spacer, colnames in iterator:
        if frame is None:
            dwg.add(dwg.text(normer, (x_start, y_start + box_height / 2)))
            if total_width is not None:
                if spacer is None:
                    x_start += total_width * 1.1
                else:
                    x_start += total_width + spacer
            else:
                if spacer is None:
                    x_start += box_size
                else:
                    x_start += spacer
            if x_start > max_width:
                x_start = x_min
                y_start += box_height + vspacer
            continue
        if has_pandas:
            frame = pd.DataFrame(frame)
        if index is not None:
            if has_pandas:
                frame = frame.ix[index]
            else:
                setix = set(index)
                #selector = [i for i, name in enumerate(all_indices) if name in setix]
                #frame = frame[selector, :]
        if normer is None:
            norm_data = array(frame.copy())
        elif normer is 'mean':
            if has_pandas:
                norm_data = array(
                    frame.divide(frame.dropna(axis=1, how='all').mean(axis=1) +
                                 10,
                                 axis=0))
            else:
                norm_data = frame / (
                    frame[:, isfinite(frame[0, :])].mean(axis=1) + 10).reshape(
                        (rows, 1))
        elif normer == 'max':
            if has_pandas:
                norm_data = array(
                    frame.divide(frame.dropna(axis=1, how='all').max(axis=1) +
                                 10,
                                 axis=0))
            else:
                norm_data = frame / (
                    frame[:, isfinite(frame[0, :])].max(axis=1) + 10).reshape(
                        (rows, 1))
        elif normer == 'maxall':
            if has_pandas:
                maxall = frame.max(axis=1)
                assert len(data) == len(new_normers)
                for old_frame, norm_type in zip(data, new_normers):
                    if norm_type != 'maxall': continue
                    if old_frame is not None:
                        old_frame = old_frame.max(
                            axis=1
                        ).ix[index if index is not None else old_frame.index]
                        maxall = maxall.where(maxall > old_frame, old_frame)
                norm_data = array(frame.divide(maxall + 10, axis=0))
            else:
                norm_data = frame / (old_data[:, isfinite(old_data[0, :])].max(
                    axis=1) + 10).reshape((rows, 1))
        elif normer == 'fullvar':
            norm_data = frame.subtract(
                frame.dropna(axis=1, how='all').min(axis=1) - 1e-6, axis=0)
            norm_data = array(
                norm_data.divide(norm_data.dropna(axis=1,
                                                  how='all').max(axis=1),
                                 axis=0))
        elif normer == 'center0':
            norm_data = array(
                0.5 + 0.5 *
                frame.divide(frame.dropna(axis=1).abs().max(axis=1), axis=0))
        elif isinstance(normer, str) and normer.startswith('center0min'):
            min_norm = (frame.dropna(axis=1).abs().max(axis=1).clip(
                float(normer[10:]), 1e6))
            norm_data = array(0.5 + 0.5 * frame.divide(min_norm, axis=0))

        elif isinstance(normer, str) and normer.startswith('center0allmin'):
            min_norm = (all_data.dropna(axis=1).abs().max(axis=1).clip(
                float(normer[13:]), 1e6))
            norm_data = array(0.5 + 0.5 * frame.divide(min_norm, axis=0))

        elif normer == 'center0all':
            norm_data = array(0.5 + 0.5 * frame.divide(
                all_data.dropna(how='all', axis=1).abs().max(axis=1), axis=0))
        elif normer == 'center0pre':
            norm_data = array(0.5 + 0.5 * frame)
        elif isinstance(normer, (int, float)):
            norm_data = array(frame / normer)
            normer = 'constant'
        elif index is not None and hasattr(normer, "ix"):
            norm_data = array(frame.divide(normer.ix[index], axis=0))
        elif hasattr(normer, "__len__") and len(normer) == rows:
            if has_pandas:
                norm_data = array(frame.divide(normer, axis=0))
            else:
                norm_data = array(frame / np.reshape(normer, (rows, -1)))

        elif hasattr(normer, "__len__"):
            print('\n' * 5)
            print(len(normer), normer, normer == 'max')
            print(frame.shape)
            raise TypeError("norm_rows_by should be the same shape "
                            "as the number of rows")
        else:
            norm_data = array(frame / normer)

        if not c_cmap or str(c_cmap).lower() == 'default':
            c_cmap = ISH

        new_rows, new_cols = np.shape(frame)
        if hasattr(frame, 'index'):
            col_labels = frame.columns
        elif colnames:
            col_labels = colnames
        else:
            col_labels = ['' for col in range(new_cols)]
        if new_rows != rows:
            raise ValueError("All input elements must have the same number of"
                             " rows (and same row meanings --unchecked)")

        if total_width is not None:
            box_size = total_width / float(new_cols)

        i = 0
        if not draw_average_only:
            for i in range(rows):
                if progress_bar:
                    pbar.update(pbar_val)
                    pbar_val += 1
                prefix = col_labels[0][:col_labels[0].find(col_sep)]
                if cmap_by_prefix:
                    c_cmap = cmap_by_prefix(prefix)
                for j in range(new_cols):
                    g = dwg.g()
                    val = frame.ix[i, j] if has_pandas else frame[i, j]
                    g.add(
                        svg.base.Title("{}, {}: {:.2f}".format(
                            row_labels[i], col_labels[j], val)))
                    hatch = not isfinite(norm_data[i, j])
                    if hatch and nan_replace is not None:
                        if isinstance(nan_replace, float):
                            norm_data[i, j] = nan_replace
                        else:
                            if normer.startswith('center0'):
                                norm_data[i, j] = 0.5
                            else:
                                norm_data[i, j] = 0.0
                    elif hatch:
                        n = 0
                        norm_data[i, j] = 0
                        left = j - 1
                        while left >= 0:
                            if isfinite(norm_data[i, left]):
                                norm_data[i, j] += norm_data[i, left]
                                n += 1
                                break
                            left -= 1
                        right = j + 1
                        while right < norm_data.shape[1]:
                            if isfinite(norm_data[i, right]):
                                norm_data[i, j] += norm_data[i, right]
                                n += 1
                                break
                            right += 1
                        if n == 0:
                            norm_data[i, j] = .5 if 'center' in normer else 0
                        else:
                            norm_data[i, j] /= n
                    g.add(
                        dwg.rect(
                            (x_start + box_size * j, y_start + i * box_height),
                            (box_size, box_height),
                            style="fill:#{:02x}{:02x}{:02x}".format(*[
                                int(255 * x) for x in c_cmap(norm_data[i, j])
                            ])))
                    dwg.add(g)
                    if hatch_nan and hatch:
                        g.add(
                            dwg.rect((x_start + box_size * j,
                                      y_start + i * box_height),
                                     (box_size, box_height),
                                     style="fill:url(#hatch)"))
                    col_base = col_labels[j][:col_labels[j].find(col_sep)]
                    if col_base != prefix:
                        prefix = col_base
                        if cmap_by_prefix:
                            c_cmap = cmap_by_prefix(prefix)
                        g.add(
                            dwg.line(
                                (x_start + box_size * j,
                                 y_start + i * box_height),
                                (x_start + box_size * j, y_start +
                                 (i + 1) * box_height),
                                style="stroke-width:{}; stroke:#000000".format(
                                    .1 * box_size)))
        else:
            for j in range(new_cols):
                hatch = not isfinite(norm_data[0, j])
                if hatch:
                    n = 0
                    norm_data[:, j] = 0
                    if j > 0 and isfinite(norm_data[0, j - 1]):
                        norm_data[:, j] += norm_data[:, j - 1]
                        n += 1
                    if (j + 1 < norm_data.shape[1]
                            and isfinite(norm_data[0, j + 1])):
                        norm_data[:, j] += norm_data[:, j + 1]
                        n += 1
                    norm_data[:, j] /= n
        dwg.add(dwg.text(first_col, (x_start, y_start + (i + 1) * box_height)))
        dwg.add(
            dwg.text(last_col, (x_start + (new_cols - 1) * box_size, y_start +
                                (i + 1) * box_height)))
        if draw_box and not draw_average_only:
            dwg.add(
                dwg.rect((x_start, y_start + 0),
                         (new_cols * box_size, rows * box_height),
                         style="stroke-width:1; "
                         "stroke:#000000; fill:none"))
        if draw_average or draw_average_only:
            avg_frame = norm_data.mean(axis=0)
            for j in range(new_cols):
                col_base = col_labels[j][:col_labels[j].find(col_sep)]
                prefix = col_base
                if cmap_by_prefix:
                    c_cmap = cmap_by_prefix(prefix)
                g = dwg.g()
                g.add(
                    svg.base.Title("Average, {}: {:.2f}".format(
                        col_labels[j], avg_frame[j])))
                g.add(
                    dwg.rect((x_start + box_size * j, y_start +
                              (i + (not draw_average_only)) * box_height),
                             (box_size, box_height),
                             style="fill:#{:02x}{:02x}{:02x}".format(*[
                                 int(255 * x)
                                 for x in c_cmap(average_scale * avg_frame[j])
                             ])))
                if not isfinite(norm_data[0, j]) and hatch_nan:
                    g.add(
                        dwg.rect((x_start + box_size * j, y_start +
                                  (i + (not draw_average_only)) * box_height),
                                 (box_size, box_height),
                                 style="fill:url(#hatch)"))

                dwg.add(g)
            dwg.add(
                dwg.rect((x_start, y_start +
                          (i + (not draw_average_only)) * box_height),
                         (new_cols * box_size, 1 * box_height),
                         style="stroke-width:1; stroke:#000000; fill:none"))

        if draw_name:
            if name == "" and split_columns:
                name = col_base
            xpos = x_start + box_size * new_cols / 2.0
            text = dwg.text(
                '', (xpos, y_start + box_height * (rows) *
                     (1 - draw_average_only) + box_height *
                     (draw_average or draw_average_only) + 13),
                style="text-anchor: middle;font-family:sans-serif;")
            text.add(dwg.tspan("", dy=["-1.5em"]))
            for line in name.split('_'):
                text.add(
                    dwg.tspan(
                        line,
                        dy=["1.5em"],
                        x=[xpos],
                        style="text-anchor: middle;",
                    ))
            dwg.add(text)

        if total_width is not None:
            if spacer is None:
                x_start += total_width * 1.1
            else:
                x_start += total_width + spacer
        else:
            if spacer is None:
                x_start += new_cols * box_size + box_size
            else:
                x_start += new_cols * box_size + spacer

        #y_diff = new_rows * box_height + vspacer
        if x_start + total_width >= max_width:
            x_start = x_min
            y_start += new_rows * box_height * (
                not draw_average_only) + vspacer
            y_start += box_height * (draw_average_only or draw_average)

    if draw_row_labels and isinstance(row_labels[0], tuple):
        lwidths = Counter()
        for r in row_labels:
            for i, l in enumerate(r):
                lwidths[i] = max(lwidths[i], len(str(l)))
        cum_len = 0
        for i in range(len(lwidths)):
            old_width = lwidths[i]
            lwidths[i] += cum_len
            cum_len += old_width

    if draw_row_labels and not draw_average_only:
        for i in range(rows):
            if color_row_labels:
                style = "font-family:sans-serif; font-size: {size}; fill: {color};".format(
                    size=box_height,
                    color='red'
                    if row_labels[i] in color_row_labels else 'black',
                )
            else:
                style = "font-family:sans-serif; font-size: {}".format(
                    box_height)
            if isinstance(row_labels[i], tuple):
                labeltext = dwg.g()
                for lnum, ltext in enumerate(row_labels[i]):
                    labeltext.add(
                        dwg.text(
                            ltext,
                            (x_start + lwidths[lnum - 1] * 10 + lnum * 50,
                             y_start + i * box_height + box_height),
                            style=style,
                        ))
            else:
                labeltext = (dwg.text(
                    row_labels[i],
                    (x_start, y_start + i * box_height + box_height),
                    style=style,
                ))
            if make_hyperlinks:
                if make_hyperlinks is True:
                    link = dwg.a(
                        'http://insitu.fruitfly.org/cgi-bin/ex/report.pl?ftype={}&ftext={}'
                        .format(
                            2 if (isinstance(row_labels[i], str) and
                                  (row_labels[i].startswith('FBgn'))) else 1,
                            row_labels[i]),
                        target='_replace',
                    )
                else:
                    link = dwg.a(make_hyperlinks.format(frame.index[i]))
                link.add(labeltext)
                dwg.add(link)
            else:
                dwg.add(labeltext)
    if progress_bar:
        pbar.finish()
    dwg.saveas(filename)
    if convert:
        cmd = [
            'convert',
            filename,
            '-units',
            'PixelsPerInch',
            '+antialias',
            '-density',
            '600',
            '-background',
            'none',
            '-transparent',
            'white',
            filename.replace('svg', 'png'),
        ]
        subprocess.Popen(cmd)
Ejemplo n.º 21
0
    else:
        step = 1

    expr_min = 5
    eps = 1
    read_table_args = dict(index_col=0,
                           keep_default_na=False,
                           na_values=['---', ''])

    if 'all_expr' not in locals():
        all_expr = (pd.read_table('analysis/summary.tsv',
                                  **read_table_args).sort_index())
        top_expr = all_expr.max(axis=1)
        all_expr = all_expr.ix[top_expr > expr_min]
        all_expr = all_expr.ix[::step]
        wt = all_expr.select(**sel_startswith('WT'))
        bcd = all_expr.select(**sel_startswith('bcd'))
        zld = all_expr.select(**sel_startswith('zld'))
        g20 = all_expr.select(**sel_startswith('G20'))
        hb = all_expr.select(**sel_startswith('hb'))

        wts = bcds = zlds = g20s = hbs = 0
        for sub_df_name in 'wt bcd zld g20 hb'.split():
            sub_df = locals()[sub_df_name]
            cycs = {
                col.split('_sl')[0].split('_', 1)[1]
                for col in sub_df.columns
            }
            cyc_embs = {}
            for cyc in cycs:
                cyc_embs[cyc] = sub_df.select(**sel_contains(cyc))
Ejemplo n.º 22
0
def svg_heatmap(data, filename, row_labels=None, box_size=4,
                index=None,
                cmap=ISH, norm_rows_by=None, draw_row_labels=False,
                col_sep='', box_height=None, total_width=None,
                draw_box=False, draw_name=False, data_names=None,
                progress_bar = False,
                max_width=np.inf,
                spacers=None,
                cmap_by_prefix=None,
                split_columns=False,
                vspacer=30,
                hatch_nan=True, hatch_size=20,
                first_col='', last_col=''):
    """
    Draw heatmap as an SVG file stored in filename

    *data* can be either a 2D array-like type (list of lists, numpy array,
    pandas DataFrame, etc), or a tuple of 2D array-likes, in which case a
    separator will be added between each one in the output

    *cmap* is a matplotlib-like colormap (i.e. a callable that expects floats
    in the range 0.0-1.0.), or an iterable of the same length as the tuple
    *data* containing colormaps

    *row_labels* can be supplied, otherwise they will detected from the first
    item in *data*, if available, and if not they will be blank.

    If *total_width* is supplied, width of each dataset in *data* will be
    scaled to that constant. If *box_height* is supplied, the height of each
    row will be *box_height*, otherwise it will be equal to the width of each
    element. If neither are supplied, elements will be squares equal to
    *box_size*. IT IS STRONGLY RECOMMENDED that if if supplying *total_width*,
    *box_height* also be specified, but this is not enforced.

    *draw_row_labels*, if True, will label the rows on the right hand side. As
    of 2013/09/03, this won't scale the SVG properly, so including the
    resulting file in an html element won't display properly.

    *spacers* is the distance between adjacent datasets.  Can either be a
    number, in which case it will apply to all datasets, or an interable for
    different distances. If the iterable is shorter than the number of
    datasets, the last value will be repeated.

    """
    import svgwrite as svg
    import pandas as pd

    if split_columns and isinstance(data, pd.DataFrame):
        from Utils import sel_startswith
        colnames = list(sorted(
            {col.split(col_sep)[0] for col in data.columns}))
        data = tuple(
            data.select(**sel_startswith(colname)) for colname in colnames
        )
    elif not isinstance(data, tuple):
        data = (data,)

    rows, cols = np.shape(data[0])
    if index is not None:
        rows = len(index)
    if box_height is None:
        box_height = box_size

    if total_width is not None and max_width is not np.inf:
        dwg = svg.Drawing(filename,
                          size=(max_width,
                                np.ceil((len(data) * total_width)/max_width)
                                * (box_height+vspacer)))
    else:
        dwg = svg.Drawing(filename)
    dwg.add(svg.base.Title(path.basename(filename)))

    pat = dwg.pattern(id='hatch', insert=(0, 0), size=(hatch_size, hatch_size),
                      patternUnits='userSpaceOnUse')
    g = pat.add(dwg.g(style="fill:none; stroke:#B0B0B0; stroke-width:1"))
    g.add(dwg.path(('M0,0', 'l{hatch},{hatch}'.format(hatch=hatch_size))))
    g.add(dwg.path(('M{hatch2},0 l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split())))
    g.add(dwg.path(('M0,{hatch2} l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split())))

    dwg.add(pat)

    if row_labels is None:
        if index is not None:
            row_labels = index
        elif hasattr(data[0], 'index'):
            row_labels = data[0].index
        else:
            row_labels = ['' for row in range(rows)]

    if box_height is None:
        box_height = box_size

    if not hasattr(cmap, "__len__"):
        cmap = [cmap for frame in data]

    if data_names is None:
        data_names = ["" for frame in data]

    if len(cmap) != len(data):
        raise ValueError("cmap and data should be the same length")

    if not hasattr(spacers, "__len__"):
        spacers = [spacers]
    else:
        spacers = list(spacers)
    while len(spacers) < len(data):
        spacers.append(spacers[-1])

    if not isinstance(norm_rows_by, tuple):
        norm_rows_by = repeat(norm_rows_by)

    x_start = 0
    y_start = 0
    y_diff = 0
    if progress_bar:
        from progressbar import ProgressBar
        iterator = zip(data, cmap, data_names, norm_rows_by, spacers)
        pbar = ProgressBar(maxval=len(iterator)*rows).start()
        pbar_val = 0
    else:
        iterator = zip(data, cmap, data_names, norm_rows_by, spacers)

    for frame, c_cmap, name, normer, spacer in iterator:
        if frame is None:
            if total_width is not None:
                if spacer is None:
                    x_start += total_width * 1.1
                else:
                    x_start += total_width + spacer
            else:
                if spacer is None:
                    x_start += box_size
                else:
                    x_start += spacer
            if x_start > max_width:
                x_start = 0
                y_start += y_diff
                continue
        frame = pd.DataFrame(frame)
        if normer is None:
            norm_data = frame.copy()
        elif normer is 'mean':
            norm_data = frame.divide(frame.dropna(axis=1).mean(axis=1)+10, axis=0)
        elif normer is 'max':
            norm_data = frame.divide(frame.dropna(axis=1).max(axis=1)+10, axis=0)
        elif normer is 'center0':
            norm_data = (0.5 +
                         0.5 * frame.divide(frame.dropna(axis=1).abs().max(axis=1),
                                      axis=0)
                        )
        elif index is not None and hasattr(normer, "ix"):
            norm_data = frame.divide(normer.ix[index], axis=0)
        elif hasattr(normer, "__len__") and len(normer) == rows:
            norm_data = frame.divide(normer, axis=0)

        elif hasattr(normer, "__len__"):
            raise TypeError("norm_rows_by should be the same shape "
                            "as the number of rows")
        else:
            norm_data = frame.divide(normer, axis=0)

        if not c_cmap or str(c_cmap).lower() == 'default':
            c_cmap = ISH

        new_rows, new_cols = np.shape(frame)
        if hasattr(frame, 'index'):
            col_labels = frame.columns
        else:
            col_labels = ['' for col in range(new_cols)]
        if new_rows != rows:
            raise ValueError("All input elements must have the same number of"
                             " rows (and same row meanings --unchecked)")

        if total_width is not None:
            box_size = total_width / float(new_cols)

        for i in range(rows):
            if progress_bar:
                pbar.update(pbar_val)
                pbar_val += 1
            prefix = col_labels[0][:col_labels[0].find(col_sep)]
            if cmap_by_prefix:
                c_cmap = cmap_by_prefix(prefix)
            for j in range(new_cols):
                g = dwg.g()
                g.add(svg.base.Title("{}, {}: {:.2f}".format(row_labels[i],
                                                             col_labels[j],
                                                             frame.ix[i, j])))
                hatch = not isfinite(norm_data.ix[i, j])
                if hatch:
                    n = 0
                    norm_data.ix[i, j] = 0
                    if j > 0:
                        norm_data.ix[i, j] += norm_data.ix[i, j-1]
                        n += 1
                    if j + 1 < len(norm_data.columns):
                        norm_data.ix[i, j] += norm_data.ix[i, j+1]
                        n += 1
                    norm_data.ix[i, j] /= n
                g.add(dwg.rect((x_start + box_size*j, y_start + i*box_height),
                               (box_size, box_height),
                               style="fill:#{:02x}{:02x}{:02x}"
                               .format(*[int(255*x) for x in
                                         c_cmap(norm_data.ix[i, j])])))
                dwg.add(g)
                if hatch_nan and hatch:
                    g.add(dwg.rect((x_start + box_size*j,
                                    y_start + i*box_height),
                                   (box_size, box_height),
                                   style="fill:url(#hatch)"
                                  )
                         )
                col_base = col_labels[j][:col_labels[j].find(col_sep)]
                if col_base != prefix:
                    prefix = col_base
                    if cmap_by_prefix:
                        c_cmap = cmap_by_prefix(prefix)
                    g.add(dwg.line((x_start + box_size * j,
                                    y_start + i * box_height),
                                   (x_start + box_size * j,
                                    y_start + (i + 1) * box_height),
                                   style="stroke-width:{}; stroke:#000000"
                                   .format(.1 * box_size)))
        dwg.add(dwg.text(first_col, (x_start,
                                     y_start + (i + 1) * box_height)))
        dwg.add(dwg.text(last_col, (x_start + (new_cols - 1) * box_size,
                                    y_start + (i + 1) * box_height)))
        if draw_box:
            dwg.add(dwg.rect((x_start, y_start + 0),
                             (new_cols*box_size, rows*box_height),
                             style="stroke-width:1; "
                             "stroke:#000000; fill:none"))
        if draw_name:
            dwg.add(dwg.text(name,
                             (x_start + box_size * new_cols / 2.0,
                              y_start + box_height * (rows) + 13),
                             style="text-anchor: middle;"))

        if total_width is not None:
            if spacer is None:
                x_start += total_width * 1.1
            else:
                x_start += total_width + spacer
        else:
            if spacer is None:
                x_start += new_cols * box_size + box_size
            else:
                x_start += new_cols * box_size + spacer

        y_diff = new_rows * box_height + 30
        if x_start + total_width >= max_width:
            x_start = 0
            y_start += new_rows*box_height + vspacer

    if draw_row_labels:
        for i in range(rows):
            dwg.add(dwg.text(row_labels[i],
                             (x_start, y_start + i*box_height+box_height),
                             style='font-size:{}'.format(box_height),
                            ))
    pbar.finish()
    dwg.saveas(filename)
Ejemplo n.º 23
0


if __name__ == "__main__":
    from Utils import fbgns

    args = parse_args()
    expr = pd.read_table(args.expression_file, **pd_kwargs).drop('---', axis=1, errors='ignore')
    ase = (pd
           .read_table(args.ase_file,
                       **pd_kwargs
                       )
           .dropna(how='all', axis=1)
           .dropna(how='all', axis=0)
           .drop(args.overlapping_genes, errors='ignore')
           .select(**sel_startswith(('melXsim', 'simXmel')))
          )
    ase_perm = pd.DataFrame(
            data=np.random.permutation(ase.T).T,
            index=ase.index,
            columns=ase.columns,
            )
    chrom_of = get_chroms()

    if args.male_samples and 'keep' not in args.male_samples:
        on_x = chrom_of[ase.index] == 'X'
        is_male = [col.startswith(args.male_samples) for col in ase.columns]
        ase.ix[on_x, is_male] = np.nan
    ase = ase.ix[ase.T.count() >= args.min_samples]
    if args.min_var:
        ase = ase.ix[ase.T.var() >= args.min_var]
Ejemplo n.º 24
0
                        default='analysis_godot/ase_summary_by_read.tsv',
                        type=str)
    return parser.parse_args()

if __name__ == "__main__":
    filterwarnings("ignore", ".*Covariance of the parameters.*",)
    filterwarnings("ignore", ".*overflow encountered in exp.*",)
    #expr = pd.read_table('analysis_godot/summary_fb.tsv', **pd_kwargs).dropna(how='all', axis=1)
    args = parse_args()
    ase = (pd
           .read_table(args.data_to_fit,
                       **pd_kwargs
                       )
           .dropna(how='all', axis=1)
           .dropna(how='all', axis=0)
           .select(**sel_startswith(('melXsim', 'simXmel')))
          )
    chrom_of = get_chroms()

    males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
    on_x = [chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index]
    is_male = [col.startswith(males) for col in ase.columns]
    ase.ix[on_x, is_male] = np.nan
    ase = ase.loc[ase.T.count() > len(ase.columns) / 2.0]

    hours = len(ase) / 1e4 * 1.5 + 2
    cluster_args['time'] = '{}:{}:00'.format(int(hours), int((hours % 1)*60))
    print("Estimate {} per iteration".format(cluster_args['time']))
    #cluster_args['queue'] = fyrd.Queue(user='******',
                                       #qtype=fyrd.queue.get_cluster_environment())
    print(cluster_args)
Ejemplo n.º 25
0
    ax.set_xlim(mel_atlas_pos.ix[:, 'X', mel_stage].min() - 15,
                mel_atlas_pos.ix[:, 'X', mel_stage].max() + 15)
    pu.minimize_ink(ax)
    savefig(path.join(
        cwd, 'analysis/results/{}_atlas_sim_M{}S{}'.format(
            target,
            mel_atlas_expr.minor_axis.get_loc(both_stage),
            sim_atlas_expr.minor_axis.get_loc(both_stage),
        )),
            transparent=True)

    from GetASEStats import slices_per_embryo
    virtual_slices = {}
    ase = (pd.read_table(
        path.join(cwd, 'analysis_godot/ase_summary_by_read.tsv'),
        **pd_kwargs).select(**sel_startswith(('melXsim', 'simXmel'))))
    n_slices = slices_per_embryo(ase)
    actual = []
    computed = []

    for embryo, n in n_slices.items():
        if n not in virtual_slices:
            virtual_slices[n] = make_virtual_slices(
                mel_expr_at_stage, sim_expr_at_matching,
                mel_atlas_pos.ix[:, :, mel_stage].T, n)
        actual.extend(ase.ix[target].select(startswith(embryo)))
        computed.extend(virtual_slices[n][1][0])

    vslice_25 = virtual_slices[25][1][0].copy()
    vslice_25[13:19] = np.nan
    vslice_25 = pd.Series(
from progressbar import ProgressBar as pb

male_hybrid_embryos = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
female_hybrid_embryos = ('melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2',
                         'simXmel_cyc14C_rep1')

if __name__ == "__main__":
    expr = pd.read_table('godot/summary_fb.tsv', **pd_kwargs)
    ase = (pd.read_table('godot/ase_summary.tsv',
                         **pd_kwargs).dropna(how='all', axis=0))

    chrom_of = get_chroms()

    ase = ase.select(lambda x: chrom_of[x] != 'X')

    expr_males = expr.select(**sel_startswith(male_hybrid_embryos))
    expr_females = expr.select(**sel_startswith(female_hybrid_embryos))

    ase_males = ase.select(**sel_startswith(male_hybrid_embryos))
    ase_females = ase.select(**sel_startswith(female_hybrid_embryos))

    ase_xs = get_xs(ase)
    ase_maternals = pd.Series(
        index=ase_xs.index,
        data=[1 if col.startswith('simXmel') else -1 for col in ase_xs.index])

    if 'logistic_females' in locals() and locals().get('recalculate', True):
        with Pool() as p:
            logistic_females = fit_all_ase(ase_females,
                                           logistic,
                                           ase_xs.ix[ase_females.columns],
Ejemplo n.º 27
0
    if "-sparse" in sys.argv:
        is_sparse = "sparse_"
        step = 10
    else:
        step = 1

    expr_min = 5
    eps = 1
    read_table_args = dict(index_col=0, keep_default_na=False, na_values=["---", ""])

    if "all_expr" not in locals():
        all_expr = pd.read_table("analysis/summary.tsv", **read_table_args).sort_index()
        top_expr = all_expr.max(axis=1)
        all_expr = all_expr.ix[top_expr > expr_min]
        all_expr = all_expr.ix[::step]
        wt = all_expr.select(**sel_startswith("WT"))
        bcd = all_expr.select(**sel_startswith("bcd"))
        zld = all_expr.select(**sel_startswith("zld"))
        g20 = all_expr.select(**sel_startswith("G20"))
        hb = all_expr.select(**sel_startswith("hb"))

        wts = bcds = zlds = g20s = hbs = 0
        for sub_df_name in "wt bcd zld g20 hb".split():
            sub_df = locals()[sub_df_name]
            cycs = {col.split("_sl")[0].split("_", 1)[1] for col in sub_df.columns}
            cyc_embs = {}
            for cyc in cycs:
                cyc_embs[cyc] = sub_df.select(**sel_contains(cyc))
            locals()[sub_df_name + "s"] = cyc_embs
    print("Read expression in")