def limits(x, y=None, xbreaks=None, ybreaks=None):
    if y is None:
        y = x

    x0, x1 = x
    y0, y1 = y

    if xbreaks is None:
        xbreaks = np.linspace(x0, x1, x1 - x0 + 1)
    if ybreaks is None:
        ybreaks = np.linspace(y0, y1, y1 - y0 + 1)

    # We want these plots to continue to the top and left.
    return [ gg.coord_cartesian(xlim=x, ylim=y),
             gg.scale_x_continuous(limits=(x0, None), breaks=xbreaks),
             gg.scale_y_continuous(limits=(y0, None), breaks=ybreaks) ]
    return [ gg.scale_x_continuous(limits=x, breaks=xbreaks),
             gg.scale_y_continuous(limits=y, breaks=ybreaks) ]
def test_text_aesthetics():
    p = (ggplot(df, aes(y='y', label='label')) +
         geom_text(aes('x', label='label'), size=15, ha='left') +
         geom_text(aes('x+1', angle='angle'),
                   size=15, va='top', show_legend=False) +
         geom_text(aes('x+2', label='label', alpha='z'),
                   size=15, show_legend=False) +
         geom_text(aes('x+3', color='factor(z)'),
                   size=15, show_legend=False) +
         geom_text(aes('x+5', size='z'),
                   ha='right', show_legend=False) +
         scale_size_continuous(range=(12, 30)) +
         scale_y_continuous(limits=(-0.5, n-0.5)))

    assert p == 'text_aesthetics'
Exemple #3
0
    "R": "R",
    "java": "Java",
    "scala": "Scala",
    "C": "C",
    "sas": "SAS"
}

skills_summary_lang = skills_summary_df[skills_summary_df.attribute.isin(
    languages)]
skills_summary_lang = skills_summary_lang.replace(to_replace=lang_clean)
skills_summary_lang = sort_df(skills_summary_lang, var_col="attribute")

lang_plot = (
    p9.ggplot(skills_summary_lang,
              p9.aes('attribute', 'value', fill='type', show_legend=False)) +
    p9.geom_col() + p9.coord_flip() + p9.scale_y_continuous(expand=[0, 0]) +
    p9.labs(y="Frequency", x="Language", fill="") +
    p9.scale_fill_brewer(palette="Blues") + p9.facet_wrap('~type'))
lang_plot.save(filename='figs/lang_plot.png',
               height=5,
               width=5,
               units='in',
               dpi=1000)
lang_plot

#Software
programs = ["tableau", "docker", "bigquery", "jira", "spark", "hadoop"]

prog_clean = {
    "tableau": "Tableau",
    "docker": "Docker",
Exemple #4
0
def plot_bar(data,nuclstr,column='value',factor=None,ymin=None,ymax=None,stat='identity',dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],usd=False,right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,double_seq=False,transparent=True,fill_params=None,bar_position='stack',title=None):
    """
    A wrapper function to make a plot of data with bars along the sequnce
    input should be a dataframe with resid, segid column and 'value' 
    This one is inspired by seqplot/seqplot/pdb_plot.py
    """
    
    segid=data['segid'].values[0]
    
    if title is None:
        title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type'])
    
    seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \
                if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna)
    msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\
                                         name=nuclstr.components[segid]['type']+':'+segid)])
    if(reverse_seq):
        logger.info("Experimental feature will reverse the sequence")
        msar[0].seq=msar[0].seq[::-1]

    if double_seq:
          msar.add_sequence('reverse',str(msar[0].seq[::-1]))

        
    msar=msar[:,cropseq[0]:cropseq[1]]
        
    
#     print("Seq to plot:",msar)
             
    #We need to get starting residue, currently for DNA chains only cifseq gets it correctly
    resid_start=nuclstr.seqs[segid]['resid_start']
    
    logger.debug("Starting resid",resid_start)
    

    overhang=nuclstr.seqs[segid]['overhangL']
    
    datafixed=data.copy()
    datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0]

    
    sl=len(msar[0].seq)

#     fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug)
    if features is None:
        fn=nuclstr.shading_features[segid]
    else:
        fn=features
    fn2=[]
    for i in fn:
        if (i['style'] in feature_types) or ('all' in feature_types) :
            fn2.append(i)
            
    fn2.extend(add_features)
    if usd:
        ruler='top'
    else:
        ruler=None
    shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,ruler=ruler,density=200)
        
    #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that.
    if right_overhang_fix is None:
        if sl%10==0:
            if sl<100:
                rof= 0.1
            else:
                rof=0.5
        else:
            rof=0
    else:
        rof=right_overhang_fix
    if (not aspect_ratio is None ):
        ar=aspect_ratio
    else:
        ar=0.2*100./sl
#     print(datafixed)
    plot=(ggplot(data=datafixed,mapping=aes(x='resid', y=column))
#         + geom_point(size=0.1)
#           +geom_bar(stat='identity',width=0.5,mapping=aes(fill=factor))
        + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[])
       # + scale_y_continuous(breaks=[0,0.5,1.0])
        + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0,text=element_text(size=6), legend_key_size=5 ,legend_position='bottom',legend_direction='horizontal'))
    #+ facet_wrap('~ segid',dir='v') +guides(color=guide_legend(ncol=10))
    if factor is None:
        plot=plot+geom_bar(stat=stat,width=0.5)
    else:
        plot=plot+geom_bar(stat=stat,width=0.5,mapping=aes(fill=factor),position=bar_position)
        
    if fill_params is not None:
        plot=plot+scale_fill_manual(**fill_params)
    
    if not usd:
        if (ymax is not None) :
            plot=plot+scale_y_continuous(limits=(None,ymax))
    else:
        if (ymin is not None) :
            plot=plot+scale_y_continuous(limits=(ymin,None))
    
    if ymax is None:
        ymax=data[column].max()
    if ymin is None:
        ymin=data[column].min()
#     print(ymax)
    plot = plot + geom_seq_x(seqimg=shaded.img,\
                   xlim=(1,sl+rof),ylim=(ymin,ymax),usd=usd,aspect_ratio=ar,transparent=transparent)+ggtitle(title)
    
    
    return plot
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1],
                            'color': all_color_data[:, 2]})

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float})

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
Exemple #6
0
    def plot_char_percent_vs_accuracy_smooth(self,
                                             expo=False,
                                             no_models=False,
                                             columns=False):
        if expo:
            if os.path.exists('data/external/all_human_gameplay.json'
                              ) and not self.no_humans:
                with open('data/external/all_human_gameplay.json') as f:
                    all_gameplay = json.load(f)
                    frames = []
                    for event, name in [('parents', 'Dilettante'),
                                        ('maryland', 'Expert'),
                                        ('live', 'National')]:
                        if self.merge_humans:
                            name = 'Human'
                        gameplay = all_gameplay[event]
                        if event != 'live':
                            control_correct_positions = gameplay[
                                'control_correct_positions']
                            control_wrong_positions = gameplay[
                                'control_wrong_positions']
                            control_positions = control_correct_positions + control_wrong_positions
                            control_positions = np.array(control_positions)
                            control_result = np.array(
                                len(control_correct_positions) * [1] +
                                len(control_wrong_positions) * [0])
                            argsort_control = np.argsort(control_positions)
                            control_x = control_positions[argsort_control]
                            control_sorted_result = control_result[
                                argsort_control]
                            control_y = control_sorted_result.cumsum(
                            ) / control_sorted_result.shape[0]
                            control_df = pd.DataFrame({
                                'correct': control_y,
                                'char_percent': control_x
                            })
                            control_df['Dataset'] = 'Regular Test'
                            control_df['Guessing_Model'] = f' {name}'
                            frames.append(control_df)

                        adv_correct_positions = gameplay[
                            'adv_correct_positions']
                        adv_wrong_positions = gameplay['adv_wrong_positions']
                        adv_positions = adv_correct_positions + adv_wrong_positions
                        adv_positions = np.array(control_positions)
                        adv_result = np.array(
                            len(adv_correct_positions) * [1] +
                            len(adv_wrong_positions) * [0])
                        argsort_adv = np.argsort(adv_positions)
                        adv_x = adv_positions[argsort_adv]
                        adv_sorted_result = adv_result[argsort_adv]
                        adv_y = adv_sorted_result.cumsum(
                        ) / adv_sorted_result.shape[0]
                        adv_df = pd.DataFrame({
                            'correct': adv_y,
                            'char_percent': adv_x
                        })
                        adv_df['Dataset'] = 'Round 1 - IR Interface'
                        adv_df['Guessing_Model'] = f' {name}'
                        frames.append(adv_df)

                        if len(gameplay['advneural_correct_positions']) > 0:
                            adv_correct_positions = gameplay[
                                'advneural_correct_positions']
                            adv_wrong_positions = gameplay[
                                'advneural_wrong_positions']
                            adv_positions = adv_correct_positions + adv_wrong_positions
                            adv_positions = np.array(control_positions)
                            adv_result = np.array(
                                len(adv_correct_positions) * [1] +
                                len(adv_wrong_positions) * [0])
                            argsort_adv = np.argsort(adv_positions)
                            adv_x = adv_positions[argsort_adv]
                            adv_sorted_result = adv_result[argsort_adv]
                            adv_y = adv_sorted_result.cumsum(
                            ) / adv_sorted_result.shape[0]
                            adv_df = pd.DataFrame({
                                'correct': adv_y,
                                'char_percent': adv_x
                            })
                            adv_df['Dataset'] = 'Round 2 - NN Interface'
                            adv_df['Guessing_Model'] = f' {name}'
                            frames.append(adv_df)

                    human_df = pd.concat(frames)
            if no_models:
                p = ggplot(human_df) + geom_line()
            else:
                df = self.char_plot_df
                if 1 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 1 - IR Interface']
                if 2 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 2 - IR Interface']
                    df = df[df['Dataset'] != 'Round 2 - NN Interface']
                p = ggplot(df)

                if os.path.exists('data/external/all_human_gameplay.json'
                                  ) and not self.no_humans:
                    eprint('Loading human data')
                    p = p + geom_line(data=human_df)

            if columns:
                facet_conf = facet_wrap('Guessing_Model', ncol=1)
            else:
                facet_conf = facet_wrap('Guessing_Model', nrow=1)

            if not no_models:
                if self.mvg_avg_char:
                    chart = stat_smooth(method='mavg',
                                        se=False,
                                        method_args={'window': 400})
                else:
                    chart = stat_summary_bin(fun_data=mean_no_se,
                                             bins=20,
                                             shape='.')
            else:
                chart = None

            p = (p + facet_conf +
                 aes(x='char_percent', y='correct', color='Dataset'))
            if chart is not None:
                p += chart
            p = (
                p + scale_y_continuous(breaks=np.linspace(0, 1, 11)) +
                scale_x_continuous(breaks=[0, .5, 1]) +
                xlab('Percent of Question Revealed') + ylab('Accuracy') +
                theme(
                    #legend_position='top', legend_box_margin=0, legend_title=element_blank(),
                    strip_text_x=element_text(margin={
                        't': 6,
                        'b': 6,
                        'l': 1,
                        'r': 5
                    })) +
                scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF'],
                                   name='Questions'))
            if self.title != '':
                p += ggtitle(self.title)

            return p
        else:
            return (
                ggplot(self.char_plot_df) +
                aes(x='char_percent', y='correct', color='Guessing_Model') +
                stat_smooth(
                    method='mavg', se=False, method_args={'window': 500}) +
                scale_y_continuous(breaks=np.linspace(0, 1, 21)))
Exemple #7
0
def line_plot(df,
              x,
              y,
              group=None,
              facet_x=None,
              facet_y=None,
              aggfun='sum',
              err=None,
              show_points=False,
              base_size=10,
              figure_size=(6, 3)):
    '''
  Aggregates data in df and plots multiple columns as a line chart.

  Parameters
  ----------
  df : pd.DataFrame
    input dataframe
  x : str
    quoted expression to be plotted on the x axis
  y : str or list of str
    quoted expression(s) to be plotted on the y axis
  group : str
    quoted expression to be used as group (ie color)
  facet_x : str
    quoted expression to be used as facet
  facet_y : str
    quoted expression to be used as facet
  aggfun : str or fun
    function to be used for aggregating (eg sum, mean, median ...)
  err : str
     quoted expression to be used as error shaded area
  show_points : bool
    show/hide markers
  base_size : int
    base size for theme_ez
  figure_size :tuple of int
    figure size

  Returns
  -------
  g : EZPlot
    EZplot object

  '''

    if group is not None and isinstance(y, list) and len(y) > 1:
        log.error(
            "groups can be specified only when a single y column is present")
        raise ValueError(
            "groups can be specified only when a single y column is present")

    if err is not None and isinstance(y, list) and len(y) > 1:
        log.error(
            "err can be specified only when a single y column is present")
        raise ValueError(
            "err can be specified only when a single y column is present")

    if isinstance(y, list) and len(y) == 1:
        y = y[0]

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    if isinstance(y, list):

        ys = []
        for i, var in enumerate(y):
            ys.append('y_{}'.format(i))
            names['y_{}'.format(i)], variables['y_{}'.format(i)] = unname(var)

        # aggregate data
        tmp_gdata = agg_data(dataframe,
                             variables,
                             groups,
                             aggfun,
                             fill_groups=True)
        groups_present = [
            c for c in ['x', 'facet_x', 'facet_y'] if c in tmp_gdata.columns
        ]
        gdata = pd.melt(tmp_gdata,
                        groups_present,
                        var_name='group',
                        value_name='y')
        gdata['group'] = gdata['group'].replace(
            {var: names[var]
             for var in ys})

        # update values for plotting
        names['y'] = 'Value'
        names['group'] = 'Variable'
        group = 'Variable'

    else:

        names['y'], variables['y'] = unname(y)
        if err is not None:
            names['err'], variables['err'] = unname(err)

        # aggregate data
        gdata = agg_data(dataframe,
                         variables,
                         groups,
                         aggfun,
                         fill_groups=True)

    # reorder columns
    gdata = gdata[[
        c for c in ['x', 'y', 'err', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]
    if err is not None:
        gdata['ymax'] = gdata['y'] + gdata['err']
        gdata['ymin'] = gdata['y'] - gdata['err']

    # init plot obj
    g = EZPlot(gdata)

    # set groups
    if group is None:
        g += p9.geom_line(p9.aes(x="x", y="y"),
                          group=1,
                          colour=ez_colors(1)[0])
        if show_points:
            g += p9.geom_point(p9.aes(x="x", y="y"),
                               group=1,
                               colour=ez_colors(1)[0])
        if err is not None:
            g += p9.geom_ribbon(p9.aes(x="x", ymax="ymax", ymin="ymin"),
                                group=1,
                                fill=ez_colors(1)[0],
                                alpha=0.2)
    else:
        g += p9.geom_line(
            p9.aes(x="x", y="y", group="factor(group)",
                   colour="factor(group)"))
        if show_points:
            g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)"))
        if err is not None:
            g += p9.geom_ribbon(p9.aes(x="x",
                                       ymax="ymax",
                                       ymin="ymin",
                                       fill="factor(group)"),
                                alpha=0.2)
        g += p9.scale_color_manual(values=ez_colors(g.n_groups('group')))
        g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
      p9.xlab(names['x']) + \
      p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    return g
Exemple #8
0
def marginal_plot(df,
                  x,
                  y,
                  group = None,
                  facet_x = None,
                  facet_y = None,
                  aggfun = 'sum',
                  bins=21,
                  use_quantiles = False,
                  label_pos='auto',
                  label_function=ez_labels,
                  sort_groups=True,
                  base_size=10,
                  figure_size=(6, 3)):

    '''
    Bin the data in a df and plot it using lines.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    aggfun : str or fun
      function to be used for aggregating (eg sum, mean, median ...)
    bins : int or tuple
      number of bins to be used
    use_quantiles : bool
      bin data using quantiles
    label_pos : str
      Use count label on each point. Choose between None, 'auto' or 'force'
    label_function : callable
      labelling function
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object
    '''

    if label_pos not in [None, 'auto', 'force']:
        log.error("label_pos not recognized")
        raise NotImplementedError("label_pos not recognized")
    elif label_pos == 'auto':
        if bins<=21 and group is None:
            show_labels=True
        else:
            show_labels=False
    else:
        show_labels = True if label_pos=='force' else False

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x,  group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {c:c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']}
    new_variables = {'y': 'y'}

    # bin data
    if use_quantiles:
        quantile_groups = [c for c in tmp_df.columns if c in ['group', 'facet_x', 'facet_y']]
        if len(quantile_groups)>0:
            tmp_df['x'] = tmp_df.groupby(quantile_groups)['x'].apply(lambda x: qbin_data(x, bins))
        else:
            tmp_df['x'] = qbin_data(tmp_df['x'], bins)
    else:
        tmp_df['x'], _, _ = bin_data(tmp_df['x'], bins, None)

    # aggregate data and reorder columns
    gdata = agg_data(tmp_df, new_variables, new_groups, aggfun, fill_groups=False)

    # reorder columns
    gdata = gdata[[c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]]

    # init plot obj
    g = EZPlot(gdata)

    # determine order and create a categorical type
    if sort_groups:
        sort_data_groups(g)

    # get colors
    colors = np.flip(ez_colors(g.n_groups('group')))

    # set groups
    if group is None:
        g += p9.geom_line(p9.aes(x="x", y="y"), group=1, colour=colors[0])
        if show_labels:
            g += p9.geom_point(p9.aes(x="x", y="y"), group=1, colour=colors[0])
    else:
        g += p9.geom_line(p9.aes(x="x", y="y", group="factor(group)", colour="factor(group)"))
        if show_labels:
            g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)"))
        g += p9.scale_color_manual(values=colors)

    # set labels
    if show_labels:
        groups_to_count = [c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']]
        tmp_df['counts']=1
        top_labels = tmp_df \
            .groupby(groups_to_count)['counts'] \
            .sum()\
            .reset_index()
        top_labels['label'] = label_function(top_labels['counts'])
        
        # make sure labels and  data can be joined
        for c in ['group', 'facet_x', 'facet_y']:
            if c in tmp_df.columns:
                try:
                    top_labels[c] = pd.Categorical(top_labels[c].astype(str),
                                                   categories = g.data[c].cat.categories,
                                                   ordered = g.data[c].cat.ordered)
                except:
                    pass
        #return g.data, top_labels
        g.data = pd.merge(g.data, top_labels, on=groups_to_count, how='left')
        g.data['label_pos'] = g.data['y'] + \
                    np.sign(g.data['y'])*g.data['y'].abs().max()*0.02

        g += p9.geom_text(p9.aes(x='x', y='label_pos', label='label'),
                          color="#000000",
                          size=base_size * 0.7,
                          ha='center',
                          va='bottom')
    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')
        
    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'], size=base_size))
    return g
Exemple #9
0
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1],
                            'color': all_color_data[:, 2]})

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float})

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
def variable_histogram(df,
                       x,
                       group=None,
                       facet_y=None,
                       w='1',
                       bins=21,
                       bin_width=None,
                       position='stack',
                       normalize=False,
                       base_size=10,
                       figure_size=(6, 3)):
    '''
    Plot a 1-d histogram

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str or list
      quoted expressions to be plotted on the x axis
    group : str
      quoted expression to be used as group (ie color)
    facet_y : str
      quoted expression to be used as facet
    w : str
      quoted expression representing histogram weights (default is 1)
    bins : int or tuple
      number of bins to be used
    bin_width : float or tuple
      bin width to be used
    position : str
      if groups are present, choose between `stack`, `overlay` or `dodge`
    normalize : bool
      normalize histogram counts
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # TODO: performance improvement
    # TODO: add support for categorical variables in x

    if position not in ['overlay', 'stack', 'dodge']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    if (bins is None) and (bin_width is None):
        log.error("Either bins or bin_with should be defined")
        raise ValueError("Either bins or bin_with should be defined")

    if (bins is not None) and (bin_width is not None):
        log.error("Only one between bins or bin_with should be defined")
        raise ValueError(
            "Only one between  bins or bin_with should be defined")

    if isinstance(x, str):
        x = [x]

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['group', 'facet_y'], [group, facet_y]):
        names[label], groups[label] = unname(var)
    xs = []
    for i, var in enumerate(x):
        xs.append('x_{}'.format(i))
        names['x_{}'.format(i)], groups['x_{}'.format(i)] = unname(var)
    names['w'], variables['w'] = unname(w)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {
        c: c
        for c in tmp_df.columns if c in ['group', 'facet_y'] + xs
    }
    non_x_groups = [g for g in new_groups.keys() if g not in xs]

    # bin data (if necessary)
    bins_x = {}
    bin_width_x = {}
    for x in xs:
        if tmp_df[x].dtypes != np.dtype('O'):
            tmp_df[x], bins_x[x], bin_width_x[x] = bin_data(
                tmp_df[x], bins, bin_width)
        else:
            bin_width_x[x] = 1

    # aggregate data and reorder columns
    df_ls = []
    for x in xs:
        # aggregate data
        groups = {g: g for g in non_x_groups}
        groups[x] = x
        single_df = agg_data(tmp_df,
                             variables,
                             groups,
                             'sum',
                             fill_groups=True)
        single_df.fillna(0, inplace=True)
        single_df['facet_x'] = names[x]
        single_df.rename(columns={x: 'x'}, inplace=True)

        # normalize
        if normalize:
            if len(non_x_groups) == 0:
                single_df['w'] = single_df['w'] / (single_df['w'].sum() *
                                                   bin_width_x[x])
            else:
                single_df['w'] = single_df.groupby(non_x_groups)['w'].apply(
                    lambda z: z / (z.sum() * bin_width_x[x]))

        df_ls.append(single_df)
    gdata = pd.concat(df_ls)
    gdata = gdata[[
        c for c in ['x', 'w', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    # start plotting
    g = EZPlot(gdata)

    # set groups
    for single_df in df_ls:
        if group is None:
            g += p9.geom_bar(p9.aes(x="x", y="w"),
                             data=single_df,
                             stat='identity',
                             colour=None,
                             fill=ez_colors(1)[0])
        else:
            g += p9.geom_bar(p9.aes(x="x",
                                    y="w",
                                    group="factor(group)",
                                    fill="factor(group)"),
                             data=single_df,
                             colour=None,
                             stat='identity',
                             **POSITION_KWARGS[position])
            g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_y is None:
        g += p9.facet_wrap('~facet_x', scales='free')
    else:
        g += p9.facet_grid('facet_y~facet_x', scales='free')

    # set x scale
    g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab('Value') + \
        p9.ylab('Counts')

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    g += p9.guides(fill=p9.guide_legend(reverse=True))

    return g
)


# In[14]:


from sklearn.calibration import calibration_curve
cnn_y, cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.uncal, n_bins=10)
all_cnn_y, all_cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.cal, n_bins=10)

calibration_df = pd.DataFrame.from_records(
    list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'before'}, zip(cnn_x, cnn_y)))
    + list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'after'}, zip(all_cnn_x, all_cnn_y)))
)
calibration_df.to_csv("output/dag_calibration.tsv", sep="\t", index=False)


# In[15]:


(
    p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration"))
    + p9.geom_point()
    + p9.geom_line(p9.aes(group="factor(model_calibration)"))
    + p9.geom_abline(intercept=0, slope=1, linetype='dashed')
    + p9.scale_y_continuous(limits=[0,1])
    + p9.scale_x_continuous(limits=[0,1])
    + p9.theme_bw()
)

Exemple #12
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Fits logistic regression to predict labels.'
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument(
        '-h5',
        '--h5_anndata',
        action='store',
        dest='h5',
        required=True,
        help='H5 AnnData file where clusters have been saved to cluster slot.')

    # parser.add_argument(
    #     '-ncpu', '--number_cpu',
    #     action='store',
    #     dest='number_cpu',
    #     default=50,
    #     type=int,
    #     help='Number of CPUs to use. Since we are testing the dask backend,\
    #         this corresponds to the number of CPUs available across all of\
    #         the worker jobs we spin out.\
    #         (default: %(default)s)'
    # )

    parser.add_argument('-s',
                        '--sparsity_l1',
                        action='store',
                        dest='sparsity_l1',
                        default=0.0001,
                        type=float,
                        help='Smaller values specify stronger regularization.\
            (default: %(default)s)')

    parser.add_argument('-nepoch',
                        '--number_epoch',
                        action='store',
                        dest='number_epoch',
                        default=25,
                        type=int,
                        help='Number of epochs.\
            (default: %(default)s)')

    parser.add_argument(
        '-bs',
        '--batch_size',
        action='store',
        dest='batch_size',
        default=32,
        type=int,
        help='Batch size. Divides the dataset into n batches and updates the\
            weights at the end of each one.\
            (default: %(default)s)')

    parser.add_argument(
        '-tsc',
        '--train_size_cells',
        action='store',
        dest='train_size_cells',
        default=0,
        type=int,
        help='Number of cells to use for training set. If > 0 all\
            remaining cells not randomly selected for training will be used\
            for the test set. Overrides <train_size_fraction>.\
            (default: %(default)s)')

    parser.add_argument('-tsf',
                        '--train_size_fraction',
                        action='store',
                        dest='train_size_fraction',
                        default=0.67,
                        type=float,
                        help='Fraction of the data to use for training set.\
            (default: %(default)s)')

    parser.add_argument(
        '--dict_add',
        action='store',
        dest='dict_add',
        default='',
        type=str,
        help='Additional information to add to output model_report.\
            Format: key::value:::key2::value2.\
            Example: method::leiden:::resolution::3.0\
            (default: %(default)s)')

    parser.add_argument('--grid_search',
                        action='store_true',
                        dest='grid_search',
                        default=False,
                        help='Run a grid search of hyperparameters.\
            (default: %(default)s)')

    parser.add_argument('--memory_limit',
                        action='store',
                        dest='memory_limit',
                        default=50,
                        type=int,
                        help='Memory limit in Gb.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: keras_model-<params>)')
    options = parser.parse_args()

    verbose = True

    # Set GPU memory limits
    gpus = tf.config.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        # For TF v1
        # config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        # session = tf.Session(config=config)

        # For TF v2
        try:
            # Method 1:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            # Method 2:
            # Restrict TensorFlow to only allocate 1GB of memory on the first
            # GPU
            # tf.config.experimental.set_virtual_device_configuration(
            #     gpus[0],
            #     [tf.config.experimental.VirtualDeviceConfiguration(
            #         memory_limit=options.memory_limit*1024
            #     )])
            # logical_gpus = tf.config.list_logical_devices('GPU')
            # print(
            #     len(gpus),
            #     "Physical GPUs,",
            #     len(logical_gpus),
            #     "Logical GPUs"
            # )
        except RuntimeError as e:
            # Virtual devices must be set before GPUs have been initialized
            print(e)
    else:
        raise Exception('ERROR: no GPUs detected.')

    # Get additional data we are going to append to the output model info
    dict_add = {}
    if options.dict_add != '':
        for item in options.dict_add.split(':::'):
            _tmp = item.split('::')
            if len(_tmp) != 2:
                raise Exception('ERROR: check dict_add.')
            else:
                dict_add[_tmp[0]] = _tmp[1]
    print(dict_add)

    # Load the AnnData file.
    # This file should already have clusters identified and saved to the
    # clusters slot.
    adata = sc.read_h5ad(filename=options.h5)

    # Set X to cp10k
    # adata.X = np.expm1(adata.layers['log1p_cp10k'])
    # Set X to ln(cp10k+1)
    # NOTE: Testing with 100k TI dataset, we were able to achieve higher
    # accuracy with log1p_cp10k - likely becuase better spread in distribution.
    adata.X = adata.layers['log1p_cp10k']
    # Set X to raw counts
    # adata.X = adata.layers['counts']

    # Add some info from adata to dict_add
    for key, value in adata.uns['neighbors']['params'].items():
        dict_add['neighbors__{}'.format(key)] = value
    for key, value in adata.uns['cluster']['params'].items():
        dict_add['cluster__{}'.format(key)] = value

    # If train_size_cells, override the fraction so that the total number of
    # cells in the training set will be equal to train_size_cells.
    train_size_fraction = options.train_size_fraction
    if options.train_size_cells > 0:
        if options.train_size_cells >= adata.n_obs:
            raise Exception('Invalid train_size_cells.')
        train_size_fraction = (
            1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs))
        if verbose:
            print(
                'Set train_size_fraction to: {}.'.format(train_size_fraction))
    if verbose:
        print('Number cells training ({}) and testing ({}).'.format(
            int(train_size_fraction * adata.n_obs),
            int((1 - train_size_fraction) * adata.n_obs)))

    # Set X and y
    X = adata.X
    y = adata.obs['cluster'].values

    # Set other variables
    sparsity_l1 = options.sparsity_l1
    n_epochs = options.number_epoch
    batch_size = options.batch_size

    # Center and scale the data
    if sp.sparse.issparse(X):
        X = X.todense()
    X_std = X
    scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_std = scaler.fit_transform(X)
    if verbose:
        print('center={} scale={}'.format(True, True))

    # One hot encode y (the cell type classes)
    # encode class values as integers
    encoder = preprocessing.LabelEncoder()
    encoder.fit(y)
    print('Found {} clusters'.format(len(encoder.classes_)))

    # Define the model
    # NOTE: Defaults determined via grid search of 160k TI single cells
    def classification_model(optimizer='sgd',
                             activation='softmax',
                             loss='categorical_crossentropy',
                             sparsity_l1__activity=0.0001,
                             sparsity_l2__activity=0.0,
                             sparsity_l1__kernel=0.0,
                             sparsity_l2__kernel=0.0,
                             sparsity_l1__bias=0.0,
                             sparsity_l2__bias=0.0):
        # create model
        model = Sequential()
        # Use a “softmax” activation function in the output layer. This is to
        # ensure the output values are in the range of 0 and 1 and may be used
        # as predicted probabilities.
        #
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax
        # Softmax assigns decimal probabilities to each class in a multi-class
        # problem. Those decimal probabilities must add up to 1.0. This
        # additional constraint helps training converge more quickly than it
        # otherwise would. Softmax is implemented through a neural network
        # layer just before the output layer. The Softmax layer must have the
        # same number of nodes as the output layer.
        # Softmax assumes that each example is a member of exactly one class.
        #
        # Softmax should be used for multi-class prediction with single label
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture
        # NOTE: input dimension = number of features your data has
        model.add(
            Dense(
                len(encoder.classes_),  # output dim is number of classes
                use_bias=True,  # intercept
                activation=activation,  # softmax, sigmoid
                activity_regularizer=L1L2(l1=sparsity_l1__activity,
                                          l2=sparsity_l2__activity),
                kernel_regularizer=L1L2(l1=sparsity_l1__kernel,
                                        l2=sparsity_l2__kernel),
                bias_regularizer=L1L2(l1=sparsity_l1__bias,
                                      l2=sparsity_l2__bias),
                input_dim=X.shape[1]))
        # Example of adding additional layers
        # model.add(Dense(8, input_dim=4, activation='relu'))
        # model.add(Dense(3, activation='softmax'))

        # Metrics to check out over training epochs
        mets = [
            # loss,
            keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
            # keras.metrics.TruePositives(name='tp'),
            # keras.metrics.FalsePositives(name='fp'),
            # keras.metrics.TrueNegatives(name='tn'),
            # keras.metrics.FalseNegatives(name='fn'),
            # keras.metrics.Precision(name='precision'),
            # keras.metrics.Recall(name='recall'),
            # keras.metrics.AUC(name='auc'),
            keras.metrics.BinaryAccuracy(name='accuracy')
        ]
        # Use Adam gradient descent optimization algorithm with a logarithmic
        # loss function, which is called “categorical_crossentropy” in Keras.
        # UPDATE: sgd works better emperically.
        model.compile(
            optimizer=optimizer,  # adam, sgd
            loss=loss,
            metrics=mets)

        return model

    # Now, either call a grid search or specific model fit
    if options.grid_search:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
        out_file_base = '{}-grid_search'.format(out_file_base)

        # Call grid search of various parameters
        grid_result, df_grid_result = keras_grid(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            n_epochs=n_epochs,
            batch_size=batch_size)

        # NOTE: This will fail because can't pickle KerasClassifier. This is
        # fine though becuase results are saved in tsv.gz format below.
        # Save the results
        # out_f = '{}-grid_result.gz'.format(out_file_base)
        # joblib.dump(
        #     grid_result,
        #     out_f,
        #     compress=('gzip', 3)
        # )
        # Load the model
        # lr = joblib.load(
        #     'test-lr_model.joblib.gz'
        # )
        # print(lr)

        # Save the results of our search to tsv
        out_f = '{}-grid_result.tsv.gz'.format(out_file_base)
        df_grid_result.to_csv(out_f,
                              sep='\t',
                              index=False,
                              quoting=csv.QUOTE_NONNUMERIC,
                              na_rep='',
                              compression=compression_opts)

        # Add a single columns that summarizes params
        param_columns = [
            col for col in df_grid_result.columns if 'param__' in col
        ]
        df_grid_result['params'] = df_grid_result[param_columns].astype(
            str).apply(lambda x: '-'.join(x), axis=1)

        # Plot the distribution of accuracy across folds
        split_columns = [
            col for col in df_grid_result.columns if 'split' in col
        ]
        split_columns = [col for col in split_columns if '_test_score' in col]
        df_plt = pd.melt(df_grid_result,
                         id_vars=['params'],
                         value_vars=split_columns)
        gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_boxplot(alpha=0.8)
        gplt = gplt + plt9.geom_jitter(alpha=0.75)
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0
            # limits=[0, 1]
        )
        gplt = gplt + plt9.labs(x='Parameters', y='Score', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-score.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

        # Plot the mean time and std err for fitting results
        gplt = plt9.ggplot(df_grid_result,
                           plt9.aes(x='params', y='mean_fit_time'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_point()
        gplt = gplt + plt9.geom_errorbar(plt9.aes(
            ymin='mean_fit_time-std_fit_time',
            ymax='mean_fit_time+std_fit_time'),
                                         width=0.2,
                                         position=plt9.position_dodge(0.05))
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
        gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-fit_time.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

    else:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
            # out_file_base = '{}-center={}-scale={}'.format(
            #     out_file_base,
            #     center,
            #     scale
            # )
            out_file_base = '{}-batch_size={}-epochs={}'.format(
                out_file_base, batch_size, n_epochs)
            out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format(
                out_file_base,
                str(sparsity_l1).replace('.', 'pt'),
                str(train_size_fraction).replace('.', 'pt'))

        # Fit the specific model and save the results
        model, model_report, y_prob_df, history = fit_model_keras(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            sparsity_l1=sparsity_l1,
            sparsity_l2=0.0,
            n_epochs=n_epochs,
            batch_size=batch_size,
            train_size_fraction=train_size_fraction)

        # Save the model, weights (coefficients), and bias (intercept)
        model.save('{}.h5'.format(out_file_base),
                   overwrite=True,
                   include_optimizer=True)

        # Save the model and weights (coefficients) seperately
        # open('{}.json'.format(out_file_base), 'w').write(model.to_json())
        open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml())
        model.save_weights('{}-weights.h5'.format(out_file_base))
        # Example read functions
        # model = model_from_yaml(open('my_model_architecture.yaml').read())
        # model.load_weights('my_model_weights.h5')

        # Save the model report
        # Add column telling us if this is cluster or summary value
        is_cluster = []
        for i in model_report.index:
            if i in encoder.classes_:
                is_cluster.append(True)
            else:
                is_cluster.append(False)
        model_report['is_cluster'] = is_cluster
        # Add in extra data
        model_report['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                model_report[key] = value
        print(model_report)
        out_f = '{}-model_report.tsv.gz'.format(out_file_base)
        model_report.to_csv(out_f,
                            sep='\t',
                            index=True,
                            index_label='cell_label',
                            quoting=csv.QUOTE_NONNUMERIC,
                            na_rep='',
                            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Save the test results - each row is a cell and the columns are the
        # prob of that cell belonging to a particular class.
        # Add in extra data
        y_prob_df['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                y_prob_df[key] = value
        out_f = '{}-test_result.tsv.gz'.format(out_file_base)
        y_prob_df.to_csv(
            out_f,
            sep='\t',
            index=False,  # NOTE: Not adding the label to test_result index.
            # index_label='cell_label',
            quoting=csv.QUOTE_NONNUMERIC,
            na_rep='',
            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Make a matrix of weights per gene
        # Columns = genes tested and rows = cell type label
        weight, bias = model.layers[-1].get_weights()
        # weight, bias = model.get_layer("output").get_weights()
        df_weights = pd.DataFrame.from_records(
            weight,
            index=adata.var.index,  # index is gene
            columns=encoder.classes_)
        # Save the weights dataframe.
        out_f = '{}-weights.tsv.gz'.format(out_file_base)
        df_weights.to_csv(out_f,
                          sep='\t',
                          index=True,
                          index_label='ensembl_gene_id',
                          quoting=csv.QUOTE_NONNUMERIC,
                          na_rep='',
                          compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot the number of features with non-zero coefficients in each
        # cluster.
        out_f = '{}-n_features.png'.format(out_file_base)
        df_plt = pd.DataFrame({
            'classes': df_weights.columns,
            'features': (df_weights != 0).sum(axis=0)
        })
        df_plt = df_plt.set_index('classes')
        # print(df_plt)
        # Add in catgories with no predictive model (e.g., becuase they were
        # too few in training).
        for i in adata.obs['cluster'].cat.categories:
            if i not in df_plt.index:
                df_plt = df_plt.append(
                    pd.Series([0], index=df_plt.columns, name=i))
        fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4))
        # plt.bar(lr.classes_, n_features)
        plt.bar(df_plt.index, df_plt['features'])
        plt.xlabel('Cluster')
        plt.ylabel('Features with coefficient != 0')
        plt.xticks(rotation=90)
        for i in df_plt.index:
            plt.annotate(str(df_plt.loc[i, 'features']),
                         xy=(i, df_plt.loc[i, 'features']))
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)

        # Plot ROC of the test and truth.
        out_f = '{}-roc.png'.format(out_file_base)
        fig = plt.figure()
        cell_label_true = y_prob_df.pop('cell_label_true')
        # Drop columns that are not cell type labels
        for i in y_prob_df.columns:
            if 'class__' not in i:
                del y_prob_df[i]
        plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns)
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot metrics vs cluster size to see if smaller clusters have poorer
        # metric measures.
        df_plt = model_report.fillna(0)
        for i in df_plt.index:
            if i not in encoder.classes_:
                df_plt = df_plt.drop(i)
        for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']:
            out_f = '{}-cluster_size_{}.png'.format(out_file_base, i)
            fig = plt.figure()
            plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5)
            plt.xlabel('Number of cells in cluster (full dataset)')
            plt.ylabel(i)
            if i in ['AUC', 'f1-score', 'average_precision_score']:
                plt.ylim(0, 1)
            elif i == 'MCC':
                plt.ylim(-1, 1)
            # Add annotation of the cluster
            for index, row in df_plt.iterrows():
                if row['n_cells_full_dataset'] == 0:
                    print('ERROP: n_cells_full_dataset = 0 for {}.'.format(
                        index))
                plt.annotate(
                    index,  # this is the text
                    (row['n_cells_full_dataset'], row[i]),  # point to label
                    textcoords='offset points',  # how to position the text
                    xytext=(0, 10),  # distance from text to points (x,y)
                    ha='center'  # horiz alignment can be left, right, center
                )
            fig.savefig(out_f, dpi=300, bbox_inches='tight')
            plt.xscale('log', basex=10)
            fig.savefig('{}-cluster_size_{}_log10.png'.format(
                out_file_base, i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
            if verbose:
                print('Completed: save {}.'.format(out_f))

        # Plot history of metrics over epochs
        for dat_i in history.history.keys():
            fig = plt.figure()
            plt.plot(history.history[dat_i])
            plt.ylabel(dat_i)
            plt.xlabel('Epoch')
            fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
Exemple #13
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Filter and merge 10x data. Save to AnnData object.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument('--tsv_file',
                        action='store',
                        dest='tsv',
                        required=True,
                        help='cell_filtered_per_experiment tsv file.')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output png file. Will have .png appended.\
            (default: %(default)s)')

    options = parser.parse_args()

    # Get basename of the output file
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}'.format(
            os.path.basename(options.tsv.rstrip('tsv.gz').rstrip('\\.')))

    # Load the data
    df = pd.read_csv(options.tsv, sep='\t')

    # Get the total number of input cells per sample
    df_before_filters = df[df.filter_type.isin(['before_filters'])]
    df_before_filters = df_before_filters.set_index('experiment_id')

    # Check if any difference between before and after filters.	If not,
    # return early.
    df_after_filters = df[df.filter_type.isin(['after_filters'])]
    filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
        df_after_filters.experiment_id, 'n_cells_left_in_adata'].values
    if all(filt):
        print("No difference detected before and after filters. No plots.")
        return ()

    # Set some plotting parameters
    plt_height = 16  # 1.5 * df.experiment_id.nunique()

    # Plot the number of cells before and after all filters across experiments
    df_plt = df[df.filter_type.isin(['before_filters', 'after_filters'])]
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(
            x='experiment_id',
            y='n_cells_left_in_adata',
            # label='n_cells',
            fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    # gplt = gplt + plt9.geom_text(vjust=1.6, color='white', size=3.5)
    gplt = gplt + plt9.scale_y_continuous(
        trans='log10', labels=comma_labels, minor_breaks=0)
    gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(title='', y='Number of cells', x='', fill='')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='horizontal',
        legend_title=plt9.element_blank())
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-n_cells_before_after.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)

    # Plot the final fraction of cells filtered per experiment
    df_plt = df_after_filters.copy()
    # Invert the numbers, so instead of the number of cells that pass, get
    # the number of cells that fail at each filter.
    df_plt.n_cells_left_in_adata = df_before_filters.loc[
        df_plt.experiment_id,
        'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata
    # Now calculate the fraction removed
    df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \
        df_before_filters.loc[
            df_plt.experiment_id,
            'n_cells_left_in_adata'
        ].values
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    if df_plt.filter_type.nunique() < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(
        title='', y='Fraction of total cells excluded', x='', fill='Filter')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='vertical')
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-fraction_before_after.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)

    # Plot the number of cells falling into each filter acoss experiments.
    # NOTE: cells can fall into multiple filters.
    # Remove the rows that we do not want
    df_plt = df[~df.filter_type.isin(['before_filters', 'after_filters'])]
    df_plt = df_plt[~df_plt.filter_type.str.contains('after_filter')]
    # Invert the numbers, so instead of the number of cells that pass, get
    # the number of cells that fail at each filter.
    df_plt.n_cells_left_in_adata = df_before_filters.loc[
        df_plt.experiment_id,
        'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(x='experiment_id',
                 y='n_cells_left_in_adata',
                 fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    if df_plt.filter_type.nunique() < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(
        title='', y='Number of cells excluded', x='', fill='Filter')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='vertical')
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-n_cells_excluded.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)

    # Plot the ratio of the total number of cells removed in each filter across
    # experiments.
    # NOTE: cells can fall into multiple filters.
    df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \
        df_before_filters.loc[
            df_plt.experiment_id,
            'n_cells_left_in_adata'
        ].values
    gplt = plt9.ggplot(
        df_plt,
        plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type'))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_bar(stat='identity', position='dodge')
    if df_plt.filter_type.nunique() < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    gplt = gplt + plt9.labs(
        title='', y='Fraction of total cells excluded', x='', fill='Filter')
    # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245
    gplt = gplt + plt9.theme(
        # legend_position='bottom',
        subplots_adjust={'bottom': 0.15},
        legend_position=(.5, .05),
        legend_direction='vertical')
    gplt = gplt + plt9.coord_flip()
    gplt.save('{}-fraction_cells_excluded.png'.format(out_file_base),
              dpi=300,
              width=4,
              height=plt_height)
gg_rep_act.save(os.path.join(dir_output, 'gg_rep_act.png'), width=8, height=4)

di_notes = {
    'chi2': 'χ2-correction',
    'insig': 'Erroneous',
    'specification': 'Specification',
    'non-replicable': 'Inconsistent'
}
# (ii) Breakdown of counts
tmp = acc_tt.merge(
    res_fisher.tt.value_counts().reset_index().rename(columns={
        'index': 'tt',
        'tt': 'n_lit'
    }))
tmp = tmp.assign(tt=lambda x: x.tt.map(di_tt),
                 notes=lambda x: x.notes.map(di_notes),
                 share=lambda x: x.n / x.n_lit)

gg_acc_notes = (
    pn.ggplot(tmp, pn.aes(x='notes', y='share', fill='tt')) + pn.theme_bw() +
    pn.scale_y_continuous(labels=percent_format(), limits=[0, 0.1]) +
    pn.scale_fill_discrete(name='Literature') +
    pn.geom_col(color='black', position=pn.position_dodge(0.5), width=0.5) +
    pn.labs(y='Percent', x='Investigation') +
    pn.theme(axis_text_x=pn.element_text(angle=45),
             axis_title_x=pn.element_blank()))
gg_acc_notes.save(os.path.join(dir_output, 'gg_acc_notes.png'),
                  width=7,
                  height=3)

print('~~~ End of 4_results_insig.py ~~~')
Exemple #15
0
df_3.to_csv('/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' %
            (df_new_3['popsize'][0], df_new_3['indsize'][0]))
try:
    df_4 = df_new_4.groupby(['nrow', 'nvar'])['timewr'].mean()
    df_4.to_csv(
        '/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' %
        (df_new_4['popsize'][0], df_new_4['indsize'][0]))
except:
    print 'error'

for ielem in (df_new_1, df_new_2, df_new_3, df_new_4):
    surveys_plot = (
        p9.ggplot(data=ielem,
                  mapping=p9.aes(x='run', y='timewr', color='factor(nvar)')) +
        p9.geom_point() + p9.facet_grid("~nrow") +
        p9.scale_y_continuous(limits=(0, 500)) +
        p9.scale_x_discrete(breaks=range(0, 35, 5)) +
        p9.theme(text=p9.element_text(size=10, family="serif"),
                 plot_title=p9.element_text(weight='bold', size=14),
                 legend_title=p9.element_text(weight='bold', size=14),
                 legend_text=p9.element_text(weight='bold', size=10),
                 axis_title_y=p9.element_text(weight='bold', size=14),
                 axis_title_x=p9.element_text(weight='bold', size=14)) +
        p9.labs(y='Time (s)',
                x='Number of run',
                title='Population Size [%s]' % ielem['popsize'][0],
                color='Features'))
    #  Cambiar a la direccion donde quieres guardarlos
    surveys_plot.save("./data_%s_%s.pdf" %
                      (ielem['popsize'][0], ielem['indsize'][0]),
                      width=11,
Exemple #16
0
def htcalc(air_velocity_inside, air_velocity_outside, t_inside, t_outside,
           surface, layers, wall_thickness, thermal_conductivity):
    # We need the convective heat resistance on both sides of the wall
    res_conv_inside = heattransfer.convective_resistance(
        heattransfer.heat_transfer_coef(air_velocity_inside), surface)
    res_conv_outside = heattransfer.convective_resistance(
        heattransfer.heat_transfer_coef(air_velocity_outside), surface)

    # We need the total resistance over all wall layers
    total_layer_resistance = []
    total_layer_resistance.append(res_conv_inside)
    for i in range(layers):
        total_layer_resistance.append(
            heattransfer.conductive_resistance(wall_thickness[i],
                                               thermal_conductivity[i],
                                               surface))

    total_layer_resistance.append(res_conv_outside)

    total_resistance = sum(total_layer_resistance)

    heat_transfer = heattransfer.conduction(t_inside, t_outside,
                                            total_resistance)

    # Calculating the temperatures between each layer
    temperatures = []
    temperatures.append(t_inside)
    layer_resistance = 0
    for resistance in total_layer_resistance:
        layer_resistance += resistance
        temperatures.append(
            heattransfer.layer_temperature(heat_transfer, layer_resistance,
                                           t_inside))

    # Preparing the x axis, position of the temperature and transition labels for the graph
    position = [0, 0.02]
    labels = ['fluid inside', 'inner surface']

    i = 0
    for entry in wall_thickness:
        position.append(position[-1] + entry)
        i += 1
        labels.append("layer" + str(i))

    labels[-1] = "outer surface"
    position.append(position[-1] + 0.02)
    labels.append("fluid outside")

    # print(f"\nThe total resistance is {round(total_resistance, 2)} K/W")
    # print(f"Total heat transfer from inside to outside is {round(heat_transfer, 2)} W\n")

    df = pd.DataFrame({'pos': position, 'temp': temperatures})

    gg = p9.ggplot(df, p9.aes(x='pos', y='temp'))
    gg += p9.geom_line(p9.aes(color='temp'), size=2)

    for ws in df.pos.values.tolist():
        gg += p9.geom_vline(xintercept=ws, color='grey')

    # gg += p9.geom_hline(yintercept=110, color='red', size=2, alpha=0.8)
    gg += p9.ggtitle('heat transfer through wall')
    gg += p9.scale_x_continuous(name='Position',
                                breaks=df.pos.values.tolist(),
                                labels=labels)
    gg += p9.scale_y_continuous(name='Temperature')
    gg += p9.theme(axis_text_x=p9.element_text(angle=45))
    gg += p9.scale_colour_gradient(low="yellow", high="orange")

    i = 0
    for temp in temperatures:
        gg += p9.geom_text(
            p9.aes(x=position[i], y=temp + 30, label=round(temp, 2)))
        i += 1

    for i in range(layers):
        labtext = 'Thermal cond.: ' + str(
            thermal_conductivity[i]) + ' [W/m°K]\nLayer thickness: ' + str(
                round(wall_thickness[i], 3)) + ' [m]'
        gg += p9.annotate(geom='text',
                          x=((position[i + 2] - position[i + 1]) / 2) +
                          position[i + 1],
                          y=temperatures[i] + 30,
                          label=labtext,
                          color='blue')

    return gg
Exemple #17
0
# Get the descriptive statistics for prices.
print('\nTHE PRICE ATTRIBUTE HAS THE FOLLOWING STATISTCS:')
print(prices.describe())

# Graph price distribution as a boxplot.
prices_df = pd.DataFrame({ 'x' : ['']*len(prices), 'price' : prices })
gg.options.figure_size=(4, 6)
g = (
	gg.ggplot(data=prices_df)
	+ gg.geom_boxplot(mapping=gg.aes(x='x', y='price'))
	+ gg.theme_bw()
	+ gg.ggtitle('Ranges of Prices Paid Across All Figures')
	+ gg.xlab('')
	+ gg.ylab('Price Paid')
	+ gg.scale_y_continuous(labels=dollar_format(digits=0)))
g.draw()
plt.show()

# Group figures by year and get counts per year.
year_gb = fig_data.groupby('year')
volume_per_year = year_gb.aggregate('count')
volume_per_year.drop(0, inplace=True)

# Plot a histogram of count of figures per year.
mpl.rcParams['figure.figsize'] = [8.0, 6.0]
mpl.rcParams['figure.dpi'] = 100 
fig, ax = plt.subplots()									# Create a graph
plt.bar(x=volume_per_year.index, height=volume_per_year["figure_id"])
ax.set_title('Year of Production of Figures in Collection')	# Title the chart
ax.set_xlabel('Year of Release')							# Title the x-axis
Exemple #18
0
def hist_plot(df,
              x,
              y=None,
              group = None,
              facet_x = None,
              facet_y = None,
              w='1',
              bins=21,
              bin_width = None,
              position = 'stack',
              normalize = False,
              sort_groups=True,
              base_size=10,
              figure_size=(6, 3)):

    '''
    Plot a 1-d or 2-d histogram

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis. If this is specified the histogram will be 2-d.
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    w : str
      quoted expression representing histogram weights (default is 1)
    bins : int or tuple
      number of bins to be used
    bin_width : float or tuple
      bin width to be used
    position : str
      if groups are present, choose between `stack`, `overlay` or `dodge`
    normalize : bool
      normalize histogram counts
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    if position not in ['overlay', 'stack', 'dodge']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    if (bins is None) and (bin_width is None):
        log.error("Either bins or bin_with should be defined")
        raise ValueError("Either bins or bin_with should be defined")

    if (bins is not None) and (bin_width is not None):
        log.error("Only one between bins or bin_with should be defined")
        raise ValueError("Only one between  bins or bin_with should be defined")

    if (y is not None) and (group is not None):
        log.error("y and group cannot be requested at the same time")
        raise ValueError("y and group cannot be requested at the same time")

    if y is None:
        bins = (bins, bins)
        bin_width = (bin_width, bin_width)
    else:
        if type(bins) not in [tuple, list]:
            bins = (bins, bins)
        if type(bin_width) not in [tuple, list]:
            bin_width = (bin_width, bin_width)

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'y', 'group', 'facet_x', 'facet_y'], [x, y, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['w'], variables['w'] = unname(w)

    # set column names and evaluate expressions
    tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False)

    # redefine groups and variables; remove and store (eventual) names
    new_groups = {c:c for c in tmp_df.columns if c in ['x', 'y', 'group', 'facet_x', 'facet_y']}
    non_xy_groups = [g for g  in new_groups.keys() if g not in ['x', 'y']]
    new_variables = {'w':'w'}

    # bin data (if necessary)
    if tmp_df['x'].dtypes != np.dtype('O'):
        tmp_df['x'], bins_x, bin_width_x= bin_data(tmp_df['x'], bins[0], bin_width[0])
    else:
        bin_width_x=1
    if y is not None:
        if tmp_df['y'].dtypes != np.dtype('O'):
            tmp_df['y'], bins_y, bin_width_y = bin_data(tmp_df['y'], bins[1], bin_width[1])
        else:
            bin_width_y=1
    else:
        bin_width_y=1

    # aggregate data and reorder columns
    gdata = agg_data(tmp_df, new_variables, new_groups, 'sum', fill_groups=True)
    gdata.fillna(0, inplace=True)
    gdata = gdata[[c for c in ['x', 'y', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]]

    # normalize
    if normalize:
        if len(non_xy_groups)==0:
            gdata['w'] = gdata['w']/(gdata['w'].sum()*bin_width_x*bin_width_y)
        else:
            gdata['w'] = gdata.groupby(non_xy_groups)['w'].apply(lambda x: x/(x.sum()*bin_width_x*bin_width_y))

    # start plotting
    g = EZPlot(gdata)
    # determine order and create a categorical type
    if (group is not None) and sort_groups:
        if g.column_is_categorical('x'):
            g.sort_group('x', 'w', ascending=False)
        g.sort_group('group', 'w')
        g.sort_group('facet_x', 'w', ascending=False)
        g.sort_group('facet_y', 'w', ascending=False)
        if groups:
            colors = np.flip(ez_colors(g.n_groups('group')))
    elif (group is not None):
        colors = ez_colors(g.n_groups('group'))

    if y is None:
        # set groups
        if group is None:
            g += p9.geom_bar(p9.aes(x="x", y="w"),
                             stat = 'identity',
                             colour = None,
                             fill = ez_colors(1)[0])
        else:
            g += p9.geom_bar(p9.aes(x="x", y="w",
                                    group="factor(group)",
                                    fill="factor(group)"),
                             colour=None,
                             stat = 'identity',
                             **POSITION_KWARGS[position])
            g += p9.scale_fill_manual(values=colors)

        # set facets
        if facet_x is not None and facet_y is None:
            g += p9.facet_wrap('~facet_x')
        if facet_x is not None and facet_y is not None:
            g += p9.facet_grid('facet_y~facet_x')

        # set x scale
        if g.column_is_categorical('x'):
            g += p9.scale_x_discrete()
        else:
            g += p9.scale_x_continuous(labels=ez_labels)

        # set y scale
        g += p9.scale_y_continuous(labels=ez_labels)

        # set axis labels
        g += \
            p9.xlab(names['x']) + \
            p9.ylab('Counts')

        # set theme
        g += theme_ez(figure_size=figure_size,
                      base_size=base_size,
                      legend_title=p9.element_text(text=names['group'], size=base_size))

        if sort_groups:
            g += p9.guides(fill=p9.guide_legend(reverse=True))

    else:
        g += p9.geom_tile(p9.aes(x="x", y="y", fill='w'),
                          stat = 'identity',
                          colour = None)

        # set facets
        if facet_x is not None and facet_y is None:
            g += p9.facet_wrap('~facet_x')
        if facet_x is not None and facet_y is not None:
            g += p9.facet_grid('facet_y~facet_x')

        # set x scale
        if g.column_is_categorical('x'):
            g += p9.scale_x_discrete()
        else:
            g += p9.scale_x_continuous(labels=ez_labels)

        # set y scale
        if g.column_is_categorical('y'):
            g += p9.scale_y_discrete()
        else:
            g += p9.scale_y_continuous(labels=ez_labels)

        # set axis labels
        g += \
            p9.xlab(names['x']) + \
            p9.ylab(names['y'])

        # set theme
        g += theme_ez(figure_size=figure_size,
                      base_size=base_size,
                      legend_title=p9.element_text(text='Counts', size=base_size))

    return g
Exemple #19
0
def estimate_cutoffs_plot(output_file,
                          df_plt,
                          df_estimate_ncells,
                          df_fit=None,
                          scale_x_log10=False,
                          save_plot=True,
                          add_text=False):
    """Plot UMI counts by sorted cell barcodes."""
    if min(df_plt['umi_counts']) <= 0:
        fix_log_scale = min(df_plt['umi_counts']) + 1
        df_plt['umi_counts'] = df_plt['umi_counts'] + fix_log_scale
    if add_text:
        df_estimate_ncells['add_text_y'] = np.random.randint(
            low=df_plt['umi_counts'].min() - 25,
            high=df_plt['umi_counts'].max() - 25,
            size=df_estimate_ncells.shape[0])
    gplt = plt9.ggplot()
    gplt = gplt + plt9.theme_bw()
    if len(df_plt) <= 50000:
        gplt = gplt + plt9.geom_point(mapping=plt9.aes(x='barcode',
                                                       y='umi_counts'),
                                      data=df_plt,
                                      alpha=0.05,
                                      size=0.1)
    else:
        gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='barcode',
                                                      y='umi_counts'),
                                     data=df_plt,
                                     alpha=0.25,
                                     size=0.75,
                                     color='grey')
    gplt = gplt + plt9.geom_vline(mapping=plt9.aes(xintercept='n_cells',
                                                   color='method'),
                                  data=df_estimate_ncells,
                                  alpha=0.75,
                                  linetype='dashdot')
    if add_text:
        gplt = gplt + plt9.geom_text(mapping=plt9.aes(
            x='n_cells', y='add_text_y', label='n_cells', color='method'),
                                     data=df_estimate_ncells,
                                     alpha=0.75)
    gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual')
    if scale_x_log10:
        gplt = gplt + plt9.scale_x_continuous(
            trans='log10', labels=comma_labels, minor_breaks=0)
    else:
        gplt = gplt + plt9.scale_x_continuous(labels=comma_labels,
                                              minor_breaks=0)
    gplt = gplt + plt9.scale_y_continuous(
        trans='log10', labels=comma_labels, minor_breaks=0)
    gplt = gplt + plt9.labs(title='',
                            y='UMI counts',
                            x='Barcode index, sorted by UMI count',
                            color='Cutoff')
    # Add the fit of the droplet utils model
    if df_fit:
        gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='x', y='y'),
                                     data=df_fit,
                                     alpha=1,
                                     color='yellow')
    if save_plot:
        gplt.save('{}.png'.format(output_file), dpi=300, width=5, height=4)
    return gplt
def plot_it(molecules, t):
    counts = [0] * (x_hi - x_lo + 1)
    for mol_loc in molecules:
        counts[mol_loc] += 1
    data = pd.DataFrame({"x": range(x_lo, x_hi + 1), "t": t, "count": counts})
    return p9.geom_line(data=data, size=1)


if __name__ == "__main__":
    molecule_locations = [100] * num_molecules

    my_plot = p9.ggplot(p9.aes(x="x", y="count", color="t"))
    start = time.perf_counter()
    intervals = partition(num_molecules, NUM_PROCESSES)
    with multiprocessing.Pool(NUM_PROCESSES) as pool:
        for t in range(0, 2001, PLOT_EVERY):
            my_plot += plot_it(molecule_locations, f"{t} ms")
            mol_parts = [molecule_locations[i[0]:i[1]] for i in intervals]
            molecule_locations = list(
                itertools.chain.from_iterable(
                    pool.map(advance,
                             [(mols, PLOT_EVERY) for mols in mol_parts])))
    if t % PLOT_EVERY == 0:
        my_plot += plot_it(molecule_locations, f"{t} ms")
    stop = time.perf_counter()

    print(f"simulation time: {stop - start}")
    my_plot += p9.scale_y_continuous(limits=(0, 3_000))
    my_plot.draw()
    plt.show()
Exemple #21
0
def syntactic_diversity_plots():
    with open('data/external/syntactic_diversity_table.json') as f:
        rows = json.load(f)
    parse_df = pd.DataFrame(rows)
    parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses']
    melt_df = pd.melt(
        parse_df,
        id_vars=['dataset', 'depth', 'overlap', 'parses'],
        value_vars=['parse_ratio', 'unique_parses'],
        var_name='metric',
        value_name='y'
    )

    def label_facet(name):
        if name == 'parse_ratio':
            return 'Average Unique Parses per Instance'
        elif name == 'unique_parses':
            return 'Count of Unique Parses'

    def label_y(ys):
        formatted_ys = []
        for y in ys:
            y = str(y)
            if y.endswith('000.0'):
                formatted_ys.append(y[:-5] + 'K')
            else:
                formatted_ys.append(y)
        return formatted_ys
    p = (
    ggplot(melt_df)
        + aes(x='depth', y='y', color='dataset')
        + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet)
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth') + ylab('')
        + scale_color_discrete(name='Dataset')
        + scale_y_continuous(labels=label_y)
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'syn_div_plot.pdf'))
    p = (
    ggplot(parse_df)
        + aes(x='depth', y='unique_parses', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Count of Unique Parses')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(
            breaks=list(range(1, 11)),
            minor_breaks=list(range(1, 11)),
            limits=[1, 10])
        + theme_fs()
    )
    p.save(path.join(output_path, 'n_unique_parses.pdf'))
    p = (
        ggplot(parse_df)
        + aes(x='depth', y='parse_ratio', color='dataset')
        + geom_line() + geom_point()
        + xlab('Parse Truncation Depth')
        + ylab('Average Unique Parses per Instance')
        + scale_color_discrete(name='Dataset')
        + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10])
        + scale_y_continuous(limits=[0, 1])
        + theme_fs()
    )
    p.save(path.join(output_path, 'parse_ratio.pdf'))
Exemple #22
0
    def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False):
        if self.y_max is not None:
            limits = [0, float(self.y_max)]
            eprint(f'Setting limits to: {limits}')
        else:
            limits = [0, 1]
        if expo:
            if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans:
                with open('data/external/all_human_gameplay.json') as f:
                    all_gameplay = json.load(f)
                    frames = []
                    for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]:
                        if self.merge_humans:
                            name = 'Human'
                        gameplay = all_gameplay[event]
                        if event != 'live':
                            control_correct_positions = gameplay['control_correct_positions']
                            control_wrong_positions = gameplay['control_wrong_positions']
                            control_positions = control_correct_positions + control_wrong_positions
                            control_positions = np.array(control_positions)
                            control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0])
                            argsort_control = np.argsort(control_positions)
                            control_x = control_positions[argsort_control]
                            control_sorted_result = control_result[argsort_control]
                            control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0]
                            control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x})
                            control_df['Dataset'] = 'Regular Test'
                            control_df['Guessing_Model'] = f' {name}'
                            frames.append(control_df)

                        adv_correct_positions = gameplay['adv_correct_positions']
                        adv_wrong_positions = gameplay['adv_wrong_positions']
                        adv_positions = adv_correct_positions + adv_wrong_positions
                        adv_positions = np.array(adv_positions)
                        adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0])
                        argsort_adv = np.argsort(adv_positions)
                        adv_x = adv_positions[argsort_adv]
                        adv_sorted_result = adv_result[argsort_adv]
                        adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                        adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x})
                        adv_df['Dataset'] = 'IR Adversarial'
                        adv_df['Guessing_Model'] = f' {name}'
                        frames.append(adv_df)

                        if len(gameplay['advneural_correct_positions']) > 0:
                            adv_correct_positions = gameplay['advneural_correct_positions']
                            adv_wrong_positions = gameplay['advneural_wrong_positions']
                            adv_positions = adv_correct_positions + adv_wrong_positions
                            adv_positions = np.array(adv_positions)
                            adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0])
                            argsort_adv = np.argsort(adv_positions)
                            adv_x = adv_positions[argsort_adv]
                            adv_sorted_result = adv_result[argsort_adv]
                            adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0]
                            adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x})
                            adv_df['Dataset'] = 'RNN Adversarial'
                            adv_df['Guessing_Model'] = f' {name}'
                            frames.append(adv_df)

                    human_df = pd.concat(frames)
                    human_vals = sort_humans(list(human_df['Guessing_Model'].unique()))
                    human_dtype = CategoricalDtype(human_vals, ordered=True)
                    human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype)
                    dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True)
                    human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype)

            if no_models:
                p = ggplot(human_df) + geom_point(shape='.')
            else:
                df = self.char_plot_df
                if 1 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 1 - IR Adversarial']
                if 2 not in self.rounds:
                    df = df[df['Dataset'] != 'Round 2 - IR Adversarial']
                    df = df[df['Dataset'] != 'Round 2 - RNN Adversarial']
                p = ggplot(df)
                if self.save_df is not None:
                    eprint(f'Saving df to: {self.save_df}')
                    df.to_json(self.save_df)

                if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans:
                    eprint('Loading human data')
                    p = p + geom_line(data=human_df)

            if columns:
                facet_conf = facet_wrap('Guessing_Model', ncol=1)
            else:
                facet_conf = facet_wrap('Guessing_Model', nrow=1)

            if not no_models:
                if self.mvg_avg_char:
                    chart = stat_smooth(method='mavg', se=False, method_args={'window': 400})
                else:
                    chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5)
            else:
                chart = None

            p = (
                p + facet_conf
                + aes(x='char_percent', y='correct', color='Dataset')
            )
            if chart is not None:
                p += chart
            p = (
                p
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + scale_x_continuous(breaks=[0, .5, 1])
                + coord_cartesian(ylim=limits)
                + xlab('Percent of Question Revealed')
                + ylab('Accuracy')
                + theme(
                    #legend_position='top', legend_box_margin=0, legend_title=element_blank(),
                    strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5})
                )
                + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')
            )
            if self.title != '':
                p += ggtitle(self.title)

            return p
        else:
            if self.save_df is not None:
                eprint(f'Saving df to: {self.save_df}')
                df.to_json(self.save_df)
            return (
                ggplot(self.char_plot_df)
                + aes(x='char_percent', y='correct', color='Guessing_Model')
                + stat_smooth(method='mavg', se=False, method_args={'window': 500})
                + scale_y_continuous(breaks=np.linspace(0, 1, 6))
                + coord_cartesian(ylim=limits)
            )
to_plot = 150
margin = 0.5
x_loc = 3
rcn = 0.1
optlin = 0.4
boundary_factor = b_from_error(optlin)

train, _, _ = prep_dataset(to_plot, margin = margin, x_loc = x_loc, rcn = rcn, boundary_factor = boundary_factor)
(ggplot(train,
       aes(x = 'x', y = 'y', color = 'group', shape = 'group')) 
        + geom_point(size = 4, fill = 'none') 
        + scale_shape_manual(values = ('o', 'P')) 
        + geom_vline(xintercept = [-1*boundary_factor, 0, boundary_factor], linetype = 'dotted')
        + scale_x_continuous(breaks = np.arange(-6, 7, 1), name = '') 
        + scale_y_continuous(name = '') 
        + theme(legend_position='none')
)


# Create a function which runs the experiment for a given learner and hyperparameter configuration. 

# In[9]:


def experiment(learner, n_train, optlin, rcn, 
               lr = 1e-2, train_batch_size = 1, epochs = 1, data_seed = 123,
               filestring = 'na', sgd_shuffle_seed = 123):
    boundary_factor = b_from_error(optlin)
    
    # Note that training batch size is important as it influences learned model.
Exemple #24
0
def scatter_plot(df,
                 x,
                 y,
                 group=None,
                 facet_x=None,
                 facet_y=None,
                 base_size=10,
                 figure_size=(6, 3),
                 **kwargs):
    '''
    Aggregates data in df and plots as a scatter plot chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    **kwargs:
      additional kwargs passed to geom_point

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=True)
    gdata = gdata[[
        c for c in ['x', 'y', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    # add group_x column
    if group is not None:
        gdata['group_x'] = gdata['group'].astype(
            'str') + '_' + gdata['x'].astype(str)

    g = EZPlot(gdata)

    # set groups
    if group is None:
        g += p9.geom_point(p9.aes(x="x", y="y"),
                           colour=ez_colors(1)[0],
                           **kwargs)
    else:
        g += p9.geom_point(
            p9.aes(x="x", y="y", group="factor(group)", color="factor(group)"),
            **kwargs)
        g += p9.scale_color_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    if g.column_is_timestamp('y'):
        g += p9.scale_y_datetime()
    elif g.column_is_categorical('y'):
        g += p9.scale_y_discrete()
    else:
        g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    return g
        lambda x: x.aupr_mean -
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len)
    }))
dev_set_stats_df

# In[9]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUROC", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
 }) + p9.scale_y_continuous(limits=[0.4, 0.75]))

# In[10]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUPR", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
 }) + p9.scale_y_continuous(limits=[0.4, 0.75]))

# In[11]:
Exemple #26
0
def area_plot(df,
              x,
              y,
              group=None,
              facet_x=None,
              facet_y=None,
              aggfun='sum',
              fill=False,
              sort_groups=True,
              base_size=10,
              figure_size=(6, 3)):
    '''
    Aggregates data in df and plots as a stacked area chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    aggfun : str or fun
      function to be used for aggregating (eg sum, mean, median ...)
    fill : bool
      plot shares for each group instead of absolute values
    sort_groups : bool
      sort groups by the sum of their value (otherwise alphabetical order is used)
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True)
    gdata['y'].fillna(0, inplace=True)
    gdata = gdata[[
        c for c in ['x', 'y', 'group', 'facet_x', 'facet_y']
        if c in gdata.columns
    ]]

    if fill:
        groups_to_normalize = [
            c for c in ['x', 'facet_x', 'facet_y'] if c in gdata.columns
        ]
        total_values = gdata \
            .groupby(groups_to_normalize)['y'] \
            .sum() \
            .reset_index() \
            .rename(columns = {'y':'tot_y'})
        gdata = pd.merge(gdata, total_values, on=groups_to_normalize)
        gdata['y'] = gdata['y'] / (gdata['tot_y'] + EPSILON)
        gdata.drop('tot_y', axis=1, inplace=True)
        ylabeller = percent_labels
    else:
        ylabeller = ez_labels

    # get plot object
    g = EZPlot(gdata)

    # determine order and create a categorical type
    if sort_groups:
        sort_data_groups(g)

    # get colors
    colors = np.flip(ez_colors(g.n_groups('group')))

    # set groups
    if group is None:
        g += p9.geom_area(p9.aes(x="x", y="y"),
                          colour=None,
                          fill=ez_colors(1)[0],
                          na_rm=True)
    else:
        g += p9.geom_area(p9.aes(x="x",
                                 y="y",
                                 group="factor(group)",
                                 fill="factor(group)"),
                          colour=None,
                          na_rm=True)
        g += p9.scale_fill_manual(values=colors)

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ylabeller,
                               expand=[0, 0, 0.1 * (not fill) + 0.03, 0])

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    if sort_groups:
        g += p9.guides(fill=p9.guide_legend(reverse=True),
                       color=p9.guide_legend(reverse=True))

    return g
def plot_difference(adata, plot_name='cellbender_results'):
    # Get the differences in counts per cell
    X_raw_minus_cb = adata.layers['counts_raw'] - adata.layers[
        'counts_cellbender']
    X_dif = abs(X_raw_minus_cb)

    # Get the top most different genes
    df_diff_genes = pd.DataFrame(data=adata.var.gene_symbols.values)
    df_diff_genes['ensembl_id'] = adata.var.index
    df_diff_genes['gene_symbols'] = adata.var.gene_symbols.values
    df_diff_genes['dif_across_cells'] = np.asarray(
        X_dif.sum(axis=0)).reshape(-1)
    df_diff_genes = df_diff_genes.sort_values('dif_across_cells',
                                              ascending=False)

    # Select the top 100 genes and plot the difference in counts across
    # cells where x axis = gene, y axis = difference, and point = cell.
    top_n_genes = 100
    df_plt = _make_data_plot_difference(
        adata, X_raw_minus_cb,
        df_diff_genes['gene_symbols'].head(n=top_n_genes))
    # print(df_plt.head())
    gplt = plt9.ggplot(df_plt)
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_boxplot(plt9.aes(x='gene_symbols', y='value'),
                                    alpha=0.25
                                    #outlier_shape=''
                                    )
    gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90))
    gplt = gplt + plt9.labs(
        x='',
        y='Raw counts - cellbender adjusted',
        title='Top {} most different genes'.format(top_n_genes))
    gplt.save(
        '{}-count_difference-boxplot.png'.format(plot_name),
        #dpi=300,
        width=14,
        height=4)
    gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=0))
    gplt = gplt + plt9.coord_flip()
    gplt.save(
        '{}-count_difference-boxplot_vertical.png'.format(plot_name),
        #dpi=300,
        width=4,
        height=14)

    # Same plot but the abs difference on log scale
    df_plt = _make_data_plot_difference(
        adata, X_dif, df_diff_genes['gene_symbols'].head(n=top_n_genes))
    df_plt['value'] += 1
    # print(df_plt.head())
    gplt = plt9.ggplot(df_plt)
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.geom_boxplot(plt9.aes(x='gene_symbols', y='value'),
                                    alpha=0.25
                                    # outlier_shape=''
                                    )
    gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90))
    gplt = gplt + plt9.labs(
        x='',
        y='Abs(raw counts - cellbender adjusted)',
        title='Top {} most different genes'.format(top_n_genes))
    gplt = gplt + plt9.scale_y_continuous(
        trans='log10', labels=comma_labels, minor_breaks=0)
    gplt.save(
        '{}-abs_count_difference-boxplot.png'.format(plot_name),
        #dpi=300,
        width=14,
        height=4)
    gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=0))
    gplt = gplt + plt9.coord_flip()
    gplt.save(
        '{}-abs_count_difference-boxplot_vertical.png'.format(plot_name),
        #dpi=300,
        width=4,
        height=14)
Exemple #28
0
def plot_line(data,nuclstr,columns=['value'],ymin=None,ymax=None,dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,transparent=True,xshift=0):
    """
    A wrapper function to make a plot of data with bars along the sequnce
    input should be a dataframe with resid, segid column and 'value' 
    This one is inspired by seqplot/seqplot/pdb_plot.py
    funcgroup example fg="\\funcgroup{xxx}{CT}{White}{Green}{upper}{up} \\funcgroup{xxx}{GA}{White}{Blue}{upper}{up}"
    """
    if isinstance(columns,str):
        columns=[columns]
    segid=data['segid'].values[0]
    
    title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type'])

    seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \
                if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna)
    msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\
                                         name=nuclstr.components[segid]['type']+':'+segid)])
    if(reverse_seq):
        logger.info("Experimental feature will reverse the sequence")
        msar[0].seq=msar[0].seq[::-1]
        
    msar=msar[:,cropseq[0]:cropseq[1]]

    
#     print("Seq to plot:",msar)
             
    #We need to get starting residue, currently for DNA chains only cifseq gets it correctly
    resid_start=nuclstr.seqs[segid]['resid_start']
    
    logger.debug("Starting resid %d"%int(resid_start))
    

    overhang=nuclstr.seqs[segid]['overhangL']
    
    datafixed=data.copy()
    datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0]+xshift

#     print(datafixed)
    sl=len(msar[0].seq)

#     fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug)
    if features is None:
        fn=nuclstr.shading_features[segid]
    else:
        fn=features
    fn2=[]
    for i in fn:
        if (i['style'] in feature_types) or ('all' in feature_types) :
            fn2.append(i)
            
    fn2.extend(add_features)
    shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,density=200)
        
    #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that.
    if right_overhang_fix is None:
        if sl%10==0:
            if sl<100:
                rof= 0.1
            else:
                rof=0.5
        else:
            rof=0
    else:
        rof=right_overhang_fix
    if (not aspect_ratio is None ):
        ar=aspect_ratio
    else:
        ar=0.15*100./sl
        
    md=pd.melt(datafixed,id_vars=['segid','resid'],value_vars=columns)
#     print(md)
#     print(md)
#     print(md['variable'])
    plot=(ggplot(data=md,mapping=aes(x='resid', y='value'))
        + geom_point(aes(color='variable'),size=0.1)+geom_line(aes(color='variable'),stat='identity')
        + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[])
#         + scale_y_continuous()
        + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0)) #+ facet_wrap('~ segid',dir='v')

    if ymax is not None:
        plot=plot+scale_y_continuous(limits=(None,ymax))
    
    if ymin is None:
        ymin=md['value'].min()
    if ymax is None:
        ymax=md['value'].max()
    plot = plot + geom_seq_x(seqimg=shaded.img,\
                   xlim=(1,sl+rof),ylim=(ymin,ymax),aspect_ratio=ar,transparent=transparent)+ggtitle(title)
    

    
    return plot
Exemple #29
0
    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_timestamp('x'):
        g += p9.scale_x_datetime()
    elif g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    return g
    def batch_plots(self):

        # First, put together active leak data and output for live plotting functionality
        # (no AL plot here currently)
        dfs = self.active_leak_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean',
                                                    'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv', index=True)

        # Now repeat for emissions (which will actually be used for batch plotting)
        dfs = self.emission_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

            # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean',
                                                    'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True)

        # Make plots from list of dataframes - one entry per dataframe
        pn.theme_set(pn.theme_linedraw())
        plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') +
                 pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) +
                 pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) +
                 pn.ylab('Daily emissions (kg/site)') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 pn.scale_y_continuous(trans='log10') +
                 pn.ggtitle('To reduce uncertainty, use more simulations.') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot1.save(self.output_directory + 'program_comparison.png', width=7, height=3, dpi=900)

        # Build relative mitigation plots
        dfs_p2 = dfs.copy()

        for i in dfs_p2[1:]:
            i['mean_dif'] = 0
            i['std_dif'] = 0
            i['mean_ratio'] = 0
            i['std_ratio'] = 0
            for j in range(len(i)):
                ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean']
                ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std']
                alt_mean = i.loc[i.index[j], 'mean']
                alt_std = i.loc[i.index[j], 'std']

                i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean
                i.loc[i.index[j], 'std_dif'] = math.sqrt(
                    math.pow(alt_std, 2) + math.pow(ref_std, 2))
                i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean
                i.loc[i.index[j], 'std_ratio'] = math.sqrt(
                    math.pow((alt_std / alt_mean), 2) + math.pow((ref_std / ref_mean), 2))

        # Build plotting dataframe
        df_p2 = self.dates_trunc.copy().to_frame()
        df_p2['program'] = dfs_p2[1]['program']
        df_p2['mean_dif'] = dfs_p2[1]['mean_dif']
        df_p2['std_dif'] = dfs_p2[1]['std_dif']
        df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio']
        df_p2['std_ratio'] = dfs_p2[1]['std_ratio']

        df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif']
        df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif']
        df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / (dfs_p2[1]
                                                        ['mean_ratio'] + 2 * dfs_p2[1]['std_ratio'])
        df_p2['high_ratio'] = dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio']

        pd.options.mode.chained_assignment = None
        for i in dfs_p2[2:]:
            i['low_dif'] = i['mean_dif'] - 2 * i['std_dif']
            i['high_dif'] = i['mean_dif'] + 2 * i['std_dif']
            i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] + 2 * i['std_ratio'])
            i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio']
            short_df = i[['program', 'mean_dif', 'std_dif', 'low_dif',
                          'high_dif', 'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio']]
            short_df['datetime'] = np.array(self.dates_trunc)
            df_p2 = df_p2.append(short_df, ignore_index=True)

        # Make plot 2
        plot2 = (pn.ggplot(None) + pn.aes('datetime', 'mean_dif', group='program') +
                 pn.geom_ribbon(
                     df_p2, pn.aes(ymin='low_dif', ymax='high_dif', fill='program'), alpha=0.2) +
                 pn.geom_line(df_p2, pn.aes('datetime', 'mean_dif', colour='program'), size=1) +
                 pn.ylab('Daily emissions difference (kg/site)') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 pn.ggtitle('Daily differences may be uncertain for small sample sizes') +
                 #        pn.scale_y_continuous(trans='log10') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot2.save(self.output_directory + 'relative_mitigation.png', width=7, height=3, dpi=900)

        # Make plot 3
        plot3 = (pn.ggplot(None) + pn.aes('datetime', 'mean_ratio', group='program') +
                 pn.geom_ribbon(df_p2, pn.aes(
                     ymin='low_ratio', ymax='high_ratio', fill='program'), alpha=0.2) +
                 pn.geom_hline(yintercept=1, size=0.5, colour='blue') +
                 pn.geom_line(df_p2, pn.aes('datetime', 'mean_ratio', colour='program'), size=1) +
                 pn.ylab('Emissions ratio') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 pn.ggtitle(
                     'Blue line represents equivalence. \nIf uncertainty is high, use more '
                     'simulations and/or sites. \nLook also at ratio of mean daily emissions'
                     'over entire timeseries.') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot3.save(self.output_directory + 'relative_mitigation2.png', width=7, height=3, dpi=900)

        # ---------------------------------------
        # ------ Figure to compare costs  ------
        dfs = self.cost_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean',
                                                    'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'rolling_cost_estimates.csv', index=True)

        # Make plots from list of dataframes - one entry per dataframe
        pn.theme_set(pn.theme_linedraw())
        plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') +
                 pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) +
                 pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) +
                 pn.ylab('Estimated cost per facility') + pn.xlab('') +
                 pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) +
                 pn.scale_x_datetime(labels=date_format('%Y')) +
                 # pn.scale_y_continuous(trans='log10') +
                 pn.labs(color='Program', fill='Program') +
                 pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2),
                          panel_grid_minor_x=pn.element_blank(),
                          panel_grid_major_x=pn.element_blank(),
                          panel_grid_minor_y=pn.element_line(
                              colour='black', linewidth=0.5, alpha=0.3),
                          panel_grid_major_y=pn.element_line(
                              colour='black', linewidth=1, alpha=0.5))
                 )
        plot1.save(self.output_directory + 'cost_estimate_temporal.png', width=7, height=3, dpi=900)

        ########################################
        # Cost breakdown by program and method
        method_lists = []
        for i in range(len(self.directories)):
            df = pd.read_csv(
                self.output_directory + self.directories[i] + "/timeseries_output_0.csv")
            df = df.filter(regex='cost$', axis=1)
            df = df.drop(columns=["total_daily_cost"])
            method_lists.append(list(df))

        costs = [[] for i in range(len(self.all_data))]
        for i in range(len(self.all_data)):
            for j in range(len(self.all_data[i])):
                simcosts = []
                for k in range(len(method_lists[i])):
                    timesteps = len(self.all_data[i][j][method_lists[i][k]])
                    simcosts.append(
                        (sum(self.all_data[i][j][method_lists[i][k]])/timesteps/self.n_sites)*365)
                costs[i].append(simcosts)

        rows_list = []
        for i in range(len(costs)):
            df_temp = pd.DataFrame(costs[i])
            for j in range(len(df_temp.columns)):
                dict = {}
                dict.update({'Program': self.directories[i]})
                dict.update({'Mean Cost': round(df_temp.iloc[:, j].mean())})
                dict.update({'St. Dev.': df_temp.iloc[:, j].std()})
                dict.update({'Method': method_lists[i][j].replace('_cost', '')})
                rows_list.append(dict)
        df = pd.DataFrame(rows_list)

        # Output Emissions df for other uses
        df.to_csv(self.output_directory + 'cost_comparison.csv', index=True)

        plot = (
            pn.ggplot(
                df, pn.aes(
                    x='Program', y='Mean Cost', fill='Method', label='Mean Cost')) +
            pn.geom_bar(stat="identity") + pn.ylab('Cost per Site per Year') + pn.xlab('Program') +
            pn.scale_fill_hue(h=0.15, l=0.25, s=0.9) +
            pn.geom_text(size=15, position=pn.position_stack(vjust=0.5)) +
            pn.theme(
                panel_border=pn.element_rect(colour="black", fill=None, size=2),
                panel_grid_minor_x=pn.element_blank(),
                panel_grid_major_x=pn.element_blank(),
                panel_grid_minor_y=pn.element_line(
                    colour='black', linewidth=0.5, alpha=0.3),
                panel_grid_major_y=pn.element_line(
                    colour='black', linewidth=1, alpha=0.5)))
        plot.save(self.output_directory + 'cost_comparison.png', width=7, height=3, dpi=900)

        return
Exemple #31
0
def density_plot(df,
                 x,
                 group=None,
                 facet_x=None,
                 facet_y=None,
                 position='overlay',
                 sort_groups=True,
                 base_size=10,
                 figure_size=(6, 3),
                 **stat_kwargs):
    '''
    Plot a 1-d density plot

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    position : str
      if groups are present, choose between `stack` or `overlay`
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    stat_kwargs : kwargs
      kwargs for the density stat

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    if position not in ['overlay', 'stack']:
        log.error("position not recognized")
        raise NotImplementedError("position not recognized")

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'],
                          [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names[
            'x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=False)
    gdata = gdata[[
        c for c in ['x', 'group', 'facet_x', 'facet_y'] if c in gdata.columns
    ]]

    # start plotting
    g = EZPlot(gdata)

    # determine order and create a categorical type
    colors = ez_colors(g.n_groups('group'))

    # set groups
    if group is None:
        g += p9.geom_density(p9.aes(x="x"),
                             stat=p9.stats.stat_density(**stat_kwargs),
                             colour=ez_colors(1)[0],
                             fill=ez_colors(1)[0],
                             **POSITION_KWARGS[position])
    else:
        g += p9.geom_density(p9.aes(x="x",
                                    group="factor(group)",
                                    colour="factor(group)",
                                    fill="factor(group)"),
                             stat=p9.stats.stat_density(**stat_kwargs),
                             **POSITION_KWARGS[position])
        g += p9.scale_fill_manual(values=colors, reverse=False)
        g += p9.scale_color_manual(values=colors, reverse=False)

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    if g.column_is_categorical('x'):
        g += p9.scale_x_discrete()
    else:
        g += p9.scale_x_continuous(labels=ez_labels)

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab('Density')

    # set theme
    g += theme_ez(figure_size=figure_size,
                  base_size=base_size,
                  legend_title=p9.element_text(text=names['group'],
                                               size=base_size))

    if sort_groups:
        g += p9.guides(fill=p9.guide_legend(reverse=True))

    return g
Exemple #32
0
def plot_ecdf(df_plot,
              variable_column,
              color_column='none',
              output_file='plot_distribution',
              facet_column='none',
              x_log10=False):
    """Plot plot_distribution to png.

    Parameters
    ----------
    df_plot : pandas.DataFrame
        DataFrame with <variable_column> as a column.
    variable_column : string
        String of variable_column column to plot.
    color_column : string
        String of color column to plot.
    output_file : string
        Basename of output file.
    facet_column : string
        Column to facet the plot by.

    Returns
    -------
    NULL
    """
    n_colors = 0
    if color_column != 'none':
        gplt = plt9.ggplot(df_plot,
                           plt9.aes(x=variable_column, color=color_column))
        n_colors = df_plot[color_column].nunique()
    else:
        gplt = plt9.ggplot(df_plot, plt9.aes(x=variable_column))
    gplt = gplt + plt9.theme_bw()
    gplt = gplt + plt9.stat_ecdf(alpha=0.8)
    if x_log10:
        gplt = gplt + plt9.scale_x_continuous(
            trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
    else:
        gplt = gplt + plt9.scale_x_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
    gplt = gplt + plt9.scale_y_continuous(
        # trans='log10',
        # labels=comma_labels,
        minor_breaks=0)
    gplt = gplt + plt9.labs(y='Cumulative density', title='')
    if n_colors != 0 and n_colors > 20:
        gplt = gplt + plt9.theme(legend_position='none')
    elif n_colors != 0 and n_colors < 9:
        gplt = gplt + plt9.scale_colour_brewer(palette='Dark2', type='qual')
    if facet_column != 'none':
        gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5)
        n_facets = df_plot[facet_column].nunique()
        gplt.save('{}.png'.format(output_file),
                  dpi=300,
                  width=6 * (n_facets / 4),
                  height=4 * (n_facets / 4),
                  limitsize=False)
    else:
        gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4)
    return 0
def generate_map(data,
                 region,
                 value_field,
                 iso_field='iso',
                 scale_params=None,
                 plot_na_dots=False,
                 tolerance=None,
                 plot_size=8,
                 out_region_color='#f0f0f0',
                 na_color='#aaaaaa',
                 line_color='#666666',
                 projection=None):
    """
    This function returns a map plot with the specified options.

    :param pandas.DataFrame data: Data to be plotted.
    :param str region: Region to center the map around. Countries outside
        the chosen region will be obscured.
    :param str value_field: Column of *data* with the values to be plotted.
    :param str iso_field: Column of *data* with the ISO3 codes for each
        country.
    :param dict scale_params: Dictionary of parameters to be passed to the
        ggplot corresponding color scale (continuous or discrete).
    :param bool plot_na_dots: Whether to plot the dots for small countries
        if said country doesn't have data available.
    :param int tolerance: Coordinate tolerance for polygon simplification,
        a higher number will result in simpler polygons and faster
        rendering (see DEFAULT_TOLERANCES).
    :param int plot_size: Size of the plot, which determines the relative sizes
        of the elements within.
    :param str out_region_color: Hex color of the countries that are out of the
        specified region.
    :param str na_color: Hex color of the countries with no data available.
    :param str line_color: Color of the country borders.
    :param str projection: Kind of map projection to be used in the map.
        Currently, Oceania (XOX) is only available in ESPG:4326 to enable
        wrapping.
    :returns: a ggplot-like plot with the map
    :rtype: plotnine.ggplot
    """
    if projection is None:
        if region == 'XOX':
            projection = 'epsg4326'
        else:
            projection = 'robinson'

    if projection not in PROJECTION_DICT.keys():
        raise ValueError('Projection "{}" not valid'.format(projection))

    if scale_params is None:
        scale_params = {}

    if region not in REGION_BOUNDS[projection]:
        raise ValueError(
            '"region" not available. Valid regions are: {}'.format(', '.join(
                REGION_BOUNDS[projection].keys())))

    if tolerance is None:
        tolerance = DEFAULT_TOLERANCES[projection][region]

    countries = GeoDataFrame.from_file(
        os.path.join(os.path.dirname(__file__), 'data/world-countries.shp'))

    # To plot Oceania we need the original EPSG:4326 to wrap around the 180º
    # longitude. In other cases transform to the desired projection.
    if region == 'XOX':
        countries.crs['lon_wrap'] = '180'  # Wrap around longitude 180º

        XOX_countries = countries['continent'] == 'XOX'
        countries[XOX_countries] = countries[XOX_countries].to_crs(
            countries.crs)
        centroids = countries[XOX_countries].apply(
            lambda row: row['geometry'].centroid, axis=1)
        countries.loc[XOX_countries, 'lon'] = [c.x for c in centroids]
        countries.loc[XOX_countries, 'lat'] = [c.y for c in centroids]
    else:
        if projection != 'epsg4326':
            countries = countries.to_crs(PROJECTION_DICT[projection])
            centroids = countries.apply(lambda row: row['geometry'].centroid,
                                        axis=1)
            countries['lon'] = [c.x for c in centroids]
            countries['lat'] = [c.y for c in centroids]

    countries['geometry'] = countries['geometry'].simplify(tolerance)

    upper_left, lower_right = REGION_BOUNDS[projection][region]
    limits_x = [upper_left[0], lower_right[0]]
    limits_y = [lower_right[1], upper_left[1]]
    ratio = (limits_x[1] - limits_x[0]) / (limits_y[1] - limits_y[0])

    plot_data = pd.merge(countries,
                         data,
                         how='left',
                         left_on='iso',
                         right_on=iso_field)
    map_bounds = REGION_BOUNDS['epsg4326'][region]
    map_area = ((map_bounds[1][0] - map_bounds[0][0]) *
                (map_bounds[0][1] - map_bounds[1][1]))
    plot_data['plot_dot'] = (plot_data['pol_area'] < DOT_THRESHOLD * map_area)

    if not plot_na_dots:
        plot_data['plot_dot'] &= ~pd.isnull(plot_data[value_field])

    if region != 'XWX':
        in_region = ((~pd.isnull(plot_data[value_field])) &
                     (plot_data['continent'] == region))
        in_region_missing = ((pd.isnull(plot_data[value_field])) &
                             (plot_data['continent'] == region))
        out_region = plot_data['continent'] != region
    else:
        in_region = ~pd.isnull(plot_data[value_field])
        in_region_missing = pd.isnull(plot_data[value_field])
        out_region = np.repeat(False, len(plot_data))

    if plot_data[value_field].dtype == 'object':
        # Assume discrete values
        fill_scale = scale_fill_brewer(**scale_params, drop=False)
    else:
        # Assume continuous values
        fill_scale = scale_fill_gradient(**scale_params)

    plot_data_values = plot_data[in_region]
    plot_data_missing = plot_data[in_region_missing]
    plot_data_out_region = plot_data[out_region]

    dots_region = plot_data_values[plot_data_values['plot_dot']]
    dots_region_missing = plot_data_missing[plot_data_missing['plot_dot']]
    dots_out_region = plot_data_out_region[plot_data_out_region['plot_dot']]

    plt = (
        ggplot() + geom_map(plot_data_values,
                            aes(fill=value_field),
                            color=line_color,
                            size=0.3) +
        geom_map(
            plot_data_missing, aes(color='plot_dot'), fill=na_color,
            size=0.3) + geom_map(plot_data_out_region,
                                 fill=out_region_color,
                                 color=line_color,
                                 size=0.3) +
        geom_point(dots_region,
                   aes(x='lon', y='lat', fill=value_field),
                   size=3,
                   stroke=.1,
                   color=line_color) + geom_point(dots_region_missing,
                                                  aes(x='lon', y='lat'),
                                                  fill=na_color,
                                                  size=3,
                                                  stroke=.1,
                                                  color=line_color) +
        geom_point(dots_out_region,
                   aes(x='lon', y='lat'),
                   fill=out_region_color,
                   size=3,
                   stroke=.1,
                   color=line_color) +
        scale_x_continuous(breaks=[], limits=limits_x) +
        scale_y_continuous(breaks=[], limits=limits_y) + theme(
            figure_size=(plot_size * ratio, plot_size),
            panel_background=element_rect(fill='white', color='black'),
            #  panel_border=element_rect(fill='white',
            #                            color='black',
            #                            size=.1),
            legend_background=element_rect(
                fill="white", color='black', size=.5),
            legend_box_just='left') + xlab('') + ylab(''))

    if len(plot_data_values.index) > 0:
        plt += fill_scale

    plt += scale_color_manual(name=' ',
                              values=[line_color],
                              breaks=[False],
                              labels=['No data available'])

    if plot_data[value_field].dtype == 'object':
        plt += guides(fill=guide_legend(override_aes={'shape': None}))

    return {
        'plot': plt,
        'ratio': ratio,
    }
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Read AnnData object and list of phenotypes. Plot boxplots of \
            phenotypes across clusters.
            """)

    parser.add_argument('-h5',
                        '--h5_anndata',
                        action='store',
                        dest='h5',
                        required=True,
                        help='H5 AnnData file.')

    parser.add_argument('--pheno_columns',
                        action='store',
                        dest='pheno_columns',
                        default='',
                        help='Pheno column to be boxplotted by cluster.')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='plot_boxplot_cluster',
        help='Basename of output png file. Will have .png appended.\
            (default: %(default)s)')

    options = parser.parse_args()

    adata = sc.read_h5ad(filename=options.h5)

    pheno_to_plot = options.pheno_columns.split(',')

    plt_height = 4
    plt_width = 16

    # Plot the data.
    for pheno in pheno_to_plot:
        # plt_width = adata.obs['cluster'].nunique() * 0.25

        gplt = plt9.ggplot(adata.obs)
        gplt = gplt + plt9.geom_boxplot(plt9.aes(x='cluster', y=pheno))
        gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90))
        gplt.save('boxplot-{}.png'.format(pheno),
                  dpi=300,
                  width=plt_width,
                  height=plt_height)

        # Add log10 transformation plot
        lab = 'log10'
        if adata.obs[pheno].min() < 0:
            adata.obs[pheno] = adata.obs[pheno] + abs(
                adata.obs[pheno].min()) + 1
            lab = 'plusmin1log10'
        elif adata.obs[pheno].min() == 0:
            adata.obs[pheno] = adata.obs[pheno] + 1
            lab = 'plus1log10'
        gplt = plt9.ggplot(adata.obs)
        gplt = gplt + plt9.geom_boxplot(plt9.aes(x='cluster', y=pheno))
        gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90))
        gplt = gplt + plt9.scale_y_continuous(
            trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
        gplt.save('boxplot_{}-{}.png'.format(lab, pheno),
                  dpi=300,
                  width=plt_width,
                  height=plt_height)