def plot_frequency(n = 200):
    """
    Draws the histogram of the distribution of n tweets by date.
    
    Parameters
    ----------
    n: int
    An integer specifying how many tweets should be analysed.
    
    Returns
    -------
    It saves the histogram as a .png file in the static folder.

    """
        
    from plotnine import ggplot, aes, geom_histogram,  scale_x_datetime, labs, theme_minimal, ggsave 
    from Mod_1_API import gather_tweets
    from mizani.breaks import date_breaks
    from mizani.formatters import date_format
    import pandas
    
     
    df = pandas.DataFrame(gather_tweets(n))
       
    plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) +
           geom_histogram() +
           scale_x_datetime(breaks=date_breaks('1 week')) +
           labs(x = "Time in weeks", y = "Number of tweets by source") +
           theme_minimal()
           )
    ggsave(plot = plot1, filename = "test.png", path = "static/")
def create_scatterplots(dataframe, unique_id='unique_id'):
    '''
    Creates and saves scatterplots for each column in a dataframe

    Inputs:
        dataframe: a pandas dataframe
        unique_id: a pandas series representing a unique identifier for each observation

    Outputs: None
    '''
    reset_df = dataframe.reset_index()
    for column in dataframe.columns:
        file_name = str(column) + 'scatterplot' + '.png'
        plt1 = p9.ggplot(reset_df, p9.aes(x=column, y=unique_id)) + p9.geom_point()
        print('Saving scatterplot: '  + file_name)
        p9.ggsave(filename=file_name, plot=plt1, device='png')
Example #3
0
def plot_bar_predictions(data, filenamePlot, x, y, facet, plot_size):
    # dados = dados.loc[dados['Gain'] < 25]

    var_plot_bar_all_predictions = p9.ggplot(data, p9.aes(x=x, y=y)) +\
      p9.geom_bar(stat='identity') +\
      p9.geom_text(p9.aes(label=y),size=7, va='bottom') +\
      p9.facet_wrap(facet) +\
      p9.scales.scale_y_log10() +\
      p9.theme(axis_text_x = p9.element_text(angle=90, size =7.5 )) +\
      p9.theme(subplots_adjust={'wspace': 0.25})

    p9.ggsave(var_plot_bar_all_predictions,
              'images/plot_Bar_' + filenamePlot + '.png',
              height=plot_size,
              width=plot_size,
              units='in',
              dpi=300)
    return var_plot_bar_all_predictions
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
            axis_line=element_line(color="grey"),
            legend_key=element_rect(fill='white', colour='white'),
            legend_title=element_text(family='sans-serif', size=15),
            legend_text=element_text(family='sans-serif', size=12),
            plot_title=element_text(family='sans-serif', size=15),
            axis_text=element_text(family='sans-serif', size=12),
            axis_title=element_text(family='sans-serif', size=15)
           ) \
    + scale_color_manual(['#1976d2', '#b3e5fc']) \

print(panel_A)
ggsave(plot=panel_A, filename=svcca_file, device="svg", dpi=300)
ggsave(plot=panel_A, filename=svcca_png_file, device="svg", dpi=300)

# ### Uncorrected PCA

# In[14]:

lst_num_partitions = [lst_num_partitions[i] for i in pca_ind]

all_data_df = pd.DataFrame()

# Get batch 1 data
partition_1_file = os.path.join(compendia_dir, "Partition_1_0.txt.xz")

partition_1 = pd.read_table(partition_1_file, header=0, index_col=0, sep='\t')
Example #5
0
         name='Filtration Step',
         values=['#1b9e77', '#d95f02', '#7570b3', '#e7298a'],
         labels=[
             'All Variants', 'Common Variants',
             'Depth (< {} reads)'.format(replicate_filter_min_depth_count),
             'Depth (> {} reads)'.format(replicate_filter_max_depth_count)
         ]) + gg.xlab('Sample') + gg.ylab('Final Number of Variants') +
     gg.theme_bw() + gg.theme(axis_text_x=gg.element_text(angle='90'),
                              axis_text=gg.element_text(size=8),
                              axis_title=gg.element_text(size=14)))
p

# In[13]:

figure_file = os.path.join('figures', 'replicates_filtration_results.pdf')
gg.ggsave(p, figure_file, height=5.5, width=6.5, dpi=500)

# In[14]:

p = (gg.ggplot(
    filter_counts_df,
    gg.aes(x='lane', y='COSMIC_count', fill='filter_min_depth_count')) +
     gg.geom_bar(stat='identity', position='dodge') + gg.geom_text(
         gg.aes(y=10, label='log_mut_count'), size=5, colour='white') +
     gg.scale_fill_gradient(low='blue', high='red', name='All Variants') +
     gg.facet_wrap('~ final_id') + gg.xlab('Lane') +
     gg.ylab('Number of COSMIC Variants') + gg.theme_bw() +
     gg.theme(axis_text_x=gg.element_text(angle='90'),
              axis_text=gg.element_text(size=8),
              axis_title=gg.element_text(size=14)))
p
Example #6
0
      str(min_acceptable_range), " to ", str(max_acceptable_range), "\n\n")

# save csv file
outlierfile = filename.replace('.csv', '_outliers.csv')

data_output.to_csv(outlierfile, index=False)

# plot overlay of IQR and mod-Z score outliers
p = (
    p9.ggplot(data=data_output,
              mapping=p9.aes(x='age_rounded', y='value', group='age_rounded'))
    + p9.geom_jitter(mapping=p9.aes(color='z_outlier', outlier_alpha=0.1)) +
    p9.geom_boxplot(outlier_size=0, outlier_stroke=0) + p9.ggtitle(
        "Outliers detected via the IQR method (boxplot)\nand modified z-score method (dotplot)"
    ) + p9.ylim(-10, 175))
print(p)
plotfile = filename.replace('.csv', '_outlierplot')
p9.ggsave(plot=p, filename=plotfile)

# plot regression
x = data_stats_regression['age_rounded']
y = data_stats_regression['median']
plt.plot(x, y, 'o')
plt.plot(x, r.func_linear(x, *linear_coeff))
plt.plot(x, r.func_log(x, *log10_coeff))
plt.plot(x, r.func_ln(x, *ln_coeff))
plt.title(
    "Regression performed on medians of age 1, 3 and 5\ndata with outliers removed"
)
plt.show()
                show_legend=False) \
    + labs(x = "Number of Partitions",
           y = "Similarity score (SVCCA)",
           title = "Similarity across varying numbers of partitions") \
    + theme(plot_title=element_text(weight='bold'),
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
            axis_line=element_line(color="grey"),
            legend_key=element_rect(fill='white', colour='white')
           ) \
    + scale_color_manual(['#b3e5fc']) \

print(g)
ggsave(plot=g, filename=svcca_uncorrected_file, dpi=300)

# In[9]:

# Plot - black
lst_num_experiments = list(all_svcca.index[0:int(len(all_svcca.index) / 2)])

threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_experiments), 1)),
                         index=lst_num_experiments,
                         columns=['score'])

g = ggplot(all_svcca[all_svcca['Group'] == 'uncorrected'])     + geom_line(all_svcca[all_svcca['Group'] == 'uncorrected'],
                aes(x=lst_num_experiments, y='score', color='Group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_experiments, y='score'),
# Concatenate input and simulated dataframes together
combined_data_df = pd.concat(
    [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df])

# Plot
g_input_sim = ggplot(combined_data_df, aes(x='1', y='2')) + geom_point(aes(color='dataset'), alpha=0.3) + labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP of original and simulated data") + theme_bw() + theme(
    legend_title_align = "center",
    plot_background=element_rect(fill='white'),
    legend_key=element_rect(fill='white', colour='white'),
    plot_title=element_text(weight='bold')
) \
+ guides(colour=guide_legend(override_aes={'alpha': 1})) \
+ scale_colour_manual(["grey", '#87CEFA'])

print(g_input_sim)
ggsave(plot=g_input_sim, filename=umap_overlay_file, dpi=300)

# ## 2. Visualize effects of multiple experiments in PCA space

# In[13]:

get_ipython().run_cell_magic(
    'time', '',
    '\nall_data_df = pd.DataFrame()\n\n# Get batch 1 data\npartition_1_file = os.path.join(\n    partition_dir,\n    "Partition_1.txt.xz")\n\npartition_1 = pd.read_table(\n    partition_1_file,\n    header=0,\n    index_col=0,\n    sep=\'\\t\')\n\n\nfor i in lst_num_partitions:\n    print(\'Plotting PCA of 1 partition vs {} partition...\'.format(i))\n    \n    # Simulated data with all samples in a single partition\n    original_data_df =  partition_1.copy()\n    \n    # Add grouping column for plotting\n    original_data_df[\'num_partitions\'] = \'1\'\n    \n    # Get data with additional partitions added\n    partition_other_file = os.path.join(\n        partition_dir,\n        "Partition_"+str(i)+".txt.xz")\n\n    partition_other = pd.read_table(\n        partition_other_file,\n        header=0,\n        index_col=0,\n        sep=\'\\t\')\n    \n    # Simulated data with i partitions\n    partition_data_df =  partition_other\n    \n    # Add grouping column for plotting\n    partition_data_df[\'num_partitions\'] = \'multiple\'\n    \n    # Concatenate datasets together\n    combined_data_df = pd.concat([original_data_df, partition_data_df])\n\n    # PCA projection\n    pca = PCA(n_components=2)\n\n    # Encode expression data into 2D PCA space\n    combined_data_numeric_df = combined_data_df.drop([\'num_partitions\'], axis=1)\n    combined_data_PCAencoded = pca.fit_transform(combined_data_numeric_df)\n\n\n    combined_data_PCAencoded_df = pd.DataFrame(combined_data_PCAencoded,\n                                               index=combined_data_df.index,\n                                               columns=[\'PC1\', \'PC2\']\n                                              )\n                                              \n    # Variance explained\n    print(pca.explained_variance_ratio_)  \n    \n    # Add back in batch labels (i.e. labels = "batch_"<how many batch effects were added>)\n    combined_data_PCAencoded_df[\'num_partitions\'] = combined_data_df[\'num_partitions\']\n    \n    # Add column that designates which batch effect comparision (i.e. comparison of 1 batch vs 5 batches\n    # is represented by label = 5)\n    combined_data_PCAencoded_df[\'comparison\'] = str(i)\n    \n    # Concatenate ALL comparisons\n    all_data_df = pd.concat([all_data_df, combined_data_PCAencoded_df])\n    \n    # Plot individual comparisons\n    print(ggplot(combined_data_PCAencoded_df, aes(x=\'PC1\', y=\'PC2\')) \\\n          + geom_point(aes(color=\'num_partitions\'), alpha=0.2) \\\n          + labs(x = "PC 1", y = "PC 2", title = "Partition 1 and Partition {}".format(i))\\\n          + theme_bw() \\\n          + theme(\n                legend_title_align = "center",\n                plot_background=element_rect(fill=\'white\'),\n                legend_key=element_rect(fill=\'white\', colour=\'white\'), \n                plot_title=element_text(weight=\'bold\')\n            ) \\\n          + guides(colour=guide_legend(override_aes={\'alpha\': 1})) \\\n          + scale_colour_manual(["grey", \'#b3e5fc\'])\n         )             '
)

# In[14]:

# Convert 'num_experiments' into categories to preserve the ordering
lst_num_partitions_str = [str(i) for i in lst_num_partitions]
num_partitions_cat = pd.Categorical(all_data_df['num_partitions'],
Example #9
0
# In[16]:

# Plot
threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_experiments), 1)),
                         index=lst_num_experiments,
                         columns=['score'])

g = ggplot(similarity_score_df, aes(x=lst_num_experiments, y='score'))     + geom_line()     + geom_line(aes(x=lst_num_experiments, y='score'), threshold, linetype='dashed')     + labs(x = "Number of Experiments",
           y = "Similarity score (SVCCA)",
           title = "Similarity after correcting for experiment variation") \
    + theme_bw() \
    + theme(plot_title=element_text(weight='bold'))

print(g)
ggsave(plot=g, filename=svcca_file, dpi=300)

# In[17]:

# Plot - black
threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_experiments), 1)),
                         index=lst_num_experiments,
                         columns=['score'])

g = ggplot(similarity_score_df, aes(x=lst_num_experiments, y='score'))     + geom_line(colour="white")     + geom_line(aes(x=lst_num_experiments, y='score'), threshold, colour="white", linetype='dashed')     + labs(x = "Number of Experiments",
           y = "Similarity score (SVCCA)",
           title = "Similarity after correcting for experiment variation") \
    + theme(plot_title=element_text(weight='bold', colour="white"),
            plot_background=element_rect(fill="black"),
            panel_background=element_rect(fill="black"),
input_data_UMAPencoded_df['dataset'] = 'original'
simulated_data_UMAPencoded_df['dataset'] = 'simulated'

# Concatenate input and simulated dataframes together
combined_data_df = pd.concat([input_data_UMAPencoded_df, simulated_data_UMAPencoded_df])

# Plot
g_input_sim = ggplot(combined_data_df[combined_data_df['dataset'] == 'original'], aes(x='1', y='2'))
g_input_sim += geom_point(color='#d5a6bd', 
                          alpha=0.15)
g_input_sim += labs(x = "UMAP 1",
                    y = "UMAP 2", 
                    title = "Original and simulated data")
g_input_sim += theme_bw()
g_input_sim += theme(
    legend_title_align = "center",
    plot_background=element_rect(fill='white'),
    legend_key=element_rect(fill='white', colour='white'), 
    plot_title=element_text(family='sans-serif', size=15),
    axis_text=element_text(family='sans-serif', size=12),
    axis_title=element_text(family='sans-serif', size=15)

)
g_input_sim += geom_point(combined_data_df[combined_data_df['dataset'] == 'simulated'],
                          alpha=0.09,
                          color='#cccccc')

print(g_input_sim)
ggsave(plot = g_input_sim, filename = umap_overlay_file, dpi=300)

comparison[["no_column", "data_length"]] = comparison[["no_column", "data_length"]].apply(pd.to_numeric, downcast="integer")

### Visual Exploration
saveformat = "png"
## Select

# pandas
plot = (gg.ggplot(pandas_sel, gg.aes("factor(no_column)", "factor(data_length)")) +
        gg.geom_tile(gg.aes(fill="q50")) +
        gg.geom_text(gg.aes(label="q50"), color="white", size=9) +
        gg.labs(y="# Rows", x="# Columns", title="Pandas median selection time") +
        gg.facet_grid("pos_col ~ sel_col") +
        gg.theme_bw() +
        gg.theme(legend_position=None))

gg.ggsave(plot, filename=os.path.join(path_n, "output", f"select_results_pandas.{saveformat}"), width=15, height=10)

# data.table
plot = (gg.ggplot(datatable_sel, gg.aes("factor(no_column)", "factor(data_length)")) +
        gg.geom_tile(gg.aes(fill="q50")) +
        gg.geom_text(gg.aes(label="q50"), color="white", size=9) +
        gg.labs(y="# Rows", x="# Columns", title="data.table median selection time") +
        gg.facet_grid("pos_col ~ sel_col") +
        gg.theme_bw() +
        gg.theme(legend_position=None))

gg.ggsave(plot, filename=os.path.join(path_n, "output", f"select_results_datatable.{saveformat}"), width=15, height=10)

# comparison
plot = (gg.ggplot(comparison[comparison.operation == "select"], gg.aes("factor(no_column)", "factor(data_length)")) +
        gg.geom_tile(gg.aes(fill="q50")) +
        gg.geom_rug(gg.aes(color="Metadata_cell_line"),
                    show_legend={'color': False}) + \
        gg.theme_bw() + \
    gg.theme(
            subplots_adjust={"wspace": 0.2},
            axis_text=gg.element_text(size=7),
            axis_title=gg.element_text(size=9),
            strip_text=gg.element_text(size=6, color="black"),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        ) + \
        gg.xlim([-0.5, 1]) + \
        gg.xlab("Median Correlation of All Guides Across Genes") + \
        gg.ylab("Density") + \
        gg.facet_wrap("~replicate_type", nrow=2, scales="free") + \
        gg.scale_fill_manual(name="Cell Line",
                             values=["#1b9e77", "#d95f02", "#7570b3"]) + \
        gg.scale_color_manual(name="Cell Line",
                              values=["#1b9e77", "#d95f02", "#7570b3"])
)

file = os.path.join("figures", "median-guide-correlation-density")
for extension in ['.png', '.pdf']:
    gg.ggsave(cor_density_gg,
              filename='{}{}'.format(file, extension),
              dpi=500,
              height=2,
              width=3,
              units='in')

cor_density_gg
def main():

    args = UserInput()

    if args.y_lim:
        y_lim = np.array(args.y_lim, dtype=np.float32)
    else:
        y_lim = None
    if args.size:
        size = np.array(args.size, dtype=np.float32)
    else:
        size = args.size

###################################

    df_list = [
        pd.read_csv(f, sep=args.sep, skipinitialspace=True)
        for f in args.infile
    ]

    ## only take input with 1 or 2 columns; for 2 columns, 1st is always removed
    lg_list = []
    for idx, df in enumerate(df_list):
        xdf = pd.DataFrame(df.iloc[:, int(args.col) - 1])

        if args.col_names:
            xdf.columns = [args.col_names[idx]]

        lg_list.append(pd.melt(xdf))

    lg_df = pd.concat(lg_list)
    lg_df.columns = [args.x_name, args.y_name]
    print(lg_df)

    ## plotnine method
    if args.use_p9:
        import plotnine as p9
        Quant = [.25, .5, .75]

        if y_lim is not None:
            set_ylim = p9.ylim(y_lim)
        else:
            set_ylim = p9.ylim(
                [lg_df[args.y_name].min(), lg_df[args.y_name].max()])

        df_plot = (p9.ggplot(
            lg_df, p9.aes(x=args.x_name, y=args.y_name, fill=args.x_name)) +
                   p9.geom_violin(
                       width=.75, draw_quantiles=Quant, show_legend=False) +
                   p9.ggtitle(args.title) + p9.theme_classic() + set_ylim +
                   p9.scale_x_discrete(limits=args.col_names) +
                   p9.theme(text=p9.element_text(size=12, color='black'),
                            axis_text_x=p9.element_text(angle=33),
                            panel_grid_major_y=p9.element_line(color='gray',
                                                               alpha=.5)))

        p9.ggsave(filename='{0}.violin.{1}'.format(args.outpref, args.img),
                  plot=df_plot,
                  dpi=int(args.dpi),
                  format=args.img,
                  width=size[0],
                  height=size[1],
                  units='in',
                  verbose=False)

    else:
        ## Seaborn method
        import seaborn as sns
        sns.set(style='whitegrid')

        ax = sns.violinplot(x=args.x_name,
                            y=args.y_name,
                            data=lg_df,
                            linewidth=1,
                            inner='box')
        if args.title:
            ax.set_title(args.title)
        if y_lim is not None:
            ax.set(ylim=y_lim)

        plt.savefig('{0}.violin.{1}'.format(args.outpref, args.img),
                    figsize=tuple(size),
                    format=args.img,
                    dpi=int(args.dpi))
        plt.clf()
Example #14
0
# Printing information
print('\nConditional attributes description: ')
print(conditional_attributes.describe())

print('\nDecision attribute description: ')
print(decision_attribute.describe())

# Creating directory for files with plots if not exists
if not os.path.isdir('./analyze'):
    os.mkdir('analyze')

# Generating and saving plots for every attributes
for i in conditional_attributes:
    plot = ggplot(dataset, aes(x=i, fill=decision_attribute.name)) + geom_histogram(stat="count")
    filename = '{0}-vs-class.png'.format(i)
    ggsave(plot=plot, filename=filename, dpi=300, scale=1, verbose=False, path='analyze')

for i in conditional_attributes:
    plot = ggplot(dataset, aes(x=decision_attribute.name, fill=i)) + geom_histogram(stat="count")
    filename = 'class-vs-{0}.png'.format(str(i))
    ggsave(plot=plot, filename=filename, dpi=300, scale=1, verbose=False, path='analyze')

# Encoding attributes
encoder = LabelEncoder()
for i in dataset.columns:
    dataset[i] = encoder.fit_transform(dataset[i])

normalize_conditional_attributes = dataset.iloc[:, :6]
normalize_decision_attribute = dataset['class']

# Splitting dataset
Example #15
0
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from plotnine import ggplot, geom_point, aes, ggsave
df = pd.read_csv(
    r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\charpowersnumerical.csv"
)

df_tsne = TSNE().fit_transform(df)
np.savetxt(r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\tsne.txt",
           df_tsne,
           fmt="%s")

tsneplot = pd.read_csv(
    r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\tsnealign.csv")

tsneplotted = ggplot(tsneplot, aes('V1', 'V2',
                                   color='factor(V3)')) + geom_point()
ggsave(tsneplotted,
       r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\pythontsne.png")
Example #16
0
 def test_ggsave(self):
     ggsave(p)
     fn = p._save_filename('pdf')
     assert_exist_and_clean(fn, "default filename")
Example #17
0
 def test_ggsave(self):
     ggsave(p)
     fn = p._save_filename('pdf')
     assert_exist_and_clean(fn, "default filename")
# Concatenate input and simulated dataframes together
combined_data_df = pd.concat([input_data_UMAPencoded_df, simulated_data_UMAPencoded_df])

# Plot
fig = ggplot(combined_data_df, aes(x='1', y='2'))
fig += geom_point(aes(color='experiment_id'), alpha=0.1)
fig += facet_wrap('~dataset')
fig += labs(x ='UMAP 1',
            y = 'UMAP 2',
            title = 'UMAP of original and simulated data (gene space)')
fig += theme_bw()
fig += theme(
    legend_title_align = "center",
    plot_background=element_rect(fill='white'),
    legend_key=element_rect(fill='white', colour='white'), 
    legend_title=element_text(family='sans-serif', size=15),
    legend_text=element_text(family='sans-serif', size=12),
    plot_title=element_text(family='sans-serif', size=15),
    axis_text=element_text(family='sans-serif', size=12),
    axis_title=element_text(family='sans-serif', size=15)
    )
fig += guides(colour=guide_legend(override_aes={'alpha': 1}))
fig += scale_color_manual(['red', '#bdbdbd'])
fig += geom_point(data=combined_data_df[combined_data_df['experiment_id'] == example_id],
                  alpha=0.1, 
                  color='red')

print(fig)
ggsave(plot=fig, filename=experiment_simulated_file)