Esempi in Python per FileHandling, esempi in Python per GEN_Utils.FileHandling

Esempio n. 1

0

Mostra file

File: utils.py Progetto: dezeraecox-manuscripts/SUI_Proteome-solubility

def image_processor(input_folder, image_name, output_path, save_file=False):
    logger.info(f'Processing ROI pixels from {image_name}')
    results_path = input_folder + image_name + '_cells_pixels.csv'
    ROI_pixels = pixel_cleaner(results_path, image_name)
    ROI_pixels.rename(columns={'cells': 'Intensity'}, inplace=True)
    logger.debug(ROI_pixels.columns.tolist())

    # Repeat for nuclear pixel info
    nuclear_path = input_folder + image_name + '_nuclei_pixels.csv'
    logger.info(f'Processing nuclear pixels from {image_name}')
    nuclei_pixels = pixel_cleaner(nuclear_path, image_name)
    nuclei_pixels.rename(columns={'nuclei': 'Intensity'}, inplace=True)
    logger.debug(nuclei_pixels.columns.tolist())

    # Filter ROI_pixels to remove any pixels that are found in the nuclear pixels list
    logger.info(f'Filtering nuclear pixels from ROI pixels for {image_name}')
    cyto_pixels = ROI_pixels[~ROI_pixels['X,Y'].isin(nuclei_pixels['X,Y'])]
    logger.debug(cyto_pixels.columns.tolist())

    # Collect nuclear pixels that were excluded from whole cell pixels as nuclear pixels for each cell
    nuclear_pixels = ROI_pixels[~ROI_pixels['X,Y'].isin(cyto_pixels['X,Y'])]

    if save_file:
        # Save to excel file:
        logger.info(f'Saving individual excel files for {image_name}')
        FileHandling.df_to_excel(
            output_path + f'{image_name}_pixel_filtered.xlsx',
            ['cytoplasm', 'nucleus', 'whole_cell'],
            data_frames=[cyto_pixels, nuclear_pixels, ROI_pixels])

    logger.info(
        f'Pixels processed successfully for {image_name}! Results saved to {output_path}'
    )

    return {
        'cyto': cyto_pixels,
        'nuclei_pixels': nuclei_pixels,
        'nucleus': nuclear_pixels,
        'whole_cell': ROI_pixels
    }

Esempio n. 2

0

Mostra file

# add columns normalised to t0 intensity
before_after['norm_Intensity_before'] = before_after[
    'Intensity_before'] / before_after['Intensity_before']
before_after['norm_Intensity_after'] = before_after[
    'Intensity_after'] / before_after['Intensity_before']

# Collect useful columns, group before/after by mutant and save to excel
mutant_summary = {}
for group, df in before_after.groupby('mutant'):
    mutant_summary[group] = df[[
        'track_id', 'ROI_name_before', 'Intensity_before', 'Intensity_after',
        'norm_Intensity_before', 'norm_Intensity_after'
    ]]

FileHandling.df_to_excel(f'{output_path}first_last_summary.xlsx',
                         sheetnames=list(mutant_summary.keys()),
                         data_frames=list(mutant_summary.values()))

# melt to produce data for seaborn plotting normalised intensity
plottable = before_after[[
    'mutant', 'track_id', 'norm_Intensity_before', 'norm_Intensity_after'
]].melt(id_vars=['mutant', 'track_id'],
        var_name='Activation',
        value_name='Intensity')

plottable['Activation'] = plottable['Activation'].str.split('_').str[-1]

#fig = scatbar_plot.scatbar_plot(x_col='mutant', y_col='Intensity', plotting_dfs=[plottable], hue_col='Activation', group_col='mutant')
#plt.savefig(f'{output_path}norm_first_last.png')

# melt to produce data for seaborn plotting raw intensity

Esempio n. 3

0

Mostra file

    fluo_data = raw_data.copy()[['phase', 'fluorescence']]
    fluo_data = fluo_data.groupby('phase').median().T
    fluo_data['sample'] = sample_name

    sample_data.append(fluo_data)

summary_df = pd.concat(sample_data).reset_index(drop=True)

summary_df['plate'] = summary_df['sample'].str[0]
summary_df['well'] = summary_df['sample'].str[1:]
summary_df['sample'] = summary_df['well'].map(sample_map)

summary_df.sort_values(['sample'], inplace=True)

FileHandling.df_to_excel(
    data_frames=[summary_df],
    sheetnames=['fluorescence_per_phase'],
    output_path=f'{output_folder}per_phase_median_TPE.xlsx')

# Generate equivalent dataset, ignoring phase
sample_data = {}
for filename in file_list:
    sample_name = os.path.splitext(filename)[0]

    raw_data = pd.read_csv(f'{input_folder}{filename}')
    raw_data.rename(columns={fluorescence_col: "fluorescence"}, inplace=True)

    fluo_data = raw_data.copy()['fluorescence']
    sample_data[sample_name] = fluo_data.median()

summary_df = pd.DataFrame.from_dict(sample_data, orient='index').reset_index()
summary_df.rename(columns={

Esempio n. 4

0

Mostra file

File: for_codes.py Progetto: dezeraecox/Investigator-Grants-2019

    })
cleaned_codes['length'] = cleaned_codes['code'].astype(str).apply(len)
level_map = {2: 1, 4: 2, 5: 3, 6: 3}
cleaned_codes['level'] = cleaned_codes['length'].map(level_map)
cleaned_codes['code'] = cleaned_codes['code'].astype(str)
cleaned_codes['code'] = [
    '0' + code if len(code) == 5 else code for code in cleaned_codes['code']
]
cleaned_codes['parent_field_code'] = cleaned_codes['code'].str[0:2]
cleaned_codes['parent_group_code'] = [
    code if len(code) == 4 else np.nan
    for code in cleaned_codes['code'].str[0:4]
]

FileHandling.df_to_excel(data_frames=[cleaned_codes.drop('length', axis=1)],
                         sheetnames=['FOR_summary'],
                         output_path=f'{output_folder}for_summary.xlsx')

cleaned_codes.tail(50)

cleaned_codes.to_csv(f'{output_folder}for_summary.csv')

# split on ' ' - any 6-digit codes will not be affected, whereas 4-digit codes will have two elements
# for all codes, keep element[0] and for four digit codes return map into 'description' column

# Generate mapped column for 2-digit and 4-digit codes (by splitting number, then collecting first two or first four digits?)

# Generate reverse dictionary mapping description to each set of codes (two, four or six-digit)

# Import summary data for field of research

Esempio n. 5

0

Mostra file

File: pixel_functions.py Progetto: dezeraecox/LEXY_Toolbox

def scattbar_plotter(summary_df,
                     group_xcol,
                     y_col,
                     hue_col,
                     output_path,
                     order=None,
                     hue_order=None):

    fig, ax = plt.subplots()

    br = sns.barplot(x=group_xcol,
                     y=y_col,
                     data=summary_df,
                     hue=hue_col,
                     dodge=True,
                     errwidth=1.25,
                     alpha=0.25,
                     ci=None,
                     order=order,
                     hue_order=hue_order,
                     ax=ax)
    scat = sns.swarmplot(x=group_xcol,
                         y=y_col,
                         data=summary_df,
                         hue=hue_col,
                         dodge=True,
                         order=order,
                         hue_order=hue_order,
                         ax=ax)

    leg_label = hue_col
    # To generate custom error bars
    if not order:
        order = list(summary_df[group_xcol].unique())
        logger.debug(f'Samples: {order}')

    if hue_order:
        hue_map = dict(zip(hue_order, range(len(hue_order))))
        summary_df['hue_pos'] = summary_df[hue_col].map(hue_map)
        hue_col = 'hue_pos'
    number_groups = len(list(set(summary_df[hue_col])))
    logger.debug(f'Number of groups: {number_groups}')

    bars = br.patches
    xvals = [(bar.get_x() + bar.get_width() / 2) for bar in bars]
    xvals.sort()
    yvals = summary_df.groupby([group_xcol, hue_col]).mean().T[order].T[y_col]

    yerr = summary_df.groupby([group_xcol, hue_col]).std().T[order].T[y_col]

    (_, caps, _) = ax.errorbar(x=xvals,
                               y=yvals,
                               yerr=yerr,
                               capsize=4,
                               elinewidth=1.25,
                               ecolor="black",
                               linewidth=0)
    for cap in caps:
        cap.set_markeredgewidth(2)
    ax.set_ylabel(y_col)

    # To only label once in legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[0:number_groups],
              labels[0:number_groups],
              bbox_to_anchor=(1.26, 1.05),
              title=leg_label)

    # rotate tick labels

    for label in ax.get_xticklabels():
        label.set_rotation(45)

    ax.set_xlabel(group_xcol)

    plt.tight_layout()
    plt.autoscale()
    #plt.show()

    FileHandling.fig_to_pdf([fig], output_path + f'_scattbar')
    FileHandling.fig_to_svg([f'scattbar'], [fig], output_path)

    return fig

Esempio n. 6

0

Mostra file

File: stats_metrics.py Progetto: dezeraecox/Investigator-Grants-2019

metrics['fwci_awarded'] = metrics['fwci_awarded'].apply(value_checker)
metrics['pubs_awarded'] = metrics['pubs_awarded'].apply(value_checker)

# Collect datapoints per year
pubs_list = {}
fwci_list = {}

for year, df in metrics.groupby(['Year']):
    for level, data in df.groupby('type_cat'):
        pubs_list[f'{year}_{level}'] = list(data['pubs_awarded'])
        fwci_list[f'{year}_{level}'] = list(data['fwci_awarded'])
# Generate separate dataframes
pubs = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in pubs_list.items()]))
fwci = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in fwci_list.items()]))

FileHandling.df_to_excel(data_frames=[pubs, fwci], sheetnames=['pubs', 'fwci'], output_path=f'{output_folder}metrics_per_year.xlsx')

# Collect cols for each level
levels = ['_1', '_2', '_3']

for level in levels:
    
    cols = [col for col in pubs.columns.tolist() if level in col]
    test_df = pubs[cols].melt(value_name='publications', var_name='Group')

    # Two-way ANOVA
    aov = pg.anova(data=test_df, dv='publications', between='Group',
                   export_filename=f'{output_folder}anova_pubs{level}.csv')
    pg.print_table(aov)

    # FDR-corrected post hocs with Hedges'g effect size

Esempio n. 7

0

Mostra file

    'object_number', 'mutant', 'image_number'
]
compiled_spots = compiled_raw_data[cols_of_interest]
compiled_spots.columns = new_col_names

# Save to csv
compiled_spots.to_csv(f'{output_folder}compiled_spots.csv')

# Calculate interesting bits
## mean of area and mean_intensity
spot_calculations = compiled_spots.groupby(
    ['mutant', 'image_number']).mean()[['area',
                                        'mean_intensity']].reset_index()

## count per cell
spot_count = compiled_spots.groupby(['mutant', 'image_number'
                                     ]).count()['object_number'].reset_index()
spot_min = compiled_spots.groupby(['mutant', 'image_number'
                                   ]).min()['min_intensity'].reset_index()

# concat into final summary df
spot_summary = reduce(
    lambda df1, df2: pd.merge(
        df1, df2, on=['mutant', 'image_number'], how='outer'),
    [spot_calculations, spot_count, spot_min])
spot_summary.rename(columns={'object_number': 'count'}, inplace=True)

FileHandling.df_to_excel(f'{output_folder}spot_summary.xlsx',
                         sheetnames=['spot_summary'],
                         data_frames=[spot_summary])

Esempio n. 8

0

Mostra file

# Generate compiled summary_df for all images, after adding descriptive nuc_id to each ROI
summary_list = []
for key, summary_df in cyto_summary.items():
    # Add additional info from image names
    summary_df = summary_df.dropna()
    summary_df['mutant'] = summary_df['image_name_mCherry'].str.split('_').str[0]
    summary_df['position'] = summary_df['image_name_mCherry'].str.split('_').str[2]
    summary_df['number'] = [str(number) for number in range(0, summary_df.shape[0])]
    summary_df['cell_id'] = list(zip(summary_df['mutant'], summary_df['position'], summary_df['number']))
    summary_df['cell_id'] = ['_'.join(tuple) for tuple in summary_df['cell_id']]
    summary_dict[key]['cyto'] = summary_df
    summary_list.append(summary_df)
mean_cell_summary = pd.concat(summary_list)

FileHandling.df_to_excel(output_path+f'mean_cell_summary.xlsx', sheetnames=['summary'], data_frames=[mean_cell_summary.reset_index()])


# # Checking out the summary results
# pixel_dict.keys() # shows the image names that were processed i.e. key 1
# pixel_dict['barnaseWT+Q25_pos_001_t_0']['cyto'].keys() # Shows the available dataframes for each image i.e. key 2
# pixel_dict['Ex2_pos_1_t_0']['nuclei_pixels'] # shows the cytoplasm pixel summary for the first image
# summary_dict.keys()
# summary_dict['Ex2_pos_1_t_0'].keys()
#
# summary_dict['barnaseWT+Q25_pos_001_t_0']['cyto']


# plot individual image for t0 to test segmentation with ROI name overlayed
for image_name in [name for name in image_names if 't_0' in name]:
    pixel_cyto = pixel_dict[image_name]['cyto']

Esempio n. 9

0

Mostra file

    filename for filename in os.listdir(input_folder)
    if 'Scholarly_Output' in filename
]

# Read in raw scival info, clean
pubs_raw = []
for filename in file_list:
    pubs_raw.append(pd.read_excel(f'{input_folder}{filename}', skiprows=12))
pubs_raw = pd.concat(pubs_raw)
pubs_clean = pubs_raw.rename(columns={'Unnamed: 0': 'name'})
pubs_clean = pubs_clean.drop(
    [col for col in pubs_clean if 'Unnamed' in str(col)], axis=1)
pubs_clean.set_index('name', inplace=True)
pubs_clean.head(20)

pubs_clean.loc['Jenkins, Misty R.']

# count number of pubs 10 years before award
pubs = {}
# test_dict = {k: v for k, v in random.sample(search_dict.items(), 10)}
for name, year in search_dict.items():
    past_range = np.arange(year - 10, year + 1, 1)
    pubs[name] = pubs_clean.loc[name, past_range].sum()

awardees['pubs_awarded'] = awardees['match_name'].map(pubs)

# Save completed mapping to excel
FileHandling.df_to_excel(
    data_frames=[awardees],
    sheetnames=['fwci_pubs'],
    output_path=f'{output_folder}ten_year_metrics_summary.xlsx')

Esempio n. 10

0

Mostra file

types = ['nucleus', 'cyto']
cell_summary = pd.DataFrame(index=[
    'image_name', 'pixel_type', 'mean', 'median', 'std', 'mode', 'geomean'
])
for image, dictionary_1 in filtered_pixels.items():
    for pixel_type, df in dictionary_1.items():
        if pixel_type in types:
            logger.info(f'Processing {pixel_type} for {image}.')
            logger.info(f'Original shape: {df.shape}')
            df = df[df['mCherry_intensity'] < mCherry_threshold]
            logger.info(f'New shape: {df.shape}')
            # calculate relevant info
            stats = [
                image, pixel_type, df['GFP_intensity'].mean(),
                df['GFP_intensity'].median(), df['GFP_intensity'].std(),
                df['GFP_intensity'].mode()[0],
                geomean(list(df['GFP_intensity']))
            ]
            cell_summary[f'{image}_{pixel_type}'] = stats

# Clean up stats df
cell_summary = cell_summary.T.reset_index(drop=True)
# Add column for the mutant, replicate
cell_summary['mutant'], cell_summary['replicate'] = cell_summary[
    'image_name'].str.split('_').str

# Save summary to excel
FileHandling.df_to_excel(output_path + f'GFP_stats_summary.xlsx',
                         sheetnames=['GFP_summary'],
                         data_frames=[cell_summary])

Esempio n. 11

0

Mostra file

        proportion = round(sample_dict['proportions'][phase], 2)
        plt.annotate(f'{phase}:{proportion}', (0.2, phase_pos[x]),
                     xycoords='figure fraction')
    plt.legend()
    plt.title(sample_name)
    plt.ylabel('Count')
    plt.savefig(f'{output_folder}plots/{sample_name}_predict.png')

    data_dict[sample_name] = sample_dict

# Save cluster-labelled df to csvs
for sample in data_dict.keys():
    data = data_dict[sample]['normalised']
    data.to_csv(f"{output_folder}normalised/{sample}.csv")

# Collect proportions into simple df labelled with sample name
summary = pd.DataFrame(index=data_dict.keys(), columns=['G', 'S', 'M'])

for sample in data_dict.keys():
    proportions = data_dict[sample]['proportions']
    summary.loc[sample, 'G'] = proportions['G']
    summary.loc[sample, 'S'] = proportions['S']
    summary.loc[sample, 'M'] = proportions['M']

summary.rename_axis('sample', inplace=True)
summary.reset_index(inplace=True)

FileHandling.df_to_excel(data_frames=[summary],
                         sheetnames=['phase_proportion'],
                         output_path=f'{output_folder}summary.xlsx')

Esempio n. 12

0

Mostra file

File: summary_stats.py Progetto: dezeraecox/Investigator-Grants-2019

    test_data['Year'] = year
    test_data.drop(grant_cols, axis=1, inplace=True)
    test_data.rename(columns={name_col: 'CIA_name'}, inplace=True)

    name_summary.append(test_data)

name_summary = pd.concat(name_summary).reset_index(drop=True)

name_summary['name'] = name_summary['CIA_name'].str.split(
    ' ').str[1:].str.join(' ')

type_mapper = {'ECF': 1, 'CDF': 2, 'RF': 3, 'L': 3, 'EL2': 2, 'EL1': 1}
name_summary['type_cat'] = name_summary['Type'].map(type_mapper)

# Save all to excel
data_frames = [
    awarded_summary, gender_summary, title_summary, title_proportions,
    research_summary, research_proportions, state_summary, institute_summary,
    kw_summary, name_summary
]
sheetnames = [
    'total_rates', 'per_gender', 'CIA_title', 'title_proportion_per_year',
    'field_of_research', 'broad_research_proportions', 'state_summary',
    'institute_summary', 'key_word_summary', 'name_summary'
]

FileHandling.df_to_excel(data_frames=data_frames,
                         sheetnames=sheetnames,
                         output_path=f'{output_folder}summary_stats.xlsx')

Esempio n. 13

0

Mostra file

File: confluency.py Progetto: dezeraecox/UTILS_cell-culture

    day_cleaning['Plate #'] = day_cleaning['Well #'].str.split('_',
                                                               expand=True)[0]
    day_cleaning['Treatment #'] = day_cleaning['Well #'].str.split(
        '_', expand=True)[2]
    day_cleaning['Plate_coords'] = day_cleaning['Plate #'].map(
        str) + '_' + day_cleaning['Treatment #'].map(str)
    day_cleaning['day'] = x + 1
    day_cleaning['sample'] = day_cleaning['Plate_coords'].map(sample_map)

    day_dict[day_name] = day_cleaning

summary_df = pd.concat(day_dict.values())

FileHandling.df_to_excel(data_frames=[summary_df],
                         sheetnames=['summary'],
                         output_path=f'{output_path}summary.xlsx')

# Generate data for plotting

plotting_plates = []
for group, data in summary_df.groupby('Plate #'):
    plot_df = data[['Well #', 'sample', 'day', '%Area']].reset_index()
    # Exclude unused wells to allow easier plotting
    plot_df = plot_df[plot_df['sample'] != 'Unused']
    plot_df['Hue position'] = plot_df['day'].astype("category").cat.codes
    plotting_plates.append(plot_df)

# Generate single plot with each plate
dimensions = len(set(summary_df['Plate #']))
fig, axes = plt.subplots(dimensions, 1, figsize=(8, dimensions * 3))