def write_summary(out_path, genes_by_merged_signature, meta_genes, global_variables, mde_dict): # append total genes and z scores to data summary_data = {} counter = 1 for key in meta_genes: meta_gene_values = meta_genes[key] total_genes = str(len(genes_by_merged_signature[key])) summary_data[counter] = total_genes + "\t" + "\t".join( str(x) for x in meta_gene_values) + "\n" counter += 1 # sort the summary data by total genes summary_data_sorted_on_number_of_genes = sorted( summary_data, key=lambda key: int(summary_data[key].split("\t")[0]), reverse=True) summary_data_sorted = {} for i in range(0, len(summary_data_sorted_on_number_of_genes)): sig_id = "signature_" + str(i + 1) + "\t" summary_data_sorted[i + 1] = sig_id + summary_data[ summary_data_sorted_on_number_of_genes[i]] # create header sample_names = get_samples_ordered_by_order_list( mde_dict["order_list"], global_variables["samples_by_sample_groups"]) summary_data_sorted[0] = "signature\tsignature_size\t" + "\t".join( sample_names) + "\n" # writes the summary with open(os.path.join(out_path, "Signature_summary.csv"), "w+") as f: for k, v in summary_data_sorted.iteritems(): f.write(v)
def add_ne_specific_parameters(global_variables, pr_dictionary): pr_dictionary = add_subsection_r( os.path.join(pr_dictionary["r_bin_path"], "section_header", "ne_workflow.txt"), "subsection_r_workflow_type", "section_header/ne_workflow.txt", pr_dictionary) # gets the various samples and sample group lists for ne: order_list = global_variables["sample_groups_default_order"] sample_groups_by_column = global_variables["sample_groups_by_column"] samples_by_sample_groups = global_variables["samples_by_sample_groups"] sample_sheet_column_names = global_variables["sample_sheet_column_names"] samples_ordered = get_samples_ordered_by_order_list( order_list[0:len(sample_groups_by_column[0])], samples_by_sample_groups) # gets the various R code strings: samples_r_string = "c(\"" + "\",\"".join(samples_ordered) + "\")" sample_groups_r_string = "c(\"" + "\",\"".join(order_list) + "\")" sample_groupings_r_string = get_r_string_sample_groupings( order_list, samples_by_sample_groups) samples_by_sample_group_r_string = get_r_string_samples_by_sample_group( order_list, samples_by_sample_groups) sample_groups_by_SS_column_r_string = get_r_string_sample_groups_by_SS_column( order_list, sample_groups_by_column) sample_groupings_by_SS_column_r_string = get_r_string_sample_groupings_by_SS_column( order_list, sample_groups_by_column, samples_by_sample_groups) default_samples_colours_by_SS_column_r_string = get_r_string_default_samples_colours_by_SS_column( samples_by_sample_groups, order_list, sample_groups_by_column) default_sample_group_colours_by_SS_column_r_string = get_r_string_default_sample_group_colours_by_SS_column( samples_by_sample_groups, order_list, sample_groups_by_column) sample_sheet_column_names_r_string = "c(\"" + "\",\"".join( sample_sheet_column_names) + "\")" # updates the pr dictionary pr_dictionary["workflow_ID"] = "Normalised Expression" pr_dictionary["sample_sheet_column_names"] = sample_sheet_column_names pr_dictionary["order_list"] = order_list pr_dictionary["samples_ordered"] = samples_ordered pr_dictionary["samples_r_string"] = samples_r_string pr_dictionary["sample_groups_r_string"] = sample_groups_r_string pr_dictionary["sample_groupings_r_string"] = sample_groupings_r_string pr_dictionary[ "samples_by_sample_group_r_string"] = samples_by_sample_group_r_string pr_dictionary[ "sample_groups_by_SS_column_r_string"] = sample_groups_by_SS_column_r_string pr_dictionary[ "sample_groupings_by_SS_column_r_string"] = sample_groupings_by_SS_column_r_string pr_dictionary[ "default_sample_colours_by_SS_column_r_string"] = default_samples_colours_by_SS_column_r_string pr_dictionary[ "default_sample_group_colours_by_SS_column_r_string"] = default_sample_group_colours_by_SS_column_r_string pr_dictionary[ "sample_sheet_column_names_r_string"] = sample_sheet_column_names_r_string return pr_dictionary
def differential_expression_signature(global_variables, infile, out_path, pde_IDs, mpde_dict): # open data file data = open(infile).readlines() # gets a dictionary of genes by signature genes_by_signature,signatures_by_gene = get_genes_by_signature(data, pde_IDs) # adds the zscores to the genes by signatures sample_list = get_samples_ordered_by_order_list(mpde_dict["order_list"], global_variables["samples_by_sample_groups"]) genes_by_signature = get_expression_data(data,sample_list,genes_by_signature,signatures_by_gene) # iteratively merges signatures genes_by_merged_signature, meta_genes = merge_signatures(genes_by_signature, mpde_dict, sample_list) # gets the number of signatures (for the report) mpde_dict["de_signatures"] = range(1,len(genes_by_merged_signature)+1) # write data out write_data(out_path, genes_by_merged_signature) write_summary(out_path, genes_by_merged_signature, meta_genes, global_variables, mpde_dict) # returns the updated mpde disct return mpde_dict
def add_Mde_specific_parameters(global_variables, pr_dictionary, workflow_parameter_dict): pr_dictionary = add_subsection_r( os.path.join(pr_dictionary["r_bin_path"], "section_header", "mde_workflow.txt"), "subsection_r_workflow_type", "section_header/mde_workflow.txt", pr_dictionary) # gets the various samples and sample group lists for Mde: order_list = workflow_parameter_dict["order_list"] samples_by_sample_groups = global_variables["samples_by_sample_groups"] samples_ordered = get_samples_ordered_by_order_list( order_list, samples_by_sample_groups) comparisons = workflow_parameter_dict["de_IDs"] # gets the number of signatures de_signatures = workflow_parameter_dict["de_signatures"] # gets the various R code strings: samples_r_string = get_r_string_samples(samples_ordered) sample_groups_r_string = get_r_string_sample_groups(order_list) sample_groupings_r_string = get_r_string_sample_groupings( order_list, samples_by_sample_groups) samples_by_sample_group_r_string = get_r_string_samples_by_sample_group( order_list, samples_by_sample_groups) default_samples_colours_r_string = get_r_string_default_sample_colours( order_list, samples_by_sample_groups) comparisons_r_string = "c(\"" + "\",\"".join(comparisons) + "\")" # updates the pr_dictionary pr_dictionary["workflow_ID"] = workflow_parameter_dict["mde_ID"] pr_dictionary["signatures_scc"] = workflow_parameter_dict["signatures_scc"] pr_dictionary["order_list"] = order_list pr_dictionary["samples_ordered"] = samples_ordered pr_dictionary["comparisons"] = comparisons pr_dictionary["comparisons_r_string"] = comparisons_r_string pr_dictionary["samples_by_sample_groups"] = samples_by_sample_groups pr_dictionary["sample_groups_r_string"] = sample_groups_r_string pr_dictionary["samples_r_string"] = samples_r_string pr_dictionary["sample_groupings_r_string"] = sample_groupings_r_string pr_dictionary[ "samples_by_sample_group_r_string"] = samples_by_sample_group_r_string pr_dictionary[ "default_samples_colours_r_string"] = default_samples_colours_r_string pr_dictionary["de_signatures"] = de_signatures # gets the hypergeometric gene set types if global_variables["ora_flag"]: hypergeom_gene_set_types = [] hypergeom_gene_set_min_set_sizes = [] hypergeom_gene_set_max_set_sizes = [] hypergeom_gene_set_p_thresholds = [] hypergeom_gene_set_fold_thresholds = [] hypergeom_gene_set_network_overlap_ratios = [] parsed_hypergeom_gene_sets_parameters = global_variables[ "ora_parameters"] for hypergeom_gene_set_parameter_dict in parsed_hypergeom_gene_sets_parameters: hypergeom_gene_set_types.append( hypergeom_gene_set_parameter_dict["type"]) hypergeom_gene_set_min_set_sizes.append( hypergeom_gene_set_parameter_dict["min_set_size"]) hypergeom_gene_set_max_set_sizes.append( hypergeom_gene_set_parameter_dict["max_set_size"]) hypergeom_gene_set_p_thresholds.append( hypergeom_gene_set_parameter_dict["p_threshold"]) hypergeom_gene_set_fold_thresholds.append( hypergeom_gene_set_parameter_dict["fold_threshold"]) hypergeom_gene_set_network_overlap_ratios.append( hypergeom_gene_set_parameter_dict["network_overlap_ratio"]) pr_dictionary["hypergeom_gene_set_types"] = hypergeom_gene_set_types pr_dictionary[ "hypergeom_gene_set_min_set_sizes"] = hypergeom_gene_set_min_set_sizes pr_dictionary[ "hypergeom_gene_set_max_set_sizes"] = hypergeom_gene_set_max_set_sizes pr_dictionary[ "hypergeom_gene_set_p_thresholds"] = hypergeom_gene_set_p_thresholds pr_dictionary[ "hypergeom_gene_set_fold_thresholds"] = hypergeom_gene_set_fold_thresholds pr_dictionary[ "hypergeom_gene_set_network_overlap_ratios"] = hypergeom_gene_set_network_overlap_ratios return pr_dictionary
def add_de_specific_parameters(global_variables, pr_dictionary, workflow_parameter_dict): # adds the r subsection workflow type pr_dictionary = add_subsection_r( os.path.join(pr_dictionary["r_bin_path"], "section_header", "de_workflow.txt"), "subsection_r_workflow_type", "section_header/de_workflow.txt", pr_dictionary) # gets the various samples and sample group lists for de: order_list = workflow_parameter_dict["order_list"] samples_by_sample_groups = global_variables["samples_by_sample_groups"] samples_ordered = get_samples_ordered_by_order_list( order_list, samples_by_sample_groups) # gets the various R code strings: samples_r_string = get_r_string_samples(samples_ordered) sample_groups_r_string = get_r_string_sample_groups(order_list) sample_groupings_r_string = get_r_string_sample_groupings( order_list, samples_by_sample_groups) samples_by_sample_group_r_string = get_r_string_samples_by_sample_group( order_list, samples_by_sample_groups) default_samples_colours_r_string = get_r_string_default_sample_colours( order_list, samples_by_sample_groups) comparisons_r_string = "c(\"" + workflow_parameter_dict["de_ID"] + "\")" # gets the list of chromosomes chromosomes_list = get_chromosome_list( os.path.join(pr_dictionary["workflow_outpath"], "data", "de_annotated.csv")) # updates the pr_dictionary pr_dictionary["workflow_ID"] = workflow_parameter_dict["de_ID"] pr_dictionary["de_p_threshold"] = workflow_parameter_dict["p_threshold"] pr_dictionary["de_fold_threshold"] = workflow_parameter_dict[ "fold_threshold"] pr_dictionary["de_numerator_group"] = workflow_parameter_dict[ "numerator_group"] pr_dictionary["de_denominator_group"] = workflow_parameter_dict[ "denominator_group"] pr_dictionary["de_file_path"] = workflow_parameter_dict["de_file_path"] pr_dictionary[ "differential_expression_set_size"] = workflow_parameter_dict[ "differential_expression_set_size"] pr_dictionary["order_list"] = order_list pr_dictionary["samples_ordered"] = samples_ordered pr_dictionary["comparisons_r_string"] = comparisons_r_string pr_dictionary["samples_by_sample_groups"] = samples_by_sample_groups pr_dictionary["sample_groups_r_string"] = sample_groups_r_string pr_dictionary["samples_r_string"] = samples_r_string pr_dictionary["sample_groupings_r_string"] = sample_groupings_r_string pr_dictionary[ "samples_by_sample_group_r_string"] = samples_by_sample_group_r_string pr_dictionary[ "default_samples_colours_r_string"] = default_samples_colours_r_string pr_dictionary["chromosome_list"] = chromosomes_list # gets the hypergeometric gene set types if global_variables["ora_flag"]: hypergeom_gene_set_types = [] hypergeom_gene_set_min_set_sizes = [] hypergeom_gene_set_max_set_sizes = [] hypergeom_gene_set_p_thresholds = [] hypergeom_gene_set_fold_thresholds = [] hypergeom_gene_set_network_overlap_ratios = [] parsed_hypergeom_gene_sets_parameters = global_variables[ "ora_parameters"] for hypergeom_gene_set_parameter_dict in parsed_hypergeom_gene_sets_parameters: hypergeom_gene_set_types.append( hypergeom_gene_set_parameter_dict["type"]) hypergeom_gene_set_min_set_sizes.append( hypergeom_gene_set_parameter_dict["min_set_size"]) hypergeom_gene_set_max_set_sizes.append( hypergeom_gene_set_parameter_dict["max_set_size"]) hypergeom_gene_set_p_thresholds.append( hypergeom_gene_set_parameter_dict["p_threshold"]) hypergeom_gene_set_fold_thresholds.append( hypergeom_gene_set_parameter_dict["fold_threshold"]) hypergeom_gene_set_network_overlap_ratios.append( hypergeom_gene_set_parameter_dict["network_overlap_ratio"]) pr_dictionary["hypergeom_gene_set_types"] = hypergeom_gene_set_types pr_dictionary[ "hypergeom_gene_set_min_set_sizes"] = hypergeom_gene_set_min_set_sizes pr_dictionary[ "hypergeom_gene_set_max_set_sizes"] = hypergeom_gene_set_max_set_sizes pr_dictionary[ "hypergeom_gene_set_p_thresholds"] = hypergeom_gene_set_p_thresholds pr_dictionary[ "hypergeom_gene_set_fold_thresholds"] = hypergeom_gene_set_fold_thresholds pr_dictionary[ "hypergeom_gene_set_network_overlap_ratios"] = hypergeom_gene_set_network_overlap_ratios # gets the ipa ureg types if global_variables["ura_flag"]: ura_types = [] ura_min_set_sizes = [] ura_max_set_sizes = [] ura_zscore_thresholds = [] ura_p_thresholds = [] ura_fold_thresholds = [] ura_overlap_ratios = [] parsed_ura_parameters = global_variables["ura_parameters"] for ura_parameters_dict in parsed_ura_parameters: ura_types.append(ura_parameters_dict["type"]) ura_min_set_sizes.append(ura_parameters_dict["min_set_size"]) ura_max_set_sizes.append(ura_parameters_dict["max_set_size"]) ura_zscore_thresholds.append( ura_parameters_dict["zscore_threshold"]) ura_p_thresholds.append(ura_parameters_dict["p_threshold"]) ura_fold_thresholds.append(ura_parameters_dict["fold_threshold"]) ura_overlap_ratios.append( ura_parameters_dict["network_overlap_ratio"]) pr_dictionary["ura_types"] = ura_types pr_dictionary["ura_min_set_sizes"] = ura_min_set_sizes pr_dictionary["ura_max_set_sizes"] = ura_max_set_sizes pr_dictionary["ura_zscore_thresholds"] = ura_zscore_thresholds pr_dictionary["ura_p_thresholds"] = ura_p_thresholds pr_dictionary["ura_fold_thresholds"] = ura_fold_thresholds pr_dictionary["ura_overlap_ratios"] = ura_overlap_ratios return pr_dictionary