def get_outfile_name(destination_dir, spreadsheet_file_name, transform_name, file_ext='tsv', timestamp=True): """ construct a full path output file name from destination path, spreadsheet file name, transformation name and file extenstion Args: destination_dir: where the file will be written (must already exists) spreadsheet_file_name: usually the input file name (before transformation) transform_name: the operation performed on the input file file_ext: default is '.tsv' Returns: spreadsheet_transformed_file_name: full path output file name """ nix_dir, name_base = os.path.split(spreadsheet_file_name) name_base, nix_ext = os.path.splitext(name_base) name_base = name_base + '_' + transform_name if timestamp == True: name_base = kn.create_timestamped_filename(name_base, file_ext) else: name_base = name_base + '.' + file_ext return os.path.join(destination_dir, name_base)
def get_output_file_name(run_parameters, dir_name_key, prefix_string, suffix_string='', type_suffix='tsv'): """ get the full directory / filename for writing Args: run_parameters: dictionary with keys: dir_name_key, "method" and "correlation_measure" dir_name_key: run_parameters dictionary key for the output directory prefix_string: the first letters of the ouput file name suffix_string: the last letters of the output file name before type_suffix type_suffix: the file type extenstion (default 'tsv') without period character Returns: output_file_name: full file and directory name suitable for file writing """ string_1 = prefix_string + '_' + run_parameters[ 'method'] + '_' + run_parameters["correlation_measure"] string_2 = '_' + suffix_string + '.' + type_suffix output_file_name = os.path.join(run_parameters[dir_name_key], string_1) output_file_name = kn.create_timestamped_filename( output_file_name) + string_2 return output_file_name
def phenotype_expander(run_parameters): """ Run phenotype expander on the whole dataframe of phenotype data. Save the results to tsv file. """ phenotype_df = kn.get_spreadsheet_df( run_parameters['phenotype_name_full_path']) output_dict = run_pre_processing_phenotype_expander( phenotype_df, run_parameters['threshold']) result_df = pd.DataFrame(index=phenotype_df.index) for key, df_list in output_dict.items(): if key == ColumnType.CATEGORICAL: for item in df_list: col_df = phenotype_df.loc[:, item.columns[0]].dropna() uniq_array = np.unique(col_df.values) col_names = [ item.columns[0] + '_' + str(i) for i in uniq_array ] cur_df = pd.DataFrame(columns=col_names, index=col_df.index) cur_append_df = pd.DataFrame(columns=col_names, index=phenotype_df.index) for i, val in enumerate(uniq_array): cur_df.loc[col_df == val, col_names[i]] = 1 cur_df.loc[col_df != val, col_names[i]] = 0 cur_append_df.loc[cur_df.index, :] = cur_df result_df = pd.concat([result_df, cur_append_df], axis=1) file_name = kn.create_timestamped_filename("phenotype_expander_result", "tsv") file_path = os.path.join(run_parameters["results_directory"], file_name) result_df.index.name = "sample_id" result_df.to_csv(file_path, header=True, index=True, sep='\t', na_rep='NA')
def clustering_evaluation(run_parameters): """ Run clustering evaluation on the whole dataframe of phenotype data. Save the results to tsv file. """ cluster_phenotype_df = combine_phenotype_data_and_clustering( run_parameters) output_dict, fail_df = run_post_processing_phenotype_clustering_data( cluster_phenotype_df, run_parameters['threshold']) result_df = pd.DataFrame(index=['Measure', 'Trait_length_after_dropna', \ 'Sample_number_after_dropna', 'chi/fval', 'pval', 'SUCCESS/FAIL', 'Comments']) for key, df_list in output_dict.items(): if key == ColumnType.CATEGORICAL: for item in df_list: phenotype_name = item.columns.values[1] result_df[phenotype_name] = chisquare(item) else: for item in df_list: phenotype_name = item.columns.values[1] result_df[phenotype_name] = f_oneway(item) file_name = kn.create_timestamped_filename("clustering_evaluation_result", "tsv") file_path = os.path.join(run_parameters["results_directory"], file_name) result_df = pd.concat([result_df, fail_df], axis=1) result_df.T.to_csv(file_path, header=True, index=True, sep='\t', na_rep='NA')
def write_predict_data(predict_df, run_parameters): ''' Save predict data into two-column tsv file Args: predict_df: dataframe of prediction result. The first column contains response names and the second column has the corresponding predicted values run_parameters: dictionary of run parameters ''' test_spreadsheet_name_full_path = run_parameters[ 'test_spreadsheet_name_full_path'] results_directory = run_parameters['results_directory'] method = run_parameters['method'] _, output_file_name = os.path.split(test_spreadsheet_name_full_path) output_file_name, _ = os.path.splitext(output_file_name) output_file_name = os.path.join(results_directory, output_file_name + '_' + method) output_file_name = kn.create_timestamped_filename( output_file_name) + '.tsv' predict_df.to_csv(output_file_name, sep='\t', header=True, index=True, float_format='%g')
def save_timestamped_df(input_df, results_dir, output_file_name): """ Save dataframe to files with timestamped name. Args: fisher_contingency_pval: list of seven items lists. results_dir: directory to save outputs. output_file_name: file name. """ file_name = kn.create_timestamped_filename(output_file_name, "df") kn.save_df(input_df, results_dir, file_name)
def write_predict_data(predict_df, run_parameters): deNada, output_file_name = os.path.split( run_parameters['test_spreadsheet_name_full_path']) output_file_name, deNada = os.path.splitext(output_file_name) output_file_name = os.path.join(run_parameters['results_directory'], output_file_name) output_file_name = kn.create_timestamped_filename( output_file_name) + '.tsv' predict_df.to_csv(output_file_name, sep='\t', header=True, index=True)
def write_predict_data(predict_df, run_parameters): test_spreadsheet_name_full_path = run_parameters['test_spreadsheet_name_full_path'] results_directory = run_parameters['results_directory'] method = run_parameters['method'] _, output_file_name = os.path.split(test_spreadsheet_name_full_path) output_file_name, _ = os.path.splitext(output_file_name) output_file_name = os.path.join(results_directory, output_file_name + '_' + method) output_file_name = kn.create_timestamped_filename(output_file_name) + '.tsv' predict_df.to_csv(output_file_name, sep='\t', header=True, index=True, float_format='%g')
def save_cosine_matrix_df(cosine_matrix_df, run_parameters): """This is to save the cosine matrix df to output file Args: cosine_matrix_df: dataframe with cosine value. run_parameters: parameters dictionary. """ new_file_name = kn.create_timestamped_filename("cosine_matrix", "df") cosine_matrix_df.to_csv(os.path.join(run_parameters['results_directory'], new_file_name), header=True, index=True, sep='\t')
def test_create_timestamped_filename(self): """ assert that the beginning char string remains unchanged and that the size of the returned string is as expected """ precision = None n_digits = 29 name_base = 'test_string' name_extension = 'wie' tsfn = kn.create_timestamped_filename(name_base, name_extension, precision, n_digits) self.assertEqual(name_base, tsfn[0:11], msg='prefix name exception') n_chars = len(tsfn) self.assertEqual(name_extension, tsfn[n_chars - 3:n_chars], msg='extension name exception') precision = 1e-15 tsfn = kn.create_timestamped_filename(name_base, name_extension, precision, n_digits) self.assertEqual(name_base, tsfn[0:11], msg='prefix name exception') n_chars = len(tsfn) self.assertEqual(name_extension, tsfn[n_chars - 3:n_chars], msg='extension name exception')
def get_output_file_name(run_parameters, prefix_string, suffix_string='', type_suffix='tsv'): """ get the full directory / filename for writing Args: run_parameters: dictionary with keys: "results_directory", "method" and "correlation_measure" prefix_string: the first letters of the ouput file name suffix_string: the last letters of the output file name before '.tsv' Returns: output_file_name: full file and directory name suitable for file writing """ results_directory = run_parameters["results_directory" ] method = run_parameters['method' ] similarity_measure = run_parameters['similarity_measure'] output_file_name = os.path.join(results_directory, prefix_string + '_' + method + '_' + similarity_measure) output_file_name = kn.create_timestamped_filename(output_file_name) + '_' + suffix_string + '.' + type_suffix return output_file_name
def map_and_save_droplist(spreadsheet_df, gene_names, droplist_name, run_parameters): """This is to map and save droplist Args: spreadsheet_df: user supplied spreadsheet dataframe. gene_names: list of genes. droplist_name: name of droplist file. run_parameters: dictionary of run parameters. Returns: property_rank_df: dataframe with ranked property names in each column. """ gene_names_map = run_parameters["gene_names_map"] results_directory = run_parameters['results_directory'] droplist = kn.find_dropped_node_names(spreadsheet_df, gene_names) map_df = pd.read_csv(gene_names_map, index_col=0, header=None, sep='\t') new_droplist_df = pd.DataFrame(map_df.loc[droplist].values, columns=[droplist_name]) file_name = kn.create_timestamped_filename(droplist_name, "tsv") kn.save_df(new_droplist_df, results_directory, file_name)