def main(): # load command-line args parser = argparse.ArgumentParser() add_global_args(parser) args = parser.parse_args() # load config file # this function also creates the output folder config = read_config_file(args) # initialize logger init_logger(config['verbose'], os.path.join(config['output'], 'figures.log')) # load first input file and replace home and user vars f = config['input'][0] f = os.path.expanduser(f) f = os.path.expandvars(f) # read in input files df = pd.read_csv(f, sep='\t', low_memory=False) params = load_params_from_file(config['params_folder']) # TODO: check that the columns we need are present # generate figures figures(df, config=config, params=params)
def main(): # load command-line args parser = argparse.ArgumentParser() add_global_args(parser) args = parser.parse_args() # load config file # this function also creates the output folder config = read_config_file(args) # initialize logger init_logger(config['verbose'], os.path.join(config['output'], 'converter.log'), config['log_file']) # process all input files (converts and filters) df, df_original = process_files(config) #logger.info('{} / {} ({:.2%}) observations pass filters and will be used for alignment'.format(np.sum(~df['exclude']), # df_original.shape[0], np.sum(~df['exclude']) / df_original.shape[0])) # write to file if config['save_combined_output']: # if combining input files, then write to one combined file out_path = os.path.join(config['output'], config['combined_output_name']) logger.info( 'Combining input file(s) and writing adjusted data file to {} ...'. format(out_path)) df.to_csv(out_path, sep='\t', index=False) if config['save_separate_output']: # if keeping input files separate, then use 'input_id' to retain the # order in which the input files were passed in logger.info('Saving output to separate files...') for i, f in enumerate(config['input']): out_path = os.path.join( config['output'], (os.path.splitext(os.path.basename(f))[0] + config['output_suffix'] + '_' + str(i) + '.txt')) logger.info('Saving input file {} to {}'.format(i, out_path)) df_a = df.loc[df['input_id'] == i] df_a.to_csv(out_path, sep='\t', index=False)
def main(): # load command-line args parser = argparse.ArgumentParser() add_global_args(parser) args = parser.parse_args() # load config file # this function also creates the output folder config = read_config_file(args) # initialize logger init_logger(config['verbose'], os.path.join(config['output'], 'align.log'), config['log_file']) logger.info('Converting files and filtering PSMs') df, df_original = process_files(config) logger.info('Finished converting files and filtering PSMs.') logger.info('Beginning alignment procedure') align(df, config) logger.info('Alignment procedure finished')
def main(): start = time.time() # load command-line args parser = argparse.ArgumentParser() add_global_args(parser) args = parser.parse_args() # load config file # this function also creates the output folder config = read_config_file(args) # initialize logger init_logger(config['verbose'], os.path.join(config['output'], 'dart.log'), config['log_file']) logger.info('Converting files and filtering PSMs') df, df_original = process_files(config) logger.info('Finished converting files and filtering PSMs.') # load params, either from a defined folder or from running the alignment params = {} if 'params_folder' in config and type(config['params_folder']) is str: params = load_params_from_file(config['params_folder']) else: logger.info('Beginning alignment procedure') params = align(df, config) logger.info('Alignment procedure finished') # now we have the params, run the update logger.info('Updating PEPs with alignment data...') df_new = update(df, params, config) # save the sparse combined input file? #df_new.to_csv(os.path.join(args.output, 'df_converted.txt'), sep='\t', index=False) # add new columns to original DF, and remove the duplicate ID column logger.info('Concatenating results to original data...') df_adjusted = pd.concat([ \ df_original.loc[~df_original['remove']].reset_index(drop=True), \ df_new.drop(['id', 'input_id'], axis=1).reset_index(drop=True)], axis=1) # add rows of PSMs originally removed from analysis if np.sum(df_original['remove']) > 0: logger.info('Reattaching {} PSMs excluded from initial filters'.format( df_original['remove'].sum())) # store a copy of the columns and their order for later df_cols = df_adjusted.columns # concatenate data frames df_adjusted = pd.concat([ \ df_adjusted, \ df_original.loc[df_original['remove']]], axis=0, ignore_index=True, sort=True) # pd.concat reindexes the order of the columns, # so just order it back to what it used to be df_adjusted = df_adjusted.reindex(df_cols, axis=1) # sort by ID, and reset index df_adjusted = df_adjusted.sort_values(['id']) df_adjusted = df_adjusted.reset_index(drop=True) # add residual RT (alignment error) column df_adjusted['residual'] = np.abs(\ df_adjusted[config['col_names']['retention_time']] - df_adjusted['muij']) # add dart_PEP column - which is pep_new, with the NaNs filled in # with the old PEPs. df_adjusted['dart_PEP'] = df_adjusted['pep_new'] df_adjusted['dart_PEP'][pd.isnull(df_adjusted['pep_new'])] = \ df_adjusted[config['col_names']['pep']][pd.isnull(df_adjusted['pep_new'])] # make sure that updated PEP does not exceed 1 df_adjusted['dart_PEP'][df_adjusted['dart_PEP'] > 1] = 1 # add q-value (FDR) column # rank-sorted, cumulative sum of PEPs is expected number of false positives # q-value is just that vector divided by # of observations, to get FDR logger.info('Calculating FDR (q-values)') # q-value, without fixing # of false positives to a discrete number #df_adjusted['q-value'] = \ # ( \ # np.cumsum(df_adjusted['dart_PEP'][np.argsort(df_adjusted['dart_PEP'])]) / \ # np.arange(1, df_adjusted.shape[0]+1) \ # )[np.argsort(np.argsort(df_adjusted['dart_PEP']))] # q-value, by fixing # of false positives to a discrete number # for now, set all null PEPs to 1. we'll remember the index and set them back to nan later null_peps = pd.isnull(df_adjusted['dart_PEP']) if null_peps.sum() > 0: df_adjusted['dart_PEP'][null_peps] = 1 # get the index order of sorted PEPs pep_order = np.argsort(df_adjusted['dart_PEP']) # Take the ceiling of the cumulative sum of the sorted PEPs to get the pessimistic # estimate of the number of false positives when selecting at that level. # because using ceiling, PSMs with different PEPs but within the same relative interval # will get the same "num_fp" value. num_fp = np.ceil(np.cumsum(df_adjusted['dart_PEP'][pep_order])).astype(int) # count the number of occurrences of num_fp and sum them up to get the sample size for each # discrete false positive # threshold fp_counts = np.cumsum(num_fp.value_counts().sort_index()).values # divide # of false positivies by sample size to get q-value. sorting the index order brings # the order of values back to their original form df_adjusted['dart_qval'] = (num_fp / fp_counts[num_fp - 1]).values[np.argsort( pep_order.values)] # set null PEPs and q-values back to nan if null_peps.sum() > 0: df_adjusted['dart_PEP'][null_peps] = np.nan df_adjusted['dart_qval'][null_peps] = np.nan # rename 'remove' column - which indicates whether or not the PSM participated in the # DART-ID alignment and update df_adjusted['participated'] = ~df_adjusted['remove'] ## Run protein inference (fido)? if 'run_pi' in config and config['run_pi'] is True: logger.info('Running protein inference with Fido...') # build fido options into a dict (parameter_map) parameter_map = { 'gamma': config['pi_gamma'] if 'pi_gamma' in config else None, 'alpha': config['pi_alpha'] if 'pi_alpha' in config else None, 'beta': config['pi_beta'] if 'pi_beta' in config else None, 'connected_protein_threshold': config['pi_connected_protein_thresh'], 'omit_clean_peptide_name': ~config['pi_clean_peptide_name'], 'all_psms': config['pi_use_all_psms'], 'group_proteins': config['pi_group_proteins'], 'prune_low_scores': config['pi_prune_low_scores'], 'parameter_accuracy': config['pi_parameter_accuracy'], 'proteins_column': config['col_names']['proteins'], 'protein_delimiter': config['pi_protein_delimiter'], 'leading_protein_column': config['col_names']['leading_protein'], 'decoy_tag': config['pi_decoy_tag'], 'sequence_column': config['col_names']['sequence'], #'error_prob_column': config['col_names']['pep'] 'error_prob_column': 'dart_PEP', # pass in output folder so fido can save some intermediate and output files 'output': config['output'] } logger.debug('parameter_map for fido:') logger.debug(str(parameter_map)) # run fido subroutine df_adjusted = run_internal(df_adjusted, parameter_map) logger.info('Fido finished') logger.info( 'FDR for PSM\'s razor protein, from protein inference, placed in \"razor_protein_fdr\" column' ) # print figures? if config['print_figures']: figures(df_adjusted, config, params) # overwrite PEP? # if true, then store old PEP in "Spectra PEP" column, # and put the dart PEP in "PEP" column. # then drop the pep_new and dart_PEP columns if config['overwrite_pep']: logger.info( 'Overwriting PEP column with new PEP. Saving old PEP in \"Spectra PEP\" column.' ) df_adjusted['Spectra PEP'] = df_adjusted[config['col_names']['pep']] df_adjusted[config['col_names']['pep']] = df_adjusted['dart_PEP'] df_adjusted = df_adjusted.drop(['pep_new', 'dart_PEP'], axis=1) # tell the user whether or not to expect diagnostic columns if config['add_diagnostic_cols']: logger.info('Adding diagnostic columns to output') # write to file if config['save_combined_output']: # if combining input files, then write to one combined file out_path = os.path.join(config['output'], config['combined_output_name']) logger.info( 'Combining input file(s) and writing adjusted data file to {} ...'. format(out_path)) write_output(df_adjusted, out_path, config) if config['save_separate_output']: # if keeping input files separate, then use 'input_id' to retain the # order in which the input files were passed in logger.info('Saving output to separate files...') for i, f in enumerate(config['input']): # get output extension # default to the same extension as the input # if one in the config file exists, use that instead out_ext = os.path.splitext(os.path.basename(f))[1] if config['output_ext'] is not None: out_ext = config['output_ext'] # contruct output path based on which input file it was out_path = os.path.join(config['output'], os.path.splitext(os.path.basename(f))[0] + \ config['output_suffix'] + '_' + str(i) + out_ext) # if saving back to the original input folder, # then base the output file name on the input file name instead. # no need to number these. if config['save_in_input_folder']: out_path = os.path.join(os.path.dirname(f), os.path.splitext(os.path.basename(f))[0] + \ config['output_suffix'] + out_ext) logger.info('Saving input file {} to {}'.format(i, out_path)) df_a = df_adjusted.loc[df_adjusted['input_id'] == i] # save to file # other data formats might have a different separator, or have an index column write_output(df_a, out_path, config) print('Done! Process took {:.3f} seconds'.format(time.time() - start))