コード例 #1
0
ファイル: figures.py プロジェクト: mofhu/DART-ID
def main():
    # load command-line args
    parser = argparse.ArgumentParser()
    add_global_args(parser)
    args = parser.parse_args()

    # load config file
    # this function also creates the output folder
    config = read_config_file(args)

    # initialize logger
    init_logger(config['verbose'], os.path.join(config['output'],
                                                'figures.log'))

    # load first input file and replace home and user vars
    f = config['input'][0]
    f = os.path.expanduser(f)
    f = os.path.expandvars(f)

    # read in input files
    df = pd.read_csv(f, sep='\t', low_memory=False)
    params = load_params_from_file(config['params_folder'])

    # TODO: check that the columns we need are present

    # generate figures
    figures(df, config=config, params=params)
コード例 #2
0
def main():
    # load command-line args
    parser = argparse.ArgumentParser()
    add_global_args(parser)
    args = parser.parse_args()

    # load config file
    # this function also creates the output folder
    config = read_config_file(args)

    # initialize logger
    init_logger(config['verbose'],
                os.path.join(config['output'], 'converter.log'),
                config['log_file'])

    # process all input files (converts and filters)
    df, df_original = process_files(config)

    #logger.info('{} / {} ({:.2%}) observations pass filters and will be used for alignment'.format(np.sum(~df['exclude']),
    #    df_original.shape[0], np.sum(~df['exclude']) / df_original.shape[0]))

    # write to file
    if config['save_combined_output']:
        # if combining input files, then write to one combined file
        out_path = os.path.join(config['output'],
                                config['combined_output_name'])
        logger.info(
            'Combining input file(s) and writing adjusted data file to {} ...'.
            format(out_path))
        df.to_csv(out_path, sep='\t', index=False)

    if config['save_separate_output']:
        # if keeping input files separate, then use 'input_id' to retain the
        # order in which the input files were passed in
        logger.info('Saving output to separate files...')
        for i, f in enumerate(config['input']):
            out_path = os.path.join(
                config['output'],
                (os.path.splitext(os.path.basename(f))[0] +
                 config['output_suffix'] + '_' + str(i) + '.txt'))
            logger.info('Saving input file {} to {}'.format(i, out_path))
            df_a = df.loc[df['input_id'] == i]
            df_a.to_csv(out_path, sep='\t', index=False)
コード例 #3
0
ファイル: align.py プロジェクト: mofhu/DART-ID
def main():
  # load command-line args
  parser = argparse.ArgumentParser()  
  add_global_args(parser)
  args = parser.parse_args()

  # load config file
  # this function also creates the output folder
  config = read_config_file(args)

  # initialize logger
  init_logger(config['verbose'], os.path.join(config['output'], 'align.log'), config['log_file'])

  logger.info('Converting files and filtering PSMs')
  df, df_original = process_files(config)
  logger.info('Finished converting files and filtering PSMs.')

  logger.info('Beginning alignment procedure')
  align(df, config)
  logger.info('Alignment procedure finished')
コード例 #4
0
ファイル: update.py プロジェクト: mofhu/DART-ID
def main():
    start = time.time()

    # load command-line args
    parser = argparse.ArgumentParser()
    add_global_args(parser)
    args = parser.parse_args()

    # load config file
    # this function also creates the output folder
    config = read_config_file(args)

    # initialize logger
    init_logger(config['verbose'], os.path.join(config['output'], 'dart.log'),
                config['log_file'])

    logger.info('Converting files and filtering PSMs')
    df, df_original = process_files(config)
    logger.info('Finished converting files and filtering PSMs.')

    # load params, either from a defined folder or from running the alignment
    params = {}
    if 'params_folder' in config and type(config['params_folder']) is str:
        params = load_params_from_file(config['params_folder'])
    else:
        logger.info('Beginning alignment procedure')
        params = align(df, config)
        logger.info('Alignment procedure finished')

    # now we have the params, run the update
    logger.info('Updating PEPs with alignment data...')
    df_new = update(df, params, config)

    # save the sparse combined input file?
    #df_new.to_csv(os.path.join(args.output, 'df_converted.txt'), sep='\t', index=False)

    # add new columns to original DF, and remove the duplicate ID column
    logger.info('Concatenating results to original data...')
    df_adjusted = pd.concat([ \
      df_original.loc[~df_original['remove']].reset_index(drop=True), \
      df_new.drop(['id', 'input_id'], axis=1).reset_index(drop=True)], axis=1)

    # add rows of PSMs originally removed from analysis
    if np.sum(df_original['remove']) > 0:
        logger.info('Reattaching {} PSMs excluded from initial filters'.format(
            df_original['remove'].sum()))
        # store a copy of the columns and their order for later
        df_cols = df_adjusted.columns
        # concatenate data frames
        df_adjusted = pd.concat([ \
          df_adjusted, \
          df_original.loc[df_original['remove']]],
          axis=0, ignore_index=True, sort=True)
        # pd.concat reindexes the order of the columns,
        # so just order it back to what it used to be
        df_adjusted = df_adjusted.reindex(df_cols, axis=1)

    # sort by ID, and reset index
    df_adjusted = df_adjusted.sort_values(['id'])
    df_adjusted = df_adjusted.reset_index(drop=True)

    # add residual RT (alignment error) column
    df_adjusted['residual'] = np.abs(\
      df_adjusted[config['col_names']['retention_time']] - df_adjusted['muij'])

    # add dart_PEP column - which is pep_new, with the NaNs filled in
    # with the old PEPs.
    df_adjusted['dart_PEP'] = df_adjusted['pep_new']
    df_adjusted['dart_PEP'][pd.isnull(df_adjusted['pep_new'])] = \
      df_adjusted[config['col_names']['pep']][pd.isnull(df_adjusted['pep_new'])]
    # make sure that updated PEP does not exceed 1
    df_adjusted['dart_PEP'][df_adjusted['dart_PEP'] > 1] = 1

    # add q-value (FDR) column
    # rank-sorted, cumulative sum of PEPs is expected number of false positives
    # q-value is just that vector divided by # of observations, to get FDR
    logger.info('Calculating FDR (q-values)')

    # q-value, without fixing # of false positives to a discrete number
    #df_adjusted['q-value'] = \
    #  ( \
    #    np.cumsum(df_adjusted['dart_PEP'][np.argsort(df_adjusted['dart_PEP'])]) / \
    #    np.arange(1, df_adjusted.shape[0]+1) \
    #  )[np.argsort(np.argsort(df_adjusted['dart_PEP']))]

    # q-value, by fixing # of false positives to a discrete number
    # for now, set all null PEPs to 1. we'll remember the index and set them back to nan later
    null_peps = pd.isnull(df_adjusted['dart_PEP'])
    if null_peps.sum() > 0:
        df_adjusted['dart_PEP'][null_peps] = 1

    # get the index order of sorted PEPs
    pep_order = np.argsort(df_adjusted['dart_PEP'])
    # Take the ceiling of the cumulative sum of the sorted PEPs to get the pessimistic
    # estimate of the number of false positives when selecting at that level.
    # because using ceiling, PSMs with different PEPs but within the same relative interval
    # will get the same "num_fp" value.
    num_fp = np.ceil(np.cumsum(df_adjusted['dart_PEP'][pep_order])).astype(int)
    # count the number of occurrences of num_fp and sum them up to get the sample size for each
    # discrete false positive # threshold
    fp_counts = np.cumsum(num_fp.value_counts().sort_index()).values
    # divide # of false positivies by sample size to get q-value. sorting the index order brings
    # the order of values back to their original form
    df_adjusted['dart_qval'] = (num_fp /
                                fp_counts[num_fp - 1]).values[np.argsort(
                                    pep_order.values)]

    # set null PEPs and q-values back to nan
    if null_peps.sum() > 0:
        df_adjusted['dart_PEP'][null_peps] = np.nan
        df_adjusted['dart_qval'][null_peps] = np.nan

    # rename 'remove' column - which indicates whether or not the PSM participated in the
    # DART-ID alignment and update
    df_adjusted['participated'] = ~df_adjusted['remove']

    ## Run protein inference (fido)?
    if 'run_pi' in config and config['run_pi'] is True:
        logger.info('Running protein inference with Fido...')

        # build fido options into a dict (parameter_map)
        parameter_map = {
            'gamma': config['pi_gamma'] if 'pi_gamma' in config else None,
            'alpha': config['pi_alpha'] if 'pi_alpha' in config else None,
            'beta': config['pi_beta'] if 'pi_beta' in config else None,
            'connected_protein_threshold':
            config['pi_connected_protein_thresh'],
            'omit_clean_peptide_name': ~config['pi_clean_peptide_name'],
            'all_psms': config['pi_use_all_psms'],
            'group_proteins': config['pi_group_proteins'],
            'prune_low_scores': config['pi_prune_low_scores'],
            'parameter_accuracy': config['pi_parameter_accuracy'],
            'proteins_column': config['col_names']['proteins'],
            'protein_delimiter': config['pi_protein_delimiter'],
            'leading_protein_column': config['col_names']['leading_protein'],
            'decoy_tag': config['pi_decoy_tag'],
            'sequence_column': config['col_names']['sequence'],
            #'error_prob_column':       config['col_names']['pep']
            'error_prob_column': 'dart_PEP',

            # pass in output folder so fido can save some intermediate and output files
            'output': config['output']
        }
        logger.debug('parameter_map for fido:')
        logger.debug(str(parameter_map))

        # run fido subroutine
        df_adjusted = run_internal(df_adjusted, parameter_map)

        logger.info('Fido finished')
        logger.info(
            'FDR for PSM\'s razor protein, from protein inference, placed in \"razor_protein_fdr\" column'
        )

    # print figures?
    if config['print_figures']:
        figures(df_adjusted, config, params)

    # overwrite PEP?
    # if true, then store old PEP in "Spectra PEP" column,
    # and put the dart PEP in "PEP" column.
    # then drop the pep_new and dart_PEP columns
    if config['overwrite_pep']:
        logger.info(
            'Overwriting PEP column with new PEP. Saving old PEP in \"Spectra PEP\" column.'
        )
        df_adjusted['Spectra PEP'] = df_adjusted[config['col_names']['pep']]
        df_adjusted[config['col_names']['pep']] = df_adjusted['dart_PEP']
        df_adjusted = df_adjusted.drop(['pep_new', 'dart_PEP'], axis=1)

    # tell the user whether or not to expect diagnostic columns
    if config['add_diagnostic_cols']:
        logger.info('Adding diagnostic columns to output')

    # write to file
    if config['save_combined_output']:
        # if combining input files, then write to one combined file
        out_path = os.path.join(config['output'],
                                config['combined_output_name'])
        logger.info(
            'Combining input file(s) and writing adjusted data file to {} ...'.
            format(out_path))
        write_output(df_adjusted, out_path, config)

    if config['save_separate_output']:
        # if keeping input files separate, then use 'input_id' to retain the
        # order in which the input files were passed in
        logger.info('Saving output to separate files...')
        for i, f in enumerate(config['input']):

            # get output extension
            # default to the same extension as the input
            # if one in the config file exists, use that instead
            out_ext = os.path.splitext(os.path.basename(f))[1]
            if config['output_ext'] is not None:
                out_ext = config['output_ext']

            # contruct output path based on which input file it was
            out_path = os.path.join(config['output'],
              os.path.splitext(os.path.basename(f))[0] + \
              config['output_suffix'] + '_' + str(i) + out_ext)

            # if saving back to the original input folder,
            # then base the output file name on the input file name instead.
            # no need to number these.
            if config['save_in_input_folder']:
                out_path = os.path.join(os.path.dirname(f),
                  os.path.splitext(os.path.basename(f))[0] + \
                  config['output_suffix'] + out_ext)

            logger.info('Saving input file {} to {}'.format(i, out_path))
            df_a = df_adjusted.loc[df_adjusted['input_id'] == i]
            # save to file
            # other data formats might have a different separator, or have an index column
            write_output(df_a, out_path, config)

    print('Done! Process took {:.3f} seconds'.format(time.time() - start))