Beispiel #1
0
def filter_uniprot_exclusion_list(df, config, _filter):
    """
  Filter proteins from exclusion list using UniProt IDs
  """

    exclusion_list = []

    # parse exclusion list
    # if exclusion_list param is a path, then load the IDs from that path
    if 'file' in _filter and _filter['file'] is not None:
        # load UniProt IDs from file line-by-line
        # first expand user or any vars
        _filter['file'] = os.path.expanduser(_filter['file'])
        _filter['file'] = os.path.expandvars(_filter['file'])

        # open the exclusion list file and read in the UniProt IDs, line by line
        try:
            with open(_filter['file'], 'r') as f:
                logger.info(
                    'Loading UniProt IDs from exclusion list file {} ...'.
                    format(_filter['file']))
                exclusion_list = [line.rstrip('\n') for line in f]
                logger.info('Loaded {} proteins from exclusion list.'.format(
                    len(exclusion_list)))
        except EnvironmentError:
            raise ConfigFileError(
                'Exclusion list file {} not found. Please provide a path to a file with UniProt IDs separated by line'
                .format(_filter['file']))

    elif 'list' in _filter and len(_filter['list']) > 0:
        # load UniProt IDs from the configuration file
        exclusion_list = _filter['list']
        logger.info(
            'Loading {} UniProt IDs from exclusion list as defined in config file'
            .format(len(exclusion_list)))
    else:
        raise ConfigFileError(
            'No exclusion list file or list of UniProt IDs provided. Please provide a path to a file with UniProt IDs separated by line with the \"file\" key, or provide a python list of UniProt IDs with the \"list\" key. If not using a UniProt ID exclusion list, then comment out the \"uniprot_exclusion\" key from the filter list.'
        )

    # filter exclusion list
    if len(exclusion_list) > 0:
        logger.info(
            'UniProt IDs from exclusion list: {}'.format(exclusion_list))

        # we could only match the excluded IDs to the razor protein,
        # but we can be more strict and match the blacklisted IDs to the entire protein
        # string, containing all possible proteins
        pat = reduce((lambda x, y: x + '|' + y), exclusion_list)
        blacklist_filter = df['proteins'].str.contains(pat)
        blacklist_filter[pd.isnull(blacklist_filter)] = False

        logger.info('Filtering out {} PSMs from the exclusion list'.format(
            np.sum(blacklist_filter)))
        return blacklist_filter
    else:
        raise ConfigFileError(
            'Exclusion list found and loaded, but no UniProt IDs found. Check the format of the file, or the list in the config file.'
        )
Beispiel #2
0
def convert(df, config):

    cols = []
    col_names = []

    # loop thru all columns listed in the config file
    for col in list(config['col_names'].keys()):
        if config['col_names'][col] is None:
            logger.debug(
                'Column \"{}\" is left empty in the config file. Skipping...'.
                format(col))
            continue

        # check if the column specified in the config file exists in the df or not
        if config['col_names'][col] not in df.columns:
            # this is probably grounds to kill the program
            raise ConfigFileError(
                'Column {} of value {} not found in the input file. Please check that this column exists. Or, comment out the field or leave the field for {} empty in the config file.'
                .format(col, config['col_names'][col], col))

        # keep track of the column and the column name
        cols.append(config['col_names'][col])
        col_names.append(col)

    # take the subset of the input file, and also rename the columns
    dfa = df[cols]
    dfa.columns = col_names

    return dfa
Beispiel #3
0
def filter_psms(df, config):
    logger.info('Filtering PSMs...')

    # load the filtering functions specified by the input config
    # types of filter functions depends on what stage of filtering this is:
    # removing observations or merely excluding from alignment
    filters = config['filters']

    # each filter has a specified required column from the dataframe
    # make sure these columns exist before proceeding
    for i, f in enumerate(filters):
        # for each required column in the filter, check if it exists
        for j in required_cols[f['name']]:
            if j not in df.columns:
                raise ConfigFileError(
                    'Filter {} required a data column {}, but this was not found in the input dataframe.'
                    .format(f['name'], j))

    # by default, filter out nothing. we'll use binary ORs (|) to
    # gradually add more and more observations to this filter out blacklist
    df['remove'] = np.repeat(False, df.shape[0])

    # run all the filters specified by the list in the input config file
    # all filter functions are passed df, and the run configuration
    # after each filter, append it onto the exclusion master list with a bitwise OR
    # if the filter function returns None, then just ignore it.
    for i, f in enumerate(filters):
        e = filter_funcs[f['name']](df, config, f)
        if e is not None:
            df['remove'] = (df['remove'] | e)

    return df
Beispiel #4
0
def load_params_from_file(params_folder):
    # first expand user or any vars
    params_folder = os.path.expanduser(params_folder)
    params_folder = os.path.expandvars(params_folder)

    # load parameters if they are specified in the command line
    params = {}
    logger.info(
        'Using provided alignment parameters. Loading params from {}...'.
        format(params_folder))
    param_files = ['exp_params.txt', 'pair_params.txt', 'peptide_params.txt']
    for pf in param_files:
        pfp = os.path.join(params_folder, pf)
        if os.path.exists(pfp):
            try:
                params[pf.split('_')[0]] = pd.read_csv(pfp, sep='\t')
            except:
                logger.error('Error loading param file')
        else:
            error_msg = 'Params file {} does not exist'.format(pfp)
            raise ConfigFileError(error_msg)

        logger.info('Loaded \"{}\" params file.'.format(pf.split('_')[0]))

    return params
Beispiel #5
0
def filter_retention_length(df, config, _filter):
    """
  Filter by retention length, which is a measure of the peak width
  during chromatography.
  """

    if _filter['dynamic']:
        # use the dynamic filter, where the value is a proportion
        # of the max RT (the run-time) of that raw file

        logger.info(
            'Using dynamic retention length of {} * run-time (max RT) for each experiment'
            .format(_filter['value']))

        # get the max RT for each raw file, reindex to the same dimension as the
        # retention_length column, and then multiply by the filter value
        max_rts = df.groupby('raw_file')['retention_time'].max().values

        filter_rtl = max_rts[df['raw_file'].map({
            ind: val
            for val, ind in enumerate(np.sort(df['raw_file'].unique()))
        })] * _filter['value']

        filter_rtl = (df['retention_length'] > filter_rtl)

    else:
        # use a constant filter for the retention length
        logger.info(
            'Using constant retention length (in RT) of {} for all raw files.'.
            format(_filter['value']))

        # only allow values between 0 and max(RT)
        if _filter['value'] <= 0 or _filter['value'] > np.max(
                df['retention_time']):
            raise ConfigFileError(
                '\"retention_length filter\" {} is not defined or incorrectly defined. Please provide a decimal number between 0.0 and max(RT).'
                .format(_filter['value']))

        filter_rtl = (df['retention_length'] > _filter['value'])

    if _filter['dynamic']:
        logger.info(
            'Filtering out {} PSMs with retention length greater than {:.4f} * max(exp_RT) of each raw file.'
            .format(np.sum(filter_rtl), _filter['value']))
    else:
        logger.info(
            'Filtering out {} PSMs with retention length greater than {:.4f}'.
            format(np.sum(filter_rtl), _filter['value']))

    return filter_rtl
def get_model_from_config(config):
    model = 'two_piece_linear'
    if config['model'] is not None:
        if config['model'] in models:
            model = config['model']
        else:
            error_msg = 'Model \"{}\" not found. Available choices are: {}'.format(
                model, models.keys())
            raise ConfigFileError(error_msg)
    else:
        logger.info(
            'Alignment model not defined. Defaulting to \"two_piece_linear\" model'
        )

    return models[model]
Beispiel #7
0
def process_files(config):

    # create our output data frames
    df_original = pd.DataFrame()
    df = pd.DataFrame()

    # iterate through each input file provided.
    for i, f in enumerate(config['input']):
        # first expand user or any vars
        f = os.path.expanduser(f)
        f = os.path.expandvars(f)

        logger.info('Reading in input file #{} | {} ...'.format(i + 1, f))

        # load the input file with pandas
        #
        # have a variable low memory option depending on the input type.
        # MaxQuant, for example, has a structure that forces pandas out of its
        # optimal low memory mode, and we have to specify it here.
        dfa = pd.read_csv(f, sep='\t', low_memory=config['low_memory'])

        # keep track of where observations came from. this is _not_ the raw file ID
        # but instead the ID from which input file it originated from, so that if
        # we need to split these observations up by input file in the future we can do so
        dfa['input_id'] = i

        # append a copy of dfa into df_original, because the conversion process will heavily
        # modify dfa. we need to keep a copy of the original dataframe in order to append
        # the new columns back onto it later.
        # re-index columns with '[dfa.columns.tolist()]' to preserve the general column order
        df_original = df_original.append(dfa, sort=True)[dfa.columns.tolist()]

        # if this input data already has DART-ID columns in it, then drop them,
        # since they cause problems later
        dart_cols = [
            'rt_minus', 'rt_plus', 'mu', 'muij', 'sigmaij', 'pep_new',
            'exp_id', 'peptide_id', 'stan_peptide_id', 'exclude', 'residual',
            'pep_updated', 'q-value'
        ]
        # print a warning if we see any
        if np.any(df_original.columns.isin(dart_cols)):
            logger.warning(
                'Columns {} are recognized as DART-ID output columns. Removing these columns before proceeding. In the future, please input original input data files, not output files from DART-ID.'
                .format(
                    np.array_str(df_original.columns[df_original.columns.isin(
                        dart_cols)])))

            # drop existing dart cols
            for col in dart_cols:
                if col in df_original.columns:
                    logger.debug('Removing column {}'.format(col))
                    df_original = df_original.drop(col, axis=1)

        logger.info('Converting {} ({} PSMs)...'.format(f, dfa.shape[0]))

        # convert - takes subset of columns and renames them
        dfa = convert(dfa, config)

        # need to reset the input_id after the conversion process
        dfa['input_id'] = i
        # append to master dataframe
        df = df.append(dfa)

    # modify columns?
    # append the ion charge to the sequence
    # also make sure the charge column is specified and exists
    if config['add_charge_to_sequence'] and 'charge' in df.columns:
        logger.info(
            'Appending charge to peptide sequence, to align different charge states separately.'
        )
        df['sequence'] = df['sequence'] + '_' + df['charge'].apply(str)

    # create a unique ID for each PSM to help with stiching the final result together
    # after all of our operations
    df['id'] = range(0, df.shape[0])
    df_original['id'] = range(0, df.shape[0])

    # by default, exclude nothing from the original experiment
    df_original['remove'] = np.repeat(False, df_original.shape[0])

    # if the input already has an 'remove' column, then skip this step
    if 'remove' in config['col_names'] and config['col_names'][
            'remove'] is not None:
        df['remove'] = df['remove'].astype(bool)
    else:  # otherwise, run the filters
        df = filter_psms(df, config)

    # apply non-optional filters, PEP threshold and requirement that
    # sequence is observed in at least n experiments (num_experiments)

    # remove any observations with null pep
    null_pep = pd.isnull(df['pep'])
    if np.sum(null_pep) > 0:
        df['remove'] = ((df['remove']) | (null_pep))
        logger.info('Removing {} PSMs with no PEP entry.'.format(
            np.sum(null_pep)))

    num_exps = len(df['raw_file'].unique())
    if config['num_experiments'] > num_exps:
        raise ConfigFileError(
            'Number of experiments filter threshold {} is greater than the number of experiments in the input list. Please provide an integer greater than or equal to 1 and less than the number of experiments with the \"num_experiments\" key.'
            .format(config['num_experiments']))

    # count the number of experiments a peptide is observed in, but filter out
    # 1) PSMs removed from previous filters
    # 2) PSMs with PEP > pep_threshold
    exps_per_pep = df[-((df['remove']) |
                        (df['pep'] >= config['pep_threshold']))].groupby(
                            'sequence')['raw_file'].unique().apply(
                                (lambda x: len(x)))
    # map values to DataFrame. peptides without any value will get NaN,
    # which will then be assigned to 0.
    exps_per_pep = df['sequence'].map(exps_per_pep)
    exps_per_pep[pd.isnull(exps_per_pep)] = 0

    # flag these sequences for removal as well
    logger.info(
        'Removing {} PSMs from peptide sequences not observed confidently in more than {} experiments'
        .format(np.sum(exps_per_pep < config['num_experiments']),
                config['num_experiments']))
    df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments']))

    # check that every experiment has at least n PSMs available for alignment.
    # if not, then exclude them from alignment
    psms_per_exp = df.groupby('raw_file')['remove'].apply(
        lambda x: np.sum(x < config['pep_threshold']))
    exclude_exps = psms_per_exp.index.values[
        psms_per_exp < config['min_psms_per_experiment']]

    if len(exclude_exps) > 0:
        logger.warning(
            'Experiments {} have < {} confident PSMs (PEP < {}) remaining after filtering. All PSMs belonging to these experiments will be excluded from the retention time alignment'
            .format(np.array_str(exclude_exps),
                    config['min_psms_per_experiment'],
                    config['pep_threshold']))

    # exclude experiments without enough PSMs
    df['remove'] = (df['remove'] | df['raw_file'].isin(exclude_exps))

    # recalculate exps_per_pep, since we removed some experiments and this
    # number will change based on the set of experiments we consider
    logger.info(
        'Recalculating number of confident peptides across experiments...')

    exps_per_pep = df[-((df['remove']) |
                        (df['pep'] >= config['pep_threshold']))].groupby(
                            'sequence')['raw_file'].unique().apply(
                                (lambda x: len(x)))
    exps_per_pep = df['sequence'].map(exps_per_pep)
    exps_per_pep[pd.isnull(exps_per_pep)] = 0
    logger.info(
        'Additional {} PSMs from peptide sequences not observed confidently in more than {} experiments flagged for removal.'
        .format(
            np.sum(exps_per_pep < config['num_experiments']) -
            np.sum(df['remove']), config['num_experiments']))
    df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments']))

    ## --------------
    ## DONE FILTERING
    ## --------------

    # flag the observations in df_original that were removed
    df_original['remove'] = df['remove']
    # remove the flagged observations from the dataframe, and reset index
    df = df[df['remove'] == False].reset_index(drop=True)

    # map peptide and experiment IDs
    # sort experiment IDs alphabetically - or else the order is by
    # first occurrence of an observation of that raw file

    # if experiment or peptide IDs are already provided, then skip this step
    if 'exp_id' not in config[
            'col_names'] or config['col_names']['exp_id'] is None:
        df['exp_id'] = df['raw_file'].map({
            ind: val
            for val, ind in enumerate(np.sort(df['raw_file'].unique()))
        })
    logger.info(
        '{} experiments (raw files) loaded'.format(np.max(df['exp_id']) + 1))

    if 'peptide_id' not in config[
            'col_names'] or config['col_names']['peptide_id'] is None:
        df['peptide_id'] = df['sequence'].map(
            {ind: val
             for val, ind in enumerate(df['sequence'].unique())})
    logger.info(
        '{} peptide sequences loaded'.format(np.max(df['peptide_id']) + 1))

    # EXCLUSION = PSM does not participate in alignment, but will participate in
    #             confidence update since the PSM's associated peptide will get
    #             parameters from the alignment.
    #             This is NOT the same as "remove", which means that the PSM's
    #             associated peptide does not have enough PSMs to participate
    #             in alignment and therefore receive parameters.

    # flag non-confident PSMs for exclusion from alignment process
    df['exclude'] = (df['pep'] >= config['pep_threshold'])
    logger.info(
        'Excluding {} / {} ({:.2%}) PSMs from alignment process after filtering at PEP threshold of {}'
        .format(np.sum(df['pep'] >= config['pep_threshold']), df.shape[0],
                np.sum(df['pep'] >= config['pep_threshold']) / df.shape[0],
                config['pep_threshold']))

    # only take the four required columns (+ the IDs) with us
    # the rest were only needed for filtering and can be removed
    df = df[[
        'sequence', 'raw_file', 'retention_time', 'pep', 'exp_id',
        'peptide_id', 'input_id', 'id', 'exclude'
    ]]

    # sort by peptide_id, exp_id
    df = df.sort_values(['peptide_id', 'exp_id'])

    return df, df_original
Beispiel #8
0
def filter_smears(df, config, _filter):
    """
  Filter out "smears". even confidently identified PSMs can have bad chromatography,
  and in that case it is unproductive to include them into the alignment.
  In theory, this should be made redundant by the retention length filter, but
  some PSMs still slip through the cracks of that, possibly because the search engine
  cannot adequately track the elution peak?
  """

    logger.info('Determining RT spread of peptides within each experiment...')
    # for each experiment-peptide pair, get the range of retention times
    # this is the step that could take a long time
    # TODO: optimize this?
    smears = df.groupby(['raw_file',
                         'sequence'])['retention_time'].apply(np.ptp)

    if _filter['dynamic']:
        # use the dynamic filter, where the value is a proportion
        # of the max RT (the run-time) of that raw file

        logger.info(
            'Using dynamic smear length (in RT) of {:.4f} * run-time (max RT) for each experiment'
            .format(_filter['value']))

        max_rts = df.groupby('raw_file')['retention_time'].max().values

        smear_pair_inds = smears.index.to_frame()['raw_file'].values
        smear_pair_inds = pd.Series(smear_pair_inds).map({
            ind: val
            for val, ind in enumerate(np.sort(df['raw_file'].unique()))
        })

        # get the (raw_file, sequence) tuples for PSMs with a range above the threshold
        smears = smears[smears > (max_rts[smear_pair_inds] *
                                  _filter['value'])].index.values

    else:
        # use a constant filter for the retention length
        logger.info(
            'Using constant smear length (in RT) of {:.4f} for all raw files.'.
            format(_filter['value']))

        if _filter['value'] <= 0:
            raise ConfigFileError(
                'Smear filter {:.4f} is not defined or incorrectly defined. Please provide a decimal number between 0.0 and max(RT).'
                .format(_filter['value']))

        # get the (exp_id, peptide_id) tuples for PSMs with a range above the threshold
        smears = smears[smears > _filter['value']].index.values

    # map the tuples back to the original data frame, and set smears to be excluded
    smears = pd.Series(list(zip(df['raw_file'], df['sequence']))).isin(smears)

    if _filter['dynamic']:
        logger.info(
            'Filtering out {} PSMs with an intra-experiment RT spread greater than {:.4f} * max(exp_RT) for each raw file.'
            .format(smears.sum(), _filter['value']))
    else:
        logger.info(
            'Filtering out {} PSMs with an intra-experiment RT spread greater than {:.4f}'
            .format(smears.sum(), _filter['value']))

    return smears.values
Beispiel #9
0
def read_config_file(args, create_output_folder=True):
    # load defaults
    config = read_default_config_file()

    # override defaults with user config file
    with open(args.config_file.name, 'r') as f:
        config.update(yaml_load(f, Loader=Loader))

    # override config file's input, output, and verbose options
    # if they were specified on the command-line
    if args.input is not None:
        if config['input'] is not None:
            logger.warning(
                'Input files specified in both the config file and the command line. Using command-line input files instead.'
            )
        config['input'] = [f.name for f in args.input]

    if args.output is not None:
        if 'output' in config and config['output'] is not None:
            logger.warning(
                'Output folder specified in both the config file and the command line. Using command-line output folder instead.'
            )
        config['output'] = args.output

    if args.verbose:
        if 'verbose' in config:
            logger.info(
                'Overwriting verbosity level in configuration file with the one provided on the command-line.'
            )
            config['verbose'] = args.verbose

    # make sure that we have inputs and outputs before continuing
    # the jsonschema validator will catch this as well but here we can print
    # a more descriptive error message
    if 'input' not in config or config['input'] is None:
        error_msg = 'No input files specified, in either the config file or the command line. Please provide input files.'
        raise ConfigFileError(error_msg)

    if 'output' not in config or config['output'] is None:
        error_msg = 'No output folder specified, in either the config file or the command line. Please provide output folder.'
        raise ConfigFileError(error_msg)

    ### --------------------
    ### VALIDATE CONFIG FILE
    ### --------------------

    schema = pkg_resources.resource_stream('dart_id', '/'.join(
        ('config', 'schema.yaml')))
    schema = yaml_load(schema, Loader=Loader)

    v = Draft7Validator(schema)
    errors = sorted(v.iter_errors(config), key=str)

    for error in errors:
        logger.error('Configuration file error:\n' + 'In field: {}\n'.format(
            ' --> '.join(['\'' + str(x) + '\'' for x in error.path])) +
                     'Error: {}\n'.format(error.message, ) +
                     'Field description: {}\n'.format(
                         error.schema['description'] if 'description' in
                         error.schema else 'No field description provided'))

        #for suberror in sorted(error.context, key=lambda e: e.schema_path):
        #  print('suberror')
        #  print(list(suberror.schema_path), suberror.message, sep=", ")

    if len(errors) > 0:
        error_msg = '{} error(s) from configuration file. Please read the validation error messages carefully and fix the configuration file'.format(
            len(errors))
        raise ConfigFileError(error_msg)

    ### ====================================================
    ### ADVANCED CONFIGURATION FILE VALIDATION
    ### --------------------------------------
    ### apply rules too complex for the jsonschema validator
    ### ====================================================

    # ...

    # expand user or any vars
    config['output'] = os.path.expanduser(config['output'])
    config['output'] = os.path.expandvars(config['output'])

    # create output folder
    if not os.path.exists(config['output']) and create_output_folder:
        logger.info('Output folder does not yet exist. Creating...')
        os.makedirs(config['output'])

    # copy config file to output folder
    if create_output_folder:
        logger.info('Copying config file to output folder')
        copyfile(
            args.config_file.name,
            os.path.join(config['output'],
                         os.path.basename(args.config_file.name)))

    ### ------------------
    ### Modify config file
    ### ------------------
    # Apply modifications/transformations

    # Decode ASCII escape characters in the sep string
    # Assumes config file is in utf-8
    config['sep'] = config['sep'].encode('utf-8').decode('unicode-escape')

    return config
Beispiel #10
0
def process_files(config):

    # create our output data frames
    df_original = pd.DataFrame()
    df = pd.DataFrame()

    # iterate through each input file provided.
    for i, f in enumerate(config['input']):
        # first expand user or any vars
        f = os.path.expanduser(f)
        f = os.path.expandvars(f)

        logger.info('Reading in input file #{} | {} ...'.format(i + 1, f))

        # load the input file with pandas
        #
        # have a variable low memory option depending on the input type.
        # MaxQuant, for example, has a structure that forces pandas out of its
        # optimal low memory mode, and we have to specify it here.
        dfa = pd.read_csv(f,
                          sep=config['sep'],
                          low_memory=config['low_memory'])

        # keep track of where observations came from. this is _not_ the raw file ID
        # but instead the ID from which input file it originated from, so that if
        # we need to split these observations up by input file in the future we can do so
        dfa['input_id'] = i

        # append a copy of dfa into df_original, because the conversion process will heavily
        # modify dfa. we need to keep a copy of the original dataframe in order to append
        # the new columns back onto it later.
        # re-index columns with '[dfa.columns.tolist()]' to preserve the general column order
        df_original = df_original.append(dfa, sort=True)[dfa.columns.tolist()]

        # if this input data already has DART-ID columns in it, then drop them,
        # since they cause problems later
        dart_cols = [
            'rt_minus', 'rt_plus', 'mu', 'muij', 'sigmaij', 'pep_new',
            'exp_id', 'peptide_id', 'stan_peptide_id', 'exclude', 'residual',
            'pep_updated', 'q-value'
        ]
        # print a warning if we see any
        if np.any(df_original.columns.isin(dart_cols)):
            logger.warning(
                'Columns {} are recognized as DART-ID output columns. Removing these columns before proceeding. In the future, please input original input data files, not output files from DART-ID.'
                .format(
                    np.array_str(df_original.columns[df_original.columns.isin(
                        dart_cols)])))

            # drop existing dart cols
            for col in dart_cols:
                if col in df_original.columns:
                    logger.debug('Removing column {}'.format(col))
                    df_original = df_original.drop(col, axis=1)

        logger.info('Converting {} ({} PSMs)...'.format(f, dfa.shape[0]))

        # convert - takes subset of columns and renames them
        dfa = convert(dfa, config)

        # need to reset the input_id after the conversion process
        dfa['input_id'] = i
        # append to master dataframe
        df = df.append(dfa)

    # modify columns?
    # append the ion charge to the sequence
    # also make sure the charge column is specified and exists
    if config['add_charge_to_sequence'] and 'charge' in df.columns:
        logger.info(
            'Appending charge to peptide sequence, to align different charge states separately.'
        )
        df['sequence'] = df['sequence'] + '_' + df['charge'].apply(str)

    # create a unique ID for each PSM to help with stiching the final result together
    # after all of our operations
    df['id'] = range(0, df.shape[0])
    df_original['id'] = range(0, df.shape[0])

    # by default, exclude nothing from the original experiment
    df_original['remove'] = np.repeat(False, df_original.shape[0])

    # if the input already has an 'remove' column, then skip this step
    if 'remove' in config['col_names'] and config['col_names'][
            'remove'] is not None:
        df['remove'] = df['remove'].astype(bool)
    else:  # otherwise, run the filters
        df = filter_psms(df, config)

    # apply non-optional filters, PEP threshold and requirement that
    # sequence is observed in at least n experiments (num_experiments)

    # remove any observations with null pep
    null_pep = pd.isnull(df['pep'])
    if np.sum(null_pep) > 0:
        df['remove'] = ((df['remove']) | (null_pep))
        logger.info('Removing {} PSMs with no PEP entry.'.format(
            np.sum(null_pep)))

    num_exps = len(df['raw_file'].unique())

    # Special error when only one experiment is loaded
    if num_exps == 1:
        error_msg = 'Only 1 raw file/experiment loaded. DART-ID derives statistical power from peptides observed over multiple experiments. Please provide an input file with more raw files, or provide a list of input files, to get the most out of your data.'
        raise ConfigFileError(error_msg)

    if config['num_experiments'] > num_exps:
        error_msg = 'Number of experiments filter threshold {} is greater than the number of experiments in the input list. Please provide an integer greater than or equal to 1 and less than the number of experiments with the \"num_experiments\" key.'.format(
            config['num_experiments'])
        raise ConfigFileError(error_msg)

    # Calculate FDR
    df['qval'] = pep_to_fdr(df['pep'])

    # Count the number of experiments a peptide is observed in, but filter out
    # 1) PSMs removed from previous filters
    # 2) PSMs with PEP > pep_threshold
    exps_per_pep = (
        df.loc[
            # Get peptides that are:
            (
                # Not previously removed, for any reason
                (~df['remove']) &
                # Are below the set confidence threshold
                (df['pep'] < config['pep_threshold'])
                # (df['qval'] < config['pep_threshold']) # peptide FDR
            ), ['sequence', 'raw_file']]
        # Group by sequence, get all unique raw files the peptide sequence
        # appears in, then count the number of raw files
        .groupby('sequence')['raw_file'].unique().apply(len))
    # map values to DataFrame. peptides without any value will get NaN,
    # which will then be assigned to 0.
    exps_per_pep = df['sequence'].map(exps_per_pep)
    exps_per_pep[pd.isnull(exps_per_pep)] = 0

    # flag these sequences for removal as well
    logger.info(
        'Removing {} PSMs from peptide sequences not observed confidently in more than {} experiments'
        .format(np.sum(exps_per_pep < config['num_experiments']),
                config['num_experiments']))
    df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments']))

    # check that every experiment has at least n PSMs available for alignment.
    # if not, then exclude them from alignment
    psms_per_exp = df.groupby('raw_file')['remove'].apply(
        lambda x: np.sum(x < config['pep_threshold']))
    exclude_exps = psms_per_exp.index.values[
        psms_per_exp < config['min_psms_per_experiment']]

    if len(exclude_exps) > 0:
        logger.warning(
            'Experiments {} have < {} confident PSMs (PEP < {}) remaining after filtering. All PSMs belonging to these experiments will be excluded from the retention time alignment'
            .format(np.array_str(exclude_exps),
                    config['min_psms_per_experiment'],
                    config['pep_threshold']))

    # exclude experiments without enough PSMs
    df['remove'] = (df['remove'] | df['raw_file'].isin(exclude_exps))

    # recalculate exps_per_pep, since we removed some experiments and this
    # number will change based on the set of experiments we consider
    logger.info(
        'Recalculating number of confident peptides across experiments...')

    exps_per_pep = (
        df.loc[
            # Get peptides that are:
            (
                # Not previously removed, for any reason
                (~df['remove']) &
                # Are below the set confidence threshold
                (df['pep'] < config['pep_threshold'])),
            ['sequence', 'raw_file']]
        # Group by sequence, get all unique raw files the peptide sequence
        # appears in, then count the number of raw files
        .groupby('sequence')['raw_file'].unique().apply(len))
    exps_per_pep = df['sequence'].map(exps_per_pep)

    logger.info(
        'Additional {} PSMs from peptide sequences not observed confidently in more than {} experiments flagged for removal.'
        .format(np.sum(exps_per_pep < config['num_experiments']),
                config['num_experiments']))

    exps_per_pep[pd.isnull(exps_per_pep)] = 0
    df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments']))

    # Exclude low-confidence PEPs from alignment (PEP > 0.01) if the
    # coefficient of variantion (CV) of their PEPs is CV > 0.01.
    # We found that this is a good predictor of whether or not
    # the PSM is a decoy hit versus a target hit.

    def cv(x):
        if len(x) < 3:
            return np.nan

        return np.nanstd(x) / np.nanmean(x)

    peptide_aggs = {
        'pep_mean': ('pep', np.nanmean),
        'pep_cv': ('pep', cv),
        'pep_min': ('pep', np.min),
        'num_obs': ('pep', 'count')
    }

    # If we have the protein_decoy_tag and the leading_proteins column,
    # Look for the protein_decoy_tag to determine whether or not the peptide is a decoy peptide
    if 'leading_protein' in config[
            'col_names'] and 'protein_decoy_tag' in config:

        def is_decoy(x):
            return x.str.contains(config['protein_decoy_tag']).any()

        peptide_aggs['is_decoy'] = ('leading_protein', is_decoy)

    peptides_df = (
        df.groupby('sequence').aggregate(**peptide_aggs)
        # Only take peptides with more than N observations
        .query('num_obs > 3')
        # Remove any extremely low CVs
        .query('pep_cv > 1e-5')
        # Remove extremely low PEP means
        .query('pep_mean > 1e-10'))

    # If we have decoy data, then perform a logistic regression with
    # the pep_mean and pep_cv as features
    if 'is_decoy' in peptides_df.columns:
        logger.info(
            'Decoy peptide information present. Running logistic regression to avoid aligning decoy peptides'
        )

        # X = feature matrix
        X = np.log10(peptides_df.loc[:, ['pep_mean', 'pep_cv']].values)
        X = StandardScaler().fit_transform(X)

        # True = Decoy, False = Target
        y = peptides_df['is_decoy'].values

        random_state = np.random.RandomState(0)

        # shuffle and split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.5,
                                                            random_state=0)

        # Learn to predict each class against the other
        classifier = svm.SVC(kernel='linear',
                             probability=True,
                             random_state=random_state)
        classifier.fit(X_train, y_train)
        y_score = classifier.decision_function(X_test)

        # Compute ROC curve and ROC area
        fpr, tpr, thresholds = roc_curve(y_test, y_score, pos_label=True)
        roc_auc = auc(fpr, tpr)

        # If the curve is inverted, run with the opposite pos_label
        inverted = False
        if roc_auc < 0.5:
            logger.info('Correcting inverted ROC curve')
            inverted = True
            fpr, tpr, thresholds = roc_curve(y_test, y_score, pos_label=False)
            roc_auc = auc(fpr, tpr)

        logger.info('AUC: {:.3f}'.format(roc_auc))

        # Maximize the Youden-Index (sensitivity (TP / P = TPR) + specificity (TN / N = 1 - FPR))
        # But set a MINIMUM TPR of 0.8. We don't want to cut out too many of our targets
        cutoff_start_ind = np.argmax(
            tpr >= 0.8)  # argmax gets the first ind of the max value
        cutoff_ind = cutoff_start_ind + np.argmax(tpr[cutoff_start_ind:] -
                                                  fpr[cutoff_start_ind:])
        cutoff_thresh = thresholds[cutoff_ind]

        logger.info('ROC Cutoff: FPR = {:.2f}, TPR = {:.2f}'.format(
            fpr[cutoff_ind], tpr[cutoff_ind]))

        # Generate scores for all points
        all_y_score = classifier.decision_function(X)
        # Points above the cutoff threshold are decoys
        remove_inds = all_y_score >= cutoff_thresh
        if inverted:
            remove_inds = ~remove_inds

        logger.info('Logistic regression is removing {} peptides'.format(
            np.sum(remove_inds)))

    # If we don't have decoy information, then run with some preset cutoffs
    if 'is_decoy' not in peptides_df.columns:
        min_pep_thresh = 0.01
        max_pep_cv_thresh = 0.1

        remove_inds = ((peptides_df['pep_min'] > min_pep_thresh) &
                       (peptides_df['pep_cv'] < max_pep_cv_thresh))

        logger.info(
            'Removing {} peptides for min(PEP) > {:.3f} and CV(PEP) < {:.3f}'.
            format(np.sum(remove_inds), min_pep_thresh, max_pep_cv_thresh))

    remove_seqs = peptides_df.index[remove_inds].values
    df['remove'] = (df['remove'] | df['sequence'].isin(remove_seqs))

    # filtered_out = remove_seqs['is_decoy'] & (remove_seqs['pep_cv'] < 0.3) & (remove_seqs['pep_min'] > 0.01)
    # print('Removed', np.sum(remove_seqs.loc[filtered_out, 'num_obs']), 'out of', np.sum(remove_seqs.loc[remove_seqs['is_decoy'], 'num_obs']), 'decoy PSMs')

    ## --------------
    ## DONE FILTERING
    ## --------------

    # flag the observations in df_original that were removed
    df_original['remove'] = df['remove']
    # remove the flagged observations from the dataframe, and reset index
    df = df[df['remove'] == False].reset_index(drop=True)

    # map peptide and experiment IDs
    # sort experiment IDs alphabetically - or else the order is by
    # first occurrence of an observation of that raw file

    # if experiment or peptide IDs are already provided, then skip this step
    if 'exp_id' not in config[
            'col_names'] or config['col_names']['exp_id'] is None:
        df['exp_id'] = df['raw_file'].map({
            ind: val
            for val, ind in enumerate(np.sort(df['raw_file'].unique()))
        })
    logger.info(
        '{} experiments (raw files) loaded'.format(np.max(df['exp_id']) + 1))

    if 'peptide_id' not in config[
            'col_names'] or config['col_names']['peptide_id'] is None:
        df['peptide_id'] = df['sequence'].map(
            {ind: val
             for val, ind in enumerate(df['sequence'].unique())})
    logger.info(
        '{} peptide sequences loaded'.format(np.max(df['peptide_id']) + 1))

    # EXCLUSION = PSM does not participate in alignment, but will participate in
    #             confidence update since the PSM's associated peptide will get
    #             parameters from the alignment.
    #             This is NOT the same as "remove", which means that the PSM's
    #             associated peptide does not have enough PSMs to participate
    #             in alignment and therefore receive parameters.

    # flag non-confident PSMs for exclusion from alignment process
    df['exclude'] = (df['pep'] >= config['pep_threshold'])
    logger.info(
        'Excluding {} / {} ({:.2%}) PSMs from alignment process after filtering at PEP threshold of {}'
        .format(np.sum(df['pep'] >= config['pep_threshold']), df.shape[0],
                np.sum(df['pep'] >= config['pep_threshold']) / df.shape[0],
                config['pep_threshold']))

    # only take the four required columns (+ the IDs) with us
    # the rest were only needed for filtering and can be removed
    df = df[[
        'sequence', 'raw_file', 'retention_time', 'pep', 'exp_id',
        'peptide_id', 'input_id', 'id', 'exclude'
    ]]

    # sort by peptide_id, exp_id
    df = df.sort_values(['peptide_id', 'exp_id'])

    return df, df_original
Beispiel #11
0
def update(dfa, params, config):
    dfa = dfa.reset_index(drop=True)

    #logger.info('{} / {} ({:.2%}) confident, alignable observations (PSMs) after filtering.'.format(dff.shape[0], dfa.shape[0], dff.shape[0] / dfa.shape[0]))

    # refactorize peptide id into stan_peptide_id,
    # to preserve continuity when feeding data into STAN
    dfa['stan_peptide_id'] = dfa['sequence'].map(
        {ind: val
         for val, ind in enumerate(dfa['sequence'].unique())})

    num_experiments = dfa['exp_id'].max() + 1
    num_peptides = dfa['peptide_id'].max() + 1
    exp_names = np.sort(dfa['raw_file'].unique())
    pep_id_list = dfa['peptide_id'].unique()

    # validate parameters file. make sure it is from the same filters
    # or else the program will crash in the code below
    # check num_experiments, num_peptides
    if params['exp'].shape[0] != num_experiments or \
       params['peptide'].shape[0] != (dfa['stan_peptide_id'].max() + 1):
        raise ConfigFileError(
            'Parameters files have different data than the input data provided. Ensure that both the input list and filters used to generate the alignment parameters and those provided to the current update are the __exact__ same.'
        )

    model = get_model_from_config(config)

    # mu from the STAN alignment
    dfa['mu'] = params['peptide']['mu'].values[dfa['stan_peptide_id']]

    # concatenate transformation parameters
    exp_params = pd.DataFrame({ key: params['exp'][key][dfa['exp_id']] \
      for key in model['exp_keys']}).reset_index(drop=True)
    dfa = pd.concat([dfa, exp_params], axis=1)

    # predict mus with RTs, and RTs with aligned mus
    dfa['mu_pred'] = model['rt_to_ref'](dfa, dfa['mu'], params)
    dfa['muij'] = model['ref_to_rt'](dfa, dfa['mu'], params)
    dfa['sigmaij'] = model['sigmaij_func'](dfa, params)
    # scaled sigma is the same ratio of muij / mu applied to sigmaij
    dfa['sigma_pred'] = dfa['sigmaij'] * dfa['mu_pred'] / dfa['muij']

    # get parameters for the null distributions for each experiment
    null_dists = dfa.groupby('exp_id')['retention_time'].agg([np.mean, np.std])
    #null_dists = np.array([norm(loc=null_dists.loc[i, 'mean'], scale=null_dists.loc[i, 'std']) for i in range(0, num_experiments)])
    # first column is mean, second is std
    null_dists = np.array(
        [null_dists['mean'].values, null_dists['std'].values]).T

    # PEP ceiling at 1, otherwise will result in
    # incorrect negative densities when plugging into Bayes' theorem
    dfa['pep'][dfa['pep'] > 1.0] = 1.0

    # output table
    df_new = pd.DataFrame()

    bootstrap_method = 'none'
    if 'bootstrap_method' in config:
        bootstrap_method = config['bootstrap_method']
        logger.info('Using \"{}\" bootstrap method'.format(bootstrap_method))
    else:
        logger.info(
            'Bootstrap method not defined, using point estimates to update confidence instead.'
        )

    k = 20  # default
    if 'bootstrap_iters' in config:
        k = config['bootstrap_iters']
        if bootstrap_method != 'none':
            logger.info('Using {} bootstrap iterations'.format(k))

    logger.info('Updating PEPs...')
    for i, e in enumerate(np.sort(dfa['exp_id'].unique())):

        exp_name = exp_names[i]

        exp = dfa[dfa['exp_id'] == e]
        exp = exp.reset_index(drop=True)

        exp_peptides = exp['stan_peptide_id'].unique()

        logger.info('Exp ({} / {}) - {} - ({} Peptides, {} PSMs)'.format(
            i + 1, num_experiments, exp_name, len(exp_peptides), exp.shape[0]))

        # vector of P(RT|delta=1) for this experiment.
        rt_plus = pd.Series(np.zeros(exp.shape[0]))

        if bootstrap_method != 'none':

            # to avoid using this experiment's own data to update the confidence
            # of its own observations, recalculate the reference RTs (mu) without the
            # data from this experiment, by:
            # 1) non-parametric bootstrapping over the median of the predicted mus.
            # OR
            # 2) parametric bootstrapping, using the RT distribution parameters

            # get predicted mus of peptides in this experiment, excluding predicted mus
            # transformed from RTs observed in this experiment
            dfe = dfa.loc[((dfa['stan_peptide_id'].isin(exp_peptides)) & (dfa['exp_id'] != e)), \
              ['stan_peptide_id', 'pep', 'mu_pred', 'mu', 'sigma_pred', 'exp_id']]

            # extract relevant values for each peptide
            mu_preds = dfe.groupby('stan_peptide_id')['mu_pred'].apply(
                lambda x: x.values).values.tolist()
            mus = dfe.groupby('stan_peptide_id')['mu'].apply(
                lambda x: x.values).values.tolist()
            sigma_preds = dfe.groupby('stan_peptide_id')['sigma_pred'].apply(
                lambda x: x.values).values.tolist()
            peps = dfe.groupby('stan_peptide_id')['pep'].apply(
                lambda x: x.values).values.tolist()
            exp_ids = dfe.groupby('stan_peptide_id')['exp_id'].apply(
                lambda x: x.values).values.tolist()

            # number of observations per peptide sequence
            obs_per_seq = [len(peptide) for peptide in mu_preds]
            num_peptides = len(mu_preds)

            # the number of observations per peptide -- used in loop
            num_obs = 0
            # matrix of n by k estimated mus from the bootstrapping
            # will iterate over in the loop after the immediate one
            mu_k = np.zeros((num_peptides, k))

            if bootstrap_method == 'parametric' or bootstrap_method == 'parametric_mixture' or bootstrap_method == 'parametric-mixture':

                t_laplace_samples = 0
                t_coin_flips = 0
                t_null_samples = 0
                t_loop_indexing = 0
                t_medians = 0

                # create pool of coin flips, instead of sampling for every peptide
                # the pool is uniformly distributed from 0 to 1, and "successful" coin flip
                # is determined by whether or not the sample from the pool is less than the
                # measured PEP
                _time = time.time()
                coin_flip_pool = 0
                if bootstrap_method == 'parametric_mixture' or bootstrap_method == 'parametric-mixture':
                    coin_flip_pool = uniform.rvs(size=(np.sum(obs_per_seq) *
                                                       k))

                t_coin_flips += (time.time() - _time)
                coin_counter = 0

                # create a pool of laplace samples, to pull from for each peptide
                _time = time.time()
                sample_pool = laplace.rvs(size=(np.sum(obs_per_seq) * k))
                t_laplace_samples += (time.time() - _time)
                # keep track of where we are in the pool with a counter
                sample_counter = 0

                # parametric bootstrap
                for i in range(0, num_peptides):
                    num_obs = obs_per_seq[i]

                    _time = time.time()
                    # sample num_obs synthetic RTs for k bootstrap iterations
                    # do the sampling in a big pool, then shape to matrix where
                    # rows correspond to bootstrap iters and columns correspond to sample observations
                    #samples = laplace.rvs(size=(k * num_obs)).reshape(k, num_obs)

                    # draw samples from sample pool, reshape into matrix
                    _time = time.time()
                    samples = sample_pool[sample_counter:(sample_counter +
                                                          (k * num_obs))]
                    samples = samples.reshape(k, num_obs)

                    # increment sample counter
                    sample_counter += (k * num_obs)

                    #mu_med = np.median()
                    # shift and scale sampled RTs by mu and sigma_pred, respectively
                    samples = (samples * sigma_preds[i]) + mu_preds[i]
                    #samples = (samples * sigma_preds[i]) + mu_med
                    t_laplace_samples += (time.time() - _time)

                    if bootstrap_method == 'parametric_mixture':
                        # sample from mixture distribution
                        _time = time.time()
                        # actually faster to just replicate the sample matrix and then
                        # take subindices from that instead of sampling from null every
                        # iteration of the loop below. this seems inefficient, especially
                        # if given very small PEPs, but still better than sampling every iteration.
                        # could probably optimize the size of the null sample matrix by
                        # looking at predicted false positive rates, but for now we're
                        # just going with worst case scenario and assuming for all false positives.
                        null_samples = norm.rvs(size=(k * num_obs)).reshape(
                            k, num_obs)
                        # shift and scale sampled RTs by mean and std of null dists
                        null_samples = (null_samples *
                                        null_dists[exp_ids[i], 1]
                                        ) + null_dists[exp_ids[i], 0]
                        t_null_samples += (time.time() - _time)

                        _time = time.time()
                        for j in range(
                                0,
                                num_obs):  # for each observation in the matrix
                            # take a chunk of the coin flip pool
                            fp = (coin_flip_pool[coin_counter:(
                                coin_counter + k)] < peps[i][j]).astype(bool)
                            coin_counter += k
                            # overwrite original samples with samples from null distribution
                            samples[fp, j] = null_samples[fp, j]
                        t_loop_indexing += (time.time() - _time)

                    _time = time.time()
                    # now take the median of each row and store it in mu_k
                    mu_k[i] = np.median(samples, axis=1)
                    # or take the weighted mean
                    #weights = ((1 - peps[i]) - (1 - config['pep_threshold'])) / config['pep_threshold']
                    #mu_k[i] = (np.sum(samples * weights, axis=1) / np.sum(weights))
                    t_medians += (time.time() - _time)

                logger.debug('laplace sampling: {:.1f} ms'.format(
                    t_laplace_samples * 1000))
                logger.debug('coin flips: {:.1f} ms'.format(t_coin_flips *
                                                            1000))
                logger.debug('null sampling: {:.1f} ms'.format(t_null_samples *
                                                               1000))
                logger.debug('loop indexing: {:.1f} ms'.format(
                    t_loop_indexing * 1000))
                logger.debug('taking medians: {:.1f} ms'.format(t_medians *
                                                                1000))

            elif bootstrap_method == 'non-parametric':
                # non-parametric bootstrap
                # instead of generating random indices for the sampling for each
                # iteration, and for each peptide, we'll generate a batch of random numbers
                # now and pull from them later.
                # the counter will keep track of which portion of the pool we're using
                counter = 0
                rand_pool = np.random.rand(np.sum(obs_per_seq) * k)

                for i in range(0, num_peptides):  # for each peptide sequence
                    num_obs = obs_per_seq[i]
                    for j in range(0, k):  # for each iteration:
                        # re-estimate mu from the resampled mu_preds
                        # TODO: choice also of mean, weighted mean
                        mu_k[i][j] = np.median(mu_preds[i][\
                          (rand_pool[counter:(counter+num_obs)] * num_obs).astype(int, copy=False)])

                        counter = counter + num_obs

            _t_dist_building = time.time()
            # map of stan_peptide_id onto 1:num_peptides
            pep_inds = {ind: var for var, ind in enumerate(exp_peptides)}
            pep_inds = exp['stan_peptide_id'].map(pep_inds)

            # for each bootstrap iteration:
            for j in range(0, k):
                # evaluate the transformed RTs (predicted mus) on distributions
                # with the bootstrapped, estimated mus as the means.
                #rt_plus = rt_plus + laplace.pdf(exp['retention_time'], \
                #  loc=model['ref_to_rt'](exp, mu_k[:,j][pep_inds], params), \
                #  scale=exp['sigmaij'])

                rt_plus = rt_plus + laplace.pdf(exp['mu_pred'], \
                  loc=mu_k[:,j][pep_inds], \
                  scale=exp['sigma_pred'])

            # divide total likelihood by # of iterations to normalize to area of 1
            rt_plus = rt_plus / k

            logger.debug('distribution building: {:.1f} ms'.format(
                (time.time() - _t_dist_building) * 1000))

        else:
            # not using bootstrap, but using adjusted mu as a point estimate
            # for updating the confidence
            rt_plus = model['rt_plus_func'](exp)

        #                                         P(RT|delta=0)*P(delta=0)
        # PEP.new = P(delta=0|RT) =   ---------------------------------------------------
        #                             P(RT|delta=0)*P(delta=0) + P(RT|delta=1)*P(delta=1)
        #
        # delta=1 = Correct ID (true positive)
        # delta=0 = Incorrect (false positive)

        # P(RT|delta=0) = probability of peptides RT, given that PSM is incorrect
        #           estimate empirical density of RTs over the experiment

        rt_minus = model['rt_minus_func'](exp)

        # P(delta=0) = probability that PSM is incorrect (PEP)
        # P(delta=1) = probability that PSM is correct (1-PEP)

        # P(RT|delta=1) = probability that given the correct ID, the RT falls in the
        #           normal distribution of RTs for that peptide, for that experiment

        # delta=1 = Correct ID (true positive)
        # delta=0 = Incorrect (false positive)
        #
        pep_new = (rt_minus * exp['pep']) / \
          ((rt_minus * exp['pep']) + (rt_plus * (1.0 - exp['pep'])))

        # for PSMs for which we have alignment/update data
        exp_new = pd.DataFrame({
            'rt_minus': rt_minus.tolist(),
            'rt_plus': rt_plus.tolist(),
            'mu': exp['mu'].values.tolist(),
            'muij': exp['muij'].values.tolist(),
            'sigmaij': exp['sigmaij'].values.tolist(),
            'pep_new': pep_new.tolist(),
            'id': exp['id'].values,
            'exp_id': exp['exp_id'].values,
            'peptide_id': exp['peptide_id'].values,
            'stan_peptide_id': exp['stan_peptide_id'].values,
            'input_id': exp['input_id'].values,
            'exclude': exp['exclude'].values
        })
        # append to master DataFrame and continue
        df_new = df_new.append(exp_new)

    # reorder by ID and reset the index
    df_new = df_new.sort_values('id')
    df_new = df_new.reset_index(drop=True)

    return df_new