Exemple #1
0
def prepare_dataframe(infile_path,
                      decoy_prefix=None,
                      decoy_infix=False,
                      cleavage_rule=False,
                      fdr=0.01,
                      decoy2set=None):
    if not cleavage_rule:
        cleavage_rule = parser.expasy_rules['trypsin']
    if infile_path.lower().endswith(
            '.pep.xml') or infile_path.lower().endswith('.pepxml'):
        df1 = pepxml.DataFrame(infile_path)
        ftype = 'pepxml'
    elif infile_path.lower().endswith('.mzid'):
        df1 = mzid.DataFrame(infile_path)
    else:
        raise WrongInputError()
    if not df1.shape[0]:
        raise EmptyFileError()

    if 'Morpheus Score' in df1.columns:
        df1 = df1[df1['Morpheus Score'] != 0]
        df1['expect'] = 1 / df1['Morpheus Score']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))

    if 'MS-GF:EValue' in df1.columns:
        # MSGF search engine
        ftype = 'msgf'
        df1['peptide'] = df1['PeptideSequence']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))
        df1['assumed_charge'] = df1['chargeState']
        df1['spectrum'] = df1['spectrumID']
        df1['massdiff'] = (
            df1['experimentalMassToCharge'] -
            df1['calculatedMassToCharge']) * df1['assumed_charge']
        df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[
            'chargeState'] - df1['chargeState'] * 1.00727649
        df1['protein'] = df1['accession']
        df1['protein_descr'] = df1['protein description']
        df1['expect'] = df1['MS-GF:EValue']

    if set(df1['protein_descr'].str[0]) == {None}:
        # MSFragger
        logger.debug('Adapting MSFragger DataFrame.')
        logger.debug('Proteins before: %s', df1.loc[1, 'protein'])
        protein = df1['protein'].apply(
            lambda row: [x.split(None, 1) for x in row])
        df1['protein'] = protein.apply(lambda row: [x[0] for x in row])
        try:
            df1['protein_descr'] = protein.apply(
                lambda row: [x[1] for x in row])
        except IndexError:
            df1['protein_descr'] = protein.apply(lambda row: ['' for x in row])
        logger.debug('Proteins after: %s', df1.loc[1, 'protein'])

    df1.loc[pd.isna(df1['protein_descr']),
            'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']),
                                       'protein']

    df1 = df1[~pd.isna(df1['peptide'])]
    if 'MS1Intensity' not in df1:
        df1['MS1Intensity'] = 0.0
    df1['length'] = df1['peptide'].apply(len)
    df1 = df1[df1['length'] >= 6]
    df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0])
    if 'retention_time_sec' not in df1.columns:
        if 'scan start time' in df1.columns:
            df1['RT exp'] = df1['scan start time']
            df1 = df1.drop([
                'scan start time',
            ], axis=1)
        else:
            df1['RT exp'] = 0
    else:
        df1['RT exp'] = df1['retention_time_sec'] / 60
        df1 = df1.drop([
            'retention_time_sec',
        ], axis=1)

    df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0)))
    df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] *
                                 1.003354) / df1['calc_neutral_pep_mass']

    df1 = remove_column_hit_rank(df1)

    if ftype == 'pepxml':
        df1['mods_counter'] = df1.apply(parse_mods, axis=1)
    elif ftype == 'msgf':
        df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1)
    prepare_mods(df1)

    try:
        logger.info('Calibrating retention model...')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))

        logger.info('RT model training results: R^2 = %f , std = %f',
                    r_value**2, std_value)
        df1['RT diff'] = df1['RT pred'] - df1['RT exp']
        logger.info('Retention model calibrated successfully.')
    except Exception:
        logger.warning('Retention times are probably missing in input file.')
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa))
        df1['RT diff'] = df1['RT exp']
    return df1, decoy2set
        '^(\s+[^\s]+){2}(\s+(?P<peptide>[A-Z]+))(\s+[^\s]+){10}(\s+(?P<affinity>[0-9]{1,2}\.[0-9]+))'
    )
    results = {}
    with open(output_location, 'r') as f:
        for line in f:
            match = regex.match(line)
            if match:
                peptide = match.group('peptide')
                affinity = float(match.group('affinity'))
                results[peptide] = affinity
            else:
                print('Could not match line: %s' % line)
    return results


mzid_parser = mzid.DataFrame(args.mzid_file)

netmhc_alleles = ['H-2-Kb']
"""
First, extract the targets and decoys from the PIN file. 
"""
target_peptides = set()
decoy_peptides = set()

with open(args.pin_file, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t', restkey='Proteins')
    next(reader)
    for row in reader:
        label = row['Label'].strip()
        peptide = clean_peptide(row['Peptide'])
        if len(peptide) >= min_peptide_length and len(
Exemple #3
0
def prepare_dataframe(infile_path,
                      decoy_prefix=None,
                      decoy_infix=False,
                      cleavage_rule=False,
                      fdr=0.01,
                      decoy2set=None):
    if not cleavage_rule:
        cleavage_rule = parser.expasy_rules['trypsin']
    if infile_path.lower().endswith(
            '.pep.xml') or infile_path.lower().endswith('.pepxml'):
        df1 = pepxml.DataFrame(infile_path)
        ftype = 'pepxml'
    elif infile_path.lower().endswith('.mzid'):
        df1 = mzid.DataFrame(infile_path)
    else:
        raise WrongInputError()
    if not df1.shape[0]:
        raise EmptyFileError()

    if 'Morpheus Score' in df1.columns:
        df1 = df1[df1['Morpheus Score'] != 0]
        df1['expect'] = 1 / df1['Morpheus Score']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))

    if 'MS-GF:EValue' in df1.columns:
        # MSGF search engine
        ftype = 'msgf'
        df1['peptide'] = df1['PeptideSequence']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))
        df1['assumed_charge'] = df1['chargeState']
        df1['spectrum'] = df1['spectrumID']
        df1['massdiff'] = (
            df1['experimentalMassToCharge'] -
            df1['calculatedMassToCharge']) * df1['assumed_charge']
        df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[
            'chargeState'] - df1['chargeState'] * 1.00727649
        df1['protein'] = df1['accession']
        df1['protein_descr'] = df1['protein description']
        df1['expect'] = df1['MS-GF:EValue']

    if set(df1['protein_descr'].str[0]) == {None}:
        # MSFragger
        logger.debug('Adapting MSFragger DataFrame.')
        logger.debug('Proteins before: %s', df1.loc[1, 'protein'])
        protein = df1['protein'].apply(
            lambda row: [x.split(None, 1) for x in row])
        df1['protein'] = protein.apply(lambda row: [x[0] for x in row])
        try:
            df1['protein_descr'] = protein.apply(
                lambda row: [x[1] for x in row])
        except IndexError:
            df1['protein_descr'] = protein.apply(lambda row: ['' for x in row])
        logger.debug('Proteins after: %s', df1.loc[1, 'protein'])

    # if any(None in set(df1['protein_descr'].str[0])):
    #     print('HERE')
    #     df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1)
    df1.loc[pd.isna(df1['protein_descr']),
            'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']),
                                       'protein']
    # try:
    #     df1['expect'] = 1.0 / df1['bions_score_neg'].values
    # except:
    #     pass

    df1 = df1[~pd.isna(df1['peptide'])]
    if 'MS1Intensity' not in df1:
        df1['MS1Intensity'] = 0.0
    df1['length'] = df1['peptide'].apply(len)
    df1 = df1[df1['length'] >= 6]
    df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0])
    if 'retention_time_sec' not in df1.columns:
        if 'scan start time' in df1.columns:
            df1['RT exp'] = df1['scan start time']
            df1 = df1.drop([
                'scan start time',
            ], axis=1)
        else:
            df1['RT exp'] = 0
    else:
        df1['RT exp'] = df1['retention_time_sec'] / 60
        df1 = df1.drop([
            'retention_time_sec',
        ], axis=1)

    df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0)))
    df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] *
                                 1.003354) / df1['calc_neutral_pep_mass']

    df1['decoy'] = df1['protein'].apply(is_decoy,
                                        decoy_prefix=decoy_prefix,
                                        decoy_infix=decoy_infix)
    if not df1.decoy.sum():
        raise NoDecoyError()
    if decoy2set is None:
        decoy2set = split_decoys(df1)
    else:
        df1['decoy2'] = df1['protein'].apply(
            lambda p: all(x in decoy2set for x in p))
        df1['decoy1'] = df1['decoy'] & (~df1['decoy2'])
    df1 = remove_column_hit_rank(df1)

    if ftype == 'pepxml':
        df1['mods_counter'] = df1.apply(parse_mods, axis=1)
    elif ftype == 'msgf':
        df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1)
    prepare_mods(df1)

    pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum()
    df1_f = filter_custom(df1[~df1['decoy1']],
                          fdr=fdr,
                          key='expect',
                          is_decoy='decoy2',
                          reverse=False,
                          remove_decoy=False,
                          ratio=pep_ratio,
                          formula=1,
                          correction=None,
                          loglabel='PSMs default')
    num_psms_def = df1_f[~df1_f['decoy2']].shape[0]
    logger.info(
        'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d',
        num_psms_def)
    try:
        logger.info('Calibrating retention model...')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retention_coefficients = achrom.get_RCs_vary_lcp(
                df1_f['peptide'].values, df1_f['RT exp'].values)
        df1_f['RT pred'] = df1_f['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))
        _, _, r_value, std_value = aux.linear_regression(
            df1_f['RT pred'], df1_f['RT exp'])
        logger.info('RT model training results: R^2 = %f , std = %f',
                    r_value**2, std_value)
        df1['RT diff'] = df1['RT pred'] - df1['RT exp']
        logger.info('Retention model calibrated successfully.')
    except Exception:
        logger.warning('Retention times are probably missing in input file.')
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa))
        df1['RT diff'] = df1['RT exp']
    return df1, decoy2set
Exemple #4
0
def read_mzid(path):
    return mzid.DataFrame(path)