Python pI Examples, pyteomics.electrochem.pI Python Examples

Example #1

0

Show file

def compute_sequence_features(peptides_df):
    """
    Compute features to evaluate database.

    Parameters:
        peptides_df : df
            dataframe with peptides.

    Returns:
        None.
    """
    # features
    seq = "sequence"
    peptides_df["length"] = peptides_df["sequence"].apply(len)

    # amino acid counts
    peptides_df["KR"] = peptides_df[seq].str.count(
        "K") + peptides_df[seq].str.count("R")
    peptides_df["aromatic"] = peptides_df[seq].str.count("F") + peptides_df[seq].str.count("W") + \
        peptides_df[seq].str.count("Y")
    peptides_df["acids"] = peptides_df[seq].str.count(
        "D") + peptides_df[seq].str.count("E")
    peptides_df["aliphatic"] = peptides_df[seq].str.count("A") + peptides_df[seq].str.count("I") + \
        peptides_df[seq].str.count("L") + peptides_df[seq].str.count("M") + \
        peptides_df[seq].str.count("V")
    peptides_df["HGP"] = peptides_df[seq].str.count("G") + peptides_df[seq].str.count("P") + \
        peptides_df[seq].str.count("H")

    # sequence properties
    peptides_df["isoelectric_point"] = [
        electrochem.pI(x) for x in peptides_df["sequence"].values
    ]
    peptides_df["gravy"] = [
        electrochem.gravy(x) for x in peptides_df["sequence"].values
    ]
    return peptides_df

Example #2

0

Show file

File: Compare_pI_AAfreqs_proteins.py Project: ugaldelab/lab_tools

    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        aa_sequence = seq_record.seq

        aa_sequence = str(aa_sequence.rstrip())

        count += 1

        if count % 1000 == 0:
            print count

        # Stop at 2,000 for testing purposes:
        #if count == 2000:
        #    break

        try:
            pI_count.append(electrochem.pI(aa_sequence))

            for aa in aa_sequence:
                aa_freqs[aa] += 1
                residue_count += 1

        except auxiliary.PyteomicsError:
            failed_count += 1
            continue

    logfile.write("For dataset " + name + "\n")
    logfile.write("A total of %d entries were found\n" % count)
    logfile.write("A total of %d entries had errors\n" % failed_count)
    logfile.write("A total of %d entries were used in the analysis\n\n" % (count-failed_count))

    freq_results[name] = aa_freqs

Example #3

0

Show file

File: peptide_fractionation_broberg.py Project: bertrand-lab/cobia

aa_comp = dict(mass.std_aa_comp)
aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0})
aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0})
aa_comp['ox'] = mass.Composition({'O':1})

# Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias.

print('Calculating peptide physicochemical properties...')
iso_electric_points = []
pep_charges = []
pep_mass = []
i = 0

for peptide in mod_pep:
    peptide_isoelectric_point = electrochem.pI(peptide)
    peptide_charge = electrochem.charge(peptide, 7)
    peptide_mass = mass.calculate_mass(sequence = peptide, aa_comp = aa_comp)
    pep_charges.append(peptide_charge)
    iso_electric_points.append(peptide_isoelectric_point)
    pep_mass.append(peptide_mass)
    i += 1

print('LC-retention time prediction with the following parameters:')

print(lc_params)

# Column length:
column_length = lc_params['column_length'][0]
if isinstance(column_length, numbers.Number) != True:
    raise NameError('Error in parameter input file, column_length takes only Numeric.')

Example #4

0

Show file

File: feature_extraction.py Project: PigeonMark/Predicting-the-Kinetic-Class-of-Human-Herpesvirus-Genes-based-on-their-Protein-Sequences

 def add_p_i(self):
     self.data_frame['pI'] = self.data_frame['sequence'].apply(
         lambda sequence: electrochem.pI(sequence))

Example #5

0

Show file

File: test_electrochem.py Project: wxlsummer/pyteomics

 def test_pI_precision(self):
     pI_best = pI('PEPTIDE', precision_pI=1e-15)
     for i in range(16):
         precision = 10 ** (-i)
         self.assertTrue(
             abs(pI('PEPTIDE', precision_pI=precision) - pI_best) < precision)

Example #6

0

Show file

File: test_electrochem.py Project: wxlsummer/pyteomics

 def test_pI_calculations(self):
     self.assertTrue(
         abs(pI('H-AAA-OH') - (2.34 + 9.69) / 2.0) < 0.01)

Example #7

0

Show file

File: _peptide_mod_biolccc_rt_prediction.py Project: bertrand-lab/cobia

def peptide_mod_biolccc_rt_prediction(lc_params_file, fasta_file_name,
                                      custom_gradient, output_name):
    lc_params = pd.read_csv(lc_params_file)

    all_required_params = [
        'column_length', 'column_diameter', 'column_pore_size',
        'second_solvent_concentration_a', 'second_solvent_concentration_b',
        'gradient_0', 'gradient_1', 'gradient_2', 'flow_rate', 'code_format',
        'linear', 'model'
    ]

    # Check if all parameters are in parameter input file.
    if sorted(all_required_params) != sorted(list(lc_params.keys())):
        raise NameError(
            'Error in parameter LC input file, check for typos or missing parameter.'
        )

    # TRUE OR FALSE statement about whether the fasta file is in codons or in amino acids
    # Currently there is not a method in place, to use a nucleotide sequence fasta file as an input.
    code_format = lc_params['code_format'][0]
    linear_gradient = lc_params['linear'][0]

    # if not a linear gradient, a gradient file must be supplied.
    if not linear_gradient:
        gradient_file = pd.read_csv(custom_gradient)

    # which type of model to use for prediction (from TFA or FA)
    model_type = lc_params['model'][0]

    if model_type == 'FA':
        print('formic acid')
    elif model_type == 'TFA':
        print('tri')

    # Initialize empty dictionary of contig names and sequences:
    seq_df = pd.DataFrame(columns=['contigs', 'seq'])

    # Initialize empty lists of sequences and contigs:
    seq_vec = []
    contig_vec = []

    # Initalize variable that will contain the name of each sequence:
    last_seq = None

    # Reading in fasta file
    fasta_in = open(fasta_file_name, 'r')

    for line in fasta_in:
        # Strip the line:
        line = line.strip()
        # If the line is blank, move on.
        if len(line) == 0:  # blank line
            continue
        # If the line is a header, record the header as last_seq
        elif line[0] == ">":  # header-line
            last_seq = line[1:]
        # If the line is a sequence, record the sequence:
        else:  # sequence line
            # separate if statements for if the fasta file was input as amino acids or as genes or as mrna. Note that code_format == 'genes' and code_format == 'rna' are not functional yet.
            if (code_format == 'genes'):
                aa_line = Codon_to_Aminoacid(line)
                cleaved_line = pyteomics.parser.cleave(
                    str(aa_line), pyteomics.parser.expasy_rules['trypsin'])
                cleaved_line = list(cleaved_line)
            elif (code_format == 'rna'):
                removed_u = line.relace('U', 'T')
                aa_line = Codon_to_Aminoacid(removed_u)
                cleaved_line = pyteomics.parser.cleave(
                    str(aa_line), pyteomics.parser.expasy_rules['trypsin'])
                cleaved_line = list(cleaved_line)
            elif (code_format == 'aas'):
                # Digest with trypsin:
                cleaved_line = pyteomics.parser.cleave(
                    str(line), pyteomics.parser.expasy_rules['trypsin'])
                cleaved_line = list(cleaved_line)
            # If the peptide is shorter than 5 amino acids long, then we remove it fromt the dataset:
            for tryp_pep in cleaved_line:
                if len(tryp_pep) < 5:
                    continue
                seq_vec.append(tryp_pep)
                contig_vec.append(last_seq)

    # Close the fasta file:
    fasta_in.close()

    print('Removing xs and *s from seqs...')

    contig_vec_pd = pd.Series(contig_vec, name='contig')
    # Adding in the modification terms for the termini:
    seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec]
    # Removing contigs with unknown amino acid (X) or selenocysteine (U):
    stars_removed_peps = []
    for starred_peptide in seq_vec_terms:
        line_new = starred_peptide
        if '*' in line_new:
            continue
    #some peptides have unknown amino acids, remove them.
        if 'X' in line_new:
            continue
        if 'U' in line_new:
            continue
        stars_removed_peps.append(line_new)

    # Changing B to asparagine
    b_removed_peps = []
    for b_peptide in stars_removed_peps:
        line_new = re.sub('B', 'N', b_peptide)
        b_removed_peps.append(line_new)

    # Changing Z to glutamine
    z_removed_peps = []
    for z_peptide in b_removed_peps:
        line_new = re.sub('Z', 'Q', z_peptide)
        z_removed_peps.append(line_new)

    # Removing contigs that have an unknown amino acid (X), or selenocysteine ('U')
    contig_vec_no_x = []
    for contig_name in range(len(contig_vec)):
        if 'X' in seq_vec_terms[contig_name]:
            continue
        if 'U' in seq_vec_terms[contig_name]:
            continue
        if '*' in seq_vec_terms[contig_name]:
            continue

        temp_contig = contig_vec[contig_name]
        contig_vec_no_x.append(temp_contig)

    # Modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream)

    print('Modifying peptides...')
    mod_pep = []
    for tryp_pep in z_removed_peps:
        test_iso = pyteomics.parser.isoforms(tryp_pep,
                                             fixed_mods={
                                                 'ox': ['M'],
                                                 'cam': ['C']
                                             },
                                             show_unmodified_termini=True)
        for blah in test_iso:
            mod_pep.append(blah)

    # Modified amino acid dictionary for mass calculation:

    aa_comp = dict(mass.std_aa_comp)
    aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0})
    aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0})
    aa_comp['ox'] = mass.Composition({'O': 1})

    # Calculate peptide isoelectric points, masses, and charge at pH = 7. Note that we do not use the isoelectric point or charge from this point on, but used it for examining other predictive components of apparent cofragmentation bias.

    print('Calculating peptide physicochemical properties...')
    iso_electric_points = []
    pep_charges = []
    pep_mass = []
    i = 0

    for peptide in mod_pep:
        peptide_isoelectric_point = electrochem.pI(peptide)
        peptide_charge = electrochem.charge(peptide, 7)
        peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp)
        pep_charges.append(peptide_charge)
        iso_electric_points.append(peptide_isoelectric_point)
        pep_mass.append(peptide_mass)
        i += 1

    print('LC-retention time prediction with the following parameters:')

    print(lc_params)

    # Column length:
    column_length = lc_params['column_length'][0]
    if isinstance(column_length, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, column_length takes only Numeric.')

    # Column diameter:
    column_diameter = lc_params['column_diameter'][0]
    if isinstance(column_diameter, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, column_diameter takes only Numeric.'
        )

    # Column pore size
    column_pore_size = lc_params['column_pore_size'][0]  # 0.11 minutes
    if isinstance(column_pore_size, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, column_pore_size takes only Numeric.'
        )

    second_solvent_concentration_a = lc_params[
        'second_solvent_concentration_a'][0]
    if isinstance(second_solvent_concentration_a, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, second_solvent_concentration_a takes only Numeric.'
        )

    second_solvent_concentration_b = lc_params[
        'second_solvent_concentration_b'][0]
    if isinstance(second_solvent_concentration_b, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, second_solvent_concentration_b takes only Numeric.'
        )

    gradient_0 = lc_params['gradient_0'][0]
    if isinstance(gradient_0, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, gradient_0 takes only Numeric.')

    gradient_1 = lc_params['gradient_1'][0]
    if isinstance(gradient_1, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, gradient_1 takes only Numeric.')

    gradient_2 = lc_params['gradient_2'][0]
    if isinstance(gradient_2, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, gradient_2 takes only Numeric.')

    flow_rate = lc_params['flow_rate'][0]
    if isinstance(flow_rate, numbers.Number) != True:
        raise NameError(
            'Error in parameter input file, flow_rate takes only Numeric')

    # biolccc predicting RT times
    myChromoConditions = biolccc.ChromoConditions()

    # The column length in mm.
    myChromoConditions.setColumnLength(column_length)

    # The internal column diameter in mm.
    myChromoConditions.setColumnDiameter(column_diameter)

    # The average pore size in A.
    myChromoConditions.setColumnPoreSize(column_pore_size)

    # The concentration of the eluting solvent (ACN for the reversed
    # phase) in component A in %.
    myChromoConditions.setSecondSolventConcentrationA(
        second_solvent_concentration_a)

    # The concentration of the eluting solvent (ACN for the reversed
    # phase) in component B in %.
    myChromoConditions.setSecondSolventConcentrationB(
        second_solvent_concentration_b)

    # The shape of the gradient. The example is a linear gradient
    # from gradient_0% to gradient_1% of component B over gradient_2 minutes.

    if linear_gradient:
        myChromoConditions.setGradient(
            biolccc.Gradient(gradient_0, gradient_1, gradient_2))
    else:
        # loop that goes through and sets a custom gradient. another gradient file is required as the argv[4] file.
        myGradient = biolccc.Gradient()
        # An older version of this was more static, and left in the comments below to demonstrate what this loop is doing:
        for set_point in range(len(gradient_file.columns)):
            myGradient.addPoint(gradient_file.iloc[0, set_point],
                                gradient_file.iloc[1, set_point])
        myChromoConditions.setGradient(myGradient)

        # The following gradient is an exponential function increasing from gradient_0
        # to 100, specifically for the Aylward testing datasetself.
        # def exp_function(x):
        #   x1 = math.pow(x, 2)//100
        # this is the function used to compute these setpoints.
        #myGradient = biolccc.Gradient()
        #myGradient.addPoint(0.0, gradient_0)
        #myGradient.addPoint(15.0, 2.0)
        #myGradient.addPoint(30.0, 9.0)
        #myGradient.addPoint(45.0, 20.0)
        #myGradient.addPoint(60.0, 36.0)
        #myGradient.addPoint(75.0, 56.0)
        #myGradient.addPoint(90.0, 81.0)
        #myGradient.addPoint(gradient_2, gradient_1)
        #myChromoConditions.setGradient(myGradient)

    # The flow rate in ml/min.
    myChromoConditions.setFlowRate(flow_rate)

    print('Calculating retention times...')

    # Designating BioLCCC model to use:
    if model_type == 'TFA':
        model_to_use = biolccc.rpAcnTfaChain
    elif model_type == 'FA':
        model_to_use = biolccc.rpAcnFaRod

    peptide_rts = []
    i = 0

    print('Calculating retention times...')
    for tryp_pep in mod_pep:
        rt_temp = biolccc.calculateRT(tryp_pep, model_to_use,
                                      myChromoConditions)

        peptide_rts.append(rt_temp)
        i += 1

    # Combining the sequences, times, and physicochemical characteristics.
    peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence')
    peptide_rts = pd.Series(peptide_rts, name='rts')
    iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point')
    pep_charges_pd = pd.Series(pep_charges, name='charge')
    pep_mass_pd = pd.Series(pep_mass, name='mass')
    contig_pd = pd.Series(contig_vec_no_x, name='contig')

    peptide_dataframe = pd.concat([
        peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd,
        pep_mass_pd, contig_pd
    ],
                                  axis=1)

    current_date = time.strftime("%Y-%m-%d")

    custom_name = output_name
    file_name = custom_name + '_lc-retention-times.csv'
    peptide_dataframe.to_csv(file_name)

Example #8

0

Show file

def handcrafted_features(data, tags):

    #
    # DOI 10.1007/s00251-017-1023-5
    # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb
    # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain
    # Modified to handle split for training, validation, and test cohorts
    # Modified for multinomial classification
    #

    # physicochemical amino acid properties
    basicity = {
        'A': 206.4,
        'B': 210.7,
        'C': 206.2,
        'D': 208.6,
        'E': 215.6,
        'F': 212.1,
        'G': 202.7,
        'H': 223.7,
        'I': 210.8,
        'K': 221.8,
        'L': 209.6,
        'M': 213.3,
        'N': 212.8,
        'P': 214.4,
        'Q': 214.2,
        'R': 237.0,
        'S': 207.6,
        'T': 211.7,
        'V': 208.7,
        'W': 216.1,
        'X': 210.2,
        'Y': 213.1,
        'Z': 214.9
    }

    hydrophobicity = {
        'A': 0.16,
        'B': -3.14,
        'C': 2.50,
        'D': -2.49,
        'E': -1.50,
        'F': 5.00,
        'G': -3.31,
        'H': -4.63,
        'I': 4.41,
        'K': -5.00,
        'L': 4.76,
        'M': 3.23,
        'N': -3.79,
        'P': -4.92,
        'Q': -2.76,
        'R': -2.77,
        'S': -2.85,
        'T': -1.08,
        'V': 3.02,
        'W': 4.88,
        'X': 4.59,
        'Y': 2.00,
        'Z': -2.13
    }

    helicity = {
        'A': 1.24,
        'B': 0.92,
        'C': 0.79,
        'D': 0.89,
        'E': 0.85,
        'F': 1.26,
        'G': 1.15,
        'H': 0.97,
        'I': 1.29,
        'K': 0.88,
        'L': 1.28,
        'M': 1.22,
        'N': 0.94,
        'P': 0.57,
        'Q': 0.96,
        'R': 0.95,
        'S': 1.00,
        'T': 1.09,
        'V': 1.27,
        'W': 1.07,
        'X': 1.29,
        'Y': 1.11,
        'Z': 0.91
    }

    mutation_stability = {
        'A': 13,
        'C': 52,
        'D': 11,
        'E': 12,
        'F': 32,
        'G': 27,
        'H': 15,
        'I': 10,
        'K': 24,
        'L': 34,
        'M': 6,
        'N': 6,
        'P': 20,
        'Q': 10,
        'R': 17,
        'S': 10,
        'T': 11,
        'V': 17,
        'W': 55,
        'Y': 31
    }

    # feature conversion and generation
    features_list = []

    for chain in ['tra', 'trb']:

        onehot_encoder = feature_extraction.DictVectorizer(sparse=False)
        features_list.append(
            pd.DataFrame(onehot_encoder.fit_transform(
                data[[chain + '_vgene',
                      chain + '_jgene']].to_dict(orient='records')),
                         columns=onehot_encoder.feature_names_))

        # sequence length
        features_list.append(data[chain + '_cdr3'].apply(
            lambda sequence: parser.length(sequence)).to_frame().rename(
                columns={chain + '_cdr3': 'length'}))

        # number of occurences of each amino acid
        aa_counts = pd.DataFrame.from_records([
            parser.amino_acid_composition(sequence)
            for sequence in data[chain + '_cdr3']
        ]).fillna(0)
        aa_counts.columns = [
            chain + '_count_{}'.format(column) for column in aa_counts.columns
        ]
        features_list.append(aa_counts)

        # physicochemical properties: (average) basicity, (average) hydrophobicity,
        #                             (average) helicity, pI, (average) mutation stability
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_basicity'}))
        features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum(
            [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame(
            ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'}))
        features_list.append(
            data[chain +
                 '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) /
                                parser.length(seq)).to_frame().rename(
                                    columns={chain + '_cdr3': 'avg_helicity'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: electrochem.pI(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'pI'}))
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser.
            length(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'avg_mutation_stability'}))

        # peptide mass
        features_list.append(data[chain + '_cdr3'].apply(
            lambda seq: mass.fast_mass(seq)).to_frame().rename(
                columns={chain + '_cdr3': 'mass'}))

        # positional features
        # amino acid occurence and physicochemical properties at a given position from the center
        pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [
            [] for _ in range(6)
        ]
        for sequence in data[chain + '_cdr3']:
            length = parser.length(sequence)
            start_pos = -1 * (length // 2)
            pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\
              list(range(start_pos, 0)) + list(range(1, start_pos + length + 1))

            pos_aa.append({
                chain + '_pos_{}_{}'.format(pos, aa): 1
                for pos, aa in zip(pos_range, sequence)
            })
            pos_basicity.append({
                chain + '_pos_{}_basicity'.format(pos): basicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_hydro.append({
                chain + '_pos_{}_hydrophobicity'.format(pos):
                hydrophobicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_helicity.append({
                chain + '_pos_{}_helicity'.format(pos): helicity[aa]
                for pos, aa in zip(pos_range, sequence)
            })
            pos_pI.append({
                chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa)
                for pos, aa in zip(pos_range, sequence)
            })
            pos_mutation.append({
                chain + '_pos_{}_mutation_stability'.format(pos):
                mutation_stability[aa]
                for pos, aa in zip(pos_range, sequence)
            })

        features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0))
        features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0))

    features_list.append(data['weights'])
    for tag in tags:
        features_list.append(data['labels_' + tag])
    features_list.append(data['split'])

    # combine all features
    data_processed = pd.concat(features_list, axis=1)

    return data_processed

Example #9

0

Show file

def openms_modelled_rt(rtfilename, output_name):

    seq_rt_df = pd.read_csv(rtfilename, names=['seq_rt'])
    df = pd.DataFrame(seq_rt_df.seq_rt.str.split(' ', 1).tolist(),
                      columns=['pep_seq', 'rts'])
    seq_vec = df['pep_seq'].tolist()
    peptide_rts = df['rts'].tolist()

    print('Removing xs and *s from seqs...')

    seq_vec_terms = [central_pep + '-OH' for central_pep in seq_vec]
    # removing contigs with unknown amino acid (X) or selenocysteine (U)
    stars_removed_peps = []
    for starred_peptide in seq_vec_terms:
        line_new = starred_peptide
        # some peptides have unknown amino acids denoted as *, remove them.
        if '*' in line_new:
            continue
    #some peptides have unknown amino acids, remove them.
        if 'X' in line_new:
            continue
        if 'U' in line_new:
            continue
        stars_removed_peps.append(line_new)

    #changing B to asparagine
    b_removed_peps = []
    for b_peptide in stars_removed_peps:
        line_new = re.sub('B', 'N', b_peptide)
        b_removed_peps.append(line_new)

    #changing Z to glutamine
    z_removed_peps = []
    for z_peptide in b_removed_peps:
        line_new = re.sub('Z', 'Q', z_peptide)
        z_removed_peps.append(line_new)

    # #modifying peptides: oxidation of methionine, carbamidomethylation of cysteine, acetylation of N terminal (this one was done upstream)

    print('Modifying peptides...')
    mod_pep = []
    for tryp_pep in z_removed_peps:
        test_iso = pyteomics.parser.isoforms(tryp_pep,
                                             fixed_mods={
                                                 'ox': ['M'],
                                                 'cam': ['C']
                                             },
                                             show_unmodified_termini=True)
        for blah in test_iso:
            mod_pep.append(blah)

    # # modified amino acid dictionary for mass calculation
    aa_comp = dict(mass.std_aa_comp)
    aa_comp['Ac-'] = mass.Composition({'C': 2, 'H': 3, 'N': 0, 'O': 1, 'P': 0})
    aa_comp['cam'] = mass.Composition({'C': 2, 'H': 3, 'N': 1, 'O': 1, 'P': 0})
    aa_comp['ox'] = mass.Composition({'O': 1})

    #%%
    # calculate peptide isoelectric points, masses, and charge at pH = 7
    print('Calculating peptide physicochemical properties...')
    iso_electric_points = []
    pep_charges = []
    pep_mass = []
    i = 0

    for peptide in mod_pep:
        peptide_isoelectric_point = electrochem.pI(peptide)
        peptide_charge = electrochem.charge(peptide, 7)
        peptide_mass = mass.calculate_mass(sequence=peptide, aa_comp=aa_comp)
        pep_charges.append(peptide_charge)
        iso_electric_points.append(peptide_isoelectric_point)
        pep_mass.append(peptide_mass)
        i += 1

    # Combining the sequences, times, and physicochemical characteristics.
    peptides_pd = pd.Series(z_removed_peps, name='peptide_sequence')
    peptide_rts = pd.Series(peptide_rts, name='rts')
    iso_electric_points_pd = pd.Series(iso_electric_points, name='iso_point')
    pep_charges_pd = pd.Series(pep_charges, name='charge')
    pep_mass_pd = pd.Series(pep_mass, name='mass')

    peptide_dataframe = pd.concat([
        peptides_pd, peptide_rts, iso_electric_points_pd, pep_charges_pd,
        pep_mass_pd
    ],
                                  axis=1)

    current_date = time.strftime("%Y-%m-%d")

    custom_name = output_name
    file_name = custom_name + '_lc-retention-times.csv'
    print(file_name)
    peptide_dataframe.to_csv(file_name)