Ejemplo n.º 1
0
def fill_protein_parameters(protein_records):
    label = 'Filling protein parameters: '
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        protein_record.protein_parameters = ProteinParameters(protein_record.protein.sequence)
        show_progress(label, 35, index / len(protein_records))
        index += 1
    print()
Ejemplo n.º 2
0
def save_proteins_to_csv(proteins, file_name):
    label = 'Saving proteins to \'{0}\': '.format(file_name)
    show_progress(label, 40, 0.0)

    with open(file_name, 'w') as file:
        file.write('id;name;sequence\n')
        index = 1
        for protein in proteins:
            file.write(protein.id + ';' + protein.name + ';' + protein.sequence + '\n')
            show_progress(label, 40, index / len(proteins))
            index += 1
    print()
Ejemplo n.º 3
0
def load_proteins_from_csv(file_name):
    label = 'Loading proteins from \'{0}\': '.format(file_name)
    show_progress(label, 40, 0.0)

    data = genfromtxt(file_name, dtype=None, delimiter=';', names=True)
    proteins = []
    index = 1
    for line in data:
        proteins.append(Protein(id=b2str(line['id']), name=b2str(line['name']), sequence=b2str(line['sequence'])))
        show_progress(label, 40, index / len(data))
        index += 1
    print()

    return proteins
Ejemplo n.º 4
0
def save_protein_records_to_folder(protein_records, folder='results/'):
    if not folder[-1] == '/':
        folder += '/'

    label = 'Saving protein records to \'{0}\': '.format(folder)
    show_progress(label, 40, 0.0)

    index = 1
    for protein_record in protein_records:
        with open(folder + protein_record.protein.id + '.txt', 'w') as file:
            file.write(str(protein_record))
        show_progress(label, 40, index / len(protein_records))
        index += 1
    print()
Ejemplo n.º 5
0
def fill_peptide_parameters(protein_records):
    print('Filling peptide parameters:')
    protein_index = 1
    for protein_record in protein_records:
        print('Processing protein record #{0} of {1}:'.format(protein_index, len(protein_records)))
        stdout.flush()

        # 1. process received peptide records first
        label = '{0:>25}: '.format('Received peptides ({0})'.format(len(protein_record.received_peptide_records)))
        show_progress(label, 40, 0.0)
        peptide_index = 1
        for peptide_record in protein_record.received_peptide_records:
            peptide_record.peptide_parameters = PeptideParameters(peptide_record.peptide.sequence)
            show_progress(label, 40, peptide_index / len(protein_record.received_peptide_records))
            peptide_index += 1
        print()

        # 2. process then missed peptide records
        if len(protein_record.missed_peptide_records) == 0:
            protein_index += 1
            continue
        label = '{0:>25}: '.format('Missed peptides ({0})'.format(len(protein_record.missed_peptide_records)))
        show_progress(label, 40, 0.0)
        peptide_index = 1
        for peptide_record in protein_record.missed_peptide_records:
            peptide_record.peptide_parameters = PeptideParameters(peptide_record.peptide.sequence)
            show_progress(label, 40, peptide_index / len(protein_record.missed_peptide_records))
            peptide_index += 1
        print()

        protein_index += 1
        print()
    print('Filling peptide parameters: done.')
Ejemplo n.º 6
0
def fill_parameter_lists(protein_records):
    total_received_peptides_number = 0
    total_missed_peptides_number = 0
    for protein_record in protein_records:
        total_received_peptides_number += len(protein_record.received_peptide_records)
        total_missed_peptides_number += len(protein_record.missed_peptide_records)

    received_parameters = DataFrame(zeros((total_received_peptides_number, len(peptide_parameter_names)),
                                          dtype=float64), columns=peptide_parameter_names)
    missed_parameters = DataFrame(zeros((total_missed_peptides_number, len(peptide_parameter_names)),
                                        dtype=float64), columns=peptide_parameter_names)

    # fill received peptides parameters
    label = 'Filling received peptides parameter lists: '
    show_progress(label, 32, 0.0)
    index = 1
    for protein_record in protein_records:
        for received_peptide_record in protein_record.received_peptide_records:
            received_parameters['Sequence length'][index] = received_peptide_record.peptide_parameters.sequence_length
            received_parameters['Aromaticity'][index] = received_peptide_record.peptide_parameters.aromaticity
            received_parameters['Instability'][index] = received_peptide_record.peptide_parameters.instability
            received_parameters['Isoelectric point'][index] = \
                received_peptide_record.peptide_parameters.isoelectric_point
            received_parameters['Molecular weight'][index] = received_peptide_record.peptide_parameters.molecular_weight
            received_parameters['Kyte plot'][index] = received_peptide_record.peptide_parameters.kyte_plot
            received_parameters['Aliphatic index'][index] = received_peptide_record.peptide_parameters.aliphatic_index
            received_parameters['Boman index'][index] = received_peptide_record.peptide_parameters.boman_index
            received_parameters['Hydrophobicity'][index] = received_peptide_record.peptide_parameters.hydrophobicity

            for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors:
                received_parameters['Kidera factor: {0}'.format(kidera_factor['name'])][index] = kidera_factor['value']

            show_progress(label, 32, index / total_received_peptides_number)
            index += 1
    print()

    # fill missed peptides parameters
    label = 'Filling missed peptides parameter lists: '
    show_progress(label, 32, 0.0)
    index = 1
    for protein_record in protein_records:
        for missed_peptide_record in protein_record.missed_peptide_records:
            missed_parameters['Sequence length'][index] = missed_peptide_record.peptide_parameters.sequence_length
            missed_parameters['Aromaticity'][index] = missed_peptide_record.peptide_parameters.aromaticity
            missed_parameters['Instability'][index] = missed_peptide_record.peptide_parameters.instability
            missed_parameters['Isoelectric point'][index] = missed_peptide_record.peptide_parameters.isoelectric_point
            missed_parameters['Molecular weight'][index] = missed_peptide_record.peptide_parameters.molecular_weight
            missed_parameters['Kyte plot'][index] = missed_peptide_record.peptide_parameters.kyte_plot
            missed_parameters['Aliphatic index'][index] = missed_peptide_record.peptide_parameters.aliphatic_index
            missed_parameters['Boman index'][index] = missed_peptide_record.peptide_parameters.boman_index
            missed_parameters['Hydrophobicity'][index] = missed_peptide_record.peptide_parameters.hydrophobicity

            for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors:
                missed_parameters['Kidera factor: {0}'.format(kidera_factor['name'])][index] = kidera_factor['value']

            show_progress(label, 32, index / total_missed_peptides_number)
            index += 1
    print()

    return received_parameters, missed_parameters
Ejemplo n.º 7
0
def construct_protein_records(proteins, main_data):
    label = 'Constructing protein records: '
    show_progress(label, 40, 0.0)
    protein_records = []

    # 1. process all main data
    index = 1
    for line in main_data:
        # 1.1. construct peptide and peptide match from current analysis
        current_peptide = Peptide(sequence=b2str(line['sequence']))
        current_peptide_match = PeptideMatch(analysis_name=b2str(line['filename']), score=line['score'],
                                             reverse_score=line['reverseScore'],
                                             percent_of_scored_peak_intensity=line['percent_scored_peak_intensity'],
                                             total_intensity=line['totalIntensity'],
                                             precursor_averagine_chi_squared=line['precursorAveragineChiSquared'],
                                             retention_time_min=line['retentionTimeMin'],
                                             chromatographic_peak_width_in_seconds=line['chromatographicPeakWidthSec'])

        # 1.2. get protein id for current analysis
        current_protein_id = b2str(line['accession_number'])

        # 1.3. find protein with such id
        protein = find_protein_with_id(proteins,
                                       current_protein_id)  # TODO: if such protein not exists, extract Protein object and add to proteins

        # 1.4. find record with such protein
        protein_record = find_protein_record_with_protein(protein_records, protein)

        # 1.5. if record with such protein exists, add current match to received peptides
        if protein_record is not None:
            # 1.5.1. if such peptide was already received, add peptide match
            peptide_record = find_peptide_record_with_peptide(protein_record.received_peptide_records, current_peptide)
            if peptide_record is not None:
                peptide_record.matches.append(current_peptide_match)
            # 1.5.2. if such peptide was not received yet, add peptide record with this one peptide match
            else:
                current_peptide_record = PeptideRecord(current_peptide, [current_peptide_match])
                protein_record.received_peptide_records.append(current_peptide_record)
        # 1.6. if protein record with such protein not exists, create new protein record
        else:
            current_peptide_record = PeptideRecord(current_peptide, [current_peptide_match])
            protein_record = ProteinRecord(protein, received_peptide_records=[current_peptide_record])
            protein_records.append(protein_record)

        show_progress(label, 40, index / len(main_data))
        index += 1
    print()

    # 2. sort peptide records by length (starting from longest)
    label = 'Filling received peptide records: '
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        protein_record.received_peptide_records = sorted(protein_record.received_peptide_records, key=lambda peptide_record: len(peptide_record.peptide.sequence), reverse=True)
        show_progress(label, 35, index / len(protein_records))
        index += 1
    print()

    return protein_records
Ejemplo n.º 8
0
def construct_proteins(main_data):
    proteins = []
    label = 'Constructing proteins from main data: '
    show_progress(label, 35, 0.0)

    # 1. fill list with unique proteins
    index = 1
    for line in main_data:
        # 1.1. construct protein from current line
        current_protein = Protein(id=b2str(line['accession_number']), name=b2str(line['entry_name']))

        # 1.2. add if not already exists in list
        if current_protein not in proteins:
            proteins.append(current_protein)

        show_progress(label, 35, index / len(main_data))
        index += 1
    print()

    return proteins
Ejemplo n.º 9
0
def fill_missed_peptide_records(protein_records):
    label = 'Filling missed peptide records: '
    show_progress(label, 40, 0.0)
    index = 1
    for protein_record in protein_records:
        # 1. construct list of sequences of received peptides
        received_sequences = [peptide_record.peptide.sequence for peptide_record in protein_record.received_peptide_records]

        # 2. calculate list of missed sequence fragments
        missed_sequences = cut_received_peptide_sequences(protein_record.protein.sequence, received_sequences)
        missed_sequences = [trypsinolize_sequence(x) for x in missed_sequences]

        # 3. construct peptide record for each fragment and store them in missed peptide records
        protein_record.missed_peptide_records = []
        for missed_sequences_list in missed_sequences:
            for fragment in missed_sequences_list:
                protein_record.missed_peptide_records.append(PeptideRecord(peptide=Peptide(sequence=fragment)))

        show_progress(label, 40, index / len(protein_records))
        index += 1
    print()
Ejemplo n.º 10
0
def calculate_simple_statistics(parameters, per_peptide_correlations=None):
    label = 'Calculating simple statistics: '
    show_progress(label, 40, 0.0)
    stats = {}

    total_stats_length = len(parameters.columns)
    if per_peptide_correlations is not None:
        total_stats_length += len(per_peptide_correlations.columns)

    index = 1
    for parameter_name in parameters.columns:
        stats[parameter_name] = calculate_simple_statistics_for_serie(parameters[parameter_name])
        show_progress(label, 40, index / total_stats_length)
        index += 1
    if per_peptide_correlations is not None:
        for parameter_name in per_peptide_correlations.columns:
            stats[parameter_name] = calculate_simple_statistics_for_serie(per_peptide_correlations[parameter_name])
            show_progress(label, 40, index / total_stats_length)
            index += 1
    print()

    return stats
Ejemplo n.º 11
0
def fill_per_peptide_correlations(protein_records):
    per_peptide_correlation_parameter_labels = ['{0} per peptide correlation (Pearson)'.format(name) for name in
                                                per_peptide_correlation_parameter_names]

    total_received_peptides_number = 0
    total_missed_peptides_number = 0
    for protein_record in protein_records:
        total_received_peptides_number += len(protein_record.received_peptide_records)
        total_missed_peptides_number += len(protein_record.missed_peptide_records)

    total_received_pairs_number = total_received_peptides_number * (total_received_peptides_number - 1) // 2
    received_per_peptide_correlations = DataFrame(zeros((total_received_pairs_number,
                                                         len(per_peptide_correlation_parameter_labels)),
                                                        dtype=float64),
                                                  columns=per_peptide_correlation_parameter_labels)
    total_missed_pairs_number = total_missed_peptides_number * (total_missed_peptides_number - 1) // 2
    missed_per_peptide_correlations = DataFrame(zeros((total_missed_pairs_number,
                                                       len(per_peptide_correlation_parameter_labels)),
                                                      dtype=float64),
                                                columns=per_peptide_correlation_parameter_labels)

    received_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_received_peptides_number),
                                              dtype=float64))
    missed_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_missed_peptides_number),
                                            dtype=float64))

    received_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_received_peptides_number),
                                             dtype=float64))
    missed_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_missed_peptides_number),
                                           dtype=float64))

    received_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_received_peptides_number),
                                              dtype=float64))
    missed_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_missed_peptides_number),
                                            dtype=float64))

    # received_charges = []
    # missed_charges = []

    received_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_received_peptides_number),
                                                   dtype=float64))
    missed_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_missed_peptides_number),
                                                 dtype=float64))

    secondary_structure_fraction_names = ['Helix', 'Turn', 'Sheet']
    received_secondary_structure_fractions = DataFrame(
        zeros((len(secondary_structure_fraction_names), total_received_peptides_number),
              dtype=float64))
    missed_secondary_structure_fractions = DataFrame(
        zeros((len(secondary_structure_fraction_names), total_missed_peptides_number),
              dtype=float64))

    label = 'Filling received peptides array-like parameter lists: '
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        for received_peptide_record in protein_record.received_peptide_records:
            kidera_factor_index = 0
            for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors:
                received_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value']
                kidera_factor_index += 1

            acid_index = 0
            for acid in 'AGVMDYNSWLFIKPQCERTH':
                received_acid_percents[index - 1][acid_index] = \
                    received_peptide_record.peptide_parameters.amino_acid_percents[acid]
                acid_index += 1

            group_index = 0
            for group in received_peptide_record.peptide_parameters.amino_acids_composition:
                received_acid_compounds[index - 1][group_index] = group['percent']
                group_index += 1

            # charges = []
            # for charge in received_peptide_record.peptide_parameters.charges:
            #     charges.append(charge['charge'])
            # received_charges.append(charges)

            moment_index = 0
            for moment in received_peptide_record.peptide_parameters.hydrophobic_moments:
                if moment['name'] != 'Polygly-polypro helix':
                    received_hydrophobic_moments[index - 1][moment_index] = moment['moment']
                    group_index += 1

            fraction_index = 0
            for fraction in received_peptide_record.peptide_parameters.secondary_structure_fraction:
                received_secondary_structure_fractions[index - 1][fraction_index] = fraction['value']
                fraction_index += 1

            show_progress(label, 35, index / total_received_peptides_number)
            index += 1
    print()

    label = 'Filling missed peptides array-like parameter lists: '
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        for missed_peptide_record in protein_record.missed_peptide_records:
            kidera_factor_index = 0
            for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors:
                missed_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value']
                kidera_factor_index += 1

            acid_index = 0
            for acid in 'AGVMDYNSWLFIKPQCERTH':
                missed_acid_percents[index - 1][acid_index] = \
                    missed_peptide_record.peptide_parameters.amino_acid_percents[acid]
                acid_index += 1

            group_index = 0
            for group in missed_peptide_record.peptide_parameters.amino_acids_composition:
                missed_acid_compounds[index - 1][group_index] = group['percent']
                group_index += 1

                # charges = []
                # for charge in missed_peptide_record.peptide_parameters.charges:
                #     charges.append(charge['charge'])
                # missed_charges.append(charges)
                #
            moment_index = 0
            for moment in missed_peptide_record.peptide_parameters.hydrophobic_moments:
                if moment['name'] != 'Polygly-polypro helix':
                    missed_hydrophobic_moments[index - 1][moment_index] = moment['moment']
                    group_index += 1

            fraction_index = 0
            for fraction in missed_peptide_record.peptide_parameters.secondary_structure_fraction:
                missed_secondary_structure_fractions[index - 1][fraction_index] = fraction['value']
                fraction_index += 1

            show_progress(label, 35, index / total_missed_peptides_number)
            index += 1
    print()

    print('Calculating Kidera factors per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_kidera_factors.corr(method='pearson'), 'Kidera factors')
    print('done')

    print('Calculating Kidera factors per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_kidera_factors.corr(method='pearson'), 'Kidera factors')
    print('done')

    print('Calculating amino acid percents per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_acid_percents.corr(method='pearson'), 'Amino acid percents')
    print('done')

    print('Calculating amino acid percents per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_acid_percents.corr(method='pearson'), 'Amino acid percents')
    print('done')

    print('Calculating amino acid compositions per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_acid_compounds.corr(method='pearson'), 'Amino acid compositions')
    print('done')

    print('Calculating amino acid compositions per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_acid_compounds.corr(method='pearson'), 'Amino acid compositions')
    print('done')

    #
    # label = 'Calculating charges Kendall correlation (missed peptides): '
    # show_progress(label, 40, 0.0)
    # index = 1
    # for first_charges in range(0, len(missed_charges)):
    #     for second_charges in range(first_charges + 1, len(missed_charges)):
    #         missed['Charges per peptide correlation (Kendall)'].append(
    #             statistics.kendalltau(missed_charges[first_charges], missed_charges[second_charges]).correlation)
    #     show_progress(label, 40, index / len(missed_charges))
    #     index += 1
    # print()

    print('Calculating hydrophobic moments per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments')
    print('done')

    print('Calculating hydrophobic moments per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments')
    print('done')

    print('Calculating secondary structure fractions per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_secondary_structure_fractions.corr(method='pearson'),
                                            'Secondary structure fractions')
    print('done')

    print('Calculating secondary structure fractions per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_secondary_structure_fractions.corr(method='pearson'),
                                            'Secondary structure fractions')
    print('done')

    return received_per_peptide_correlations, missed_per_peptide_correlations