Example #1
0
 def __init__(self, termination_event: Event,
              protein_data_dir: pathlib.Path, log_dir_path: pathlib.Path,
              statistics_write_period: int, number_of_threads: int,
              enzyme_name: str, maximum_number_of_missed_cleavages: int,
              minimum_peptide_length: int, maximum_peptide_length: int,
              run_count: int):
     self.__log_file_path = log_dir_path.joinpath(f"digest_{run_count}.log")
     self.__unprocessible_proteins_embl_file_path = log_dir_path.joinpath(
         f"unprocessible_proteins_{run_count}.txt")
     self.__number_of_threads = number_of_threads
     self.__max_protein_queue_size = 3 * self.__number_of_threads
     EnzymeClass = get_digestion_enzyme_by_name(enzyme_name)
     self.__enzyme = EnzymeClass(maximum_number_of_missed_cleavages,
                                 minimum_peptide_length,
                                 maximum_peptide_length)
     self.__input_file_paths = [
         pathlib.Path(path) for path in protein_data_dir.glob('*.txt')
     ]
     self.__input_file_paths += [
         pathlib.Path(path) for path in protein_data_dir.glob('*.dat')
     ]
     self.__statistics_write_period = statistics_write_period
     self.__statistics_csv_file_path = log_dir_path.joinpath(
         f"statistics_{run_count}.csv")
     self.__termination_event = termination_event
Example #2
0
    def __load_or_set_digestion_informations(self, database_url: str):
        """
        Loads the digestion information from previous digestions from the databsae and overrides the given ones.
        If no digestion information were found, it will save the current one.

        Parameters
        ----------
        database_url: str
            Database URL, e.g. postgres://username:password@host:port/database
        """
        database_connection = psycopg2.connect(database_url)
        with database_connection.cursor() as database_cursor:
            database_cursor.execute(
                "SELECT values FROM maintenance_information WHERE key = %s;",
                (MaintenanceInformation.DIGESTION_PARAMTERS_KEY, ))
            digestion_information_row = database_cursor.fetchone()
            if digestion_information_row:
                digestion_information_values = digestion_information_row[0]
                DigestEnzymeClass = get_digestion_enzyme_by_name(
                    digestion_information_values['enzyme_name'])
                self.__enzyme = DigestEnzymeClass(
                    digestion_information_values[
                        'maximum_number_of_missed_cleavages'],
                    digestion_information_values['minimum_peptide_length'],
                    digestion_information_values['maximum_peptide_length'])
            else:
                digestion_information_values = {
                    'enzyme_name': self.__enzyme.NAME,
                    'maximum_number_of_missed_cleavages':
                    self.__enzyme.max_number_of_missed_cleavages,
                    'minimum_peptide_length':
                    self.__enzyme.minimum_peptide_length,
                    'maximum_peptide_length':
                    self.__enzyme.maximum_peptide_length
                }
                database_cursor.execute(
                    "INSERT INTO maintenance_information (key, values) VALUES (%s, %s);",
                    (MaintenanceInformation.DIGESTION_PARAMTERS_KEY,
                     json.dumps(digestion_information_values)))
                database_connection.commit()
        database_connection.close()
Example #3
0
    def initial_digestion(self) -> List[Protein]:
        """
        Digests `text_files/proteins.txt` and verify the database

        Returns
        -------
        The peptides from the file.
        """
        work_dir = pathlib.Path(f"./tmp/{self.id()}_digest")
        test_files_path = pathlib.Path('./test_files')
        protein_data_test_file_path = test_files_path.joinpath('proteins.txt')
        self.prepare_workdir(work_dir, test_files_path,
                             protein_data_test_file_path)

        maintenance = DatabaseMaintenance(os.getenv("TEST_MACPEPDB_URL"),
                                          work_dir, 4, 5, 'Trypsin',
                                          TRYPSIN_MAX_MISSED_CLEAVAGES,
                                          TRYPSIN_MIN_PEPTIDE_LENGTH,
                                          TRYPSIN_MAX_PEPTIDE_LENGTH)

        maintenance.start()

        EnzymeClass = get_digestion_enzyme_by_name("Trypsin")
        trypsin = EnzymeClass(TRYPSIN_MAX_MISSED_CLEAVAGES,
                              TRYPSIN_MIN_PEPTIDE_LENGTH,
                              TRYPSIN_MAX_PEPTIDE_LENGTH)

        # Read proteins from file
        file_proteins = []
        with protein_data_test_file_path.open("r") as protein_data_test_file:
            protein_file_reader = UniprotTextReader(protein_data_test_file)
            file_proteins = [
                file_protein for file_protein in protein_file_reader
            ]

        self.verify_database_integrity(file_proteins, trypsin)

        return file_proteins
Example #4
0
    def digest():
        """
        Digest a given peptide/sequence, search the resulting peptides in the database and return matching and not matching peptides in separate array.
        """
        data = request.get_json()
        errors = ApiDigestionController.check_digestion_parameters(data)

        if not "sequence" in data:
            errors["sequence"].append("cannot be empty")

        digestion_peptides = []
        database_peptides = []
        if len(errors) == 0:
            EnzymeClass = get_digestion_enzyme_by_name("trypsin")
            enzyme = EnzymeClass(data["maximum_number_of_missed_cleavages"],
                                 data["minimum_peptide_length"],
                                 data["maximum_peptide_length"])
            digestion_peptides = enzyme.digest(
                Protein("TMP", [], "TMP", "TMP", data["sequence"], [], [],
                        False, 0))

            if "do_database_search" in data and isinstance(
                    data["do_database_search"],
                    bool) and data["do_database_search"]:
                database_connection = get_database_connection()
                with database_connection.cursor() as database_cursor:
                    database_peptides = Peptide.select(
                        database_cursor,
                        WhereCondition(
                            ["(partition, mass, sequence) IN %s"], (tuple(
                                (peptide.partition, peptide.mass,
                                 peptide.sequence)
                                for peptide in digestion_peptides), )),
                        fetchall=True)
                database_peptides.sort(key=lambda peptide: peptide.mass)
                digestion_peptides = [
                    peptide for peptide in digestion_peptides
                    if peptide not in database_peptides
                ]

            digestion_peptides.sort(key=lambda peptide: peptide.mass)

        if len(errors) == 0:

            def json_stream() -> Iterator[bytes]:
                yield b"{\"database\": ["
                for peptide_idx, peptide in enumerate(database_peptides):
                    if peptide_idx > 0:
                        yield b","
                    yield from peptide.to_json()
                yield b"],\"digestion\": ["
                for peptide_idx, peptide in enumerate(digestion_peptides):
                    if peptide_idx > 0:
                        yield b","
                    yield from peptide.to_json()
                yield f"],\"count\": {len(database_peptides) +  len(digestion_peptides)}}}".encode(
                    "utf-8")

            return Response(json_stream(), content_type="application/json")

        else:
            return jsonify({"errors": errors}), 422
Example #5
0
    def merge_digestion(
            self,
            initial_digest_file_proteins: List[Protein]) -> List[Protein]:
        """
        Digests `test_files/B0FIH3_merge.txt`
        which will merge B0FIH3 with a slightly updates version of the protein
        creating new peptides.
        Than the database will be verified.

        Returns
        -------
        List of proteins containing the proteins from the inital digest with the applied merges.
        """
        # Run digest with updated B0FIH3.
        # The old peptide is merged with the new one
        # which has the accesseion 'NEWACC'
        work_dir = pathlib.Path(f"./tmp/{self.id()}_merge")
        test_files_path = pathlib.Path('./test_files')
        protein_data_test_file_path = test_files_path.joinpath(
            'B0FIH3_merge.txt')
        self.prepare_workdir(work_dir, test_files_path,
                             protein_data_test_file_path)

        maintenance = DatabaseMaintenance(os.getenv("TEST_MACPEPDB_URL"),
                                          work_dir, 1, 5, 'Trypsin',
                                          TRYPSIN_MAX_MISSED_CLEAVAGES,
                                          TRYPSIN_MIN_PEPTIDE_LENGTH,
                                          TRYPSIN_MAX_PEPTIDE_LENGTH)

        maintenance.start()

        old_file_proteins_len = len(initial_digest_file_proteins)

        merged_file_proteins = []
        with protein_data_test_file_path.open("r") as protein_data_test_file:
            protein_file_reader = UniprotTextReader(protein_data_test_file)
            merged_file_proteins = [
                merged_file_protein
                for merged_file_protein in protein_file_reader
            ]

        # Remove all file_proteins which are merged with the proteins in the merge file.
        # In this case only B0FIH3 should be remove and
        for merged_file_protein in merged_file_proteins:
            for secondary_accession in merged_file_protein.secondary_accessions:
                file_proteins_to_remove = []
                for file_protein in initial_digest_file_proteins:
                    if secondary_accession == file_protein.accession or secondary_accession in file_protein.secondary_accessions:
                        file_proteins_to_remove.append(file_protein)
                for file_protein in file_proteins_to_remove:
                    initial_digest_file_proteins.remove(file_protein)

        # One protein should be removed
        self.assertEqual(len(initial_digest_file_proteins),
                         old_file_proteins_len - 1)

        # Than add the merged proteins to the file_proteins
        # and verfy the database again
        initial_digest_file_proteins = initial_digest_file_proteins + merged_file_proteins
        self.assertEqual(len(initial_digest_file_proteins),
                         old_file_proteins_len)

        EnzymeClass = get_digestion_enzyme_by_name("Trypsin")
        trypsin = EnzymeClass(TRYPSIN_MAX_MISSED_CLEAVAGES,
                              TRYPSIN_MIN_PEPTIDE_LENGTH,
                              TRYPSIN_MAX_PEPTIDE_LENGTH)

        self.verify_database_integrity(initial_digest_file_proteins, trypsin)

        return initial_digest_file_proteins
Example #6
0
    def update_digestion(
        self, initial_and_merged_digest_file_proteins: List[Protein]
    ) -> List[Protein]:
        """
        Digests `test_files/NEWACC_updated.txt`
        which will add  B0FIJ1 to NEWACCs secondary accession (which ultimatly deletes and merges B0FIJ1 into NEWACC) 
        and also updating NEWACCs sequence without creating new peptides.
        Than the database will be verified.

        Returns
        -------
        List of proteins containing the proteins from the merge with the applied updates.
        """
        # Run digest with updated B0FIH3.
        # The old peptide is merged with the new one
        # which has the accesseion 'NEWACC'
        work_dir = pathlib.Path(f"./tmp/{self.id()}_merge")
        test_files_path = pathlib.Path('./test_files')
        protein_data_test_file_path = test_files_path.joinpath(
            'NEWACC_updated.txt')
        self.prepare_workdir(work_dir, test_files_path,
                             protein_data_test_file_path)

        maintenance = DatabaseMaintenance(os.getenv("TEST_MACPEPDB_URL"),
                                          work_dir, 1, 5, 'Trypsin',
                                          TRYPSIN_MAX_MISSED_CLEAVAGES,
                                          TRYPSIN_MIN_PEPTIDE_LENGTH,
                                          TRYPSIN_MAX_PEPTIDE_LENGTH)

        maintenance.start()

        old_file_proteins_len = len(initial_and_merged_digest_file_proteins)

        merged_file_proteins = []
        with protein_data_test_file_path.open("r") as protein_data_test_file:
            protein_file_reader = UniprotTextReader(protein_data_test_file)
            merged_file_proteins = [
                merged_file_protein
                for merged_file_protein in protein_file_reader
            ]

        self.assertEqual(len(merged_file_proteins), 1)
        newacc_upgraded_protein = merged_file_proteins.pop()

        # NEWACC will be updated, so replace it.
        newacc_idx = -1
        for file_protein_index, file_protein in enumerate(
                initial_and_merged_digest_file_proteins):
            if file_protein.accession == newacc_upgraded_protein.accession:
                newacc_idx = file_protein_index
                break

        initial_and_merged_digest_file_proteins[
            newacc_idx] = newacc_upgraded_protein

        # Remove B0FIJ1 from file proteins
        file_proteins_to_remove = []
        for file_protein in initial_and_merged_digest_file_proteins:
            if file_protein.accession in newacc_upgraded_protein.secondary_accessions:
                file_proteins_to_remove.append(file_protein)

        for file_protein in file_proteins_to_remove:
            initial_and_merged_digest_file_proteins.remove(file_protein)

        # Length should be decreased by one
        self.assertEqual(len(initial_and_merged_digest_file_proteins),
                         old_file_proteins_len - 1)

        EnzymeClass = get_digestion_enzyme_by_name("Trypsin")
        trypsin = EnzymeClass(TRYPSIN_MAX_MISSED_CLEAVAGES,
                              TRYPSIN_MIN_PEPTIDE_LENGTH,
                              TRYPSIN_MAX_PEPTIDE_LENGTH)

        self.verify_database_integrity(initial_and_merged_digest_file_proteins,
                                       trypsin)

        return initial_and_merged_digest_file_proteins