def run(self, database: DigestionDatabase) -> Optional[Set[DigestionSettings]]: # Refreshing anzymes combobox self.enzymeComboBox.clear() for enzyme in enzymescollection.available_enzymes(): self.enzymeComboBox.addItem(enzyme) # Refreshing digestion settings table self.digestionSettingsTableWidget.setRowCount(0) self.digestionSettingsTableWidget.setSortingEnabled(False) for i, digestion in enumerate(database.available_digestions): self.digestionSettingsTableWidget.insertRow(i) enzyme_item = QTableWidgetItem(digestion.enzyme) missed_cleavages_item = QTableWidgetItem( str(digestion.missed_cleavages)) rule_item = QTableWidgetItem( enzymescollection.enzyme(digestion.enzyme).description) self.digestionSettingsTableWidget.setItem(i, 0, enzyme_item) self.digestionSettingsTableWidget.setItem(i, 1, missed_cleavages_item) self.digestionSettingsTableWidget.setItem(i, 2, rule_item) self.digestionSettingsTableWidget.setSortingEnabled(True) if self.exec() == QDialog.Accepted: return self._generateDigestionSettings() return None
def enzymeComboBoxCurrentTextChanged(self, text: str) -> None: try: enzyme = enzymescollection.enzyme(text) except enzymescollection.InvalidEnzymeError: self.enzymeDescriptionLabel.setText('') else: self.enzymeDescriptionLabel.setText( f'<i>{enzyme.name}: {enzyme.description}</i>')
def is_coherent_with_enzymes_collection(self) -> bool: # Checks that the enzymes used in the database still exist in the collections and that the rules used in # the database are the same than the ones in the collection available_enzymes = enzymescollection.available_enzymes() cursor = self._connection.execute( 'SELECT enzyme, rule FROM digestions') for row in cursor: if row['enzyme'] not in available_enzymes or row[ 'rule'] != enzymescollection.enzyme(row['enzyme']).rule: return False return True
def addPushButtonClicked(self): digestion_settings = DigestionSettings( self.enzymeComboBox.currentText(), self.missedCleavagesSpinBox.value()) if not digestion_settings in self._generateDigestionSettings(): row = self.digestionSettingsTableWidget.rowCount() self.digestionSettingsTableWidget.insertRow(row) enzyme_item = QTableWidgetItem(self.enzymeComboBox.currentText()) missed_cleavages_item = QTableWidgetItem( str(self.missedCleavagesSpinBox.value())) rule_item = QTableWidgetItem( enzymescollection.enzyme( self.enzymeComboBox.currentText()).description) self.digestionSettingsTableWidget.setItem(row, 0, enzyme_item) self.digestionSettingsTableWidget.setItem(row, 1, missed_cleavages_item) self.digestionSettingsTableWidget.setItem(row, 2, rule_item) self.digestionSettingsTableWidget.selectRow(row) self.digestionSettingsTableWidget.scrollToItem(enzyme_item) else: commondialog.errorMessage( self, 'This digestion settings is already listed.')
def _digest(self, digestion, callback=None, proteins_per_batch=10000) -> None: digestion_tables = self._digestion_tables(digestion) enzyme = enzymescollection.enzyme(digestion.enzyme) self._progress_handler_function = callback self._maximum_task_iteration = 0 self._current_task_iteration = 0 self._current_task = 'Determining number of sequences to digest...' # Counting the number of sequence to digest cursor = self._connection.execute( f'''SELECT COUNT(*) FROM sequences WHERE sequences.id NOT IN (SELECT DISTINCT {digestion_tables.peptides_association}.sequence_id FROM {digestion_tables.peptides_association})''' ) self._maximum_task_iteration = cursor.fetchone()[0] # Nothing to digest, exiting if not self._maximum_task_iteration: return self._current_task_iteration = 0 self._current_task = ( f'Digesting database with {digestion.enzyme}, {digestion.missed_cleavages} ' f'missed cleavage{"s" if digestion.missed_cleavages > 1 else ""}...' ) # Dropping the indicies to speed up digestion self._connection.execute( f'DROP INDEX IF EXISTS {digestion_tables.peptides_table_index}') self._connection.execute( f'DROP INDEX IF EXISTS {digestion_tables.peptides_association_index}' ) # Reading sequences to digest... read_cursor = self._connection.execute( f'''SELECT id, sequence FROM sequences WHERE sequences.id NOT IN (SELECT DISTINCT {digestion_tables.peptides_association}.sequence_id FROM {digestion_tables.peptides_association})''' ) rows = read_cursor.fetchmany(proteins_per_batch) while rows: for aa_sequence in (AminoAcidSequence(row['sequence'], row['id']) for row in rows): peptides = tuple( enzyme.cleave(aa_sequence, digestion.missed_cleavages)) self._connection.executemany( f'''INSERT INTO {digestion_tables.peptides_table} (sequence, missed_cleavages) VALUES(?, ?) ON CONFLICT DO NOTHING''', ((peptide.sequence, peptide.missed_cleavages) for peptide in peptides)) # We need that to preserve the digestion order of peptide when updating the association table sequences_to_ids = {} for i in range(0, len(peptides), 900): queried_peptide_sequences = tuple( peptide.sequence for peptide in peptides[i:i + 900]) parameters_substitution = ','.join( ('?', ) * len(queried_peptide_sequences)) cursor = self._connection.execute( f'''SELECT id, sequence FROM {digestion_tables.peptides_table} WHERE sequence IN ({parameters_substitution})''', queried_peptide_sequences) # Mapping peptide sequence to its id sequences_to_ids.update( {row['sequence']: row['id'] for row in cursor}) # Creating a list of ids, sorted by digestion order sorted_peptides_id = (sequences_to_ids[peptide.sequence] for peptide in peptides) self._connection.executemany( f'''INSERT INTO {digestion_tables.peptides_association} (peptide_id, sequence_id) VALUES(?, ?)''', ((peptide_id, aa_sequence.id) for peptide_id in sorted_peptides_id)) self._current_task_iteration += 1 rows = read_cursor.fetchmany(proteins_per_batch) # Creating indicies to speed up search self._maximum_task_iteration = 0 self._current_task_iteration = 0 self._current_task = ( f'Creating index for digestion {digestion.enzyme}, {digestion.missed_cleavages} ' f'missed cleavage{"s" if digestion.missed_cleavages > 1 else ""}...' ) self._connection.execute( f'''CREATE INDEX {digestion_tables.peptides_table_index} ON {digestion_tables.peptides_table}(sequence)''' ) self._connection.execute( f'''CREATE INDEX {digestion_tables.peptides_association_index} ON {digestion_tables.peptides_association}(peptide_id, sequence_id)''' )
def update_digestion(self, digestion_settings: Iterable[DigestionSettings], remove=False, callback=None, proteins_per_batch=10000) -> None: available_digestions = set(self.available_digestions) updated_digestions = set(digestion_settings) self._progress_handler_function = callback cleanup_needed = False if not self.is_coherent_with_enzymes_collection: raise IncoherencyWithEnzymesCollectionError # Removing unneeded digestions if remove: self._current_task_iteration = 0 self._maximum_task_iteration = 0 with self._connection: for digestion in available_digestions - updated_digestions: cleanup_needed = True self._current_task = ( f'Removing digestion {digestion.enzyme}, {digestion.missed_cleavages} ' f'missed cleavage{"s" if digestion.missed_cleavages > 1 else ""}...' ) self._progress_handler() digestion_tables = self._digestion_tables(digestion) try: # self._digestion_tables returns table names surrounded by ", we need to remove them in this # case self._connection.execute( 'DELETE FROM digestions WHERE peptides_table = ?', (digestion_tables.peptides_table[1:-1], )) self._connection.execute( f'DROP TABLE {digestion_tables.peptides_table}') self._connection.execute( f'DROP TABLE {digestion_tables.peptides_association}' ) except sqlite3.OperationalError: self._connection.rollback() self._end_of_task() return # Adding digestions with self._connection: added_digestions = list(updated_digestions - available_digestions) added_digestions.sort( key=lambda digestion: digestion.missed_cleavages) added_digestions.sort(key=lambda digestion: digestion.enzyme) for digestion in added_digestions: # Generates a uuid as the table name digestion_table_name = uuid.uuid4().hex # Add this digestion table into the list of digestion enzyme = enzymescollection.enzyme(digestion.enzyme) self._connection.execute( '''INSERT INTO digestions(enzyme, rule, missed_cleavages, peptides_table) VALUES(?, ?, ?, ?)''', (enzyme.name, enzyme.rule, digestion.missed_cleavages, digestion_table_name)) # Get the table names (including many-to-many table name) digestion_tables = self._digestion_tables(digestion) # Creates all the tables needed to store the digestion result try: self._connection.execute( f'''CREATE TABLE {digestion_tables.peptides_table}( id INTEGER, sequence TEXT NOT NULL UNIQUE, missed_cleavages INTEGER NOT NULL, PRIMARY KEY(id))''') self._connection.execute( f'''CREATE TABLE {digestion_tables.peptides_association}( peptide_id INTEGER, sequence_id INTEGER, FOREIGN KEY(peptide_id) REFERENCES {digestion_tables.peptides_table}(id), FOREIGN KEY(sequence_id) REFERENCES sequences(id))''' ) self._digest(digestion, callback=callback, proteins_per_batch=proteins_per_batch) except sqlite3.OperationalError: self._connection.rollback() self._end_of_task() return if cleanup_needed: self._current_task = (f'Cleaning up database...') self._connection.execute('VACUUM') self._end_of_task()