Example #1
0
 def test_get_aa_sequence_mass(self):
     aa_sequence = 'ARNDCEQGHILKMFPSTWYV'
     aa_masses = {
         'G': 57.02146,
         'A': 71.03711,
         'S': 87.03203, 
         'P': 97.05276,
         'V': 99.06841,
         'T': 101.04768,
         'C': 103.00919,
         'L': 113.08406,
         'I': 113.08406,
         'N': 114.04293,
         'D': 115.02694,
         'Q': 128.05858,
         'K': 128.09496,
         'E': 129.04259,
         'M': 131.04049,
         'H': 137.05891,
         'F': 147.06841,
         'R': 156.10111,
         'Y': 163.06333,
         'W': 186.07931,
     }
     actual = mass.get_aa_sequence_mass(aa_sequence, aa_masses=aa_masses)
     expected = 2376.11432
     self.assertEquals(actual, expected)
Example #2
0
    def process_protein_batch(self, batch, taxon, logger=None):
        """ Process a batch of proteins with the given digest. """
        if not batch:
            return
        if not logger:
            logger = self.logger
        # Get existing proteins by searching for sequences.
        existing_proteins = {}
        for protein in (self.session.query(Protein).filter(
                Protein.sequence.in_(
                    [sequence for metadata, sequence in batch]))):
            existing_proteins[protein.sequence] = protein

        # Initialize collection of undigested proteins.
        undigested_proteins = {}
        digested_proteins = {}
        if existing_proteins:
            for protein in (self.session.query(Protein).filter(
                    Protein.id.in_([
                        protein.id for protein in existing_proteins.values()
                    ])).join(ProteinDigest).filter(
                        ProteinDigest.digest == self.digest)):
                digested_proteins[protein.sequence] = protein
        for protein in existing_proteins.values():
            if protein.sequence not in digested_proteins:
                undigested_proteins[protein.sequence] = protein

        # Create proteins which do not exist in the db and add to undigested
        # collection.
        num_new_proteins = 0
        for metadata, sequence in batch:
            if sequence not in existing_proteins:
                try:
                    mass = get_aa_sequence_mass(sequence)
                    protein = Protein(sequence=sequence, mass=mass)
                except Exception as e:
                    logger.exception("Error processing protein, skipping")
                    continue
                self.session.add(protein)
                num_new_proteins += 1
                undigested_proteins[sequence] = protein
                existing_proteins[sequence] = protein
        logger.info("creating %s new proteins..." % (num_new_proteins))
        self.session.commit()
        self.stats['Protein'] += num_new_proteins

        # Digest undigested proteins.
        if undigested_proteins:
            num_undigested = len(undigested_proteins)
            logger.info("digesting %s proteins" % num_new_proteins)
            undigested_batch = {}
            peptide_counter = 0
            protein_digests = []
            for protein in undigested_proteins.values():
                protein_digest = ProteinDigest(protein=protein,
                                               digest=self.digest)
                protein_digests.append(protein_digest)
                peptide_sequences = cleave(
                    protein.sequence,
                    self.digest.protease.cleavage_rule,
                    self.digest.max_missed_cleavages,
                    min_acids=self.digest.min_acids,
                    max_acids=self.digest.max_acids,
                )
                peptide_counter += len(peptide_sequences)
                undigested_batch[protein] = {
                    'peptide_sequences': peptide_sequences,
                    'protein_digest': protein_digest,
                }
                if (peptide_counter > 1e4):
                    self.process_peptide_batch(undigested_batch, logger)
                    peptide_counter = 0
            self.process_peptide_batch(undigested_batch, logger)

        # Create taxon protein instances in bulk.
        taxon_protein_dicts = []
        for metadata, sequence in batch:
            try:
                protein = existing_proteins[sequence]
            except Exception as e:
                logger.exception("Error processing protein, sequence does not"
                                 " exist in db, skipping")
                continue
            taxon_protein_dicts.append({
                'protein_id': protein.id,
                'taxon_id': taxon.id,
                'metadata': metadata,
            })
        logger.info("Creating %s new taxon proteins..." %
                    (len(taxon_protein_dicts)))
        self.session.execute(db.tables['TaxonProtein'].insert(),
                             taxon_protein_dicts)
        self.session.commit()
        self.stats['TaxonProtein'] += len(taxon_protein_dicts)
Example #3
0
    def process_peptide_batch(self, batch, logger=None):
        if not logger:
            logger = self.logger

        # Assemble combined peptide sequences and protein digests.
        combined_peptide_sequences = set()
        combined_protein_digests = []
        for protein, data in batch.items():
            for sequence in data['peptide_sequences']:
                combined_peptide_sequences.add(sequence)
            combined_protein_digests.append(data['protein_digest'])

        # Add protein digests to db.
        logger.info("Creating %s new protein digests..." %
                    (len(combined_protein_digests)))
        self.session.add_all(combined_protein_digests)
        self.session.commit()
        self.stats['ProteinDigest'] += len(combined_protein_digests)

        # Get existing peptides.
        existing_peptides = {}
        existing_peptides_batch = []
        existing_peptides_counter = 0
        for sequence in combined_peptide_sequences:
            existing_peptides_counter += 1
            existing_peptides_batch.append(sequence)
            if (existing_peptides_counter % 500) == 0:
                self.update_existing_peptides_(existing_peptides_batch,
                                               existing_peptides)
                existing_peptides_batch = []
        self.update_existing_peptides_(existing_peptides_batch,
                                       existing_peptides)

        # Create non-existent peptides in bulk.
        num_new_peptides = 0
        peptide_dicts = []
        for sequence in combined_peptide_sequences:
            if sequence not in existing_peptides:
                num_new_peptides += 1
                mass = get_aa_sequence_mass(sequence)
                peptide_dicts.append({
                    'sequence': sequence,
                    'mass': mass,
                })
        logger.info("Creating %s new peptides..." % num_new_peptides)
        self.session.execute(db.tables['Peptide'].insert(), peptide_dicts)
        self.session.commit()
        self.stats['Peptide'] += num_new_peptides

        # Get newly created peptide objects and add to existing peptides.
        created_peptides_batch = []
        created_peptides_counter = 0
        for peptide_dict in peptide_dicts:
            created_peptides_counter += 1
            created_peptides_batch.append(peptide_dict['sequence'])
            if (created_peptides_counter % 500) == 0:
                self.update_existing_peptides_(created_peptides_batch,
                                               existing_peptides)
                created_peptides_batch = []
        self.update_existing_peptides_(created_peptides_batch,
                                       existing_peptides)

        # Create histogram of peptide sequence occurences for each protein.
        num_peptide_instances = 0
        for protein, data in batch.items():
            peptides_histogram = defaultdict(int)
            for sequence in data['peptide_sequences']:
                peptides_histogram[sequence] += 1
            data['peptide_histogram'] = peptides_histogram
            # Update number of peptide instances.
            num_peptide_instances += len(peptides_histogram)

        # Create protein digest peptide instances in bulk.
        logger.info("Creating %s new protein digest peptides..." %
                    (num_peptide_instances))
        pdp_batch = []
        pdp_counter = 0
        for protein, data in batch.items():
            for sequence, count in data['peptide_histogram'].items():
                pdp_counter += 1
                peptide = existing_peptides[sequence]
                pdp_batch.append({
                    'peptide_id': peptide.id,
                    'protein_digest_id': data['protein_digest'].id,
                    'count': count,
                })
                if (pdp_counter % 1e4) == 0:
                    self.session.execute(
                        db.tables['ProteinDigestPeptide'].insert(), pdp_batch)
                    self.session.commit()
        self.session.execute(db.tables['ProteinDigestPeptide'].insert(),
                             pdp_batch)
        self.session.commit()
        self.stats['ProteinDigestPeptide'] += num_peptide_instances
    def process_peptide_batch(self,
                              metagenome_sequence_digests_dict,
                              logger=None):
        if not logger:
            logger = self.logger
        # Assemble combined peptide sequences and metagenome digests.  Each metagenome sequence can have many peptides.
        combined_peptide_sequences = set()
        for proteinId, data in list(metagenome_sequence_digests_dict.items()):
            for sequence in data['peptide_sequences']:
                combined_peptide_sequences.add(sequence)
        # Get existing peptides.
        existing_peptides = {}
        # Create non-existent peptides in bulk.
        start_time = time.time()

        num_new_peptides = 0
        peptide_sequences = []
        peptide_masses = []
        peptide_file = ''
        for sequence in combined_peptide_sequences:
            num_new_peptides += 1
            #calculate mass of peptide
            mass = get_aa_sequence_mass(sequence)
            peptide_sequences.append(sequence)
            peptide_masses.append(mass)

        logger.info("Creating %s new peptides..." % num_new_peptides)
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from peptide_insert(%s, %s);",
                    (peptide_sequences, peptide_masses))

        for record in cur:
            try:
                peptide = Peptide(
                    id=record[0],
                    sequence=record[1],
                )
                existing_peptides[peptide.sequence] = peptide
            except Exception as e:
                logger.exception("Error processing peptide, skipping")
                continue
        total_time = time.time() - start_time
        self.total_peptide_time = self.total_peptide_time + total_time
        logger.info("peptide time elapsed: %s" % (total_time))

        self.stats['Peptide'] += num_new_peptides
        # Create histogram of peptide sequence occurences for each protein.
        num_peptide_instances = 0
        for sequenceId, data in list(metagenome_sequence_digests_dict.items()):
            peptides_histogram = defaultdict(int)
            for sequence in data['peptide_sequences']:
                peptides_histogram[sequence] += 1
            data['peptide_histogram'] = peptides_histogram
            # Update number of peptide instances.
            num_peptide_instances += len(peptides_histogram)
        # Create protein digest peptide instances in bulk.

        start_time = time.time()
        pdp_peptide_ids = []
        pdp_metagenome_sequence_ids = []
        pdp_digest_ids = []
        pdp_peptide_count = []
        pdp_counter = 0
        for sequenceId, data in list(metagenome_sequence_digests_dict.items()):
            for sequence, count in list(data['peptide_histogram'].items()):
                pdp_counter += 1
                peptide = existing_peptides[sequence]
                pdp_peptide_ids.append(peptide.id)
                pdp_metagenome_sequence_ids.append(
                    data['metagenome_sequence'].id)
                pdp_digest_ids.append(data['digest'].id)
                pdp_peptide_count.append(count)
        total_time = time.time() - start_time
        cur.execute(
            "select metagenome_sequence_digest_peptide_insert(%s, %s, %s, %s);",
            (pdp_peptide_ids, pdp_metagenome_sequence_ids, pdp_digest_ids,
             pdp_peptide_count))
        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        # logger.info("protein digest time elapsed: %s" % (total_time))
        self.stats['ProteinDigestPeptide'] += num_peptide_instances
Example #5
0
    def process_peptide_batch(self, batch, logger=None):
        if not logger:
            logger = self.logger

        # Assemble combined peptide sequences and protein digests.
        combined_peptide_sequences = set()

        protein_ids = []
        digest_ids = []
        protein_digests = []
        protein_digests_dict = {}
        for proteinId, data in list(batch.items()):
            for sequence in data['peptide_sequences']:
                combined_peptide_sequences.add(sequence)

            pd = data['protein_digest']
            protein_ids.append(pd.protein.id)
            digest_ids.append(pd.digest.id)

        cur = db.get_psycopg2_cursor()
        cur.execute("select * from protein_digest_insert(%s, %s);",
                    (protein_ids, digest_ids))

        # iterate through the protein_digest records returned from the insert and build a protein_digest object
        for record in cur:

            try:
                protein_digest = ProteinDigest(id=record[0],
                                               protein=record[1],
                                               digest=record[2])

                protein_digests.append(protein_digest)
                batch_record = batch.get(record[1])
                protein_digests_dict[record[1]] = {
                    'peptide_sequences': batch_record['peptide_sequences'],
                    'protein_digest': protein_digest,
                }
            except Exception as e:
                logger.exception("Error processing protein digest, skipping")
                continue

        db.psycopg2_connection.commit()

        self.stats['ProteinDigest'] += len(protein_digests)

        # Get existing peptides.
        existing_peptides = {}

        # Create non-existent peptides in bulk.
        start_time = time.time()
        num_new_peptides = 0
        peptide_dicts = []
        peptide_sequences = []
        peptide_masses = []
        for sequence in combined_peptide_sequences:
            #if sequence not in existing_peptides:
            num_new_peptides += 1
            mass = get_aa_sequence_mass(sequence)
            peptide_dicts.append({
                'sequence': sequence,
                'mass': mass,
            })
            peptide_sequences.append(sequence)
            peptide_masses.append(mass)
        logger.info("Creating %s new peptides..." % num_new_peptides)
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from peptide_insert(%s, %s);",
                    (peptide_sequences, peptide_masses))
        for record in cur:
            try:
                peptide = Peptide(
                    id=record[0],
                    sequence=record[1],
                )
                existing_peptides[peptide.sequence] = peptide
            except Exception as e:
                logger.exception("Error processing peptide, skipping")
                continue

        self.stats['Peptide'] += num_new_peptides
        # Create histogram of peptide sequence occurences for each protein.
        num_peptide_instances = 0

        for proteinId, data in list(protein_digests_dict.items()):
            peptides_histogram = defaultdict(int)
            for sequence in data['peptide_sequences']:
                peptides_histogram[sequence] += 1
            data['peptide_histogram'] = peptides_histogram
            # Update number of peptide instances.
            num_peptide_instances += len(peptides_histogram)
        total_time = time.time() - start_time
        logger.info("peptide time elapsed: %s" % (total_time))
        # Create protein digest peptide instances in bulk.
        logger.info("Creating %s new protein digest peptides..." %
                    (num_peptide_instances))

        start_time = time.time()
        pdp_batch = []
        pdp_peptide_ids = []
        pdp_protein_digest_ids = []
        pdp_peptide_count = []
        pdp_counter = 0
        for proteinId, data in list(protein_digests_dict.items()):
            for sequence, count in list(data['peptide_histogram'].items()):
                pdp_counter += 1
                peptide = existing_peptides[sequence]
                pdp_peptide_ids.append(peptide.id)
                pdp_protein_digest_ids.append(data['protein_digest'].id)
                pdp_peptide_count.append(count)
        total_time = time.time() - start_time
        logger.info("protein digest loop time elapsed: %s" % (total_time))
        cur = db.get_psycopg2_cursor()
        cur.execute(
            "select protein_digest_peptide_insert(%s, %s, %s);",
            (pdp_peptide_ids, pdp_protein_digest_ids, pdp_peptide_count))
        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        logger.info("protein digest time elapsed: %s" % (total_time))
        self.stats['ProteinDigestPeptide'] += num_peptide_instances
Example #6
0
    def process_protein_batch(self, batch, taxon, logger=None):
        """ Process a batch of proteins with the given digest. """
        if not batch:
            return
        if not logger:
            logger = self.logger
        # Get existing proteins by searching for sequences.
        existing_proteins = {}
        existing_protein_ids = []
        cur = db.get_psycopg2_cursor()
        sequences = []

        for metadata, sequence in batch:
            sequences.append(sequence)

        cur.execute("select * from protein where protein.sequence in %s",
                    (tuple(sequences), ))

        for record in cur.fetchall():
            protein = Protein(id=record[0], sequence=record[1], mass=record[2])
            existing_proteins[protein.sequence] = protein
            existing_protein_ids.append(record[0])
        db.psycopg2_connection.commit()
        # Initialize collection of undigested proteins.
        undigested_proteins = {}
        digested_proteins = {}
        protein_sequences = []
        protein_masses = []
        #testing now, convert to stored procedure
        if existing_proteins:
            cur = db.get_psycopg2_cursor()
            cur.execute(
                "select * from protein join protein_digest on protein.id = protein_digest.protein_id where protein.id in %s and protein_digest.digest_id = %s",
                (
                    tuple(existing_protein_ids),
                    self.digest.id,
                ))

            for record in cur.fetchall():
                protein = Protein(id=record[0],
                                  sequence=record[1],
                                  mass=record[2])
                digested_proteins[protein.sequence] = protein
            db.psycopg2_connection.commit()
        for protein in list(existing_proteins.values()):
            if protein.sequence not in digested_proteins:
                undigested_proteins[protein.sequence] = protein

        # Create proteins which do not exist in the db and add to undigested
        # collection.

        start_time = time.time()
        num_new_proteins = 0
        for metadata, sequence in batch:
            try:
                mass = get_aa_sequence_mass(sequence)
            except Exception as e:
                logger.exception("Error processing protein, skipping")
                continue
            num_new_proteins += 1
            # add sequence and mass to their respective lists to be passed to postgres stored procedure
            if (sequence not in protein_sequences):
                protein_sequences.append(sequence)
                protein_masses.append(mass)

        logger.info("creating %s new proteins..." % (num_new_proteins))
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from protein_insert(%s, %s);",
                    (protein_sequences, protein_masses))
        # iterate through the protein records returned from the insert and build a protein object
        for record in cur:
            try:
                protein = Protein(id=record[0],
                                  sequence=record[1],
                                  mass=record[2])
            except Exception as e:
                logger.exception("Error processing protein, skipping")
                continue
            undigested_proteins[record[1]] = protein
            existing_proteins[record[1]] = protein

        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        logger.info("time elapsed: %s" % (total_time))
        self.stats['Protein'] += num_new_proteins

        # Digest undigested proteins.
        if undigested_proteins:
            num_undigested = len(undigested_proteins)
            logger.info("digesting %s proteins" % num_new_proteins)
            undigested_batch = {}
            peptide_counter = 0
            protein_digests = []

            for protein in list(undigested_proteins.values()):
                protein_digest = ProteinDigest(protein=protein,
                                               digest=self.digest)
                protein_digests.append(protein_digest)
                #do the digestion of a single protein sequence
                peptide_sequences = cleave(
                    protein.sequence,
                    self.digest.protease.cleavage_rule,
                    self.logger,
                    self.digest.max_missed_cleavages,
                    min_acids=self.digest.min_acids,
                    max_acids=self.digest.max_acids,
                )
                peptide_counter += len(peptide_sequences)
                undigested_batch[protein.id] = {
                    'peptide_sequences': peptide_sequences,
                    'protein_digest': protein_digest,
                }

            self.process_peptide_batch(undigested_batch, logger)

        # Create taxon protein instances in bulk.
        taxon_protein_dicts = []

        taxon_protein_ids = []
        taxon_ids = []
        metadatas = []

        for metadata, sequence in batch:
            if sequence != "No sequence found":
                try:
                    protein = existing_proteins[sequence]
                except Exception as e:
                    logger.exception(
                        "Error processing protein, sequence does not"
                        " exist in db, skipping")
                    continue
                taxon_protein_dicts.append({
                    'protein_id': protein.id,
                    'taxon_id': taxon.id,
                    'metadata': metadata,
                })
                taxon_protein_ids.append(protein.id)
                taxon_ids.append(taxon.id)
                metadatas.append(metadata)
        logger.info("Creating %s new taxon proteins..." %
                    (len(taxon_protein_dicts)))

        cur = db.get_psycopg2_cursor()
        cur.execute("select * from taxon_protein_insert(%s, %s, %s);",
                    (taxon_protein_ids, taxon_ids, metadatas))
        db.psycopg2_connection.commit()

        self.stats['TaxonProtein'] += len(taxon_protein_dicts)
Example #7
0
    def process_peptide_batch(self, batch, logger=None):
        if not logger:
            logger = self.logger

        # Assemble combined peptide sequences and protein digests.
        combined_peptide_sequences = set()
        combined_protein_digests = []
        for protein, data in batch.items():
            for sequence in data['peptide_sequences']:
                combined_peptide_sequences.add(sequence)
            combined_protein_digests.append(data['protein_digest'])

        # Add protein digests to db.
        logger.info("Creating %s new protein digests..." % (
            len(combined_protein_digests)))
        self.session.add_all(combined_protein_digests)
        self.session.commit()
        self.stats['ProteinDigest'] += len(combined_protein_digests)

        # Get existing peptides.
        existing_peptides = {}
        existing_peptides_batch = []
        existing_peptides_counter = 0
        for sequence in combined_peptide_sequences:
            existing_peptides_counter += 1
            existing_peptides_batch.append(sequence)
            if (existing_peptides_counter % 500) == 0:
                self.update_existing_peptides_(
                    existing_peptides_batch, existing_peptides)
                existing_peptides_batch = []
        self.update_existing_peptides_(
            existing_peptides_batch, existing_peptides)

        # Create non-existent peptides in bulk.
        num_new_peptides = 0
        peptide_dicts = []
        for sequence in combined_peptide_sequences:
            if sequence not in existing_peptides:
                num_new_peptides += 1
                mass = get_aa_sequence_mass(sequence)
                peptide_dicts.append({
                    'sequence': sequence,
                    'mass': mass,
                })
        logger.info("Creating %s new peptides..." % num_new_peptides)
        self.session.execute(db.tables['Peptide'].insert(), peptide_dicts)
        self.session.commit()
        self.stats['Peptide'] += num_new_peptides

        # Get newly created peptide objects and add to existing peptides.
        created_peptides_batch = []
        created_peptides_counter = 0
        for peptide_dict in peptide_dicts:
            created_peptides_counter += 1
            created_peptides_batch.append(peptide_dict['sequence'])
            if (created_peptides_counter % 500) == 0:
                self.update_existing_peptides_(created_peptides_batch, 
                                               existing_peptides)
                created_peptides_batch = []
        self.update_existing_peptides_(
            created_peptides_batch, existing_peptides)

        # Create histogram of peptide sequence occurences for each protein.
        num_peptide_instances = 0
        for protein, data in batch.items():
            peptides_histogram = defaultdict(int)
            for sequence in data['peptide_sequences']: 
                peptides_histogram[sequence] += 1
            data['peptide_histogram'] = peptides_histogram
            # Update number of peptide instances.
            num_peptide_instances += len(peptides_histogram)

        # Create protein digest peptide instances in bulk.
        logger.info("Creating %s new protein digest peptides..." % (
            num_peptide_instances))
        pdp_batch = []
        pdp_counter = 0
        for protein, data in batch.items():
            for sequence, count in data['peptide_histogram'].items():
                pdp_counter += 1
                peptide = existing_peptides[sequence]
                pdp_batch.append({
                    'peptide_id': peptide.id,
                    'protein_digest_id': data['protein_digest'].id,
                    'count': count,
                })
                if (pdp_counter % 1e4) == 0:
                    self.session.execute(
                        db.tables['ProteinDigestPeptide'].insert(),
                        pdp_batch)
                    self.session.commit()
        self.session.execute(
            db.tables['ProteinDigestPeptide'].insert(), pdp_batch)
        self.session.commit()
        self.stats['ProteinDigestPeptide'] += num_peptide_instances
Example #8
0
    def process_protein_batch(self, batch, taxon, logger=None):
        """ Process a batch of proteins with the given digest. """
        if not batch:
            return
        if not logger:
            logger = self.logger
        # Get existing proteins by searching for sequences.
        existing_proteins = {}
        for protein in (
            self.session.query(Protein)
            .filter(Protein.sequence.in_(
                [sequence for metadata, sequence in batch])
            )
        ):
            existing_proteins[protein.sequence] = protein

        # Initialize collection of undigested proteins.
        undigested_proteins = {}
        digested_proteins = {}
        if existing_proteins:
            for protein in (
                self.session.query(Protein)
                .filter(Protein.id.in_(
                    [protein.id for protein in existing_proteins.values()]))
                .join(ProteinDigest)
                .filter(ProteinDigest.digest == self.digest)
            ):
                digested_proteins[protein.sequence] = protein
        for protein in existing_proteins.values():
            if protein.sequence not in digested_proteins:
                undigested_proteins[protein.sequence] = protein
        
        # Create proteins which do not exist in the db and add to undigested
        # collection.
        num_new_proteins = 0
        for metadata, sequence in batch:
            if sequence not in existing_proteins:
                try:
                    mass = get_aa_sequence_mass(sequence)
                    protein = Protein(sequence=sequence, mass=mass)
                except Exception as e:
                    logger.exception("Error processing protein, skipping")
                    continue
                self.session.add(protein)
                num_new_proteins += 1
                undigested_proteins[sequence] = protein
                existing_proteins[sequence] = protein
        logger.info("creating %s new proteins..." % (
            num_new_proteins))
        self.session.commit()
        self.stats['Protein'] += num_new_proteins

        # Digest undigested proteins.
        if undigested_proteins:
            num_undigested = len(undigested_proteins)
            logger.info("digesting %s proteins" % num_new_proteins)
            undigested_batch = {}
            peptide_counter = 0
            protein_digests = []
            for protein in undigested_proteins.values():
                protein_digest = ProteinDigest(protein=protein, 
                                               digest=self.digest)
                protein_digests.append(protein_digest)
                peptide_sequences = cleave(
                    protein.sequence, 
                    self.digest.protease.cleavage_rule, 
                    self.digest.max_missed_cleavages,
                    min_acids=self.digest.min_acids,
                    max_acids=self.digest.max_acids,
                )
                peptide_counter += len(peptide_sequences)
                undigested_batch[protein] = {
                    'peptide_sequences': peptide_sequences,
                    'protein_digest': protein_digest,
                }
                if (peptide_counter > 1e4):
                    self.process_peptide_batch(undigested_batch, logger)
                    peptide_counter = 0
            self.process_peptide_batch(undigested_batch, logger)

        # Create taxon protein instances in bulk.
        taxon_protein_dicts = []
        for metadata, sequence in batch:
            try:
                protein = existing_proteins[sequence]
            except Exception as e:
                logger.exception("Error processing protein, sequence does not"
                                 " exist in db, skipping")
                continue
            taxon_protein_dicts.append({
                'protein_id': protein.id,
                'taxon_id': taxon.id,
                'metadata': metadata,
            })
        logger.info("Creating %s new taxon proteins..." % (
            len(taxon_protein_dicts)))
        self.session.execute(
            db.tables['TaxonProtein'].insert(), taxon_protein_dicts)
        self.session.commit()
        self.stats['TaxonProtein'] += len(taxon_protein_dicts)