def clear_data_for_metaomic_assembly(self, metaomic_assembly):
     self.logger.info("Clearing data for meta-omic assembly '%s'" % metaomic_assembly)
     cur = db.get_psycopg2_cursor()
     try:
         # Get MetagenomeSequences.
         cur.execute("select ms.id from metagenome_sequence ms where ms.metagenome_id = %s", (metaomic_assembly,))
         metagenome_sequences = cur.fetchall()
         if metagenome_sequences is not None:
             # Delete Sequences, Annotations and Digests
             self.logger.info("Deleting TaxonDigestPeptides and TaxonDigests")
             for ms in metagenome_sequences:
     
                 cur.execute("delete from metagenome_sequence_digest_peptide where metagenome_sequence_id = %s", (ms[0],))
                 cur.execute("delete from metagenome_annotations where metagenome_sequence_id = %s", (ms[0],))
             db.psycopg2_connection.commit()
             cur = db.get_psycopg2_cursor()
             for ms in metagenome_sequences:    
                 cur.execute("delete from metagenome_sequence where metagenome_id = %s", (ms[0],))
        
             db.psycopg2_connection.commit()
             cur = db.get_psycopg2_cursor()
         # Delete Metagenome
         cur.execute("delete from metagenome where id = %s;", (metaomic_assembly,))
     except Exception as e:
         self.logger.error("Problem removing '%s'" % metaomic_assembly)
         traceback.print_exc()
     else:
         # Commit the deletes.
         db.psycopg2_connection.commit()
def get_digest(logger, digest_def):
    """ Fetch or create a digest from a digest definition."""
    #session = db.get_session()

    # Get or create protease.
    protease = Protease(**digest_def['protease'])
    protease_id = str(digest_def['protease']['id'])
    cur = db.get_psycopg2_cursor()
    cur.execute("select * from protease where protease.id=%s;", (protease_id,))
    results = cur.fetchone()

    if results is None:
        logger.info(
            "No protease exists for the given definition, creating...")
        protease = Protease(**digest_def['protease'])
        cur.execute("insert into protease (id, cleavage_rule) values( %s, %s);", (str(digest_def['protease']['id']), str(digest_def['protease']['cleavage_rule']),))
    else:
        protease = Protease(id=results[0], cleavage_rule=results[1])
    db.psycopg2_connection.commit()

    # Get or create digest object.
    cur = db.get_psycopg2_cursor()

    #not all possible digestion parameters will have a value so build the query to account for this
    query_params = [protease.id]
    digest_query = "select * from digest where digest.protease_id = %s";
    if digest_def.get('max_missed_cleavages') is not None:
        digest_query = digest_query + " and digest.max_missed_cleavages = %s "
        query_params.append(digest_def.get('max_missed_cleavages'))
    if digest_def.get('min_acids') is not None:
        digest_query = digest_query + " and digest.min_acids = %s "
        query_params.append(digest_def.get('min_acids'))
    if digest_def.get('max_acids') is not None:
        digest_query = digest_query + " and digest.max_acids = %s "
        query_params.append(digest_def.get('max_acids'))

    cur.execute(digest_query, (query_params))
    results = cur.fetchone()
    db.psycopg2_connection.commit
    if results is None:
    #if not digest:
        logger.info(
            "No digest exists for the given definition, creating...")
        digest_kwargs = {}
        digest_kwargs.update(digest_def)
        digest_kwargs['protease'] = protease
        digest = Digest(**digest_kwargs)
        cur = db.get_psycopg2_cursor()


        cur.execute("select * from digest_insert( %s, %s, %s, %s);", (protease.id, digest.max_missed_cleavages, digest.min_acids, digest.max_acids,))
        digest_result = cur.fetchone()

        if digest_result:
            digest = Digest(id=digest_result[0], protease = protease, max_missed_cleavages=digest_result[2], min_acids = digest_result[3], max_acids = digest_result[4])
    else:
        digest = Digest(id=results[0], protease = protease, max_missed_cleavages=results[2], min_acids = results[3], max_acids = results[4])
    db.psycopg2_connection.commit()
    return digest
Esempio n. 3
0
def count_peptide_union(taxon_digests=[], logger=None):
    taxon_digest_ids = [taxon_digest.id for taxon_digest in taxon_digests]
    cur = db.get_psycopg2_cursor();
    cur.execute("select * from taxon_count_peptide_union(%s)", (taxon_digest_ids,))
    count = len(cur.fetchall());
    db.psycopg2_connection.commit()
    return count
Esempio n. 4
0
def count_peptide_union_sa(specialized_assemblies=[], logger=None):
    sa_ids = [sa.id for sa in specialized_assemblies]
    cur = db.get_psycopg2_cursor();
    cur.execute("select * from sadp_count_peptide_union(%s)", (sa_ids,))
    count = len(cur.fetchall());
    db.psycopg2_connection.commit()
    return count
Esempio n. 5
0
    def process_taxon_digest_peptide_batch(self,
                                           taxon_digest,
                                           batch,
                                           logger=None):
        if not logger:
            logger = self.logger
        start_time = time.time()
        dicts = []
        taxon_digest_ids = []
        pepdide_ids = []
        peptide_count = []
        logger.info("Creating %s new taxon digest peptides..." % (len(batch)))
        for row in batch:
            dicts.append({
                'taxon_digest_id': row[2],
                'peptide_id': row[0],
                'count': row[1],
            })
            taxon_digest_ids.append(row[2])
            pepdide_ids.append(row[0])
            peptide_count.append(row[1])

        cur = db.get_psycopg2_cursor()
        cur.execute("select taxon_digest_peptide_insert(%s, %s, %s);",
                    (pepdide_ids, taxon_digest_ids, peptide_count))
        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        logger.info("taxon digest time elapsed: %s" % (total_time))
Esempio n. 6
0
    def run(self):

        try:
            # Get session.
            cur = db.get_psycopg2_cursor()
            self.logger.info("Clearing data for specialized assemblies '%s'" %
                             self.specialized_assembly_ids)

            cur.execute(
                "select sa.id from specialized_assembly sa where sa.genome_name in %s",
                (tuple(self.specialized_assembly_ids), ))
            specialized_assembly_results = cur.fetchall()

            if specialized_assembly_results is None:
                self.logger.info(
                    "No matching taxons found.  Nothing was changed")
                exit()

            for specialized_assembly in specialized_assembly_results:
                self.logger.info(
                    "Clearing data for specialized assembly '%s'" %
                    specialized_assembly)
                self.clear_data_for_specialized_assembly(specialized_assembly)
        except Exception as e:
            self.logger.error("Problem removing specialized_assemblies: '%s'" %
                              e)
            traceback.print_exc()
            db.psycopg2_connection.commit()
        else:
            db.psycopg2_connection.commit()
Esempio n. 7
0
    def clear_data_for_taxon(self, taxon):
        self.logger.info("Clearing data for taxon '%s'" % taxon)
        cur = db.get_psycopg2_cursor()
        try:
            # Get TaxonDigests.
            cur.execute(
                "select td.id from taxon_digest td where td.taxon_id = %s",
                (taxon, ))
            taxon_digests = cur.fetchall()
            if taxon_digests is not None:
                # Delete TaxonDigestPeptides and TaxonDigests
                self.logger.info(
                    "Deleting TaxonDigestPeptides and TaxonDigests")
                for td in taxon_digests:
                    cur.execute(
                        "delete from taxon_digest_peptide where taxon_digest_id = %s",
                        (td[0], ))
                    cur.execute("delete from taxon_digest where id = %s",
                                (td[0], ))

            # Delete TaxonProteins.
            self.logger.info("Deleting TaxonProteins")
            cur.execute("delete from taxon_protein where taxon_id = %s",
                        (taxon, ))

            # Delete Taxon
            cur.execute("delete from taxon where id = %s;", (taxon, ))
        except Exception as e:
            self.logger.error("Problem removing '%s'" % taxon)
            traceback.print_exc()
        else:
            # Commit the deletes.
            db.psycopg2_connection.commit()
Esempio n. 8
0
    def run(self):

        try:
            # Get session.
            cur = db.get_psycopg2_cursor()
            self.logger.info("Clearing data for taxon '%s'" % self.taxon_ids)

            cur.execute("select t.id from taxon t where t.id in %s",
                        (tuple(self.taxon_ids), ))
            taxon_results = cur.fetchall()

            if taxon_results is None:
                self.logger.info(
                    "No matching taxons found.  Nothing was changed")
                exit()

            for taxon in taxon_results:
                self.logger.info("Clearing data for taxon '%s'" % taxon)
                self.clear_data_for_taxon(taxon)
        except Exception as e:
            self.logger.error("Problem removing taxons: '%s'" % e)
            traceback.print_exc()
            db.psycopg2_connection.commit()
        else:
            db.psycopg2_connection.commit()
Esempio n. 9
0
    def run(self):
        # Read in sequences to query.
        sequences = []
        max_dist = self.args.max_distance

        if self.args.sequence_file:
            with open(self.args.sequence_file, 'rb') as f:
                sequences = [line.strip() for line in f.readlines()]

        elif self.args.sequence:
            sequences = [self.args.sequence]

        # Read in whether to query just genomes ('g'), just metagenomes ('g') or both ('b').
        # Genomes is the default search

        type = 'g'
        if self.args.type:
            type = self.args.type

        if not sequences:
            argparser.error(
                "Provide a query sequence via the '--sequence' option, "
                "or a set of sequences via the --sequence-file option")

        # Print headers.
        headers = ['query', 'taxon', 'lev_distance', 'match']
        print(','.join(headers))

        # Execute query for each sequence and print results.
        cur = db.get_psycopg2_cursor()

        for seq in sequences:
            if type == 'g' or type == 'all':
                print('GENOMIC RESULTS')
                print('search sequence,id,name')
                #cur.execute("select taxon_digest_taxon_id from genomic_query_by_peptide_sequence(%s)", (seq,))
                cur.execute(
                    "select id, genome_name from genomic_query_taxon_by_peptide_sequence_new(%s)",
                    (seq, ))
                for row in cur.fetchall():
                    print(','.join([str(s) for s in [seq] + list(row)]))
            if type == 'sa' or type == 'all':
                print('\n')
                print('SPECIALIZED ASSEMBLY RESULTS')
                print('search sequence,genome name, sequence id')
                cur.execute(
                    "select specialized_assembly_name, specialized_assembly_sequence from specialized_assembly_taxon_query_by_peptide_sequence(%s)",
                    (seq, ))
                for row in cur.fetchall():
                    print(','.join([str(s) for s in [seq] + list(row)]))
            if type == 'm' or type == 'all':
                print('\n')
                print('METAGENOMNIC RESULTS')
                print('search sequence, metagenome name')
                cur.execute(
                    "select metagenome_name from metagenomic_query_by_peptide_sequence(%s)",
                    (seq, ))
                for row in cur.fetchall():
                    print(','.join([str(s) for s in [seq] + list(row)]))
Esempio n. 10
0
    def update_existing_peptides_(self, sequences, existing_peptides):
        if not sequences:
            return

        cur = db.get_psycopg2_cursor()
        cur.execute("select * from peptide where peptide.sequence in %s",
                    (tuple(sequences), ))
        for record in cur.fetchall():
            peptide = Peptide(id=record[0], sequence=record[1], mass=record[2])
            existing_peptides[peptide.sequence] = peptide
        db.psycopg2_connection.commit()
def main():
    logger = logging.getLogger('metaomic_assemblies')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    cur = db.get_psycopg2_cursor()
    cur.execute("select m.name from metagenome m;")

    logger.info("Meta-omic Assemblies")
    for record in cur:
        logger.info("%s" % (record[0]))
    db.psycopg2_connection.commit()
Esempio n. 12
0
def main():
    logger = logging.getLogger('taxons')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    cur = db.get_psycopg2_cursor();
    cur.execute("select t.id from taxon t;")

    logger.info("Taxons");
    for record in cur:
        logger.info("%s" % (record[0]))
    db.psycopg2_connection.commit()
Esempio n. 13
0
def main():
    args = argparser.parse_args()

    logger = logging.getLogger('redundancy_tables')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    # Check that specialized assembly genome names or file were provided.
    if not (args.sa_ids or args.sa_id_file):
        raise Exception("Must provide --sa-ids or --sa-id-file option")

    # Get specialized assemblies.
    if args.sa_ids:
        sa_ids = args.sa_ids
    else:
        with open(args.sa_id_file, 'r') as f:
            sa_ids = [row[0] for row in csv.reader(f)]
    logger.info("Specialized Assembly Ids: %s" % (sa_ids))

    # Create output dir if it does not exist.
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    cur = db.get_psycopg2_cursor()

    cur.execute(
        "select sa.id, sa.genome_name from specialized_assembly sa where sa.genome_name = any(%s);",
        (sa_ids, ))
    sa_digests = []

    for record in cur:
        sa = Specialized_Assembly(id=record[0], genome_name=record[1])

        logger.info("Specialized Assembly: %s  %s" % (sa.genome_name, sa.id))

        sa_digests.append(sa)
    db.psycopg2_connection.commit()

    # Generate the redundancy tables.
    tables = redundancy.generate_redundancy_tables_sa(sa_digests,
                                                      logger=logger)

    #Output tables.
    for table_id, table in list(tables.items()):
        table_file = os.path.join(args.output_dir, table_id + '.csv')
        logger.info("Writing '%s'..." % table_file)
        with open(table_file, 'w', newline='') as f:
            w = csv.writer(f)
            for row in table:
                w.writerow(row)

    logger.info("Done.")
 def get_venter_annotations(self, seq_ids, logger=None):
     if not logger:
         logger = self.logger
     metagenome_sequence_ids = []
     the_annotations = extract_venter_annotations(self,
                                                  list(seq_ids.keys()),
                                                  logger)
     # logger.info("annotations" % (the_annotations))
     accession_numbers = []
     scaffold_ids = []
     orf_ids = []
     orf_nums = []
     annotations = []
     gene_names = []
     orf_tax_levels = []
     orf_taxonomies = []
     orf_tax_ids = []
     contig_tax_ids = []
     contig_taxonomies = []
     contig_tax_levels = []
     for annot in the_annotations:
         accession_numbers.append(annot[0])
         metagenome_sequence_ids.append(seq_ids[annot[0]])
         scaffold_ids.append(annot[1])
         orf_ids.append(annot[2])
         if annot[3] is not None:
             orf_nums.append(int(annot[3]))
         else:
             orf_nums.append(annot[3])
         annotations.append(annot[4])
         gene_names.append(annot[5])
         orf_tax_levels.append(annot[6])
         orf_taxonomies.append(annot[7])
         if annot[8] is not None:
             orf_tax_ids.append(int(annot[8]))
         else:
             orf_tax_ids.append(annot[8])
         contig_tax_levels.append(annot[9])
         contig_taxonomies.append(annot[10])
         if annot[11] is not None:
             contig_tax_ids.append(int(annot[11]))
         else:
             contig_tax_ids.append(annot[11])
     cur = db.get_psycopg2_cursor()
     cur.execute(
         "select * from metagenome_annotation_insert(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
         (accession_numbers, metagenome_sequence_ids, scaffold_ids, orf_ids,
          orf_nums, annotations, gene_names, orf_tax_levels, orf_taxonomies,
          orf_tax_ids, contig_tax_ids, contig_taxonomies,
          contig_tax_levels))
     db.psycopg2_connection.commit()
Esempio n. 15
0
def main():
    args = argparser.parse_args()

    logger = logging.getLogger('redundancy_tables')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    # Check that taxon ids or taxon id file were provided.
    if not (args.taxon_ids or args.taxon_id_file):
        raise Exception("Must provide --taxon-ids or --taxon-id-file option")

    # Get taxons.
    if args.taxon_ids:
        taxon_ids = args.taxon_ids
    else:
        with open(args.taxon_id_file, 'r') as f:
            taxon_ids = [row[0] for row in csv.reader(f)]
    logger.info("Taxon Ids: %s" % (taxon_ids))

    cur = db.get_psycopg2_cursor()
    cur.execute(
        "select * from taxon_digest td where td.taxon_id in (select t.id from taxon t where t.id = any(%s));",
        (taxon_ids, ))

    taxon_digests = []

    for record in cur:
        td = TaxonDigest(id=record[0], taxon=record[1], digest=record[2])
        logger.info("Taxon Digest: %s" % (td.id))

        taxon_digests.append(td)
    db.psycopg2_connection.commit()

    # Generate the redundancy tables.
    tables = redundancy.generate_redundancy_tables(taxon_digests,
                                                   logger=logger)

    # Create output dir if it does not exist.
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Output tables.
    for table_id, table in list(tables.items()):
        table_file = os.path.join(args.output_dir, table_id + '.csv')
        logger.info("Writing '%s'..." % table_file)
        with open(table_file, 'w', newline='') as f:
            w = csv.writer(f)
            for row in table:
                w.writerow(row)

    logger.info("Done.")
Esempio n. 16
0
    def clear_data_for_specialized_assembly(self, specialized_assembly):
        self.logger.info("Clearing data for specialized assembly '%s'" %
                         specialized_assembly)
        cur = db.get_psycopg2_cursor()
        try:
            # Get Specialized Assembly Sequences.
            cur.execute(
                "select sas.id from specialized_assembly_sequence sas where sas.specialized_assembly_id = %s",
                (specialized_assembly, ))
            specialized_assembly_sequences = cur.fetchall()
            if specialized_assembly_sequences is not None:
                # Delete Sequences, Annotations and Digests
                self.logger.info(
                    "Deleting Specialized Assembly Sequences and Digests")
                for sas in specialized_assembly_sequences:
                    # self.logger.info("SAS '%s'" % sas)
                    cur.execute(
                        "delete from specialized_assembly_digest_peptide where specialized_assembly_sequence_id = %s",
                        (sas[0], ))

                db.psycopg2_connection.commit()
                cur = db.get_psycopg2_cursor()
                for sas in specialized_assembly_sequences:
                    cur.execute(
                        "delete from specialized_assembly_sequence where id = %s",
                        (sas[0], ))

                db.psycopg2_connection.commit()
                cur = db.get_psycopg2_cursor()
            # Delete Specialized Assembly
            cur.execute("delete from specialized_assembly where id = %s;",
                        (specialized_assembly, ))
        except Exception as e:
            self.logger.error("Problem removing '%s'" % specialized_assembly)
            traceback.print_exc()
        else:
            # Commit the deletes.
            db.psycopg2_connection.commit()
def main():
    logger = logging.getLogger('specialized_assemblies')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    cur = db.get_psycopg2_cursor()
    cur.execute(
        "select sa.genome_name, sa.type_flag from specialized_assembly sa;")

    logger.info("Specialized Assemblies")
    logger.info("Genome Name            Type")
    for record in cur:
        logger.info("%s     %s" % (record[0], record[1]))
    db.psycopg2_connection.commit()
def main():
    logger = logging.getLogger('metaomic_taxons')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    cur = db.get_psycopg2_cursor();
    cur.execute("select mt.tax_species from metagenome_taxon mt where mt.ncbi_id in \
        (select distinct ma.contig_tax_id from metagenome_annotations ma) or \
        mt.ncbi_id in (select distinct ma.orf_tax_id from metagenome_annotations ma)") 

    logger.info("Meta-omic Assembly Taxons")
    for record in cur:
        logger.info("%s" % (record[0]))
    db.psycopg2_connection.commit()
Esempio n. 19
0
def count_peptide_union_combined(sa_ids=[], td_ids=[], logger=None):
    #logger.info("In Union Common Pepdides Combined for SA %s and TD %s" % (sa_ids, td_ids))
    cur = db.get_psycopg2_cursor();
    cur.execute("select * from sa_taxon_count_peptide_union(%s, %s)", (sa_ids, td_ids))
    # cur.execute("SELECT peptide_id from "\
    #     "((SELECT distinct sadp.peptide_id AS peptide_id "\
	#     "FROM specialized_assembly_digest_peptide sadp "\
	#     "JOIN specialized_assembly_sequence ON specialized_assembly_sequence.id = sadp.specialized_assembly_sequence_id "\
	#     "join specialized_assembly on specialized_assembly.id = specialized_assembly_sequence.specialized_assembly_id "\
	#     "WHERE specialized_assembly_sequence.specialized_assembly_id = any(array[%s])) "\
	#     "union all "\
    #     "(SELECT distinct taxon_digest_peptide.peptide_id AS peptide_id "\
	#     "FROM taxon_digest_peptide JOIN taxon_digest ON taxon_digest.id = taxon_digest_peptide.taxon_digest_id "\
	#     "WHERE taxon_digest.id = any(array[%s]))) as peptide_unions "\
    #     "group by peptide_id;", (sa_ids, td_ids))
    count = len(cur.fetchall());
    #logger.info("Union Count %s " % (count))
    db.psycopg2_connection.commit()
    return count
Esempio n. 20
0
    def run(self):

        try:
            # Get session.
            cur = db.get_psycopg2_cursor();
            self.logger.info("Clearing data for meta-omic assemblies '%s'" % self.metaomic_ids)

            cur.execute("select m.id from metagenome m where m.id in %s", (tuple(self.metaomic_ids),))
            metaomic_results = cur.fetchall()

            if metaomic_results is None:
                self.logger.info("No matching taxons found.  Nothing was changed")
                exit()

            for metaomic_assembly in metaomic_results:
                self.logger.info("Clearing data for meta-omic assembly '%s'" % metaomic_assembly)
                self.clear_data_for_metaomic_assembly(metaomic_assembly)
        except Exception as e:
            self.logger.error("Problem removing metaomic_assemblies: '%s'" % e)
            traceback.print_exc()
            db.psycopg2_connection.commit()
        else:
            db.psycopg2_connection.commit()
    def process_peptide_batch(self,
                              metagenome_sequence_digests_dict,
                              logger=None):
        if not logger:
            logger = self.logger
        # Assemble combined peptide sequences and metagenome digests.  Each metagenome sequence can have many peptides.
        combined_peptide_sequences = set()
        for proteinId, data in list(metagenome_sequence_digests_dict.items()):
            for sequence in data['peptide_sequences']:
                combined_peptide_sequences.add(sequence)
        # Get existing peptides.
        existing_peptides = {}
        # Create non-existent peptides in bulk.
        start_time = time.time()

        num_new_peptides = 0
        peptide_sequences = []
        peptide_masses = []
        peptide_file = ''
        for sequence in combined_peptide_sequences:
            num_new_peptides += 1
            #calculate mass of peptide
            mass = get_aa_sequence_mass(sequence)
            peptide_sequences.append(sequence)
            peptide_masses.append(mass)

        logger.info("Creating %s new peptides..." % num_new_peptides)
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from peptide_insert(%s, %s);",
                    (peptide_sequences, peptide_masses))

        for record in cur:
            try:
                peptide = Peptide(
                    id=record[0],
                    sequence=record[1],
                )
                existing_peptides[peptide.sequence] = peptide
            except Exception as e:
                logger.exception("Error processing peptide, skipping")
                continue
        total_time = time.time() - start_time
        self.total_peptide_time = self.total_peptide_time + total_time
        logger.info("peptide time elapsed: %s" % (total_time))

        self.stats['Peptide'] += num_new_peptides
        # Create histogram of peptide sequence occurences for each protein.
        num_peptide_instances = 0
        for sequenceId, data in list(metagenome_sequence_digests_dict.items()):
            peptides_histogram = defaultdict(int)
            for sequence in data['peptide_sequences']:
                peptides_histogram[sequence] += 1
            data['peptide_histogram'] = peptides_histogram
            # Update number of peptide instances.
            num_peptide_instances += len(peptides_histogram)
        # Create protein digest peptide instances in bulk.

        start_time = time.time()
        pdp_peptide_ids = []
        pdp_metagenome_sequence_ids = []
        pdp_digest_ids = []
        pdp_peptide_count = []
        pdp_counter = 0
        for sequenceId, data in list(metagenome_sequence_digests_dict.items()):
            for sequence, count in list(data['peptide_histogram'].items()):
                pdp_counter += 1
                peptide = existing_peptides[sequence]
                pdp_peptide_ids.append(peptide.id)
                pdp_metagenome_sequence_ids.append(
                    data['metagenome_sequence'].id)
                pdp_digest_ids.append(data['digest'].id)
                pdp_peptide_count.append(count)
        total_time = time.time() - start_time
        cur.execute(
            "select metagenome_sequence_digest_peptide_insert(%s, %s, %s, %s);",
            (pdp_peptide_ids, pdp_metagenome_sequence_ids, pdp_digest_ids,
             pdp_peptide_count))
        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        # logger.info("protein digest time elapsed: %s" % (total_time))
        self.stats['ProteinDigestPeptide'] += num_peptide_instances
Esempio n. 22
0
    def process_peptide_batch(self, batch, logger=None):
        if not logger:
            logger = self.logger

        # Assemble combined peptide sequences and protein digests.
        combined_peptide_sequences = set()

        protein_ids = []
        digest_ids = []
        protein_digests = []
        protein_digests_dict = {}
        for proteinId, data in list(batch.items()):
            for sequence in data['peptide_sequences']:
                combined_peptide_sequences.add(sequence)

            pd = data['protein_digest']
            protein_ids.append(pd.protein.id)
            digest_ids.append(pd.digest.id)

        cur = db.get_psycopg2_cursor()
        cur.execute("select * from protein_digest_insert(%s, %s);",
                    (protein_ids, digest_ids))

        # iterate through the protein_digest records returned from the insert and build a protein_digest object
        for record in cur:

            try:
                protein_digest = ProteinDigest(id=record[0],
                                               protein=record[1],
                                               digest=record[2])

                protein_digests.append(protein_digest)
                batch_record = batch.get(record[1])
                protein_digests_dict[record[1]] = {
                    'peptide_sequences': batch_record['peptide_sequences'],
                    'protein_digest': protein_digest,
                }
            except Exception as e:
                logger.exception("Error processing protein digest, skipping")
                continue

        db.psycopg2_connection.commit()

        self.stats['ProteinDigest'] += len(protein_digests)

        # Get existing peptides.
        existing_peptides = {}

        # Create non-existent peptides in bulk.
        start_time = time.time()
        num_new_peptides = 0
        peptide_dicts = []
        peptide_sequences = []
        peptide_masses = []
        for sequence in combined_peptide_sequences:
            #if sequence not in existing_peptides:
            num_new_peptides += 1
            mass = get_aa_sequence_mass(sequence)
            peptide_dicts.append({
                'sequence': sequence,
                'mass': mass,
            })
            peptide_sequences.append(sequence)
            peptide_masses.append(mass)
        logger.info("Creating %s new peptides..." % num_new_peptides)
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from peptide_insert(%s, %s);",
                    (peptide_sequences, peptide_masses))
        for record in cur:
            try:
                peptide = Peptide(
                    id=record[0],
                    sequence=record[1],
                )
                existing_peptides[peptide.sequence] = peptide
            except Exception as e:
                logger.exception("Error processing peptide, skipping")
                continue

        self.stats['Peptide'] += num_new_peptides
        # Create histogram of peptide sequence occurences for each protein.
        num_peptide_instances = 0

        for proteinId, data in list(protein_digests_dict.items()):
            peptides_histogram = defaultdict(int)
            for sequence in data['peptide_sequences']:
                peptides_histogram[sequence] += 1
            data['peptide_histogram'] = peptides_histogram
            # Update number of peptide instances.
            num_peptide_instances += len(peptides_histogram)
        total_time = time.time() - start_time
        logger.info("peptide time elapsed: %s" % (total_time))
        # Create protein digest peptide instances in bulk.
        logger.info("Creating %s new protein digest peptides..." %
                    (num_peptide_instances))

        start_time = time.time()
        pdp_batch = []
        pdp_peptide_ids = []
        pdp_protein_digest_ids = []
        pdp_peptide_count = []
        pdp_counter = 0
        for proteinId, data in list(protein_digests_dict.items()):
            for sequence, count in list(data['peptide_histogram'].items()):
                pdp_counter += 1
                peptide = existing_peptides[sequence]
                pdp_peptide_ids.append(peptide.id)
                pdp_protein_digest_ids.append(data['protein_digest'].id)
                pdp_peptide_count.append(count)
        total_time = time.time() - start_time
        logger.info("protein digest loop time elapsed: %s" % (total_time))
        cur = db.get_psycopg2_cursor()
        cur.execute(
            "select protein_digest_peptide_insert(%s, %s, %s);",
            (pdp_peptide_ids, pdp_protein_digest_ids, pdp_peptide_count))
        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        logger.info("protein digest time elapsed: %s" % (total_time))
        self.stats['ProteinDigestPeptide'] += num_peptide_instances
def main():
    start_time = time.time()
    args = argparser.parse_args()

    logger = logging.getLogger('proteomz_annotations')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    filename = args.annotation_file

    # Parse digest definition if given.
    if filename:
        logger.info("Starting annotation ingest")
        metagenome_annotations = {}
        metagenome_sequence_ids = []
        orf_ids = []  #i.e. node_123_orf format
        contig_ncbi_tax_ids = []
        contig_tax_names = []
        orf_ncbi_tax_ids = []
        orf_tax_names = []

        #open the file and iterate through it line by line
        total_annotations = 0
        line_count = 0
        with open(filename, 'rt') as f:
            reader = csv.reader(f)
            line_count = len(list(reader))

        with open(filename, 'rt') as f:
            reader = csv.reader(f)

            try:
                count = 1

                batch_count = 0
                #handle ingestion in batches to reduce number of hits on the db
                for annot in reader:
                    if count > 1:
                        orf_id = annot[0]
                        contig_tax_name = annot[1]
                        if not contig_tax_name:
                            contig_tax_name = None

                        contig_ncbi_tax_id = annot[2]
                        #print(contig_ncbi_tax_id)
                        if not contig_ncbi_tax_id or contig_ncbi_tax_id == '#N/A' or contig_ncbi_tax_id == '':
                            contig_ncbi_tax_id = None
                        else:
                            contig_ncbi_tax_id = int(contig_ncbi_tax_id)
                        orf_tax_name = annot[3]
                        if not orf_tax_name:
                            orf_tax_name = None

                        orf_ncbi_tax_id = annot[3]
                        if not orf_ncbi_tax_id or orf_ncbi_tax_id == '#N/A' or orf_ncbi_tax_id == '':
                            orf_ncbi_tax_id = None
                        else:
                            orf_ncbi_tax_id = int(orf_ncbi_tax_id)

                        #get all the annotations we need from the csv and save them into a dict for later use
                        ma = {
                            "orf_id": orf_id,
                            "contig_tax_name": contig_tax_name,
                            "contig_tax_id": contig_ncbi_tax_id,
                            "orf_tax_name": orf_tax_name,
                            "orf_tax_id": orf_ncbi_tax_id
                        }
                        orf_ids.append(orf_id)
                        metagenome_annotations[orf_id] = ma

                        batch_count = batch_count + 1

                        if batch_count == 1000 or count == line_count:
                            a_numbers = []
                            metagenome_sequence_ids = []
                            contig_ncbi_tax_ids = []
                            contig_tax_names = []
                            orf_ncbi_tax_ids = []
                            orf_tax_names = []

                            #look up the already populated sequence id to match it to the accession number
                            cur = db.get_psycopg2_cursor()
                            cur.execute(
                                "select id, sequence_id from metagenome_sequence ms where ms.sequence_id = any(%s);",
                                (orf_ids, ))

                            db.psycopg2_connection.commit()

                            for seq_id, a_number in cur:

                                if seq_id is not None:
                                    batch_count = batch_count + 1

                                    #populate the arrays that will be passed to postgres for a bulk insert
                                    metagenome_sequence_ids.append(seq_id)
                                    meta_annon = metagenome_annotations[
                                        a_number]
                                    #print meta_annon["accession_number"]

                                    a_numbers.append(meta_annon["orf_id"])
                                    contig_ncbi_tax_ids.append(
                                        meta_annon["contig_tax_id"])
                                    contig_tax_names.append(
                                        meta_annon["contig_tax_name"])
                                    orf_ncbi_tax_ids.append(
                                        meta_annon["orf_tax_id"])
                                    orf_tax_names.append(
                                        meta_annon["orf_tax_name"])

                            cur = db.get_psycopg2_cursor()
                            cur.execute(
                                "select * from metagenome_annotation_insert(%s::text[], %s::numeric[], %s::numeric[], %s::text[], %s::numeric[], %s::text[]);",
                                (a_numbers, metagenome_sequence_ids,
                                 contig_ncbi_tax_ids, contig_tax_names,
                                 orf_ncbi_tax_ids, orf_tax_names))

                            db.psycopg2_connection.commit()
                            logger.info("ingested: %s annotations" %
                                        (total_annotations))
                            # logger.info("annotations" % (the_annotations))
                            metagenome_annotations = {}
                            orf_ids = []
                            total_annotations = total_annotations + batch_count
                            batch_count = 0
                    count = count + 1

            except csv.Error as e:
                logger.error('file %s, line %d: %s' %
                             (filename, reader.line_num, e))
                print(e)
Esempio n. 24
0
    def process_fasta_file(self, path):
        base_msg = "Processing file '%s'..." % path
        file_logger = self.get_child_logger(id(path), base_msg, self.logger)

        #check to make sure the specified file exists before we try to do anything with it
        if os.path.exists(path):
            file_logger.info("Found file to ingest.")
        else:
            file_logger.info("Could not find file to ingest at %s" % path)
            exit()
        # Get taxon from filename.
        taxon_id = os.path.splitext(os.path.basename(path))[0]

        # Get taxon object from db or create a new one.
        taxon = Taxon(id=taxon_id)
        cur = db.get_psycopg2_cursor()
        cur.execute("select t.id from taxon t where t.id = %s;", (taxon_id, ))
        taxon_result = cur.fetchone()
        db.psycopg2_connection.commit()
        if taxon_result is None:
            #add a taxon to the DB
            cur.execute("insert into taxon (id) values(%s);", (taxon_id, ))
            db.psycopg2_connection.commit()
            self.stats['Taxon'] += 1
            file_logger.info("Created taxon '%s'" % taxon_id)

        # Check if TaxonDigest record exists in db.
        cur.execute(
            "select t.id from taxon_digest t where t.taxon_id = %s and t.digest_id = %s;",
            (
                taxon_id,
                self.digest.id,
            ))
        db.psycopg2_connection.commit()
        taxon_digest_result = cur.fetchone()
        taxon_digest = TaxonDigest(taxon=taxon, digest=self.digest)
        if taxon_digest_result:
            # If digest has been run on this taxon, don't do anything.
            file_logger.info(
                ("Taxon '%s' has already been digested with"
                 " digest '%s', skipping.") % (taxon_id, self.digest))
            return
        else:
            # Otherwise create a new TaxonDigest.
            cur.execute(
                "insert into taxon_digest (taxon_id, digest_id) values(%s,%s);",
                (
                    taxon_digest.taxon.id,
                    taxon_digest.digest.id,
                ))
            db.psycopg2_connection.commit()
            self.stats['TaxonDigest'] += 1

        # Process protein sequences in batches.
        file_logger.info("Counting # of protein sequences...")
        num_proteins = 0
        for metadata, sequence in fasta.read(path):
            num_proteins += 1
        file_logger.info("%s total protein sequences." % num_proteins)
        batch_size = 1000
        batch_counter = 0
        batch = []
        protein_logger = self.get_child_logger("%s_proteins" % id(file_logger),
                                               "Processing proteins...",
                                               file_logger)
        protein_logger.info("")

        #check sequence against expected amino acids, if this regex returns true it means it is not a valid sequence (contains a non amino acid character)
        for metadata, sequence in fasta.read(path):
            if VALID_AAS.search(sequence):
                file_logger.info(
                    "Tried to ingest invalid protein sequence %s" % sequence)
            else:
                batch.append((
                    metadata,
                    sequence,
                ))
                batch_counter += 1
            if (batch_counter % batch_size) == 0:
                self.process_protein_batch(batch, taxon, logger=protein_logger)
                protein_logger.info(("%s of %s (%.1f%%)") %
                                    (batch_counter, num_proteins,
                                     100.0 * batch_counter / num_proteins))
                batch = []
        self.process_protein_batch(batch, taxon, logger=protein_logger)

        batch_size = 1e4
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from get_peptide_count(%s, %s);", (
            self.digest.id,
            taxon.id,
        ))

        tdp_batch = []
        tdp_counter = 0
        for row in cur.fetchall():
            tdp_counter += 1
            tdp_batch.append(row)
            if (tdp_counter % batch_size) == 0:
                self.process_taxon_digest_peptide_batch(taxon_digest,
                                                        tdp_batch,
                                                        logger=file_logger)
                tdp_batch = []
        self.process_taxon_digest_peptide_batch(taxon_digest,
                                                tdp_batch,
                                                logger=file_logger)
        self.stats['TaxonDigestPeptide'] += tdp_counter

        self.logger.info("Done processing file '%s'" % path)
        db.psycopg2_connection.commit()
def main():
    """
    Process arguments.
    """
    argparser = argparse.ArgumentParser(
        description=('Update specialized assembly lineage from CSV file.'))
    argparser.add_argument('--filepath',
                           help=('CSV file containing genome lineage'))

    start_time = time.time()
    logger = logging.getLogger('update_specialized_Assembly_taxons')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    args = argparser.parse_args()

    if args.filepath:
        filename = args.filepath
        print(filename)

        with open(filename, 'rt') as f:
            reader = csv.reader(f)
            try:
                count = 1
                for row in reader:
                    #get the ncbi_id and genome (MAG or SAG) name and update the specialized_assembly table with this info
                    if count > 1:
                        fasta_file_name = row[0]
                        genome_id = os.path.splitext(
                            os.path.basename(fasta_file_name))[0]
                        taxon_name = row[1]
                        print("Genome ID", genome_id)
                        ncbi_id = row[2]
                        tax_group = row[3]
                        tax_kingdom = row[4]
                        tax_phylum = row[5]
                        tax_class = row[6]
                        tax_order = row[7]
                        tax_family = row[8]
                        tax_genus = row[9]
                        tax_species = row[10].replace(".", "")
                        ncbi_taxon_name = row[11]

                        cur = db.get_psycopg2_cursor()
                        cur.execute(
                            "update specialized_assembly set ncbi_id = %s where genome_name = %s",
                            (ncbi_id, taxon_name))
                        db.psycopg2_connection.commit()

                        s = "."
                        seq = (tax_group, tax_kingdom, tax_phylum, tax_class,
                               tax_order, tax_family, tax_genus, tax_species)
                        hierachy = s.join(seq)
                        hierachy = hierachy.replace(" ", "_")
                        hierachy = hierachy.replace("-", "_")
                        hierachy = hierachy.replace("(", "_")
                        hierachy = hierachy.replace(")", "_")
                        hierachy = hierachy.replace("[", "")
                        hierachy = hierachy.replace("]", "")
                        hierachy = hierachy.replace("'", "")
                        hierachy = hierachy.replace("'", "")
                        hierachy = hierachy.replace("..", ".")
                        hierachy = hierachy.replace(":", ".")
                        if hierachy.endswith("."):
                            hierachy = hierachy[:-1]

                        ncbi_url = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=" + ncbi_id
                        print("URL: ", ncbi_url)

                        cur.execute(
                            "select nt.ncbi_id from ncbi_taxonomy nt where nt.ncbi_id = %s;",
                            (ncbi_id, ))
                        taxon_result = cur.fetchone()
                        if taxon_result is None:
                            cur.execute(
                                "insert into ncbi_taxonomy(ncbi_id, tax_group, tax_kingdom, tax_phylum, tax_class, "
                                "tax_order, tax_family, tax_genus, tax_species, hierachy, ncbi_url)"
                                " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
                                (ncbi_id, tax_group, tax_kingdom, tax_phylum,
                                 tax_class, tax_order, tax_family, tax_genus,
                                 tax_species, hierachy, ncbi_url))
                            db.psycopg2_connection.commit()
                        else:
                            cur.execute(
                                "update ncbi_taxonomy set ncbi_id = %s, tax_group = %s, tax_kingdom = %s, tax_phylum = %s, tax_class = %s, "
                                "tax_order = %s, tax_family = %s, tax_genus = %s, tax_species = %s, hierachy = %s, ncbi_url = %s where ncbi_id=%s",
                                (ncbi_id, tax_group, tax_kingdom, tax_phylum,
                                 tax_class, tax_order, tax_family, tax_genus,
                                 tax_species, hierachy, ncbi_url, ncbi_id))
                            db.psycopg2_connection.commit()

                    count = count + 1
            except csv.Error as e:
                logger.error('file %s, line %d: %s' %
                             (filename, reader.line_num, e))
Esempio n. 26
0
def count_peptide_union_sa_ids(sa_ids=[], logger=None):
    cur = db.get_psycopg2_cursor();
    cur.execute("select * from sadp_count_peptide_union(%s)", (sa_ids,))
    count = len(cur.fetchall());
    db.psycopg2_connection.commit()
    return count
    def run(self):
        
        # Read in sequences to query.
        sequences = []
        max_dist = self.args.max_distance

        if self.args.sequence_file:
            with open(self.args.sequence_file, 'rb') as f:
                sequences = [line.strip() for line in f.readlines()]

        elif self.args.sequence:
            sequences = [self.args.sequence]

        # Read in whether to query just genomes ('g'), just metagenomes ('g') or both ('b').
        # Genomes is the default search

        type = 'g'
        if self.args.type:
            type = self.args.type

        if not sequences:
            argparser.error("Provide a query sequence via the '--sequence' option, "
                            "or a set of sequences via the --sequence-file option")

        # Print headers.
        headers = ['query', 'taxon', 'lev_distance', 'match']
        print(','.join(headers))

        # Execute query for each sequence and print results.
        cur = db.get_psycopg2_cursor()

        for seq in sequences:
            if type == 'g' or type == 'all':
                hierachy = []
                results=[]
                taxon_lca=''
                print('GENOMIC RESULTS')
                print('LCA,search sequence,id,name')

                if max_dist == 0:
                    cur.execute("select id, genome_name, hierachy from genomic_query_taxon_by_peptide_sequence_new(%s)", (seq,))
                else:
                    #change this query when we add fuzzy matching
                    cur.execute("select id, genome_name, hierachy from genomic_query_taxon_by_peptide_sequence_new(%s)", (seq,))

                for row in cur.fetchall():
                    hierachy.append(row[0])
                    results.append(','.join([seq, str(row[0]), row[1]]))
                #print(','.join([str(s) for s in [seq] + list(row)]))

                cur.execute("select * from genomic_lca(%s);", [hierachy])
                if hierachy:
                    lca = cur.fetchone()
                    if lca is not None:
                        taxonHierachy = lca[0].split(".")
                        # iterate through taxon hierachy backwards until we find the first record that is not "unclassified"
                        for l in taxonHierachy[::-1]:

                            if l.lower() != "unclassified":
                                taxon_lca = l
                                break
                    else:
                        taxon_lca = "Unknown"
                for r in results:
                    print(','.join([taxon_lca, r]))
            if type == 'sa' or type == 'all':
                hierachy=[]
                results=[]
                print('\n');
                print('SPECIALIZED ASSEMBLY RESULTS')
                print('LCA,search sequence,genome name, sequence id, NCBI ID')
                cur.execute(
                    "select specialized_assembly_name, specialized_assembly_sequence, ncbi_id from specialized_assembly_taxon_query_by_peptide_sequence(%s)",
                    (seq,))
                for row in cur.fetchall():
                    ncbi_id = row[2]
                    hierachy.append(ncbi_id)
                    results.append(','.join([str(s) for s in [seq] + list(row)]))
                if hierachy:
                    assembly_lca = ""
                    # get the least common ancester for hierachy list
                    cur.execute("select * from specialized_assembly_lca(%s);", [hierachy])
                    lca = cur.fetchone()

                    if lca is not None:
                        # print lca[0]
                        taxonHierachy = lca[0].split(".")
                        # iterate through taxon hierachy backwards until we find the first record that is not "unclassified"
                        for l in taxonHierachy[::-1]:

                            if l.lower() != "unclassified":
                                assembly_lca = l
                                break
                    else:
                        assembly_lca = "Unknown"

                for r in results:
                    print(','.join([assembly_lca, r]))
            if type == 'm' or type == 'all':
                hierachy = []
                results = []
                print('\n');
                print('METAGENOMNIC RESULTS')
                print('LCA, search sequence, metagenome name, NCBI ID')
                cur.execute("select metagenome_name, contig_tax_id, orf_tax_id from metagenomic_query_by_peptide_sequence(%s)", (seq,))
                for row in cur.fetchall():
                    name = row[0]
                    contig_tax_id=row[1]
                    orf_tax_id=row[2]
                    if contig_tax_id:
                        prefered_tax_id = contig_tax_id
                    elif orf_tax_id:
                        prefered_tax_id = orf_tax_id
                    hierachy.append(prefered_tax_id)
                    results.append(','.join([seq,name,str(prefered_tax_id)]))
                    # get the least common ancester for hierachy list
                if hierachy:
                    cur.execute("select * from metagenomic_lca2(%s);", [hierachy])
                    lca = cur.fetchone()
                    metagenome_lca = ""
                    if lca is not None:
                        taxonHierachy = lca[0].split(".")
                        # iterate through taxon hierachy backwards until we find the first record that is not "unclassified"
                        for l in taxonHierachy[::-1]:
                            # print l
                            if l.lower() != "unclassified":
                                metagenome_lca = l
                                break
                    else:
                        metagenome_lca = "Unknown"
                for r in results:
                    print(','.join([metagenome_lca, r]))

        db.psycopg2_connection.commit()
    def process_metagenome_sequence_batch(self,
                                          batch,
                                          metagenome,
                                          logger=None):
        """ Process a batch of metagenome sequences with the given digest. """
        if not batch:
            return
        if not logger:
            logger = self.logger
        # Get existing metagenome sequences (proteins) by searching for sequences.
        sequences = []
        metadataList = []
        for metadata, sequence in batch:
            sequences.append(sequence)
        # Initialize collection of undigested proteins.
        undigested_sequences = {}
        metagenome_sequences = []
        metagenome_digest_ids = []
        metagenome_ids = []
        metagenome_accesion_ids = {}
        # Create proteins which do not exist in the db and add to undigested
        # collection.
        start_time = time.time()
        num_new_sequences = 0
        for metadata, sequence in batch:
            num_new_sequences += 1
            # add sequence and mass to their respective lists to be passed to postgres stored procedure
            metagenome_sequences.append(sequence)
            metadataList.append(metadata)
            metagenome_digest_ids.append(self.digest.id)
            metagenome_ids.append(metagenome.id)

    # logger.info("creating %s new metagenome sequences..." % (
    #    num_new_sequences))
        cur = db.get_psycopg2_cursor()
        cur.execute("select * from metagenome_sequence_insert(%s, %s, %s);",
                    (metagenome_sequences, metagenome_ids, metadataList))
        # iterate through the protein records returned from the insert and build a protein object
        for record in cur:
            try:
                meta_seq = Metagenome_Sequence(id=record[0],
                                               sequence=record[1],
                                               metagenome_id=record[2],
                                               sequence_id=record[3])
            except Exception as e:
                logger.exception(
                    "Error processing metagenome sequence, skipping")
                continue
            undigested_sequences[record[0]] = meta_seq
            metagenome_accesion_ids[meta_seq.sequence_id] = meta_seq.id
        db.psycopg2_connection.commit()
        total_time = time.time() - start_time
        logger.info("time elapsed: %s" % (total_time))
        self.stats['Protein'] += num_new_sequences
        # Digest undigested proteins.
        if undigested_sequences:
            #logger.info("digesting %s proteins" % num_new_sequences)
            undigested_batch = {}
            peptide_counter = 0
            for metagenome_sequence in list(undigested_sequences.values()):
                peptide_sequences = cleave(
                    metagenome_sequence.sequence,
                    self.digest.protease.cleavage_rule,
                    self.logger,
                    self.digest.max_missed_cleavages,
                    min_acids=self.digest.min_acids,
                    max_acids=self.digest.max_acids,
                )
                peptide_counter += len(peptide_sequences)
                undigested_batch[metagenome_sequence.id] = {
                    'peptide_sequences': peptide_sequences,
                    'metagenome_sequence': metagenome_sequence,
                    'digest': self.digest,
                }
            self.process_peptide_batch(undigested_batch, logger)
Esempio n. 29
0
def count_common_peptides_ids(taxon_digest_ids=[], logger=None):
    cur = db.get_psycopg2_cursor();
    cur.execute("select * from taxon_count_common_peptides(%s, %s)", (taxon_digest_ids, len(taxon_digest_ids)))
    count = len(cur.fetchall());
    db.psycopg2_connection.commit()
    return count
 def process_fasta_file(self, path):
     base_msg = "Processing file '%s'..." % path
     file_logger = self.get_child_logger(id(path), base_msg, self.logger)
     # Get metagenome name from filename.
     # This may not be the best way to do this, might be better to allow user to input a name?
     metagenome_name = os.path.splitext(os.path.basename(path))[0]
     cur = db.get_psycopg2_cursor()
     cur.execute("select m.id from metagenome m where m.name = %s;",
                 (metagenome_name, ))
     metagenome_result = cur.fetchone()
     db.psycopg2_connection.commit()
     if metagenome_result is None:
         # add a metagenome to the DB
         cur.execute("select * from metagenome_insert(%s);",
                     (metagenome_name, ))
         # db.psycopg2_connection.commit()
         metagenome_result = cur.fetchone()
         self.stats['Metagenome'] += 1
         file_logger.info("Created metagenome '%s'" % metagenome_name)
     metagenome = Metagenome(id=metagenome_result[0], name=metagenome_name)
     # Check if metagenome has already been digested with given digestion agent.
     cur.execute(
         "select md.digest_id from metagenome_sequence_digest_peptide md where md.metagenome_sequence_id in (select ms.id from metagenome_sequence ms where ms.metagenome_id = %s) and md.digest_id = %s;",
         (
             metagenome.id,
             self.digest.id,
         ))
     db.psycopg2_connection.commit()
     metagenome_digest_result = cur.fetchone()
     if metagenome_digest_result:
         # If digest has been run on this metagenome, don't do anything.
         file_logger.info(
             ("Metagenome '%s' has already been digested with"
              " digest '%s', skipping.") % (metagenome_name, self.digest))
         return
     # Process metagenome sequences in batches.
     file_logger.info("Counting # of metagenome sequences...")
     num_proteins = 0
     for metadata, sequence in fasta.read(path):
         num_proteins += 1
     file_logger.info("%s total metagenome sequences." % num_proteins)
     batch_size = 999
     batch_counter = 0
     batch = []
     protein_logger = self.get_child_logger(
         "%s_proteins" % id(file_logger),
         "Processing metagenome sequences...", file_logger)
     protein_logger.info("")
     for metadata, sequence in fasta.read(path):
         # check sequence against expected amino acids, if this regex returns true it means it is not a valid sequence (contains a non amino acid character)
         if VALID_AAS.search(sequence):
             file_logger.info(
                 "Tried to ingest invalid protein sequence %s" % sequence)
         else:
             batch.append((
                 metadata,
                 sequence,
             ))
             batch_counter += 1
         if (batch_counter % batch_size) == 0:
             self.process_metagenome_sequence_batch(batch,
                                                    metagenome,
                                                    logger=protein_logger)
             protein_logger.info(("%s of %s (%.1f%%)") %
                                 (batch_counter, num_proteins,
                                  100.0 * batch_counter / num_proteins))
             batch = []
     self.process_metagenome_sequence_batch(batch,
                                            metagenome,
                                            logger=protein_logger)
     protein_logger.info("Total Peptide Time: %s" % self.total_peptide_time)