def get_digest(logger, digest_def, session=None): """ Get digest from a digest definition.""" if not session: session = db.get_session() # Get protease object. protease = session.query(Protease).get( digest_def['protease']['id']) if not protease: raise Exception("No protease exists for the given definition.") # Get digest object. digest = ( session.query(Digest) .filter(Digest.protease == protease) .filter(Digest.max_missed_cleavages == digest_def.get( 'max_missed_cleavages')) .filter(Digest.min_acids == digest_def.get( 'min_acids')) .filter(Digest.max_acids == digest_def.get( 'max_acids')) ).first() if not digest: raise Exception("No digest exists for the given definition.") return digest
def setUp(self): # Setup DB. #d = tempfile.mkdtemp(prefix="tdb.") #db_file = os.path.join(d, "foo") #self.engine = create_engine('sqlite:///%s' % db_file) self.engine = create_engine('sqlite://') def get_connection(): return self.engine.connect() self.get_connection = get_connection db.metadata.create_all(bind=self.engine) self.session = db.get_session(bind=self.get_connection()) # Create trypsin protease. trypsin = Protease(id='trypsin', cleavage_rule=expasy_rules['trypsin']) self.session.add(trypsin) # Create digest. self.digest = Digest(protease=trypsin) self.session.add(self.digest) self.session.commit() # Create mock FASTA file. hndl, self.fasta_file = tempfile.mkstemp(suffix=".fasta") self.taxon_id = os.path.splitext(os.path.basename(self.fasta_file))[0] with open(self.fasta_file, 'wb') as fh: fh.write(self.get_mock_fasta())
def get_digest(logger, digest_def): """ Fetch or create a digest from a digest definition.""" session = db.get_session() # Get or create protease. protease = session.query(Protease).get( digest_def['protease']['id']) if not protease: logger.info( "No protease exists for the given definition, creating...") protease = Protease(**digest_def['protease']) session.add(protease) # Get or create digest object. digest = ( session.query(Digest) .filter(Digest.protease == protease) .filter(Digest.max_missed_cleavages == digest_def.get( 'max_missed_cleavages')) .filter(Digest.min_acids == digest_def.get( 'min_acids')) .filter(Digest.max_acids == digest_def.get( 'max_acids')) ).first() if not digest: logger.info( "No digest exists for the given definition, creating...") digest_kwargs = {} digest_kwargs.update(digest_def) digest_kwargs['protease'] = protease digest = Digest(**digest_kwargs) session.add(digest) session.commit() return digest
def main(): args = argparser.parse_args() logger = logging.getLogger('query_by_sequence') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Define levenshtein function in SQLite. try: def levenshtein(s1,s2): l1 = len(s1) l2 = len(s2) matrix = [range(l1 + 1)] * (l2 + 1) for zz in range(l2 + 1): matrix[zz] = range(zz,zz + l1 + 1) for zz in range(0,l2): for sz in range(0,l1): if s1[sz] == s2[zz]: matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz]) else: matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1) return matrix[l2][l1] connection = db.get_connection() connection.connection.create_function("LEVENSHTEIN", 2, levenshtein) except Exception as e: logger.exception('Could not define Levenshtein distance function: %s' % e) raise e session = db.get_session(bind=connection) # Read in sequences to query. sequences = [] if args.sequence_file: with open(args.sequence_file, 'rb') as f: sequences = [line.strip() for line in f.readlines()] elif args.sequence: sequences = [args.sequence] if not sequences: argparser.error("Provide a query sequence via the '--sequence' option, " "or a set of sequences via the --sequence-file option") # Print headers. headers = ['query', 'taxon', 'lev_distance', 'match'] print ','.join(headers) # Execute query for each sequence and print results. for seq in sequences: lev_dist = func.LEVENSHTEIN(Peptide.sequence, seq) q = (session.query(TaxonDigest.taxon_id, lev_dist, Peptide.sequence) .select_from(Peptide) .join(TaxonDigestPeptide) .join(TaxonDigest) .filter(lev_dist <= args.max_distance) ) for row in q: print ','.join([str(s) for s in [seq] + list(row)])
def run(self): # Get session. self.session = db.get_session(bind=self.get_connection()) taxons = (self.session.query(Taxon).filter(Taxon.id.in_( self.taxon_ids))) for taxon in taxons: self.logger.info("Clearing data for taxon '%s'" % taxon.id) self.clear_data_for_taxon(taxon)
def run(self): # Get session. self.session = db.get_session(bind=self.get_connection()) taxons = ( self.session.query(Taxon) .filter(Taxon.id.in_(self.taxon_ids)) ) for taxon in taxons: self.logger.info("Clearing data for taxon '%s'" % taxon.id) self.clear_data_for_taxon(taxon)
def run(self): # Get session. self.session = db.get_session(bind=self.get_connection()) self.digest = self.session.merge(self.digest) # Initialize stats dict. self.stats = defaultdict(int) # Process FASTA files. for path in self.fasta_paths: self.process_fasta_file(path) self.logger.info("Digest and ingest task complete.") return self.stats
def main(): args = argparser.parse_args() logger = logging.getLogger('redundancy_tables') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Check that taxon ids or taxon id file were provided. if not (args.taxon_ids or args.taxon_id_file): raise Exception("Must provide --taxon-ids or --taxon-id-file option") session = db.get_session() # Get taxons. if args.taxon_ids: taxon_ids = args.taxon_ids else: with open(args.taxon_id_file, 'rb') as f: taxon_ids = [row[0] for row in csv.reader(f)] # Get the digest. digest = get_digest(logger, config.DEFAULT_DIGEST_DEFINITION, session) # Get the TaxonDigests. taxon_digests = ( session.query(TaxonDigest) .filter(TaxonDigest.digest == digest) .join(Taxon) .filter(Taxon.id.in_(taxon_ids)) ).all() # Generate the redundancy tables. tables = redundancy.generate_redundancy_tables( session, taxon_digests, logger=logger) # Create output dir if it does not exist. if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Output tables. for table_id, table in tables.items(): table_file = os.path.join(args.output_dir, table_id + '.csv') logger.info("Writing '%s'..." % table_file) with open(table_file, 'wb') as f: w = csv.writer(f) for row in table: w.writerow(row) logger.info("Done.")
def main(): args = argparser.parse_args() logger = logging.getLogger('redundancy_tables') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Check that taxon ids or taxon id file were provided. if not (args.taxon_ids or args.taxon_id_file): raise Exception("Must provide --taxon-ids or --taxon-id-file option") session = db.get_session() # Get taxons. if args.taxon_ids: taxon_ids = args.taxon_ids else: with open(args.taxon_id_file, 'rb') as f: taxon_ids = [row[0] for row in csv.reader(f)] # Get the digest. digest = get_digest(logger, config.DEFAULT_DIGEST_DEFINITION, session) # Get the TaxonDigests. taxon_digests = (session.query(TaxonDigest).filter( TaxonDigest.digest == digest).join(Taxon).filter( Taxon.id.in_(taxon_ids))).all() # Generate the redundancy tables. tables = redundancy.generate_redundancy_tables(session, taxon_digests, logger=logger) # Create output dir if it does not exist. if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Output tables. for table_id, table in tables.items(): table_file = os.path.join(args.output_dir, table_id + '.csv') logger.info("Writing '%s'..." % table_file) with open(table_file, 'wb') as f: w = csv.writer(f) for row in table: w.writerow(row) logger.info("Done.")
def get_digest(logger, digest_def, session=None): """ Get digest from a digest definition.""" if not session: session = db.get_session() # Get protease object. protease = session.query(Protease).get(digest_def['protease']['id']) if not protease: raise Exception("No protease exists for the given definition.") # Get digest object. digest = (session.query(Digest).filter(Digest.protease == protease).filter( Digest.max_missed_cleavages == digest_def.get('max_missed_cleavages')). filter(Digest.min_acids == digest_def.get('min_acids')).filter( Digest.max_acids == digest_def.get('max_acids'))).first() if not digest: raise Exception("No digest exists for the given definition.") return digest
def main(): args = argparser.parse_args() logger = logging.getLogger('query_by_sequence') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Define levenshtein function in SQLite. try: def levenshtein(s1, s2): l1 = len(s1) l2 = len(s2) matrix = [range(l1 + 1)] * (l2 + 1) for zz in range(l2 + 1): matrix[zz] = range(zz, zz + l1 + 1) for zz in range(0, l2): for sz in range(0, l1): if s1[sz] == s2[zz]: matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz]) else: matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz] + 1) return matrix[l2][l1] connection = db.get_connection() connection.connection.create_function("LEVENSHTEIN", 2, levenshtein) except Exception as e: logger.exception('Could not define Levenshtein distance function: %s' % e) raise e session = db.get_session(bind=connection) # Read in sequences to query. sequences = [] if args.sequence_file: with open(args.sequence_file, 'rb') as f: sequences = [line.strip() for line in f.readlines()] elif args.sequence: sequences = [args.sequence] if not sequences: argparser.error( "Provide a query sequence via the '--sequence' option, " "or a set of sequences via the --sequence-file option") # Print headers. headers = ['query', 'taxon', 'lev_distance', 'match'] print ','.join(headers) # Execute query for each sequence and print results. for seq in sequences: lev_dist = func.LEVENSHTEIN(Peptide.sequence, seq) q = (session.query(TaxonDigest.taxon_id, lev_dist, Peptide.sequence).select_from(Peptide).join( TaxonDigestPeptide).join(TaxonDigest).filter( lev_dist <= args.max_distance)) for row in q: print ','.join([str(s) for s in [seq] + list(row)])