def get_digest(logger, digest_def, session=None):
    """ Get digest from a digest definition."""
    if not session:
        session = db.get_session()

    # Get protease object.
    protease = session.query(Protease).get(
        digest_def['protease']['id'])
    if not protease:
        raise Exception("No protease exists for the given definition.")

    # Get digest object.
    digest = (
        session.query(Digest)
        .filter(Digest.protease == protease)
        .filter(Digest.max_missed_cleavages == digest_def.get(
            'max_missed_cleavages'))
        .filter(Digest.min_acids == digest_def.get(
            'min_acids'))
        .filter(Digest.max_acids == digest_def.get(
            'max_acids'))
    ).first()
    if not digest:
        raise Exception("No digest exists for the given definition.")

    return digest
    def setUp(self):

        # Setup DB.
        #d = tempfile.mkdtemp(prefix="tdb.")
        #db_file = os.path.join(d, "foo")
        #self.engine = create_engine('sqlite:///%s' % db_file)
        self.engine = create_engine('sqlite://')

        def get_connection():
            return self.engine.connect()

        self.get_connection = get_connection
        db.metadata.create_all(bind=self.engine)

        self.session = db.get_session(bind=self.get_connection())

        # Create trypsin protease.
        trypsin = Protease(id='trypsin', cleavage_rule=expasy_rules['trypsin'])
        self.session.add(trypsin)

        # Create digest.
        self.digest = Digest(protease=trypsin)
        self.session.add(self.digest)
        self.session.commit()

        # Create mock FASTA file.
        hndl, self.fasta_file = tempfile.mkstemp(suffix=".fasta")
        self.taxon_id = os.path.splitext(os.path.basename(self.fasta_file))[0]
        with open(self.fasta_file, 'wb') as fh:
            fh.write(self.get_mock_fasta())
    def setUp(self):

        # Setup DB.
        #d = tempfile.mkdtemp(prefix="tdb.")
        #db_file = os.path.join(d, "foo")
        #self.engine = create_engine('sqlite:///%s' % db_file)
        self.engine = create_engine('sqlite://')
        def get_connection():
            return self.engine.connect()
        self.get_connection = get_connection
        db.metadata.create_all(bind=self.engine)

        self.session = db.get_session(bind=self.get_connection())

        # Create trypsin protease.
        trypsin = Protease(id='trypsin', cleavage_rule=expasy_rules['trypsin'])
        self.session.add(trypsin)

        # Create digest.
        self.digest = Digest(protease=trypsin)
        self.session.add(self.digest)
        self.session.commit()

        # Create mock FASTA file.
        hndl, self.fasta_file = tempfile.mkstemp(suffix=".fasta")
        self.taxon_id = os.path.splitext(os.path.basename(self.fasta_file))[0]
        with open(self.fasta_file, 'wb') as fh:
            fh.write(self.get_mock_fasta())
Ejemplo n.º 4
0
def get_digest(logger, digest_def):
    """ Fetch or create a digest from a digest definition."""
    session = db.get_session()

    # Get or create protease.
    protease = session.query(Protease).get(
        digest_def['protease']['id'])
    if not protease:
        logger.info(
            "No protease exists for the given definition, creating...")
        protease = Protease(**digest_def['protease'])
        session.add(protease)
    # Get or create digest object.
    digest = (
        session.query(Digest)
        .filter(Digest.protease == protease)
        .filter(Digest.max_missed_cleavages == digest_def.get(
            'max_missed_cleavages'))
        .filter(Digest.min_acids == digest_def.get(
            'min_acids'))
        .filter(Digest.max_acids == digest_def.get(
            'max_acids'))
    ).first()
    if not digest:
        logger.info(
            "No digest exists for the given definition, creating...")
        digest_kwargs = {}
        digest_kwargs.update(digest_def)
        digest_kwargs['protease'] = protease
        digest = Digest(**digest_kwargs)
        session.add(digest)
    session.commit()
    return digest
Ejemplo n.º 5
0
def main():
    args = argparser.parse_args()

    logger = logging.getLogger('query_by_sequence')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    # Define levenshtein function in SQLite.
    try:
        def levenshtein(s1,s2):
            l1 = len(s1)
            l2 = len(s2)
            matrix = [range(l1 + 1)] * (l2 + 1)
            for zz in range(l2 + 1):
              matrix[zz] = range(zz,zz + l1 + 1)
            for zz in range(0,l2):
              for sz in range(0,l1):
                if s1[sz] == s2[zz]:
                  matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz])
                else:
                  matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1)
            return matrix[l2][l1]

        connection = db.get_connection()
        connection.connection.create_function("LEVENSHTEIN", 2, levenshtein)
    except Exception as e:
        logger.exception('Could not define Levenshtein distance function: %s' % e)
        raise e

    session = db.get_session(bind=connection)

    # Read in sequences to query.
    sequences = []
    if args.sequence_file:
        with open(args.sequence_file, 'rb') as f:
            sequences = [line.strip() for line in f.readlines()]
    elif args.sequence:
        sequences = [args.sequence]

    if not sequences:
        argparser.error("Provide a query sequence via the '--sequence' option, "
                        "or a set of sequences via the --sequence-file option")

    # Print headers. 
    headers = ['query', 'taxon', 'lev_distance', 'match']
    print ','.join(headers)

    # Execute query for each sequence and print results.
    for seq in sequences:
        lev_dist = func.LEVENSHTEIN(Peptide.sequence, seq)
        q = (session.query(TaxonDigest.taxon_id, lev_dist,
                           Peptide.sequence)
             .select_from(Peptide)
             .join(TaxonDigestPeptide)
             .join(TaxonDigest)
             .filter(lev_dist <= args.max_distance)
            )
        for row in q:
            print ','.join([str(s) for s in [seq] + list(row)])
Ejemplo n.º 6
0
    def run(self):
        # Get session.
        self.session = db.get_session(bind=self.get_connection())

        taxons = (self.session.query(Taxon).filter(Taxon.id.in_(
            self.taxon_ids)))

        for taxon in taxons:
            self.logger.info("Clearing data for taxon '%s'" % taxon.id)
            self.clear_data_for_taxon(taxon)
Ejemplo n.º 7
0
    def run(self):
        # Get session.
        self.session = db.get_session(bind=self.get_connection())

        taxons = (
            self.session.query(Taxon)
            .filter(Taxon.id.in_(self.taxon_ids))
        )

        for taxon in taxons:
            self.logger.info("Clearing data for taxon '%s'" % taxon.id)
            self.clear_data_for_taxon(taxon)
Ejemplo n.º 8
0
    def run(self):
        # Get session.
        self.session = db.get_session(bind=self.get_connection())
        self.digest = self.session.merge(self.digest)

        # Initialize stats dict.
        self.stats = defaultdict(int)

        # Process FASTA files.
        for path in self.fasta_paths:
            self.process_fasta_file(path)

        self.logger.info("Digest and ingest task complete.")
        return self.stats
Ejemplo n.º 9
0
    def run(self):
        # Get session.
        self.session = db.get_session(bind=self.get_connection())
        self.digest = self.session.merge(self.digest)

        # Initialize stats dict.
        self.stats = defaultdict(int)

        # Process FASTA files.
        for path in self.fasta_paths:
            self.process_fasta_file(path)

        self.logger.info("Digest and ingest task complete.")
        return self.stats
Ejemplo n.º 10
0
def main():
    args = argparser.parse_args()

    logger = logging.getLogger('redundancy_tables')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    # Check that taxon ids or taxon id file were provided.
    if not (args.taxon_ids or args.taxon_id_file):
        raise Exception("Must provide --taxon-ids or --taxon-id-file option")

    session = db.get_session()

    # Get taxons.
    if args.taxon_ids:
        taxon_ids = args.taxon_ids
    else:
        with open(args.taxon_id_file, 'rb') as f:
            taxon_ids = [row[0] for row in csv.reader(f)]

    # Get the digest.
    digest = get_digest(logger, config.DEFAULT_DIGEST_DEFINITION, session)

    # Get the TaxonDigests.
    taxon_digests = (
        session.query(TaxonDigest)
        .filter(TaxonDigest.digest == digest)
        .join(Taxon)
        .filter(Taxon.id.in_(taxon_ids))
    ).all()

    # Generate the redundancy tables.
    tables = redundancy.generate_redundancy_tables(
        session, taxon_digests, logger=logger)

    # Create output dir if it does not exist.
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Output tables.
    for table_id, table in tables.items():
        table_file = os.path.join(args.output_dir, table_id + '.csv')
        logger.info("Writing '%s'..." % table_file)
        with open(table_file, 'wb') as f:
            w = csv.writer(f)
            for row in table:
                w.writerow(row)

    logger.info("Done.")
Ejemplo n.º 11
0
def main():
    args = argparser.parse_args()

    logger = logging.getLogger('redundancy_tables')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    # Check that taxon ids or taxon id file were provided.
    if not (args.taxon_ids or args.taxon_id_file):
        raise Exception("Must provide --taxon-ids or --taxon-id-file option")

    session = db.get_session()

    # Get taxons.
    if args.taxon_ids:
        taxon_ids = args.taxon_ids
    else:
        with open(args.taxon_id_file, 'rb') as f:
            taxon_ids = [row[0] for row in csv.reader(f)]

    # Get the digest.
    digest = get_digest(logger, config.DEFAULT_DIGEST_DEFINITION, session)

    # Get the TaxonDigests.
    taxon_digests = (session.query(TaxonDigest).filter(
        TaxonDigest.digest == digest).join(Taxon).filter(
            Taxon.id.in_(taxon_ids))).all()

    # Generate the redundancy tables.
    tables = redundancy.generate_redundancy_tables(session,
                                                   taxon_digests,
                                                   logger=logger)

    # Create output dir if it does not exist.
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Output tables.
    for table_id, table in tables.items():
        table_file = os.path.join(args.output_dir, table_id + '.csv')
        logger.info("Writing '%s'..." % table_file)
        with open(table_file, 'wb') as f:
            w = csv.writer(f)
            for row in table:
                w.writerow(row)

    logger.info("Done.")
Ejemplo n.º 12
0
def get_digest(logger, digest_def, session=None):
    """ Get digest from a digest definition."""
    if not session:
        session = db.get_session()

    # Get protease object.
    protease = session.query(Protease).get(digest_def['protease']['id'])
    if not protease:
        raise Exception("No protease exists for the given definition.")

    # Get digest object.
    digest = (session.query(Digest).filter(Digest.protease == protease).filter(
        Digest.max_missed_cleavages == digest_def.get('max_missed_cleavages')).
              filter(Digest.min_acids == digest_def.get('min_acids')).filter(
                  Digest.max_acids == digest_def.get('max_acids'))).first()
    if not digest:
        raise Exception("No digest exists for the given definition.")

    return digest
Ejemplo n.º 13
0
def main():
    args = argparser.parse_args()

    logger = logging.getLogger('query_by_sequence')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    # Define levenshtein function in SQLite.
    try:

        def levenshtein(s1, s2):
            l1 = len(s1)
            l2 = len(s2)
            matrix = [range(l1 + 1)] * (l2 + 1)
            for zz in range(l2 + 1):
                matrix[zz] = range(zz, zz + l1 + 1)
            for zz in range(0, l2):
                for sz in range(0, l1):
                    if s1[sz] == s2[zz]:
                        matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1,
                                                     matrix[zz][sz + 1] + 1,
                                                     matrix[zz][sz])
                    else:
                        matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1,
                                                     matrix[zz][sz + 1] + 1,
                                                     matrix[zz][sz] + 1)
            return matrix[l2][l1]

        connection = db.get_connection()
        connection.connection.create_function("LEVENSHTEIN", 2, levenshtein)
    except Exception as e:
        logger.exception('Could not define Levenshtein distance function: %s' %
                         e)
        raise e

    session = db.get_session(bind=connection)

    # Read in sequences to query.
    sequences = []
    if args.sequence_file:
        with open(args.sequence_file, 'rb') as f:
            sequences = [line.strip() for line in f.readlines()]
    elif args.sequence:
        sequences = [args.sequence]

    if not sequences:
        argparser.error(
            "Provide a query sequence via the '--sequence' option, "
            "or a set of sequences via the --sequence-file option")

    # Print headers.
    headers = ['query', 'taxon', 'lev_distance', 'match']
    print ','.join(headers)

    # Execute query for each sequence and print results.
    for seq in sequences:
        lev_dist = func.LEVENSHTEIN(Peptide.sequence, seq)
        q = (session.query(TaxonDigest.taxon_id, lev_dist,
                           Peptide.sequence).select_from(Peptide).join(
                               TaxonDigestPeptide).join(TaxonDigest).filter(
                                   lev_dist <= args.max_distance))
        for row in q:
            print ','.join([str(s) for s in [seq] + list(row)])