Beispiel #1
0
def get_unannotated_entries(databank_name):

    databank = storage.find_one('databanks',{'name':databank_name})
    if not databank:
        raise Exception ("no such databank: " + databank_name)

    # Needs a parent to determine what's missing
    if 'parent_name' not in databank:
        return []

    entries = entries_by_pdbid(storage.find('entries',
                                            {'databank_name': databank_name,
                                             '$or': [{'filepath': {'$exists': True}}, {'comment': {'$exists': True }}]
                                            }))

    parent_entries = storage.find('entries',
                                  {'databank_name': databank['parent_name'],
                                   'filepath': {'$exists': True}
                                  }, {'pdbid': 1})

    unannotated = []
    for parent_entry in parent_entries:
        pdbid = parent_entry['pdbid']
        if pdbid not in entries:
            entry = {'pdbid': pdbid, 'databank_name': databank_name}
            unannotated.append(entry)

    return unannotated
Beispiel #2
0
def search_results_for(pdbid):

    part = pdbid[1:3]

    entries = entries_by_databank(storage.find('entries', {'pdbid': pdbid}))
    databanks = databanks_by_name(storage.find('databanks', {}))

    results = {}
    for databank_name in databanks.keys():

        databank = databanks[databank_name]

        if databank_name in entries:

            entry = entries[databank_name]

            if 'filepath' in entry:
                results[databank_name] = get_file_link(databank, pdbid)
            elif 'comment' in entry:
                results[databank_name] = entry['comment']
        else:
            results[databank_name] = 'Not available'

            if 'parent_name' in databank:
                parent_name = databank['parent_name']
                if parent_name not in entries or 'comment' in entries[
                        parent_name]:
                    results[databank_name] += ', depends on %s' % parent_name

    return results
Beispiel #3
0
def get_databank_hierarchy(name=None):

    if name is None:

        databanks = storage.find('databanks',
                                 {'parent_name': {
                                     '$exists': False
                                 }}, {
                                     'name': 1,
                                     '_id': 0
                                 })
    else:
        databanks = storage.find('databanks', {'parent_name': name}, {
            'name': 1,
            '_id': 0
        })

    tree = {}
    for databank in databanks:
        name = databank['name']
        branch = get_databank_hierarchy(name)

        tree[name] = branch

    return tree
Beispiel #4
0
def search_results_for (pdbid):

    part = pdbid [1:3]

    entries = entries_by_databank (storage.find ('entries', {'pdbid': pdbid}))
    databanks = databanks_by_name (storage.find ('databanks', {}))

    results = {}
    for databank_name in databanks.keys():

        databank = databanks [databank_name]

        if databank_name in entries:

            entry = entries [databank_name]

            if 'filepath' in entry:
                results [databank_name] = get_file_link (databank, pdbid)
            elif 'comment' in entry:
                results [databank_name] = entry ['comment']
        else:
            results [databank_name] = 'Not available'

            if 'parent_name' in databank:
                parent_name = databank ['parent_name']
                if parent_name not in entries or 'comment' in entries [parent_name]:
                    results [databank_name] += ', depends on %s' % parent_name

    return results
Beispiel #5
0
def count_summary (databank_name):

    databank = storage.find_one('databanks',{'name': databank_name})
    if not databank:
        raise Exception("no such databank: " + databank_name)

    projection = {'pdbid':1, '_id':0}

    count = {}

    pdbids = Set ()
    for entry in storage.find ('entries', {'databank_name': databank_name,'filepath': {'$exists': True}}, projection):
        pdbids.add (entry ['pdbid'])

    count ['present'] = len (pdbids)

    if 'parent_name' in databank:

        parent_name = databank ['parent_name']

        parent_pdbids = Set()
        missing_pdbids = Set()

        parent_entries = storage.find ('entries', {'databank_name': parent_name,'filepath': {'$exists': True}}, projection)
        comment_entries = storage.find('entries', {'databank_name': databank_name, 'comment': {'$exists': True}}, projection)

        for entry in parent_entries:
            parent_pdbids.add (entry ['pdbid'])
            if entry ['pdbid'] not in pdbids:
                missing_pdbids.add (entry ['pdbid'])

        count ['missing'] = len (missing_pdbids)
        count ['annotated'] = 0
        for entry in comment_entries:
            if entry ['pdbid'] in missing_pdbids:

                count ['annotated'] += 1

        # missing = annotated + unannotated
        count ['unannotated'] = count ['missing'] - count ['annotated']

        count ['obsolete'] = 0
        for pdbid in pdbids:
            if pdbid not in parent_pdbids:
                count ['obsolete'] += 1

        count ['valid'] = count ['present'] - count ['obsolete']

    else: # no parent, so nothing is missing or obsolete

        count ['missing'] = 0
        count ['valid'] = count ['present']
        count ['obsolete'] = 0
        count ['annotated'] = 0
        count ['unannotated'] = 0

    return count
Beispiel #6
0
def comment_summary():

    comments = {}
    for entry in storage.find('entries', {
            'comment': {
                '$exists': True
            },
            'mtime': {
                '$exists': True
            }
    }, {
            'mtime': 1,
            'comment': 1,
            '_id': 0
    }):
        text = entry['comment']

        if text not in comments:
            comments[text] = {
                'text': text,
                'n_entries': 0,
                'mtime': entry['mtime']
            }

        comments[text]['n_entries'] += 1
        if comments[text]['mtime'] < entry['mtime']:
            comments[text]['mtime'] = entry['mtime']

    return comments.values()
Beispiel #7
0
def get_databank_hierarchy (name = None):

    if name is None:

        databanks = storage.find ('databanks', {'parent_name': {'$exists': False}}, {'name': 1, '_id': 0})
    else:
        databanks = storage.find ('databanks', {'parent_name': name}, {'name': 1, '_id': 0})

    tree = {}
    for databank in databanks:
        name = databank ['name']
        branch = get_databank_hierarchy (name)

        tree [name] = branch

    return tree
Beispiel #8
0
def get_missing_entries(databank_name):

    databank = storage.find_one('databanks', {'name': databank_name})
    if not databank:
        raise Exception("no such databank: " + databank_name)

    # Needs a parent to determine what's missing
    if 'parent_name' not in databank:
        return []

    entries = entries_by_pdbid(
        storage.find('entries', {'databank_name': databank_name}))

    missing = []
    for entry in get_present_entries(databank['parent_name']):
        pdbid = entry['pdbid']
        if pdbid in entries:
            if 'filepath' not in entries[pdbid] or 'mtime' not in entries[
                    pdbid]:
                missing.append(entries[pdbid])
        else:
            entry = {'pdbid': pdbid, 'databank_name': databank_name}
            missing.append(entry)

    return missing
Beispiel #9
0
def crawl_files (databank, pathnames):

    present_entries_bypdbid = entries_by_pdbid (get_present_entries (databank ['name']))
    record_pdbids = entries_by_pdbid (storage.find ('entries', {'databank_name': databank ['name']}, {'pdbid':1}))
    pattern = parse_regex (databank['regex'])

    for f in pathnames:

        # Only use files that match the databank's pattern.
        m = pattern.search(f)
        if not m:
            continue

        # For disk files take their mtimes, for urls take current time.
        mtime = time ()
        if os.path.isfile (f):
            mtime = os.path.getmtime (f)

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': f,
            'mtime': mtime
        }
        if entry ['pdbid'] in present_entries_bypdbid:
            continue

        if entry ['pdbid'] in record_pdbids:
            storage.update ('entries', {'databank_name': databank ['name'], 'pdbid': entry ['pdbid']}, entry)
        else:
            storage.insert ('entries', entry)
Beispiel #10
0
def crawl_lines (databank, filepath, lines):

    present_entries_bypdbid = entries_by_pdbid(get_present_entries(databank['name']))
    record_pdbids = entries_by_pdbid(storage.find('entries',{'databank_name':databank['name']}, {'pdbid':1}))
    pattern = parse_regex(databank['regex'])

    # If it's a disk file take its mtime, for urls take current time.
    mtime = time()
    if os.path.isfile (filepath):
        mtime = os.path.getmtime (filepath)

    for line in lines:

        # Only use lines that match the databank's pattern
        m = pattern.search (line)
        if not m:
            continue

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': filepath,
            'mtime': mtime
        }
        if entry['pdbid'] in present_entries_bypdbid:
            continue

        if entry['pdbid'] in record_pdbids:
            storage.update('entries', {'databank_name':databank['name'], 'pdbid':entry['pdbid']}, entry)
        else:
            storage.insert('entries', entry)
Beispiel #11
0
def get_entries_with_comment(databank_name, comment):

    # ordering was found to make it take longer!

    return storage.find('entries', {
        'databank_name': databank_name,
        'comment': comment
    },
                        order=[("pdbid", pymongo.ASCENDING)])
Beispiel #12
0
def get_entries_with_pdbid(databank_name, pdbid):

    # ordering was found to make it take longer!

    return storage.find('entries', {
        'databank_name': databank_name,
        'pdbid': pdbid
    },
                        order=[("pdbid", pymongo.ASCENDING)])
Beispiel #13
0
def databanks(name=None):
    start_time = time()
    if name is None:
        databanks = storage.find('databanks', {})
    else:
        databanks = [ storage.find_one('databanks', {'name': name}) ]
    end_time = time()

    return render_template('databank/DatabankPage.html', db_tree=db_tree, nav_disabled='databanks', databanks=databanks)
Beispiel #14
0
def load_statistics():
    _log.info("request for statistics")

    #TODO: speed up this method

    ndb = storage.count('databanks', {})

    ne = 0
    na = 0
    nf = 0
    nc = 0

    unique_comments = Set()
    recent_files = top_highest(10)
    recent_annotations = top_highest(10)
    for entry in storage.find('entries', {}):

        ne += 1
        if 'mtime' in entry:
            if 'filepath' in entry:
                nf += 1
                recent_files.add(entry['mtime'], entry)
            elif 'comment' in entry:
                na += 1
                unique_comments.add(entry['comment'])
                recent_annotations.add(entry['mtime'], entry)

    # Perform time-consuming operations only on the last 10 files and annotations
    files = []
    for f in recent_files.get():
        files.append({
            'path': f['filepath'],
            'date': strftime(date_format, gmtime(f['mtime']))
        })

    annotations = []
    for a in recent_annotations.get():
        annotations.append({
            'comment': a['comment'],
            'pdbid': a['pdbid'],
            'databank_name': a['databank_name'],
            'date': strftime(date_format, gmtime(a['mtime']))
        })

    nc = len(unique_comments)

    statistics = {}
    statistics['total_databanks'] = ndb
    statistics['total_entries'] = ne
    statistics['total_files'] = nf
    statistics['total_annotations'] = na
    statistics['total_comments'] = nc
    statistics['annotations'] = annotations
    statistics['files'] = files

    return jsonify(statistics)
Beispiel #15
0
def databanks(name=None):
    start_time = time()
    if name is None:
        databanks = storage.find('databanks', {})
    else:
        databanks = [storage.find_one('databanks', {'name': name})]
    end_time = time()

    return render_template('databank/DatabankPage.html',
                           db_tree=db_tree,
                           nav_disabled='databanks',
                           databanks=databanks)
Beispiel #16
0
def get_annotated_entries(databank_name):

    return storage.find('entries', {
        'databank_name': databank_name,
        'comment': {
            '$exists': True
        },
        'filepath': {
            '$exists': False
        }
    },
                        order=[("pdbid", pymongo.ASCENDING)])
Beispiel #17
0
def get_present_entries(databank_name, ordered=False):

    # ordering was found to make it take longer!

    return storage.find('entries', {
        'databank_name': databank_name,
        'filepath': {
            '$exists': True
        },
        'mtime': {
            '$exists': True
        }
    },
                        order=[("pdbid", pymongo.ASCENDING)])
Beispiel #18
0
def comment_summary ():

    comments = {}
    for entry in storage.find ('entries', {'comment': {'$exists': True}, 'mtime': {'$exists': True}}, {'mtime':1, 'comment':1, '_id':0}):
        text = entry ['comment']

        if text not in comments:
            comments [text] = {'text':text, 'n_entries': 0, 'mtime': entry ['mtime']}

        comments [text]['n_entries'] += 1
        if comments [text]['mtime'] < entry ['mtime']:
            comments [text]['mtime'] = entry ['mtime']

    return comments.values ()
Beispiel #19
0
def load_statistics():
    _log.info("request for statistics")

    #TODO: speed up this method

    ndb = storage.count('databanks', {})

    ne = 0
    na = 0
    nf = 0
    nc = 0

    unique_comments = Set()
    recent_files = top_highest(10)
    recent_annotations = top_highest(10)
    for entry in storage.find('entries', {}):

        ne += 1
        if 'mtime' in entry:
            if 'filepath' in entry:
                nf += 1
                recent_files.add(entry ['mtime'], entry)
            elif 'comment' in entry:
                na += 1
                unique_comments.add(entry ['comment'])
                recent_annotations.add(entry ['mtime'], entry)

    # Perform time-consuming operations only on the last 10 files and annotations
    files = []
    for f in recent_files.get():
        files.append({'path': f['filepath'], 'date': strftime(date_format, gmtime(f['mtime']))})

    annotations = []
    for a in recent_annotations.get():
        annotations.append({'comment': a ['comment'], 'pdbid': a ['pdbid'],
                             'databank_name': a ['databank_name'],
                             'date':  strftime(date_format, gmtime(a['mtime']))})

    nc = len(unique_comments)

    statistics = {}
    statistics ['total_databanks'] = ndb
    statistics ['total_entries'] = ne
    statistics ['total_files'] = nf
    statistics ['total_annotations'] = na
    statistics ['total_comments'] = nc
    statistics ['annotations'] = annotations
    statistics ['files'] = files

    return jsonify(statistics)
Beispiel #20
0
def crawl_lines(databank, filepath, lines):

    present_entries_bypdbid = entries_by_pdbid(
        get_present_entries(databank['name']))
    record_pdbids = entries_by_pdbid(
        storage.find('entries', {'databank_name': databank['name']},
                     {'pdbid': 1}))
    pattern = parse_regex(databank['regex'])

    # If it's a disk file take its mtime, for urls take current time.
    mtime = time()
    if os.path.isfile(filepath):
        mtime = os.path.getmtime(filepath)

    for line in lines:

        # Only use lines that match the databank's pattern
        m = pattern.search(line)
        if not m:
            continue

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': filepath,
            'mtime': mtime
        }
        if entry['pdbid'] in present_entries_bypdbid:
            continue

        if entry['pdbid'] in record_pdbids:
            storage.update('entries', {
                'databank_name': databank['name'],
                'pdbid': entry['pdbid']
            }, entry)
        else:
            storage.insert('entries', entry)
Beispiel #21
0
def crawl_files(databank, pathnames):

    present_entries_bypdbid = entries_by_pdbid(
        get_present_entries(databank['name']))
    record_pdbids = entries_by_pdbid(
        storage.find('entries', {'databank_name': databank['name']},
                     {'pdbid': 1}))
    pattern = parse_regex(databank['regex'])

    for f in pathnames:

        # Only use files that match the databank's pattern.
        m = pattern.search(f)
        if not m:
            continue

        # For disk files take their mtimes, for urls take current time.
        mtime = time()
        if os.path.isfile(f):
            mtime = os.path.getmtime(f)

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': f,
            'mtime': mtime
        }
        if entry['pdbid'] in present_entries_bypdbid:
            continue

        if entry['pdbid'] in record_pdbids:
            storage.update('entries', {
                'databank_name': databank['name'],
                'pdbid': entry['pdbid']
            }, entry)
        else:
            storage.insert('entries', entry)
Beispiel #22
0
#!/usr/bin/python

import sys
import os
import commands

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from storage import storage
from sets import Set

from httplib import HTTPConnection

from utils import has_annotated_parent, has_present_parent, databanks_by_name, valid_path, get_entry

databanks = databanks_by_name(storage.find('databanks', {}))

regexes = {}
for name in databanks:
    regexes[name] = databanks[name]['regex'].try_compile()

psql_defs = 'export PGPASSWORD=oon6oo4J'
psql_call = 'psql whynot2 -h cmbi11 -U whynotuser'

# Compare entries

entries_mongo = {}
files_mongo = {}
comments_mongo = {}
entries_psql = {}
comments_psql = {}
files_psql = {}
Beispiel #23
0
def get_entries_with_pdbid (databank_name, pdbid):

    # ordering was found to make it take longer!

    return storage.find ('entries', {'databank_name': databank_name, 'pdbid': pdbid}, order=[("pdbid", pymongo.ASCENDING)])
Beispiel #24
0
from urllib2 import urlopen
from storage import storage
import pymongo
from sets import Set
from httplib import HTTPConnection

def databanks_by_name (databanks):

    d = {}
    for databank in databanks:
        d [databank ['name']] = databank

    return d

databanks = databanks_by_name (storage.find ('databanks', {}))

databank_regexes = {}
for name in databanks:
    databank_regexes [name] = databanks [name]['regex'].try_compile ()


# Verifies that the path contains the regex that the databank prescribes.
# For a file path, checks that the file exists
# For an url, checks that the url points to a vallid location. (hssp response 200)
def valid_path (databank_name, path):

    if not databank_regexes [databank_name].search (path):
        return False

    if path.startswith ('http://') or path.startswith ('ftp://'):
Beispiel #25
0
#!/usr/bin/python

import sys
import os
import commands

sys.path.append (os.path.join(os.path.dirname (__file__), '..'))
from storage import storage
from sets import Set

from httplib import HTTPConnection

from utils import has_annotated_parent, has_present_parent, databanks_by_name, valid_path, get_entry

databanks = databanks_by_name (storage.find ('databanks', {}))

regexes = {}
for name in databanks:
    regexes [name] = databanks [name]['regex'].try_compile ()

psql_defs = 'export PGPASSWORD=oon6oo4J'
psql_call = 'psql whynot2 -h cmbi11 -U whynotuser'

# Compare entries

entries_mongo = {}
files_mongo = {}
comments_mongo = {}
entries_psql = {}
comments_psql = {}
files_psql = {}
Beispiel #26
0
def count_summary(databank_name):

    databank = storage.find_one('databanks', {'name': databank_name})
    if not databank:
        raise Exception("no such databank: " + databank_name)

    projection = {'pdbid': 1, '_id': 0}

    count = {}

    pdbids = Set()
    for entry in storage.find('entries', {
            'databank_name': databank_name,
            'filepath': {
                '$exists': True
            }
    }, projection):
        pdbids.add(entry['pdbid'])

    count['present'] = len(pdbids)

    if 'parent_name' in databank:

        parent_name = databank['parent_name']

        parent_pdbids = Set()
        missing_pdbids = Set()

        parent_entries = storage.find('entries', {
            'databank_name': parent_name,
            'filepath': {
                '$exists': True
            }
        }, projection)
        comment_entries = storage.find('entries', {
            'databank_name': databank_name,
            'comment': {
                '$exists': True
            }
        }, projection)

        for entry in parent_entries:
            parent_pdbids.add(entry['pdbid'])
            if entry['pdbid'] not in pdbids:
                missing_pdbids.add(entry['pdbid'])

        count['missing'] = len(missing_pdbids)
        count['annotated'] = 0
        for entry in comment_entries:
            if entry['pdbid'] in missing_pdbids:

                count['annotated'] += 1

        # missing = annotated + unannotated
        count['unannotated'] = count['missing'] - count['annotated']

        count['obsolete'] = 0
        for pdbid in pdbids:
            if pdbid not in parent_pdbids:
                count['obsolete'] += 1

        count['valid'] = count['present'] - count['obsolete']

    else:  # no parent, so nothing is missing or obsolete

        count['missing'] = 0
        count['valid'] = count['present']
        count['obsolete'] = 0
        count['annotated'] = 0
        count['unannotated'] = 0

    return count
Beispiel #27
0
from storage import storage
import pymongo
from sets import Set
from httplib import HTTPConnection


def databanks_by_name(databanks):

    d = {}
    for databank in databanks:
        d[databank['name']] = databank

    return d


databanks = databanks_by_name(storage.find('databanks', {}))

databank_regexes = {}
for name in databanks:
    databank_regexes[name] = databanks[name]['regex'].try_compile()


# Verifies that the path contains the regex that the databank prescribes.
# For a file path, checks that the file exists
# For an url, checks that the url points to a vallid location. (hssp response 200)
def valid_path(databank_name, path):

    if not databank_regexes[databank_name].search(path):
        return False

    if path.startswith('http://') or path.startswith('ftp://'):
Beispiel #28
0
def get_all_entries_with_comment (comment):

    # ordering was found to make it take longer!

    return storage.find ('entries', {'comment': comment}, order=[("pdbid", pymongo.ASCENDING)])
Beispiel #29
0
def get_entries_with_comment (databank_name, comment):

    # ordering was found to make it take longer!

    return storage.find ('entries', {'databank_name': databank_name, 'comment': comment}, order=[("pdbid", pymongo.ASCENDING)])
Beispiel #30
0
def get_annotated_entries (databank_name):

    return storage.find('entries', {'databank_name': databank_name, 'comment': {'$exists': True}, 'filepath': {'$exists': False}},
                        order=[("pdbid", pymongo.ASCENDING)])
Beispiel #31
0
def get_all_entries_with_comment(comment):

    # ordering was found to make it take longer!

    return storage.find('entries', {'comment': comment},
                        order=[("pdbid", pymongo.ASCENDING)])
Beispiel #32
0
def get_present_entries (databank_name, ordered=False):

    # ordering was found to make it take longer!

    return storage.find('entries', {'databank_name': databank_name,'filepath': {'$exists': True}, 'mtime': {'$exists': True}},
                        order=[("pdbid", pymongo.ASCENDING)])