def get_unannotated_entries(databank_name): databank = storage.find_one('databanks',{'name':databank_name}) if not databank: raise Exception ("no such databank: " + databank_name) # Needs a parent to determine what's missing if 'parent_name' not in databank: return [] entries = entries_by_pdbid(storage.find('entries', {'databank_name': databank_name, '$or': [{'filepath': {'$exists': True}}, {'comment': {'$exists': True }}] })) parent_entries = storage.find('entries', {'databank_name': databank['parent_name'], 'filepath': {'$exists': True} }, {'pdbid': 1}) unannotated = [] for parent_entry in parent_entries: pdbid = parent_entry['pdbid'] if pdbid not in entries: entry = {'pdbid': pdbid, 'databank_name': databank_name} unannotated.append(entry) return unannotated
def search_results_for(pdbid): part = pdbid[1:3] entries = entries_by_databank(storage.find('entries', {'pdbid': pdbid})) databanks = databanks_by_name(storage.find('databanks', {})) results = {} for databank_name in databanks.keys(): databank = databanks[databank_name] if databank_name in entries: entry = entries[databank_name] if 'filepath' in entry: results[databank_name] = get_file_link(databank, pdbid) elif 'comment' in entry: results[databank_name] = entry['comment'] else: results[databank_name] = 'Not available' if 'parent_name' in databank: parent_name = databank['parent_name'] if parent_name not in entries or 'comment' in entries[ parent_name]: results[databank_name] += ', depends on %s' % parent_name return results
def get_databank_hierarchy(name=None): if name is None: databanks = storage.find('databanks', {'parent_name': { '$exists': False }}, { 'name': 1, '_id': 0 }) else: databanks = storage.find('databanks', {'parent_name': name}, { 'name': 1, '_id': 0 }) tree = {} for databank in databanks: name = databank['name'] branch = get_databank_hierarchy(name) tree[name] = branch return tree
def search_results_for (pdbid): part = pdbid [1:3] entries = entries_by_databank (storage.find ('entries', {'pdbid': pdbid})) databanks = databanks_by_name (storage.find ('databanks', {})) results = {} for databank_name in databanks.keys(): databank = databanks [databank_name] if databank_name in entries: entry = entries [databank_name] if 'filepath' in entry: results [databank_name] = get_file_link (databank, pdbid) elif 'comment' in entry: results [databank_name] = entry ['comment'] else: results [databank_name] = 'Not available' if 'parent_name' in databank: parent_name = databank ['parent_name'] if parent_name not in entries or 'comment' in entries [parent_name]: results [databank_name] += ', depends on %s' % parent_name return results
def count_summary (databank_name): databank = storage.find_one('databanks',{'name': databank_name}) if not databank: raise Exception("no such databank: " + databank_name) projection = {'pdbid':1, '_id':0} count = {} pdbids = Set () for entry in storage.find ('entries', {'databank_name': databank_name,'filepath': {'$exists': True}}, projection): pdbids.add (entry ['pdbid']) count ['present'] = len (pdbids) if 'parent_name' in databank: parent_name = databank ['parent_name'] parent_pdbids = Set() missing_pdbids = Set() parent_entries = storage.find ('entries', {'databank_name': parent_name,'filepath': {'$exists': True}}, projection) comment_entries = storage.find('entries', {'databank_name': databank_name, 'comment': {'$exists': True}}, projection) for entry in parent_entries: parent_pdbids.add (entry ['pdbid']) if entry ['pdbid'] not in pdbids: missing_pdbids.add (entry ['pdbid']) count ['missing'] = len (missing_pdbids) count ['annotated'] = 0 for entry in comment_entries: if entry ['pdbid'] in missing_pdbids: count ['annotated'] += 1 # missing = annotated + unannotated count ['unannotated'] = count ['missing'] - count ['annotated'] count ['obsolete'] = 0 for pdbid in pdbids: if pdbid not in parent_pdbids: count ['obsolete'] += 1 count ['valid'] = count ['present'] - count ['obsolete'] else: # no parent, so nothing is missing or obsolete count ['missing'] = 0 count ['valid'] = count ['present'] count ['obsolete'] = 0 count ['annotated'] = 0 count ['unannotated'] = 0 return count
def comment_summary(): comments = {} for entry in storage.find('entries', { 'comment': { '$exists': True }, 'mtime': { '$exists': True } }, { 'mtime': 1, 'comment': 1, '_id': 0 }): text = entry['comment'] if text not in comments: comments[text] = { 'text': text, 'n_entries': 0, 'mtime': entry['mtime'] } comments[text]['n_entries'] += 1 if comments[text]['mtime'] < entry['mtime']: comments[text]['mtime'] = entry['mtime'] return comments.values()
def get_databank_hierarchy (name = None): if name is None: databanks = storage.find ('databanks', {'parent_name': {'$exists': False}}, {'name': 1, '_id': 0}) else: databanks = storage.find ('databanks', {'parent_name': name}, {'name': 1, '_id': 0}) tree = {} for databank in databanks: name = databank ['name'] branch = get_databank_hierarchy (name) tree [name] = branch return tree
def get_missing_entries(databank_name): databank = storage.find_one('databanks', {'name': databank_name}) if not databank: raise Exception("no such databank: " + databank_name) # Needs a parent to determine what's missing if 'parent_name' not in databank: return [] entries = entries_by_pdbid( storage.find('entries', {'databank_name': databank_name})) missing = [] for entry in get_present_entries(databank['parent_name']): pdbid = entry['pdbid'] if pdbid in entries: if 'filepath' not in entries[pdbid] or 'mtime' not in entries[ pdbid]: missing.append(entries[pdbid]) else: entry = {'pdbid': pdbid, 'databank_name': databank_name} missing.append(entry) return missing
def crawl_files (databank, pathnames): present_entries_bypdbid = entries_by_pdbid (get_present_entries (databank ['name'])) record_pdbids = entries_by_pdbid (storage.find ('entries', {'databank_name': databank ['name']}, {'pdbid':1})) pattern = parse_regex (databank['regex']) for f in pathnames: # Only use files that match the databank's pattern. m = pattern.search(f) if not m: continue # For disk files take their mtimes, for urls take current time. mtime = time () if os.path.isfile (f): mtime = os.path.getmtime (f) entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': f, 'mtime': mtime } if entry ['pdbid'] in present_entries_bypdbid: continue if entry ['pdbid'] in record_pdbids: storage.update ('entries', {'databank_name': databank ['name'], 'pdbid': entry ['pdbid']}, entry) else: storage.insert ('entries', entry)
def crawl_lines (databank, filepath, lines): present_entries_bypdbid = entries_by_pdbid(get_present_entries(databank['name'])) record_pdbids = entries_by_pdbid(storage.find('entries',{'databank_name':databank['name']}, {'pdbid':1})) pattern = parse_regex(databank['regex']) # If it's a disk file take its mtime, for urls take current time. mtime = time() if os.path.isfile (filepath): mtime = os.path.getmtime (filepath) for line in lines: # Only use lines that match the databank's pattern m = pattern.search (line) if not m: continue entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': filepath, 'mtime': mtime } if entry['pdbid'] in present_entries_bypdbid: continue if entry['pdbid'] in record_pdbids: storage.update('entries', {'databank_name':databank['name'], 'pdbid':entry['pdbid']}, entry) else: storage.insert('entries', entry)
def get_entries_with_comment(databank_name, comment): # ordering was found to make it take longer! return storage.find('entries', { 'databank_name': databank_name, 'comment': comment }, order=[("pdbid", pymongo.ASCENDING)])
def get_entries_with_pdbid(databank_name, pdbid): # ordering was found to make it take longer! return storage.find('entries', { 'databank_name': databank_name, 'pdbid': pdbid }, order=[("pdbid", pymongo.ASCENDING)])
def databanks(name=None): start_time = time() if name is None: databanks = storage.find('databanks', {}) else: databanks = [ storage.find_one('databanks', {'name': name}) ] end_time = time() return render_template('databank/DatabankPage.html', db_tree=db_tree, nav_disabled='databanks', databanks=databanks)
def load_statistics(): _log.info("request for statistics") #TODO: speed up this method ndb = storage.count('databanks', {}) ne = 0 na = 0 nf = 0 nc = 0 unique_comments = Set() recent_files = top_highest(10) recent_annotations = top_highest(10) for entry in storage.find('entries', {}): ne += 1 if 'mtime' in entry: if 'filepath' in entry: nf += 1 recent_files.add(entry['mtime'], entry) elif 'comment' in entry: na += 1 unique_comments.add(entry['comment']) recent_annotations.add(entry['mtime'], entry) # Perform time-consuming operations only on the last 10 files and annotations files = [] for f in recent_files.get(): files.append({ 'path': f['filepath'], 'date': strftime(date_format, gmtime(f['mtime'])) }) annotations = [] for a in recent_annotations.get(): annotations.append({ 'comment': a['comment'], 'pdbid': a['pdbid'], 'databank_name': a['databank_name'], 'date': strftime(date_format, gmtime(a['mtime'])) }) nc = len(unique_comments) statistics = {} statistics['total_databanks'] = ndb statistics['total_entries'] = ne statistics['total_files'] = nf statistics['total_annotations'] = na statistics['total_comments'] = nc statistics['annotations'] = annotations statistics['files'] = files return jsonify(statistics)
def databanks(name=None): start_time = time() if name is None: databanks = storage.find('databanks', {}) else: databanks = [storage.find_one('databanks', {'name': name})] end_time = time() return render_template('databank/DatabankPage.html', db_tree=db_tree, nav_disabled='databanks', databanks=databanks)
def get_annotated_entries(databank_name): return storage.find('entries', { 'databank_name': databank_name, 'comment': { '$exists': True }, 'filepath': { '$exists': False } }, order=[("pdbid", pymongo.ASCENDING)])
def get_present_entries(databank_name, ordered=False): # ordering was found to make it take longer! return storage.find('entries', { 'databank_name': databank_name, 'filepath': { '$exists': True }, 'mtime': { '$exists': True } }, order=[("pdbid", pymongo.ASCENDING)])
def comment_summary (): comments = {} for entry in storage.find ('entries', {'comment': {'$exists': True}, 'mtime': {'$exists': True}}, {'mtime':1, 'comment':1, '_id':0}): text = entry ['comment'] if text not in comments: comments [text] = {'text':text, 'n_entries': 0, 'mtime': entry ['mtime']} comments [text]['n_entries'] += 1 if comments [text]['mtime'] < entry ['mtime']: comments [text]['mtime'] = entry ['mtime'] return comments.values ()
def load_statistics(): _log.info("request for statistics") #TODO: speed up this method ndb = storage.count('databanks', {}) ne = 0 na = 0 nf = 0 nc = 0 unique_comments = Set() recent_files = top_highest(10) recent_annotations = top_highest(10) for entry in storage.find('entries', {}): ne += 1 if 'mtime' in entry: if 'filepath' in entry: nf += 1 recent_files.add(entry ['mtime'], entry) elif 'comment' in entry: na += 1 unique_comments.add(entry ['comment']) recent_annotations.add(entry ['mtime'], entry) # Perform time-consuming operations only on the last 10 files and annotations files = [] for f in recent_files.get(): files.append({'path': f['filepath'], 'date': strftime(date_format, gmtime(f['mtime']))}) annotations = [] for a in recent_annotations.get(): annotations.append({'comment': a ['comment'], 'pdbid': a ['pdbid'], 'databank_name': a ['databank_name'], 'date': strftime(date_format, gmtime(a['mtime']))}) nc = len(unique_comments) statistics = {} statistics ['total_databanks'] = ndb statistics ['total_entries'] = ne statistics ['total_files'] = nf statistics ['total_annotations'] = na statistics ['total_comments'] = nc statistics ['annotations'] = annotations statistics ['files'] = files return jsonify(statistics)
def crawl_lines(databank, filepath, lines): present_entries_bypdbid = entries_by_pdbid( get_present_entries(databank['name'])) record_pdbids = entries_by_pdbid( storage.find('entries', {'databank_name': databank['name']}, {'pdbid': 1})) pattern = parse_regex(databank['regex']) # If it's a disk file take its mtime, for urls take current time. mtime = time() if os.path.isfile(filepath): mtime = os.path.getmtime(filepath) for line in lines: # Only use lines that match the databank's pattern m = pattern.search(line) if not m: continue entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': filepath, 'mtime': mtime } if entry['pdbid'] in present_entries_bypdbid: continue if entry['pdbid'] in record_pdbids: storage.update('entries', { 'databank_name': databank['name'], 'pdbid': entry['pdbid'] }, entry) else: storage.insert('entries', entry)
def crawl_files(databank, pathnames): present_entries_bypdbid = entries_by_pdbid( get_present_entries(databank['name'])) record_pdbids = entries_by_pdbid( storage.find('entries', {'databank_name': databank['name']}, {'pdbid': 1})) pattern = parse_regex(databank['regex']) for f in pathnames: # Only use files that match the databank's pattern. m = pattern.search(f) if not m: continue # For disk files take their mtimes, for urls take current time. mtime = time() if os.path.isfile(f): mtime = os.path.getmtime(f) entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': f, 'mtime': mtime } if entry['pdbid'] in present_entries_bypdbid: continue if entry['pdbid'] in record_pdbids: storage.update('entries', { 'databank_name': databank['name'], 'pdbid': entry['pdbid'] }, entry) else: storage.insert('entries', entry)
#!/usr/bin/python import sys import os import commands sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from storage import storage from sets import Set from httplib import HTTPConnection from utils import has_annotated_parent, has_present_parent, databanks_by_name, valid_path, get_entry databanks = databanks_by_name(storage.find('databanks', {})) regexes = {} for name in databanks: regexes[name] = databanks[name]['regex'].try_compile() psql_defs = 'export PGPASSWORD=oon6oo4J' psql_call = 'psql whynot2 -h cmbi11 -U whynotuser' # Compare entries entries_mongo = {} files_mongo = {} comments_mongo = {} entries_psql = {} comments_psql = {} files_psql = {}
def get_entries_with_pdbid (databank_name, pdbid): # ordering was found to make it take longer! return storage.find ('entries', {'databank_name': databank_name, 'pdbid': pdbid}, order=[("pdbid", pymongo.ASCENDING)])
from urllib2 import urlopen from storage import storage import pymongo from sets import Set from httplib import HTTPConnection def databanks_by_name (databanks): d = {} for databank in databanks: d [databank ['name']] = databank return d databanks = databanks_by_name (storage.find ('databanks', {})) databank_regexes = {} for name in databanks: databank_regexes [name] = databanks [name]['regex'].try_compile () # Verifies that the path contains the regex that the databank prescribes. # For a file path, checks that the file exists # For an url, checks that the url points to a vallid location. (hssp response 200) def valid_path (databank_name, path): if not databank_regexes [databank_name].search (path): return False if path.startswith ('http://') or path.startswith ('ftp://'):
#!/usr/bin/python import sys import os import commands sys.path.append (os.path.join(os.path.dirname (__file__), '..')) from storage import storage from sets import Set from httplib import HTTPConnection from utils import has_annotated_parent, has_present_parent, databanks_by_name, valid_path, get_entry databanks = databanks_by_name (storage.find ('databanks', {})) regexes = {} for name in databanks: regexes [name] = databanks [name]['regex'].try_compile () psql_defs = 'export PGPASSWORD=oon6oo4J' psql_call = 'psql whynot2 -h cmbi11 -U whynotuser' # Compare entries entries_mongo = {} files_mongo = {} comments_mongo = {} entries_psql = {} comments_psql = {} files_psql = {}
def count_summary(databank_name): databank = storage.find_one('databanks', {'name': databank_name}) if not databank: raise Exception("no such databank: " + databank_name) projection = {'pdbid': 1, '_id': 0} count = {} pdbids = Set() for entry in storage.find('entries', { 'databank_name': databank_name, 'filepath': { '$exists': True } }, projection): pdbids.add(entry['pdbid']) count['present'] = len(pdbids) if 'parent_name' in databank: parent_name = databank['parent_name'] parent_pdbids = Set() missing_pdbids = Set() parent_entries = storage.find('entries', { 'databank_name': parent_name, 'filepath': { '$exists': True } }, projection) comment_entries = storage.find('entries', { 'databank_name': databank_name, 'comment': { '$exists': True } }, projection) for entry in parent_entries: parent_pdbids.add(entry['pdbid']) if entry['pdbid'] not in pdbids: missing_pdbids.add(entry['pdbid']) count['missing'] = len(missing_pdbids) count['annotated'] = 0 for entry in comment_entries: if entry['pdbid'] in missing_pdbids: count['annotated'] += 1 # missing = annotated + unannotated count['unannotated'] = count['missing'] - count['annotated'] count['obsolete'] = 0 for pdbid in pdbids: if pdbid not in parent_pdbids: count['obsolete'] += 1 count['valid'] = count['present'] - count['obsolete'] else: # no parent, so nothing is missing or obsolete count['missing'] = 0 count['valid'] = count['present'] count['obsolete'] = 0 count['annotated'] = 0 count['unannotated'] = 0 return count
from storage import storage import pymongo from sets import Set from httplib import HTTPConnection def databanks_by_name(databanks): d = {} for databank in databanks: d[databank['name']] = databank return d databanks = databanks_by_name(storage.find('databanks', {})) databank_regexes = {} for name in databanks: databank_regexes[name] = databanks[name]['regex'].try_compile() # Verifies that the path contains the regex that the databank prescribes. # For a file path, checks that the file exists # For an url, checks that the url points to a vallid location. (hssp response 200) def valid_path(databank_name, path): if not databank_regexes[databank_name].search(path): return False if path.startswith('http://') or path.startswith('ftp://'):
def get_all_entries_with_comment (comment): # ordering was found to make it take longer! return storage.find ('entries', {'comment': comment}, order=[("pdbid", pymongo.ASCENDING)])
def get_entries_with_comment (databank_name, comment): # ordering was found to make it take longer! return storage.find ('entries', {'databank_name': databank_name, 'comment': comment}, order=[("pdbid", pymongo.ASCENDING)])
def get_annotated_entries (databank_name): return storage.find('entries', {'databank_name': databank_name, 'comment': {'$exists': True}, 'filepath': {'$exists': False}}, order=[("pdbid", pymongo.ASCENDING)])
def get_all_entries_with_comment(comment): # ordering was found to make it take longer! return storage.find('entries', {'comment': comment}, order=[("pdbid", pymongo.ASCENDING)])
def get_present_entries (databank_name, ordered=False): # ordering was found to make it take longer! return storage.find('entries', {'databank_name': databank_name,'filepath': {'$exists': True}, 'mtime': {'$exists': True}}, order=[("pdbid", pymongo.ASCENDING)])