def remove_changed (databank, lines=[]): pattern = parse_regex(databank['regex']) line_matches = {} if databank ['crawltype'] == LINE: for line in lines: m = pattern.search (line) if m: line_matches [m.group (1)] = line # Remove entries where the file's mtime has changed or where the # actual file/line was removed or doesn't match the pattern anymore: for entry in get_present_entries (databank['name']): path = entry ['filepath'] if databank ['crawltype'] == FILE and \ (not valid_path (databank['name'], path) or \ os.path.getmtime (path) != entry['mtime']): storage.remove ('entries', {'databank_name': databank['name'], 'pdbid': entry['pdbid']}) elif databank ['crawltype'] == LINE and \ (not os.path.isfile (path) or \ os.path.getmtime (path) != entry['mtime'] or \ entry ['pdbid'] not in line_matches): storage.remove ('entries', {'databank_name': databank['name'], 'pdbid': entry['pdbid']})
def crawl_files (databank, pathnames): present_entries_bypdbid = entries_by_pdbid (get_present_entries (databank ['name'])) record_pdbids = entries_by_pdbid (storage.find ('entries', {'databank_name': databank ['name']}, {'pdbid':1})) pattern = parse_regex (databank['regex']) for f in pathnames: # Only use files that match the databank's pattern. m = pattern.search(f) if not m: continue # For disk files take their mtimes, for urls take current time. mtime = time () if os.path.isfile (f): mtime = os.path.getmtime (f) entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': f, 'mtime': mtime } if entry ['pdbid'] in present_entries_bypdbid: continue if entry ['pdbid'] in record_pdbids: storage.update ('entries', {'databank_name': databank ['name'], 'pdbid': entry ['pdbid']}, entry) else: storage.insert ('entries', entry)
def crawl_lines (databank, filepath, lines): present_entries_bypdbid = entries_by_pdbid(get_present_entries(databank['name'])) record_pdbids = entries_by_pdbid(storage.find('entries',{'databank_name':databank['name']}, {'pdbid':1})) pattern = parse_regex(databank['regex']) # If it's a disk file take its mtime, for urls take current time. mtime = time() if os.path.isfile (filepath): mtime = os.path.getmtime (filepath) for line in lines: # Only use lines that match the databank's pattern m = pattern.search (line) if not m: continue entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': filepath, 'mtime': mtime } if entry['pdbid'] in present_entries_bypdbid: continue if entry['pdbid'] in record_pdbids: storage.update('entries', {'databank_name':databank['name'], 'pdbid':entry['pdbid']}, entry) else: storage.insert('entries', entry)
def remove_changed(databank, lines=[]): pattern = parse_regex(databank['regex']) line_matches = {} if databank['crawltype'] == LINE: for line in lines: m = pattern.search(line) if m: line_matches[m.group(1)] = line # Remove entries where the file's mtime has changed or where the # actual file/line was removed or doesn't match the pattern anymore: for entry in get_present_entries(databank['name']): path = entry['filepath'] if databank ['crawltype'] == FILE and \ (not valid_path (databank['name'], path) or \ os.path.getmtime (path) != entry['mtime']): storage.remove('entries', { 'databank_name': databank['name'], 'pdbid': entry['pdbid'] }) elif databank ['crawltype'] == LINE and \ (not os.path.isfile (path) or \ os.path.getmtime (path) != entry['mtime'] or \ entry ['pdbid'] not in line_matches): storage.remove('entries', { 'databank_name': databank['name'], 'pdbid': entry['pdbid'] })
def crawl_lines(databank, filepath, lines): present_entries_bypdbid = entries_by_pdbid( get_present_entries(databank['name'])) record_pdbids = entries_by_pdbid( storage.find('entries', {'databank_name': databank['name']}, {'pdbid': 1})) pattern = parse_regex(databank['regex']) # If it's a disk file take its mtime, for urls take current time. mtime = time() if os.path.isfile(filepath): mtime = os.path.getmtime(filepath) for line in lines: # Only use lines that match the databank's pattern m = pattern.search(line) if not m: continue entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': filepath, 'mtime': mtime } if entry['pdbid'] in present_entries_bypdbid: continue if entry['pdbid'] in record_pdbids: storage.update('entries', { 'databank_name': databank['name'], 'pdbid': entry['pdbid'] }, entry) else: storage.insert('entries', entry)
def crawl_files(databank, pathnames): present_entries_bypdbid = entries_by_pdbid( get_present_entries(databank['name'])) record_pdbids = entries_by_pdbid( storage.find('entries', {'databank_name': databank['name']}, {'pdbid': 1})) pattern = parse_regex(databank['regex']) for f in pathnames: # Only use files that match the databank's pattern. m = pattern.search(f) if not m: continue # For disk files take their mtimes, for urls take current time. mtime = time() if os.path.isfile(f): mtime = os.path.getmtime(f) entry = { 'databank_name': databank['name'], 'pdbid': m.group(1).lower(), 'filepath': f, 'mtime': mtime } if entry['pdbid'] in present_entries_bypdbid: continue if entry['pdbid'] in record_pdbids: storage.update('entries', { 'databank_name': databank['name'], 'pdbid': entry['pdbid'] }, entry) else: storage.insert('entries', entry)
# This is simply a commandline tool for quick listing of the database contents. usage = 'Usage: %s [DB] [present|missing|valid|obsolete|annotated|unannotated|comment:*|pdbid:????]' % sys.argv[ 0] if len(sys.argv) < 2: print(usage) sys.exit(0) dbname = sys.argv[1] if len(sys.argv) == 3: category = sys.argv[2] entries = [] if category.lower() == 'present': entries = get_present_entries(dbname) elif category.lower() == 'missing': entries = get_missing_entries(dbname) elif category.lower() == 'valid': entries = get_valid_entries(dbname) elif category.lower() == 'obsolete': entries = get_obsolete_entries(dbname) elif category.lower() == 'annotated': entries = get_annotated_entries(dbname) elif category.lower() == 'unannotated': entries = get_unannotated_entries(dbname) elif category.lower().startswith('comment:'): entries = get_entries_with_comment(dbname, category[8:].strip()) elif category.lower().startswith('pdbid:'): print(get_entries_with_pdbid(dbname, category[6:].strip())) else:
# This is simply a commandline tool for quick listing of the database contents. usage='Usage: %s [DB] [present|missing|valid|obsolete|annotated|unannotated|comment:*|pdbid:????]'%sys.argv[0] if len(sys.argv) < 2: print usage sys.exit(0) dbname=sys.argv[1] if len(sys.argv) == 3: category=sys.argv[2] entries=[] if category.lower()=='present': entries=get_present_entries(dbname) elif category.lower()=='missing': entries=get_missing_entries(dbname) elif category.lower()=='valid': entries=get_valid_entries(dbname) elif category.lower()=='obsolete': entries=get_obsolete_entries(dbname) elif category.lower()=='annotated': entries=get_annotated_entries(dbname) elif category.lower()=='unannotated': entries=get_unannotated_entries(dbname) elif category.lower().startswith('comment:'): entries=get_entries_with_comment(dbname,category[8:].strip()) elif category.lower().startswith('pdbid:'): print get_entries_with_pdbid(dbname,category[6:].strip()) else: