Ejemplo n.º 1
0
Archivo: crawl.py Proyecto: cmbi/whynot
def remove_changed (databank, lines=[]):

    pattern = parse_regex(databank['regex'])

    line_matches = {}
    if databank ['crawltype'] == LINE:

        for line in lines:
            m = pattern.search (line)
            if m:
                line_matches [m.group (1)] = line

    # Remove entries where the file's mtime has changed or where the
    # actual file/line was removed or doesn't match the pattern anymore:
    for entry in get_present_entries (databank['name']):

        path = entry ['filepath']
        if databank ['crawltype'] == FILE and \
                (not valid_path (databank['name'], path) or \
                 os.path.getmtime (path) != entry['mtime']):

            storage.remove ('entries', {'databank_name': databank['name'], 'pdbid': entry['pdbid']})

        elif databank ['crawltype'] == LINE and \
                (not os.path.isfile (path) or \
                 os.path.getmtime (path) != entry['mtime'] or \
                 entry ['pdbid'] not in line_matches):

            storage.remove ('entries', {'databank_name': databank['name'], 'pdbid': entry['pdbid']})
Ejemplo n.º 2
0
Archivo: crawl.py Proyecto: cmbi/whynot
def crawl_files (databank, pathnames):

    present_entries_bypdbid = entries_by_pdbid (get_present_entries (databank ['name']))
    record_pdbids = entries_by_pdbid (storage.find ('entries', {'databank_name': databank ['name']}, {'pdbid':1}))
    pattern = parse_regex (databank['regex'])

    for f in pathnames:

        # Only use files that match the databank's pattern.
        m = pattern.search(f)
        if not m:
            continue

        # For disk files take their mtimes, for urls take current time.
        mtime = time ()
        if os.path.isfile (f):
            mtime = os.path.getmtime (f)

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': f,
            'mtime': mtime
        }
        if entry ['pdbid'] in present_entries_bypdbid:
            continue

        if entry ['pdbid'] in record_pdbids:
            storage.update ('entries', {'databank_name': databank ['name'], 'pdbid': entry ['pdbid']}, entry)
        else:
            storage.insert ('entries', entry)
Ejemplo n.º 3
0
Archivo: crawl.py Proyecto: cmbi/whynot
def crawl_lines (databank, filepath, lines):

    present_entries_bypdbid = entries_by_pdbid(get_present_entries(databank['name']))
    record_pdbids = entries_by_pdbid(storage.find('entries',{'databank_name':databank['name']}, {'pdbid':1}))
    pattern = parse_regex(databank['regex'])

    # If it's a disk file take its mtime, for urls take current time.
    mtime = time()
    if os.path.isfile (filepath):
        mtime = os.path.getmtime (filepath)

    for line in lines:

        # Only use lines that match the databank's pattern
        m = pattern.search (line)
        if not m:
            continue

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': filepath,
            'mtime': mtime
        }
        if entry['pdbid'] in present_entries_bypdbid:
            continue

        if entry['pdbid'] in record_pdbids:
            storage.update('entries', {'databank_name':databank['name'], 'pdbid':entry['pdbid']}, entry)
        else:
            storage.insert('entries', entry)
Ejemplo n.º 4
0
def remove_changed(databank, lines=[]):

    pattern = parse_regex(databank['regex'])

    line_matches = {}
    if databank['crawltype'] == LINE:

        for line in lines:
            m = pattern.search(line)
            if m:
                line_matches[m.group(1)] = line

    # Remove entries where the file's mtime has changed or where the
    # actual file/line was removed or doesn't match the pattern anymore:
    for entry in get_present_entries(databank['name']):

        path = entry['filepath']
        if databank ['crawltype'] == FILE and \
                (not valid_path (databank['name'], path) or \
                 os.path.getmtime (path) != entry['mtime']):

            storage.remove('entries', {
                'databank_name': databank['name'],
                'pdbid': entry['pdbid']
            })

        elif databank ['crawltype'] == LINE and \
                (not os.path.isfile (path) or \
                 os.path.getmtime (path) != entry['mtime'] or \
                 entry ['pdbid'] not in line_matches):

            storage.remove('entries', {
                'databank_name': databank['name'],
                'pdbid': entry['pdbid']
            })
Ejemplo n.º 5
0
def crawl_lines(databank, filepath, lines):

    present_entries_bypdbid = entries_by_pdbid(
        get_present_entries(databank['name']))
    record_pdbids = entries_by_pdbid(
        storage.find('entries', {'databank_name': databank['name']},
                     {'pdbid': 1}))
    pattern = parse_regex(databank['regex'])

    # If it's a disk file take its mtime, for urls take current time.
    mtime = time()
    if os.path.isfile(filepath):
        mtime = os.path.getmtime(filepath)

    for line in lines:

        # Only use lines that match the databank's pattern
        m = pattern.search(line)
        if not m:
            continue

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': filepath,
            'mtime': mtime
        }
        if entry['pdbid'] in present_entries_bypdbid:
            continue

        if entry['pdbid'] in record_pdbids:
            storage.update('entries', {
                'databank_name': databank['name'],
                'pdbid': entry['pdbid']
            }, entry)
        else:
            storage.insert('entries', entry)
Ejemplo n.º 6
0
def crawl_files(databank, pathnames):

    present_entries_bypdbid = entries_by_pdbid(
        get_present_entries(databank['name']))
    record_pdbids = entries_by_pdbid(
        storage.find('entries', {'databank_name': databank['name']},
                     {'pdbid': 1}))
    pattern = parse_regex(databank['regex'])

    for f in pathnames:

        # Only use files that match the databank's pattern.
        m = pattern.search(f)
        if not m:
            continue

        # For disk files take their mtimes, for urls take current time.
        mtime = time()
        if os.path.isfile(f):
            mtime = os.path.getmtime(f)

        entry = {
            'databank_name': databank['name'],
            'pdbid': m.group(1).lower(),
            'filepath': f,
            'mtime': mtime
        }
        if entry['pdbid'] in present_entries_bypdbid:
            continue

        if entry['pdbid'] in record_pdbids:
            storage.update('entries', {
                'databank_name': databank['name'],
                'pdbid': entry['pdbid']
            }, entry)
        else:
            storage.insert('entries', entry)
Ejemplo n.º 7
0
# This is simply a commandline tool for quick listing of the database contents.

usage = 'Usage: %s [DB] [present|missing|valid|obsolete|annotated|unannotated|comment:*|pdbid:????]' % sys.argv[
    0]
if len(sys.argv) < 2:
    print(usage)
    sys.exit(0)

dbname = sys.argv[1]

if len(sys.argv) == 3:
    category = sys.argv[2]

    entries = []
    if category.lower() == 'present':
        entries = get_present_entries(dbname)
    elif category.lower() == 'missing':
        entries = get_missing_entries(dbname)
    elif category.lower() == 'valid':
        entries = get_valid_entries(dbname)
    elif category.lower() == 'obsolete':
        entries = get_obsolete_entries(dbname)
    elif category.lower() == 'annotated':
        entries = get_annotated_entries(dbname)
    elif category.lower() == 'unannotated':
        entries = get_unannotated_entries(dbname)
    elif category.lower().startswith('comment:'):
        entries = get_entries_with_comment(dbname, category[8:].strip())
    elif category.lower().startswith('pdbid:'):
        print(get_entries_with_pdbid(dbname, category[6:].strip()))
    else:
Ejemplo n.º 8
0
# This is simply a commandline tool for quick listing of the database contents.

usage='Usage: %s [DB] [present|missing|valid|obsolete|annotated|unannotated|comment:*|pdbid:????]'%sys.argv[0]
if len(sys.argv) < 2:
    print usage
    sys.exit(0)

dbname=sys.argv[1]

if len(sys.argv) == 3:
    category=sys.argv[2]

    entries=[]
    if category.lower()=='present':
        entries=get_present_entries(dbname)
    elif category.lower()=='missing':
        entries=get_missing_entries(dbname)
    elif category.lower()=='valid':
        entries=get_valid_entries(dbname)
    elif category.lower()=='obsolete':
        entries=get_obsolete_entries(dbname)
    elif category.lower()=='annotated':
        entries=get_annotated_entries(dbname)
    elif category.lower()=='unannotated':
        entries=get_unannotated_entries(dbname)
    elif category.lower().startswith('comment:'):
        entries=get_entries_with_comment(dbname,category[8:].strip())
    elif category.lower().startswith('pdbid:'):
        print get_entries_with_pdbid(dbname,category[6:].strip())
    else: