def get_all_editions(self): """Returns all the editions of this list in arbitrary order. The return value is an iterator over all the edtions. Each entry is a dictionary. (Compare the difference with get_editions.) This works even for lists with too many seeds as it doesn't try to return editions in the order of last-modified. """ rawseeds = self._get_rawseeds() def get_edition_keys(seeds): d = self._editions_view(seeds, limit=10000, stale="ok") return [row['id'] for row in d['rows']] keys = set() # When there are too many seeds, couchdb-lucene fails because the query URL is too long. # Splitting the seeds into groups of 50 to avoid that trouble. for seeds in web.group(rawseeds, 50): keys.update(get_edition_keys(seeds)) # Load docs from couchdb now. for chunk in web.group(keys, 1000): docs = self.get_couchdb_docs(self._get_editions_db(), chunk) for doc in docs.values(): del doc['_id'] del doc['_rev'] yield doc
def update_docs(db, all_docs, chunk_size=10000, comment=""): now = datetime.datetime.utcnow() for chunk in web.group(all_docs, chunk_size): print chunk d = dict((doc['key'], doc) for doc in chunk) rows = get_docs(db, d.keys()) for row in rows: row.doc.update(d[row.key]) row.doc['revision'] = row.revision + 1 row.doc['latest_revision'] = row.revision + 1 row.doc['last_modified']['value'] = now.isoformat() data = [web.storage(thing_id=row.id, revision=row.revision+1, data=simplejson.dumps(row.doc)) for row in rows] author_id = get_thing_id(db, "/user/anand") t = db.transaction() try: tx_id = db.insert("transaction", author_id=author_id, action="bulk_update", ip="127.0.0.1", bot=True, created=now, comment=comment) db.multiple_insert("version", [dict(thing_id=d.thing_id, transaction_id=tx_id, revision=d.revision) for d in data], seqname=False) db.multiple_insert("data", data, seqname=False) db.query("UPDATE thing set latest_revision=latest_revision+1 WHERE key in $d.keys()", vars=locals()) except: t.rollback() raise else: t.commit()
def request(path, method, data): """Fakes the web request. Useful when infobase is not run as a separate process. """ web.ctx.infobase_localmode = True web.ctx.infobase_input = data or {} web.ctx.infobase_method = method def get_class(classname): if '.' in classname: modname, classname = classname.rsplit('.', 1) mod = __import__(modname, None, None, ['x']) fvars = mod.__dict__ else: fvars = globals() return fvars[classname] try: # hack to make cache work for local infobase connections cache.loadhook() for pattern, classname in web.group(app.mapping, 2): m = web.re_compile('^' + pattern + '$').match(path) if m: args = m.groups() cls = get_class(classname) tocall = getattr(cls(), method) return tocall(*args) raise web.notfound() finally: # hack to make cache work for local infobase connections cache.unloadhook()
def _process_key(key): mapping = ('/l/', '/languages/', '/a/', '/authors/', '/b/', '/books/', '/user/', '/people/') for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def _process_key(key): mapping = ("/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/", "/user/", "/people/") for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def preload_redirects(self, keys): keys = [k for k in keys if k not in self.redirect_cache] if not keys: return logger.info("preload_redirects %s", keys) for chunk in web.group(keys, 100): self._preload_redirects0(list(chunk))
async def update_keys(keys): if not keys: return 0 # FIXME: Some kind of hack introduced to work around DB connectivity issue global args logger.debug("Args: %s" % str(args)) update_work.load_configs(args['ol_url'], args['ol_config'], 'default') keys = [ k for k in keys if k.count("/") == 2 and k.split("/")[1] in ("books", "authors", "works") ] count = 0 for chunk in web.group(keys, 100): chunk = list(chunk) count += len(chunk) await update_work.do_updates(chunk) # Caches should not persist between different calls to update_keys! update_work.data_provider.clear_cache() if count: logger.info("updated %d documents", count) return count
def parse(): states = fips2state() shapeid2district = {} for lines in web.group(file(DATA_DIR + '/cd99_110a.dat'), 7): num, fipscode, distnum, distname, distid, distdesc, ignore = [x.strip().strip('"') for x in lines] if not fipscode.strip(): continue shapeid2district[num] = states[fipscode] + '-' + distnum out = {} for line in file(DATA_DIR + '/cd99_110.dat'): nums = line.strip().split() if len(nums) == 3: shapeid = nums[0] # other points are the center if shapeid in shapeid2district: SKIPME = False district = shapeid2district[shapeid] out.setdefault(district, []) out[district].append([]) else: SKIPME = True elif len(nums) == 2 and not SKIPME: out[district][-1].append((float(nums[0]), float(nums[1]))) return out
def backupQueueVisitors(self, date, queue_list=None): """备份指定队列的患者信息 Args: date: 日期 queue_list: 指定队列列表 """ where = "registDate < \'{0}\'".format(date) if queue_list: where += " AND queueID IN {0}".format( str(web.db.sqlquote(queue_list))) # expired_visitor_data = self.db.select("visitor_source_data", # where=where).list() sql = "SELECT vs.*, vl.status AS localStatus, vl.workStartTime, " \ "vl.workEndTime FROM visitor_source_data vs INNER JOIN " \ "(SELECT * FROM visitor_local_data WHERE {0}) vl " \ "ON vs.id = vl.id".format(where) expired_visitor_data = self.db.query(sql).list() self.db.printing = False if expired_visitor_data: visitor_group = web.group(expired_visitor_data, 1000) for item in visitor_group: self.db.multiple_insert("visitor_backup_data", item) self.db.printing = True self.db.delete("visitor_source_data", where) self.db.delete("visitor_local_data", where)
def _parse_solr_result(self, result, doc_wrapper, facet_wrapper): response = result['response'] doc_wrapper = doc_wrapper or web.storage facet_wrapper = facet_wrapper or ( lambda name, value, count: web.storage(locals())) d = web.storage() d.num_found = response['numFound'] d.docs = [doc_wrapper(doc) for doc in response['docs']] if 'facet_counts' in result: d.facets = {} for k, v in result['facet_counts']['facet_fields'].items(): d.facets[k] = [ facet_wrapper(k, value, count) for value, count in web.group(v, 2) ] if 'highlighting' in result: d.highlighting = result['highlighting'] if 'spellcheck' in result: d.spellcheck = result['spellcheck'] return d
def add_urls(module): global urls module_urls = [] for path, classname in web.group(module.urls, 2): classname = module.__name__ + "." + classname module_urls.extend([path, classname]) urls = urls + tuple(module_urls)
def _get_all_facet_counts(self): if not self._facet_counts: facets = [ "library_s", "region_s", "country_s", "ia_collections_id", "sponsor_s", "contributor_s", "book_key_s", "author_keys_id", "resource_type_s", "subject_facet", "place_facet", "person_facet", "time_facet" ] params = { "wt": "json", "fq": "type:stats", "q": "*:*", "rows": 0, "facet": "on", "facet.mincount": 1, "facet.field": facets, "facet.limit": 20 } response = self.solr_select(params) self._total_loans = response['response']['numFound'] self._facet_counts = dict( (name, web.group(counts, 2)) for name, counts in response['facet_counts']['facet_fields'].items()) return self._facet_counts
def get_loan_durations(self): params = { "wt": "json", "q": "*:*", "rows": 0, "facet": "on", "facet.field": ['duration_hours_i'] } response = self.solr_select(params) counts = [[int(hr), count] for hr, count in web.group( response['facet_counts']['facet_fields']['duration_hours_i'], 2)] one_hour = sum(count for hr, count in counts if hr == 0) one_day = sum(count for hr, count in counts if 1 <= hr < 24) one_week = sum(count for hr, count in counts if 24 <= hr < 24 * 7) two_week = sum(count for hr, count in counts if 24 * 7 <= hr < 24 * 14) expired = sum(count for hr, count in counts if 24 * 14 <= hr) return [{ "label": "Less than one hour", "data": one_hour }, { "label": "Less than one day", "data": one_day }, { "label": "Less than one week", "data": one_week }, { "label": "More than a week", "data": two_week }, { "label": "Loan expired", "data": expired }]
def parse(): shapeid2district = {} for lines in web.group(file(DATA_DIR + '/cd99_110a.dat'), 7): num, fipscode, distnum, distname, distid, distdesc, ignore = \ [x.strip().strip('"') for x in lines] if not fipscode.strip(): continue shapeid2district[num] = (fipscode, distnum) out = {} for line in file(DATA_DIR + '/cd99_110.dat'): nums = line.strip().split() if len(nums) == 3: shapeid = nums[0] # other points are the center if shapeid in shapeid2district: SKIPME = False district = shapeid2district[shapeid] out.setdefault(district, []) out[district].append([]) else: SKIPME = True elif len(nums) == 2 and not SKIPME: out[district][-1].append((float(nums[0]), float(nums[1]))) for (fipscode, distnum), shapes in out.iteritems(): yield { '_type': 'district', 'state_fipscode': fipscode, 'district': distnum, 'shapes': shapes }
def update_keys(keys): if not keys: return 0 # FIXME: Some kind of hack introduced to work around DB connectivity issue global args logger.debug("Args: %s" % str(args)) update_work.load_configs(args.ol_url, args.config, 'default') keys = [ k for k in keys if k.count("/") == 2 and k.split("/")[1] in ("books", "authors", "works") ] count = 0 for chunk in web.group(keys, 100): chunk = list(chunk) count += len(chunk) update_work.do_updates(chunk) if count: logger.info("updated %d documents", count) return count
def load_identifiers(self, identifiers): for chunk in web.group(identifiers, 1000): chunk = list(set(chunk)) result = self.db.query("SELECT identifier FROM bookloader WHERE identifier IN $chunk", vars=locals()) present = set(row.identifier for row in result) data = [dict(identifier=id) for id in chunk if id not in present] if data: self.db.multiple_insert("bookloader", data)
def preload_documents0(self, keys): keys = [k for k in keys if k not in self.cache] if not keys: return logger.info("preload_documents0 %s", keys) for chunk in web.group(keys, 100): docs = web.ctx.site.get_many(list(chunk)) for doc in docs: self.cache[doc['key']] = doc.dict()
def _process_key(self, key): mapping = ("/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/", "/user/", "/people/") if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']: for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def list_ranges(self): r = db.select(self.sql_table, vars = dict(start_id=self.start_id, end_id=self.end_id, step=self.step), what = 'id, %s as value' % self.sql_field, where = 'id >= $start_id and id <= $end_id and\ ((id-$start_id) % $step = 0 or (id-$start_id+1) % $step = 0 or\ id = $end_id)', limit = self.max_size * 2) return web.group(r, 2)
def get_editions(self, limit=50, offset=0, _raw=False): """Returns the editions objects belonged to this list ordered by last_modified. When _raw=True, the edtion dicts are returned instead of edtion objects. """ # show at max 10 pages MAX_OFFSET = min(self.edition_count, 50 * 10) if not self.seeds or offset > MAX_OFFSET: return { "count": 0, "offset": offset, "limit": limit, "editions": [] } # We don't want to give more than 500 editions for performance reasons. if offset + limit > MAX_OFFSET: limit = MAX_OFFSET - offset key_data = [] rawseeds = self._get_rawseeds() for seeds in web.group(rawseeds, 50): key_data += self._get_edition_keys(seeds, limit=MAX_OFFSET) keys = [key for key, last_modified in sorted(key_data, key=lambda x: x[1], reverse=True)] keys = keys[offset:limit] # Get the documents from couchdb docs = self.get_couchdb_docs(self._get_editions_db(), keys) def get_doc(key): doc = docs[key] del doc['_id'] del doc['_rev'] if not _raw: data = self._site._process_dict(common.parse_query(doc)) doc = client.create_thing(self._site, doc['key'], data) return doc d = { "count": self.edition_count, "offset": offset, "limit": limit, "editions": [get_doc(key) for key in keys] } if offset + limit < MAX_OFFSET: d['next_params'] = { 'offset': offset+limit } if offset > 0: d['prev_params'] = { 'offset': max(0, offset-limit) } return d
def process_changesets(self, changesets, update_seeds=False): """Updates the lists databases for given changesets. Seeds are updated in the seeds db if update_seeds is True, otherwise they are marked for later update. """ logger.info("BEGIN process_changesets") ctx = UpdaterContext() for chunk in web.group(changesets, 50): chunk = list(chunk) logger.info("processing changesets %s", [c['id'] for c in chunk]) works = [work for changeset in chunk for work in self._get_works(changeset)] editions = [e for changeset in chunk for e in self._get_editions(changeset)] logger.info("found %d works and %d editions", len(works), len(editions)) keys = [w['key'] for w in works] + [e['works'][0]['key'] for e in editions if e.get('works')] keys = list(set(keys)) self.works_db.db.preload(keys) for work in works: work = self.works_db.update_work(ctx, work) # works have been modified. Commit to update the views. logger.info("BEGIN commit works_db") self.works_db.db.commit() logger.info("END commit works_db") self.works_db.update_editions(ctx, editions) self.editions_db.update_editions(ctx.editions.values()) ctx.editions.clear() t = datetime.datetime.utcnow().isoformat() if ctx.seeds: logger.info("BEGIN commit works_db") self.works_db.db.commit() logger.info("END commit works_db") logger.info("BEGIN mark %d seeds for update" % len(ctx.seeds)) if update_seeds: self.seeds_db.update_seeds(ctx.seeds.keys()) else: self.seeds_db.mark_seeds_for_update(ctx.seeds.keys()) logger.info("END mark %d seeds for update" % len(ctx.seeds)) ctx.seeds.clear() # reset to limit the make sure the size of cache never grows without any limit. if len(self.works_db.db.docs) > 1000: self.works_db.db.reset() self.works_db.db.commit() self.works_db.db.reset() logger.info("END process_changesets")
def get_many(self, keys): """Get multiple documents in a single request as a dictionary.""" if len(keys) > 100: # Process in batches to avoid crossing the URL length limit. d = {} for chunk in web.group(keys, 100): d.update(self._get_many(chunk)) return d else: return self._get_many(keys)
def get_editions(self, limit=50, offset=0, _raw=False): """Returns the editions objects belonged to this list ordered by last_modified. When _raw=True, the edtion dicts are returned instead of edtion objects. """ # show at max 10 pages MAX_OFFSET = min(self.edition_count, 50 * 10) if not self.seeds or offset > MAX_OFFSET: return { "count": 0, "offset": offset, "limit": limit, "editions": [] } # We don't want to give more than 500 editions for performance reasons. if offset + limit > MAX_OFFSET: limit = MAX_OFFSET - offset key_data = [] rawseeds = self._get_rawseeds() for seeds in web.group(rawseeds, 50): key_data += self._get_edition_keys(seeds, limit=MAX_OFFSET) keys = [ key for key, last_modified in sorted( key_data, key=lambda x: x[1], reverse=True) ] keys = keys[offset:limit] # Get the documents from couchdb docs = self.get_couchdb_docs(self._get_editions_db(), keys) def get_doc(key): doc = docs[key] del doc['_id'] del doc['_rev'] if not _raw: data = self._site._process_dict(common.parse_query(doc)) doc = client.create_thing(self._site, doc['key'], data) return doc d = { "count": self.edition_count, "offset": offset, "limit": limit, "editions": [get_doc(key) for key in keys] } if offset + limit < MAX_OFFSET: d['next_params'] = {'offset': offset + limit} if offset > 0: d['prev_params'] = {'offset': max(0, offset - limit)} return d
def update_seeds(self, seeds, chunksize=50): big_seeds = self.get_big_seeds() seeds2 = sorted(seed for seed in seeds if seed not in big_seeds) logging.info("update_seeds %s", len(seeds2)) logging.info("ignored %d big seeds", len(seeds) - len(seeds2)) for i, chunk in enumerate(web.group(seeds2, chunksize)): chunk = list(chunk) logging.info("update_seeds %d %d", i, len(chunk)) self._update_seeds(chunk)
def find_not_indexed(keys, chunk_size=1000): for chunk in web.group(keys, chunk_size): chunk = list(chunk) q=" OR ".join("key:" + k for k in chunk) params = urllib.urlencode({"q": q, "rows": chunk_size, "wt": "json", "fl": "key"}) url = solr_base_url + "/select" d = jsonget(url, params) found = set(doc['key'] for doc in d['response']['docs']) for k in chunk: if k not in found: yield k
def get_many(self, keys): """Get multiple documents in a single request as a dictionary. """ if len(keys) > 500: # get in chunks of 500 to avoid crossing the URL length limit. d = {} for chunk in web.group(keys, 500): d.update(self._get_many(chunk)) return d else: return self._get_many(keys)
def _process_key(self, key): # some data in the database still has /b/ instead /books. # The transaformation is still done in software. mapping = ("/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/", "/user/", "/people/") if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']: for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def _process_key(key): mapping = ( "/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/", "/user/", "/people/" ) for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def update_seeds(self, seeds, chunksize=50): big_seeds = self.get_big_seeds() seeds2 = sorted(seed for seed in seeds if seed not in big_seeds) logger.info("update_seeds %s", len(seeds2)) logger.info("ignored %d big seeds", len(seeds)-len(seeds2)) for i, chunk in enumerate(web.group(seeds2, chunksize)): chunk = list(chunk) logger.info("update_seeds %d %d", i, len(chunk)) self._update_seeds(chunk)
def _get_docs(self, keys): """Returns docs for the specified keys as a dictionary. """ docs = {} for keys2 in web.group(keys, 500): json = self.infobase_conn.request( sitename="openlibrary.org", path="/get_many", data={"keys": simplejson.dumps(keys2)}) docs2 = simplejson.loads(json) docs.update(docs2) return docs
def _process_key(self, key): mapping = ( "/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/", "/user/", "/people/" ) if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']: for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def write_sitemaps(data, outdir, prefix): timestamp = datetime.datetime.utcnow().isoformat() + 'Z' # maximum permitted entries in one sitemap is 50K. for i, rows in enumerate(web.group(data, 50000)): filename = "sitemap_%s_%04d.xml.gz" % (prefix, i) print("generating", filename, file=sys.stderr) sitemap = web.safestr(t_sitemap(rows)) path = os.path.join(outdir, filename) gzwrite(path, sitemap) yield filename, timestamp
def write_sitemaps(data, outdir, prefix): timestamp = datetime.datetime.utcnow().isoformat() + 'Z' # maximum permitted entries in one sitemap is 50K. for i, rows in enumerate(web.group(data, 50000)): filename = "sitemap_%s_%04d.xml.gz" % (prefix, i) print >> sys.stderr, "generating", filename sitemap = web.safestr(t_sitemap(rows)) path = os.path.join(outdir, filename) gzwrite(path, sitemap) yield filename, timestamp
def update_seeds(self, seeds, chunksize=50): # XXX-Anand: temporarily disable updates as the node hosting seeds_db is low on disk return big_seeds = self.get_big_seeds() seeds2 = sorted(seed for seed in seeds if seed not in big_seeds) logger.info("update_seeds %s", len(seeds2)) logger.info("ignored %d big seeds", len(seeds)-len(seeds2)) for i, chunk in enumerate(web.group(seeds2, chunksize)): chunk = list(chunk) logger.info("update_seeds %d %d", i, len(chunk)) self._update_seeds(chunk)
def parse(filename, chunk_size=10000): t0 = time.time() i = 0 for chunk in web.group(open(filename), chunk_size): print i, time.time() - t0 d = {} for line in chunk: key, type, revision, json = line.strip().split("\t") d["%s@@%s" % (key, revision)] = json i += len(d) yield d print i, time.time() - t0
def update_keys(keys): keys = (k for k in keys if k.count("/") == 2 and k.split("/")[1] in ["books", "authors", "works"]) count = 0 for chunk in web.group(keys, 100): chunk = list(chunk) count += len(chunk) update_work.update_keys(chunk, commit=False) if count: logger.info("updated %d documents", count) return count
def load_works(self, filename, author="/user/ImportBot"): self.author = author root = os.path.dirname(filename) editions_file = open(os.path.join(root, 'editions.txt'), 'a') try: for i, lines in enumerate(web.group(open(filename), 1000)): t0 = time.time() self.load_works_chunk(lines, editions_file) t1 = time.time() log(i, "%.3f sec" % (t1-t0)) finally: editions_file.close()
def load_works(self, filename, author="/user/ImportBot"): self.author = author root = os.path.dirname(filename) editions_file = open(os.path.join(root, 'editions.txt'), 'a') try: for i, lines in enumerate(web.group(open(filename), 1000)): t0 = time.time() self.load_works_chunk(lines, editions_file) t1 = time.time() log(i, "%.3f sec" % (t1 - t0)) finally: editions_file.close()
def process_changesets(self, changesets): ctx = UpdaterContext() for chunk in web.group(changesets, 50): chunk = list(chunk) works = [ work for changeset in chunk for work in self._get_works(changeset) ] editions = [ e for changeset in chunk for e in self._get_editions(changeset) ] keys = [w['key'] for w in works] + [ e['works'][0]['key'] for e in editions if e.get('works') ] keys = list(set(keys)) self.works_db.db.preload(keys) for work in works: work = self.works_db.update_work(ctx, work) # works have been modified. Commit to update the views. logging.info("BEGIN commit works_db") self.works_db.db.commit() logging.info("END commit works_db") self.works_db.update_editions(ctx, editions) self.editions_db.update_editions(ctx.editions.values()) ctx.editions.clear() t = datetime.datetime.utcnow().isoformat() if ctx.seeds: logging.info("BEGIN commit works_db") self.works_db.db.commit() logging.info("END commit works_db") logging.info("BEGIN mark %d seeds for update" % len(ctx.seeds)) self.seeds_db.mark_seeds_for_update(ctx.seeds.keys()) logging.info("END mark %d seeds for update" % len(ctx.seeds)) ctx.seeds.clear() # reset to limit the make sure the size of cache never grows without any limit. if len(self.works_db.db.docs) > 1000: self.works_db.db.reset() self.works_db.db.commit() self.works_db.db.reset()
def get_handler(methodlist): dispatcher = SimpleXMLRPCDispatcher(False, None) for method in web.group(methodlist, 2): dispatcher.register_function(method[1], method[0]) class rpc: def GET(self): web.header('Content-Type', 'text/html') print get_doc(dispatcher) def POST(self): response = dispatcher._marshaled_dispatch(web.webapi.data()) web.header('Content-Type', 'text/xml') web.header('Content-length', str(len(response))) print response return rpc
def _run_solr_facet_query(self, facet_fields, facet_limit=None): params = { "wt": "json", "fq": "type:stats", "q": "*:*", "rows": 0, "facet": "on", "facet.mincount": 1, "facet.field": facet_fields } if facet_limit: params["facet.limit"] = facet_limit response = self.solr_select(params) return dict((name, list(web.group(counts, 2))) for name, counts in response['facet_counts']['facet_fields'].items())
def GET(self): page = self.page content = '' admin_urls = [] for key in mod: if hasattr(mod[key], 'urls'): p = web.group(mod[key].urls,2) for pair in p: if pair[0].startswith('/admin/'): admin_urls.append(pair) content += '<p>'+key+' <a href="'+pair[0]+'">'+pair[1]+'</a> ' content += str(getattr(mod[key],pair[1]).__doc__) content += '</p>' web.render('generic.html')
def _process_key(self, key): # some data in the database still has /b/ instead /books. # The transaformation is still done in software. mapping = ( "/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/", "/user/", "/people/" ) if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']: for old, new in web.group(mapping, 2): if key.startswith(old): return new + key[len(old):] return key
def generate_html_index(index_file, outdir): data = parse_index_file(index_file) data = ((d[0], d[1]) for d in data) for i, chunk in enumerate(web.group(data, 1000)): back = ".." index = t_html_layout(t_html_sitemap(back, chunk)) path = outdir + "/%02d/%05d.html" % (i / 1000, i) write(path, web.safestr(index)) for f in os.listdir(outdir): path = os.path.join(outdir, f) if os.path.isdir(path): dirindex(path) dirindex(outdir, back=".")
def generate_html_index(index_file, outdir): data = parse_index_file(index_file) data = ((d[0], d[1]) for d in data) for i, chunk in enumerate(web.group(data, 1000)): back = ".." index = t_html_layout(t_html_sitemap(back, chunk)) path = outdir + "/%02d/%05d.html" % (i/1000, i) write(path, web.safestr(index)) for f in os.listdir(outdir): path = os.path.join(outdir, f) if os.path.isdir(path): dirindex(path) dirindex(outdir, back=".")
def update_keys(keys): global args keys = (k for k in keys if k.count("/") == 2 and k.split("/")[1] in ["books", "authors", "works"]) update_work.clear_monkeypatch_cache(max_size=10000) print str(args) update_work.load_configs(args.ol_url, args.config, 'default') count = 0 for chunk in web.group(keys, 100): chunk = list(chunk) count += len(chunk) update_work.do_updates(chunk) if count: logger.info("updated %d documents", count) return count
def get_loans_per_day(self, resource_type="total"): params = { "wt": "json", "fq": ["type:stats"], "q": "*:*", "rows": 0, "facet": "on", "facet.mincount": 1, "facet.field": ['start_day_s'] } if resource_type != 'total': params['fq'].append("resource_type_s:" + resource_type) response = self.solr_select(params) counts0 = response['facet_counts']['facet_fields']['start_day_s'] day_facet = web.group(counts0, 2) return [[self.date2timestamp(*self.parse_date(day))*1000, count] for day, count in day_facet]
def add_oclc_ids(filename): """Adds OCLC Ids to OL records. """ for mapping in web.group(read_mapping(filename), 1000): mapping = dict(mapping) docs = get_docs(mapping.keys()) # ignore docs that already have the oclc_number that we are about to set docs = [doc for doc in docs if not has_identifier(doc, "oclc_numbers", mapping[doc['key']])] for doc in docs: add_identifier(doc, "oclc_numbers", mapping[doc['key']]) if docs: ol.save_many(docs, comment="Added OCLC numbers.")