Example #1
0
    def get_all_editions(self):
        """Returns all the editions of this list in arbitrary order.
        
        The return value is an iterator over all the edtions. Each entry is a dictionary.
        (Compare the difference with get_editions.)
        
        This works even for lists with too many seeds as it doesn't try to
        return editions in the order of last-modified.
        """
        rawseeds = self._get_rawseeds()

        def get_edition_keys(seeds):
            d = self._editions_view(seeds, limit=10000, stale="ok")
            return [row['id'] for row in d['rows']]

        keys = set()

        # When there are too many seeds, couchdb-lucene fails because the query URL is too long.
        # Splitting the seeds into groups of 50 to avoid that trouble.
        for seeds in web.group(rawseeds, 50):
            keys.update(get_edition_keys(seeds))

        # Load docs from couchdb now.
        for chunk in web.group(keys, 1000):
            docs = self.get_couchdb_docs(self._get_editions_db(), chunk)
            for doc in docs.values():
                del doc['_id']
                del doc['_rev']
                yield doc
Example #2
0
 def get_all_editions(self):
     """Returns all the editions of this list in arbitrary order.
     
     The return value is an iterator over all the edtions. Each entry is a dictionary.
     (Compare the difference with get_editions.)
     
     This works even for lists with too many seeds as it doesn't try to
     return editions in the order of last-modified.
     """
     rawseeds = self._get_rawseeds()
     
     def get_edition_keys(seeds):
         d = self._editions_view(seeds, limit=10000, stale="ok")
         return [row['id'] for row in d['rows']]
         
     keys = set()
     
     # When there are too many seeds, couchdb-lucene fails because the query URL is too long.
     # Splitting the seeds into groups of 50 to avoid that trouble.
     for seeds in web.group(rawseeds, 50):
         keys.update(get_edition_keys(seeds))
     
     # Load docs from couchdb now.
     for chunk in web.group(keys, 1000):
         docs = self.get_couchdb_docs(self._get_editions_db(), chunk)
         for doc in docs.values():
             del doc['_id']
             del doc['_rev']
             yield doc
Example #3
0
def update_docs(db, all_docs, chunk_size=10000, comment=""):
    now = datetime.datetime.utcnow()

    for chunk in web.group(all_docs, chunk_size):
        print chunk
        d = dict((doc['key'], doc) for doc in chunk)
        rows = get_docs(db, d.keys())

        for row in rows:
            row.doc.update(d[row.key])
            row.doc['revision'] = row.revision + 1
            row.doc['latest_revision'] = row.revision + 1
            row.doc['last_modified']['value'] = now.isoformat()

        data = [web.storage(thing_id=row.id, revision=row.revision+1, data=simplejson.dumps(row.doc)) for row in rows]

    author_id = get_thing_id(db, "/user/anand")

    t = db.transaction()
    try:
        tx_id = db.insert("transaction", author_id=author_id, action="bulk_update", ip="127.0.0.1", bot=True, created=now, comment=comment)
        db.multiple_insert("version", [dict(thing_id=d.thing_id, transaction_id=tx_id, revision=d.revision) for d in data], seqname=False)
        db.multiple_insert("data", data, seqname=False)
        db.query("UPDATE thing set latest_revision=latest_revision+1 WHERE key in $d.keys()", vars=locals())
    except:
        t.rollback()
        raise
    else:
        t.commit()
Example #4
0
def request(path, method, data):
    """Fakes the web request.
    Useful when infobase is not run as a separate process.
    """
    web.ctx.infobase_localmode = True
    web.ctx.infobase_input = data or {}
    web.ctx.infobase_method = method
    
    def get_class(classname):
        if '.' in classname:
            modname, classname = classname.rsplit('.', 1)
            mod = __import__(modname, None, None, ['x'])
            fvars = mod.__dict__
        else:
            fvars = globals()
        return fvars[classname]

    try:
        # hack to make cache work for local infobase connections
        cache.loadhook()

        for pattern, classname in web.group(app.mapping, 2):
            m = web.re_compile('^' + pattern + '$').match(path)
            if m:
                args = m.groups()
                cls = get_class(classname)
                tocall = getattr(cls(), method)
                return tocall(*args)
        raise web.notfound()
    finally:
        # hack to make cache work for local infobase connections
        cache.unloadhook()
Example #5
0
def _process_key(key):
    mapping = ('/l/', '/languages/', '/a/', '/authors/', '/b/', '/books/',
               '/user/', '/people/')
    for old, new in web.group(mapping, 2):
        if key.startswith(old):
            return new + key[len(old):]
    return key
Example #6
0
def _process_key(key):
    mapping = ("/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/",
               "/user/", "/people/")
    for old, new in web.group(mapping, 2):
        if key.startswith(old):
            return new + key[len(old):]
    return key
Example #7
0
 def preload_redirects(self, keys):
     keys = [k for k in keys if k not in self.redirect_cache]
     if not keys:
         return
     logger.info("preload_redirects %s", keys)
     for chunk in web.group(keys, 100):
         self._preload_redirects0(list(chunk))
Example #8
0
async def update_keys(keys):
    if not keys:
        return 0

    # FIXME: Some kind of hack introduced to work around DB connectivity issue
    global args
    logger.debug("Args: %s" % str(args))
    update_work.load_configs(args['ol_url'], args['ol_config'], 'default')

    keys = [
        k for k in keys
        if k.count("/") == 2 and k.split("/")[1] in ("books", "authors",
                                                     "works")
    ]

    count = 0
    for chunk in web.group(keys, 100):
        chunk = list(chunk)
        count += len(chunk)
        await update_work.do_updates(chunk)

        # Caches should not persist between different calls to update_keys!
        update_work.data_provider.clear_cache()

    if count:
        logger.info("updated %d documents", count)

    return count
Example #9
0
def update_docs(db, all_docs, chunk_size=10000, comment=""):
    now = datetime.datetime.utcnow()
    
    for chunk in web.group(all_docs, chunk_size):
        print chunk
        d = dict((doc['key'], doc) for doc in chunk)
        rows = get_docs(db, d.keys())
        
        for row in rows:
            row.doc.update(d[row.key])
            row.doc['revision'] = row.revision + 1
            row.doc['latest_revision'] = row.revision + 1
            row.doc['last_modified']['value'] = now.isoformat()
            
        data = [web.storage(thing_id=row.id, revision=row.revision+1, data=simplejson.dumps(row.doc)) for row in rows]
            
    author_id = get_thing_id(db, "/user/anand")
            
    t = db.transaction()
    try:
        tx_id = db.insert("transaction", author_id=author_id, action="bulk_update", ip="127.0.0.1", bot=True, created=now, comment=comment)
        db.multiple_insert("version", [dict(thing_id=d.thing_id, transaction_id=tx_id, revision=d.revision) for d in data], seqname=False)
        db.multiple_insert("data", data, seqname=False)
        db.query("UPDATE thing set latest_revision=latest_revision+1 WHERE key in $d.keys()", vars=locals())
    except:
        t.rollback()
        raise
    else:
        t.commit()
Example #10
0
 def preload_redirects(self, keys):
     keys = [k for k in keys if k not in self.redirect_cache]
     if not keys:
         return
     logger.info("preload_redirects %s", keys)
     for chunk in web.group(keys, 100):
         self._preload_redirects0(list(chunk))
Example #11
0
def parse():
    states = fips2state()
    
    shapeid2district = {}
    for lines in web.group(file(DATA_DIR + '/cd99_110a.dat'), 7):
        num, fipscode, distnum, distname, distid, distdesc, ignore = [x.strip().strip('"') for x in lines]
        if not fipscode.strip(): continue
        shapeid2district[num] = states[fipscode] + '-' + distnum

    out = {}    
    for line in file(DATA_DIR + '/cd99_110.dat'):
        nums = line.strip().split()
        if len(nums) == 3:
            shapeid = nums[0] # other points are the center
            if shapeid in shapeid2district:
                SKIPME = False
                district = shapeid2district[shapeid]
                out.setdefault(district, [])
                out[district].append([])
            else:
                SKIPME = True
        elif len(nums) == 2 and not SKIPME:
            out[district][-1].append((float(nums[0]), float(nums[1])))

    return out
Example #12
0
    def backupQueueVisitors(self, date, queue_list=None):
        """备份指定队列的患者信息

        Args:
            date: 日期
            queue_list: 指定队列列表
        """

        where = "registDate < \'{0}\'".format(date)
        if queue_list:
            where += " AND queueID IN {0}".format(
                str(web.db.sqlquote(queue_list)))
        # expired_visitor_data = self.db.select("visitor_source_data",
        #                                          where=where).list()
        sql = "SELECT vs.*, vl.status AS localStatus, vl.workStartTime, " \
              "vl.workEndTime FROM visitor_source_data vs INNER JOIN " \
              "(SELECT * FROM visitor_local_data WHERE {0}) vl " \
              "ON vs.id = vl.id".format(where)
        expired_visitor_data = self.db.query(sql).list()
        self.db.printing = False
        if expired_visitor_data:
            visitor_group = web.group(expired_visitor_data, 1000)
            for item in visitor_group:
                self.db.multiple_insert("visitor_backup_data", item)
        self.db.printing = True
        self.db.delete("visitor_source_data", where)
        self.db.delete("visitor_local_data", where)
Example #13
0
    def _parse_solr_result(self, result, doc_wrapper, facet_wrapper):
        response = result['response']

        doc_wrapper = doc_wrapper or web.storage
        facet_wrapper = facet_wrapper or (
            lambda name, value, count: web.storage(locals()))

        d = web.storage()
        d.num_found = response['numFound']
        d.docs = [doc_wrapper(doc) for doc in response['docs']]

        if 'facet_counts' in result:
            d.facets = {}
            for k, v in result['facet_counts']['facet_fields'].items():
                d.facets[k] = [
                    facet_wrapper(k, value, count)
                    for value, count in web.group(v, 2)
                ]

        if 'highlighting' in result:
            d.highlighting = result['highlighting']

        if 'spellcheck' in result:
            d.spellcheck = result['spellcheck']

        return d
Example #14
0
def add_urls(module):
    global urls
    module_urls = []
    for path, classname in web.group(module.urls, 2):
        classname = module.__name__ + "." + classname
        module_urls.extend([path, classname])
    urls = urls + tuple(module_urls)
Example #15
0
    def _get_all_facet_counts(self):
        if not self._facet_counts:
            facets = [
                "library_s", "region_s", "country_s", "ia_collections_id",
                "sponsor_s", "contributor_s", "book_key_s", "author_keys_id",
                "resource_type_s", "subject_facet", "place_facet",
                "person_facet", "time_facet"
            ]

            params = {
                "wt": "json",
                "fq": "type:stats",
                "q": "*:*",
                "rows": 0,
                "facet": "on",
                "facet.mincount": 1,
                "facet.field": facets,
                "facet.limit": 20
            }
            response = self.solr_select(params)
            self._total_loans = response['response']['numFound']
            self._facet_counts = dict(
                (name, web.group(counts, 2)) for name, counts in
                response['facet_counts']['facet_fields'].items())
        return self._facet_counts
Example #16
0
 def get_loan_durations(self):
     params = {
         "wt": "json",
         "q": "*:*",
         "rows": 0,
         "facet": "on",
         "facet.field": ['duration_hours_i']
     }
     response = self.solr_select(params)
     counts = [[int(hr), count] for hr, count in web.group(
         response['facet_counts']['facet_fields']['duration_hours_i'], 2)]
     one_hour = sum(count for hr, count in counts if hr == 0)
     one_day = sum(count for hr, count in counts if 1 <= hr < 24)
     one_week = sum(count for hr, count in counts if 24 <= hr < 24 * 7)
     two_week = sum(count for hr, count in counts if 24 * 7 <= hr < 24 * 14)
     expired = sum(count for hr, count in counts if 24 * 14 <= hr)
     return [{
         "label": "Less than one hour",
         "data": one_hour
     }, {
         "label": "Less than one day",
         "data": one_day
     }, {
         "label": "Less than one week",
         "data": one_week
     }, {
         "label": "More than a week",
         "data": two_week
     }, {
         "label": "Loan expired",
         "data": expired
     }]
Example #17
0
def parse():
    shapeid2district = {}
    for lines in web.group(file(DATA_DIR + '/cd99_110a.dat'), 7):
        num, fipscode, distnum, distname, distid, distdesc, ignore = \
          [x.strip().strip('"') for x in lines]
        if not fipscode.strip(): continue
        shapeid2district[num] = (fipscode, distnum)
    
    out = {}
    for line in file(DATA_DIR + '/cd99_110.dat'):
        nums = line.strip().split()
        if len(nums) == 3:
            shapeid = nums[0] # other points are the center
            if shapeid in shapeid2district:
                SKIPME = False
                district = shapeid2district[shapeid]
                out.setdefault(district, [])
                out[district].append([])
            else:
                SKIPME = True
        elif len(nums) == 2 and not SKIPME:
            out[district][-1].append((float(nums[0]), float(nums[1])))
    
    for (fipscode, distnum), shapes in out.iteritems():
        yield {
          '_type': 'district', 
          'state_fipscode': fipscode, 
          'district': distnum,
          'shapes': shapes
        }
Example #18
0
def update_keys(keys):
    if not keys:
        return 0

    # FIXME: Some kind of hack introduced to work around DB connectivity issue
    global args
    logger.debug("Args: %s" % str(args))
    update_work.load_configs(args.ol_url, args.config, 'default')

    keys = [
        k for k in keys
        if k.count("/") == 2 and k.split("/")[1] in ("books", "authors",
                                                     "works")
    ]

    count = 0
    for chunk in web.group(keys, 100):
        chunk = list(chunk)
        count += len(chunk)
        update_work.do_updates(chunk)

    if count:
        logger.info("updated %d documents", count)

    return count
Example #19
0
 def load_identifiers(self, identifiers):
     for chunk in web.group(identifiers, 1000):
         chunk = list(set(chunk))
         result = self.db.query("SELECT identifier FROM bookloader WHERE identifier IN $chunk", vars=locals())
         present = set(row.identifier for row in result)
         data = [dict(identifier=id) for id in chunk if id not in present]
         if data:
             self.db.multiple_insert("bookloader", data)
Example #20
0
 def preload_documents0(self, keys):
     keys = [k for k in keys if k not in self.cache]
     if not keys:
         return
     logger.info("preload_documents0 %s", keys)
     for chunk in web.group(keys, 100):
         docs = web.ctx.site.get_many(list(chunk))
         for doc in docs:
             self.cache[doc['key']] = doc.dict()
Example #21
0
 def preload_documents0(self, keys):
     keys = [k for k in keys if k not in self.cache]
     if not keys:
         return
     logger.info("preload_documents0 %s", keys)
     for chunk in web.group(keys, 100):
         docs = web.ctx.site.get_many(list(chunk))
         for doc in docs:
             self.cache[doc['key']] = doc.dict()
Example #22
0
    def _process_key(self, key):
        mapping = ("/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/",
                   "/user/", "/people/")

        if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']:
            for old, new in web.group(mapping, 2):
                if key.startswith(old):
                    return new + key[len(old):]
        return key
Example #23
0
 def list_ranges(self):
     r = db.select(self.sql_table,
         vars = dict(start_id=self.start_id, end_id=self.end_id, step=self.step),
         what = 'id, %s as value' % self.sql_field,
         where = 'id >= $start_id and id <= $end_id and\
                 ((id-$start_id) % $step = 0 or (id-$start_id+1) % $step = 0 or\
                  id = $end_id)',
         limit = self.max_size * 2)
     return web.group(r, 2)
Example #24
0
    def get_editions(self, limit=50, offset=0, _raw=False):
        """Returns the editions objects belonged to this list ordered by last_modified. 
        
        When _raw=True, the edtion dicts are returned instead of edtion objects.
        """
        # show at max 10 pages
        MAX_OFFSET = min(self.edition_count, 50 * 10)
        
        if not self.seeds or offset > MAX_OFFSET:
            return {
                "count": 0,
                "offset": offset,
                "limit": limit,
                "editions": []
            }
        
        # We don't want to give more than 500 editions for performance reasons.
        if offset + limit > MAX_OFFSET:
            limit = MAX_OFFSET - offset
            
        key_data = []
        rawseeds = self._get_rawseeds()
        for seeds in web.group(rawseeds, 50):
            key_data += self._get_edition_keys(seeds, limit=MAX_OFFSET)
        keys = [key for key, last_modified in sorted(key_data, key=lambda x: x[1], reverse=True)]
        keys = keys[offset:limit]
        
        # Get the documents from couchdb 
        docs = self.get_couchdb_docs(self._get_editions_db(), keys)

        def get_doc(key):
            doc = docs[key]
            del doc['_id']
            del doc['_rev']
            if not _raw:
                data = self._site._process_dict(common.parse_query(doc))
                doc = client.create_thing(self._site, doc['key'], data)
            return doc
        
        d = {
            "count": self.edition_count,
            "offset": offset,
            "limit": limit,
            "editions": [get_doc(key) for key in keys]
        }
        
        if offset + limit < MAX_OFFSET:
            d['next_params'] = {
                'offset': offset+limit
            }
            
        if offset > 0:
            d['prev_params'] = {
                'offset': max(0, offset-limit)
            }
        return d
Example #25
0
    def process_changesets(self, changesets, update_seeds=False):
        """Updates the lists databases for given changesets.
        
        Seeds are updated in the seeds db if update_seeds is True, otherwise they are marked for later update.
        """
        logger.info("BEGIN process_changesets")
        ctx = UpdaterContext()
        for chunk in web.group(changesets, 50):
            chunk = list(chunk)
            logger.info("processing changesets %s", [c['id'] for c in chunk])
            
            works = [work for changeset in chunk 
                          for work in self._get_works(changeset)]

            editions = [e for changeset in chunk
                        for e in self._get_editions(changeset)]
                        
            logger.info("found %d works and %d editions", len(works), len(editions))
                        
            keys = [w['key'] for w in works] + [e['works'][0]['key'] for e in editions if e.get('works')] 
            keys = list(set(keys))
            self.works_db.db.preload(keys)
            
            for work in works:
                work = self.works_db.update_work(ctx, work)

            # works have been modified. Commit to update the views.
            logger.info("BEGIN commit works_db")
            self.works_db.db.commit()
            logger.info("END commit works_db")
            
            self.works_db.update_editions(ctx, editions)
            self.editions_db.update_editions(ctx.editions.values())
            ctx.editions.clear()
            
            t = datetime.datetime.utcnow().isoformat()
            if ctx.seeds:
                logger.info("BEGIN commit works_db")
                self.works_db.db.commit()
                logger.info("END commit works_db")
                
                logger.info("BEGIN mark %d seeds for update" % len(ctx.seeds))
                if update_seeds:
                    self.seeds_db.update_seeds(ctx.seeds.keys())
                else:
                    self.seeds_db.mark_seeds_for_update(ctx.seeds.keys())
                logger.info("END mark %d seeds for update" % len(ctx.seeds))
                ctx.seeds.clear()
            
            # reset to limit the make sure the size of cache never grows without any limit.
            if len(self.works_db.db.docs) > 1000:
                self.works_db.db.reset()
                
        self.works_db.db.commit()
        self.works_db.db.reset()
        logger.info("END process_changesets")
Example #26
0
    def process_changesets(self, changesets, update_seeds=False):
        """Updates the lists databases for given changesets.
        
        Seeds are updated in the seeds db if update_seeds is True, otherwise they are marked for later update.
        """
        logger.info("BEGIN process_changesets")
        ctx = UpdaterContext()
        for chunk in web.group(changesets, 50):
            chunk = list(chunk)
            logger.info("processing changesets %s", [c['id'] for c in chunk])
            
            works = [work for changeset in chunk 
                          for work in self._get_works(changeset)]

            editions = [e for changeset in chunk
                        for e in self._get_editions(changeset)]
                        
            logger.info("found %d works and %d editions", len(works), len(editions))
                        
            keys = [w['key'] for w in works] + [e['works'][0]['key'] for e in editions if e.get('works')] 
            keys = list(set(keys))
            self.works_db.db.preload(keys)
            
            for work in works:
                work = self.works_db.update_work(ctx, work)

            # works have been modified. Commit to update the views.
            logger.info("BEGIN commit works_db")
            self.works_db.db.commit()
            logger.info("END commit works_db")
            
            self.works_db.update_editions(ctx, editions)
            self.editions_db.update_editions(ctx.editions.values())
            ctx.editions.clear()
            
            t = datetime.datetime.utcnow().isoformat()
            if ctx.seeds:
                logger.info("BEGIN commit works_db")
                self.works_db.db.commit()
                logger.info("END commit works_db")
                
                logger.info("BEGIN mark %d seeds for update" % len(ctx.seeds))
                if update_seeds:
                    self.seeds_db.update_seeds(ctx.seeds.keys())
                else:
                    self.seeds_db.mark_seeds_for_update(ctx.seeds.keys())
                logger.info("END mark %d seeds for update" % len(ctx.seeds))
                ctx.seeds.clear()
            
            # reset to limit the make sure the size of cache never grows without any limit.
            if len(self.works_db.db.docs) > 1000:
                self.works_db.db.reset()
                
        self.works_db.db.commit()
        self.works_db.db.reset()
        logger.info("END process_changesets")
Example #27
0
 def get_many(self, keys):
     """Get multiple documents in a single request as a dictionary."""
     if len(keys) > 100:
         # Process in batches to avoid crossing the URL length limit.
         d = {}
         for chunk in web.group(keys, 100):
             d.update(self._get_many(chunk))
         return d
     else:
         return self._get_many(keys)
Example #28
0
    def get_editions(self, limit=50, offset=0, _raw=False):
        """Returns the editions objects belonged to this list ordered by last_modified. 
        
        When _raw=True, the edtion dicts are returned instead of edtion objects.
        """
        # show at max 10 pages
        MAX_OFFSET = min(self.edition_count, 50 * 10)

        if not self.seeds or offset > MAX_OFFSET:
            return {
                "count": 0,
                "offset": offset,
                "limit": limit,
                "editions": []
            }

        # We don't want to give more than 500 editions for performance reasons.
        if offset + limit > MAX_OFFSET:
            limit = MAX_OFFSET - offset

        key_data = []
        rawseeds = self._get_rawseeds()
        for seeds in web.group(rawseeds, 50):
            key_data += self._get_edition_keys(seeds, limit=MAX_OFFSET)
        keys = [
            key for key, last_modified in sorted(
                key_data, key=lambda x: x[1], reverse=True)
        ]
        keys = keys[offset:limit]

        # Get the documents from couchdb
        docs = self.get_couchdb_docs(self._get_editions_db(), keys)

        def get_doc(key):
            doc = docs[key]
            del doc['_id']
            del doc['_rev']
            if not _raw:
                data = self._site._process_dict(common.parse_query(doc))
                doc = client.create_thing(self._site, doc['key'], data)
            return doc

        d = {
            "count": self.edition_count,
            "offset": offset,
            "limit": limit,
            "editions": [get_doc(key) for key in keys]
        }

        if offset + limit < MAX_OFFSET:
            d['next_params'] = {'offset': offset + limit}

        if offset > 0:
            d['prev_params'] = {'offset': max(0, offset - limit)}
        return d
Example #29
0
    def update_seeds(self, seeds, chunksize=50):
        big_seeds = self.get_big_seeds()
        seeds2 = sorted(seed for seed in seeds if seed not in big_seeds)

        logging.info("update_seeds %s", len(seeds2))
        logging.info("ignored %d big seeds", len(seeds) - len(seeds2))

        for i, chunk in enumerate(web.group(seeds2, chunksize)):
            chunk = list(chunk)
            logging.info("update_seeds %d %d", i, len(chunk))
            self._update_seeds(chunk)
def find_not_indexed(keys, chunk_size=1000):
	for chunk in web.group(keys, chunk_size):
		chunk = list(chunk)
		q=" OR ".join("key:" + k for k in chunk)
		params = urllib.urlencode({"q": q, "rows": chunk_size, "wt": "json", "fl": "key"})
		url = solr_base_url + "/select"
		d = jsonget(url, params)
		found = set(doc['key'] for doc in d['response']['docs'])
		for k in chunk:
			if k not in found:
				yield k
Example #31
0
 def get_many(self, keys):
     """Get multiple documents in a single request as a dictionary.
     """
     if len(keys) > 500:
         # get in chunks of 500 to avoid crossing the URL length limit.
         d = {}
         for chunk in web.group(keys, 500):
             d.update(self._get_many(chunk))
         return d
     else:
         return self._get_many(keys)
Example #32
0
    def _process_key(self, key):
        # some data in the database still has /b/ instead /books.
        # The transaformation is still done in software.
        mapping = ("/l/", "/languages/", "/a/", "/authors/", "/b/", "/books/",
                   "/user/", "/people/")

        if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']:
            for old, new in web.group(mapping, 2):
                if key.startswith(old):
                    return new + key[len(old):]
        return key
Example #33
0
def _process_key(key):
    mapping = (
        "/l/", "/languages/",
        "/a/", "/authors/",
        "/b/", "/books/",
        "/user/", "/people/"
    )
    for old, new in web.group(mapping, 2):
        if key.startswith(old):
            return new + key[len(old):]
    return key
Example #34
0
    def update_seeds(self, seeds, chunksize=50):
        big_seeds = self.get_big_seeds()
        seeds2 = sorted(seed for seed in seeds if seed not in big_seeds)
        
        logger.info("update_seeds %s", len(seeds2))
        logger.info("ignored %d big seeds", len(seeds)-len(seeds2))

        for i, chunk in enumerate(web.group(seeds2, chunksize)):
            chunk = list(chunk)
            logger.info("update_seeds %d %d", i, len(chunk))
            self._update_seeds(chunk)
Example #35
0
 def _get_docs(self, keys):
     """Returns docs for the specified keys as a dictionary.
     """
     docs = {}
     
     for keys2 in web.group(keys, 500):
         json = self.infobase_conn.request(
             sitename="openlibrary.org",
             path="/get_many",
             data={"keys": simplejson.dumps(keys2)})
         docs2 = simplejson.loads(json)
         docs.update(docs2)
     return docs    
Example #36
0
    def _process_key(self, key):
        mapping = (
            "/l/", "/languages/",
            "/a/", "/authors/",
            "/b/", "/books/",
            "/user/", "/people/"
        )

        if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']:
            for old, new in web.group(mapping, 2):
                if key.startswith(old):
                    return new + key[len(old):]
        return key
Example #37
0
def write_sitemaps(data, outdir, prefix):
    timestamp = datetime.datetime.utcnow().isoformat() + 'Z'

    # maximum permitted entries in one sitemap is 50K.
    for i, rows in enumerate(web.group(data, 50000)):
        filename = "sitemap_%s_%04d.xml.gz" % (prefix, i)
        print("generating", filename, file=sys.stderr)

        sitemap = web.safestr(t_sitemap(rows))

        path = os.path.join(outdir, filename)
        gzwrite(path, sitemap)
        yield filename, timestamp
Example #38
0
def write_sitemaps(data, outdir, prefix):
    timestamp = datetime.datetime.utcnow().isoformat() + 'Z'

    # maximum permitted entries in one sitemap is 50K.
    for i, rows in enumerate(web.group(data, 50000)):
        filename = "sitemap_%s_%04d.xml.gz" % (prefix, i)
        print >> sys.stderr, "generating", filename

        sitemap = web.safestr(t_sitemap(rows))

        path = os.path.join(outdir, filename)
        gzwrite(path, sitemap)
        yield filename, timestamp
Example #39
0
    def update_seeds(self, seeds, chunksize=50):
        # XXX-Anand: temporarily disable updates as the node hosting seeds_db is low on disk
        return
        big_seeds = self.get_big_seeds()
        seeds2 = sorted(seed for seed in seeds if seed not in big_seeds)
        
        logger.info("update_seeds %s", len(seeds2))
        logger.info("ignored %d big seeds", len(seeds)-len(seeds2))

        for i, chunk in enumerate(web.group(seeds2, chunksize)):
            chunk = list(chunk)
            logger.info("update_seeds %d %d", i, len(chunk))
            self._update_seeds(chunk)
Example #40
0
def parse(filename, chunk_size=10000):
    t0 = time.time()
    i = 0
    for chunk in web.group(open(filename), chunk_size):
        print i, time.time() - t0
        d = {}
        for line in chunk:
            key, type, revision, json = line.strip().split("\t")
            d["%s@@%s" % (key, revision)] = json

        i += len(d)
        yield d
    print i, time.time() - t0
Example #41
0
def update_keys(keys):
    keys = (k for k in keys if k.count("/") == 2 and k.split("/")[1] in ["books", "authors", "works"])

    count = 0
    for chunk in web.group(keys, 100):
        chunk = list(chunk)
        count += len(chunk)
        update_work.update_keys(chunk, commit=False)

    if count:
        logger.info("updated %d documents", count)

    return count
Example #42
0
 def load_works(self, filename, author="/user/ImportBot"):
     self.author = author
     
     root = os.path.dirname(filename)
     editions_file = open(os.path.join(root, 'editions.txt'), 'a')
     
     try:
         for i, lines in enumerate(web.group(open(filename), 1000)):
             t0 = time.time()
             self.load_works_chunk(lines, editions_file)
             t1 = time.time()
             log(i, "%.3f sec" % (t1-t0))
     finally:
         editions_file.close()
Example #43
0
    def load_works(self, filename, author="/user/ImportBot"):
        self.author = author

        root = os.path.dirname(filename)
        editions_file = open(os.path.join(root, 'editions.txt'), 'a')

        try:
            for i, lines in enumerate(web.group(open(filename), 1000)):
                t0 = time.time()
                self.load_works_chunk(lines, editions_file)
                t1 = time.time()
                log(i, "%.3f sec" % (t1 - t0))
        finally:
            editions_file.close()
Example #44
0
    def process_changesets(self, changesets):
        ctx = UpdaterContext()
        for chunk in web.group(changesets, 50):
            chunk = list(chunk)

            works = [
                work for changeset in chunk
                for work in self._get_works(changeset)
            ]

            editions = [
                e for changeset in chunk for e in self._get_editions(changeset)
            ]

            keys = [w['key'] for w in works] + [
                e['works'][0]['key'] for e in editions if e.get('works')
            ]
            keys = list(set(keys))
            self.works_db.db.preload(keys)

            for work in works:
                work = self.works_db.update_work(ctx, work)

            # works have been modified. Commit to update the views.
            logging.info("BEGIN commit works_db")
            self.works_db.db.commit()
            logging.info("END commit works_db")

            self.works_db.update_editions(ctx, editions)
            self.editions_db.update_editions(ctx.editions.values())
            ctx.editions.clear()

            t = datetime.datetime.utcnow().isoformat()
            if ctx.seeds:
                logging.info("BEGIN commit works_db")
                self.works_db.db.commit()
                logging.info("END commit works_db")

                logging.info("BEGIN mark %d seeds for update" % len(ctx.seeds))
                self.seeds_db.mark_seeds_for_update(ctx.seeds.keys())
                logging.info("END mark %d seeds for update" % len(ctx.seeds))
                ctx.seeds.clear()

            # reset to limit the make sure the size of cache never grows without any limit.
            if len(self.works_db.db.docs) > 1000:
                self.works_db.db.reset()

        self.works_db.db.commit()
        self.works_db.db.reset()
Example #45
0
def get_handler(methodlist):
    dispatcher = SimpleXMLRPCDispatcher(False, None)
    for method in web.group(methodlist, 2):
        dispatcher.register_function(method[1], method[0])
    class rpc:
        def GET(self):
            web.header('Content-Type', 'text/html')
            print get_doc(dispatcher)

        def POST(self):
            response = dispatcher._marshaled_dispatch(web.webapi.data())
            web.header('Content-Type', 'text/xml')
            web.header('Content-length', str(len(response)))
            print response
    return rpc
Example #46
0
    def _run_solr_facet_query(self, facet_fields, facet_limit=None):
        params = {
            "wt": "json",
            "fq": "type:stats",
            "q": "*:*",
            "rows": 0,
            "facet": "on",
            "facet.mincount": 1,
            "facet.field": facet_fields
        }
        if facet_limit:
            params["facet.limit"] = facet_limit

        response = self.solr_select(params)
        return dict((name, list(web.group(counts, 2))) for name, counts in response['facet_counts']['facet_fields'].items())
Example #47
0
    def _run_solr_facet_query(self, facet_fields, facet_limit=None):
        params = {
            "wt": "json",
            "fq": "type:stats", 
            "q": "*:*", 
            "rows": 0,
            "facet": "on",
            "facet.mincount": 1,
            "facet.field": facet_fields
        }
        if facet_limit:
            params["facet.limit"] = facet_limit

        response = self.solr_select(params)
        return dict((name, list(web.group(counts, 2))) for name, counts in response['facet_counts']['facet_fields'].items())
Example #48
0
File: admin.py Project: keizo/kulu
 def GET(self):
     page = self.page
     content = ''
     admin_urls = []
     for key in mod:
         if hasattr(mod[key], 'urls'):
             p = web.group(mod[key].urls,2)
             for pair in p:
                 if pair[0].startswith('/admin/'):
                     admin_urls.append(pair)
                     content += '<p>'+key+' <a href="'+pair[0]+'">'+pair[1]+'</a> '
                     content += str(getattr(mod[key],pair[1]).__doc__)
                     content += '</p>'
                 
     web.render('generic.html')
Example #49
0
    def _process_key(self, key):
        # some data in the database still has /b/ instead /books. 
        # The transaformation is still done in software.
        mapping = (
            "/l/", "/languages/",
            "/a/", "/authors/",
            "/b/", "/books/",
            "/user/", "/people/"
        )

        if "/" in key and key.split("/")[1] in ['a', 'b', 'l', 'user']:
            for old, new in web.group(mapping, 2):
                if key.startswith(old):
                    return new + key[len(old):]
        return key
Example #50
0
def generate_html_index(index_file, outdir):
    data = parse_index_file(index_file)
    data = ((d[0], d[1]) for d in data)

    for i, chunk in enumerate(web.group(data, 1000)):
        back = ".."
        index = t_html_layout(t_html_sitemap(back, chunk))

        path = outdir + "/%02d/%05d.html" % (i / 1000, i)
        write(path, web.safestr(index))

    for f in os.listdir(outdir):
        path = os.path.join(outdir, f)
        if os.path.isdir(path):
            dirindex(path)
    dirindex(outdir, back=".")
Example #51
0
def generate_html_index(index_file, outdir):
    data = parse_index_file(index_file)
    data = ((d[0], d[1]) for d in data)

    for i, chunk in enumerate(web.group(data, 1000)):
        back = ".."
        index = t_html_layout(t_html_sitemap(back, chunk))

        path = outdir + "/%02d/%05d.html" % (i/1000, i)
        write(path, web.safestr(index))

    for f in os.listdir(outdir):
        path = os.path.join(outdir, f)
        if os.path.isdir(path):
            dirindex(path)
    dirindex(outdir, back=".")
Example #52
0
def update_keys(keys):
    global args
    keys = (k for k in keys if k.count("/") == 2 and k.split("/")[1] in ["books", "authors", "works"])
    update_work.clear_monkeypatch_cache(max_size=10000)
    print str(args)
    update_work.load_configs(args.ol_url, args.config, 'default')

    count = 0
    for chunk in web.group(keys, 100):
        chunk = list(chunk)
        count += len(chunk)
        update_work.do_updates(chunk)

    if count:
        logger.info("updated %d documents", count)

    return count
Example #53
0
    def get_loans_per_day(self, resource_type="total"):
        params = {
            "wt": "json",
            "fq": ["type:stats"],
            "q": "*:*", 
            "rows": 0,
            "facet": "on",
            "facet.mincount": 1,
            "facet.field": ['start_day_s']
        }
        if resource_type != 'total':
            params['fq'].append("resource_type_s:" + resource_type)

        response = self.solr_select(params)
        counts0 = response['facet_counts']['facet_fields']['start_day_s']
        day_facet = web.group(counts0, 2)
        return [[self.date2timestamp(*self.parse_date(day))*1000, count] for day, count in day_facet]
def add_oclc_ids(filename):
    """Adds OCLC Ids to OL records.
    """
    for mapping in web.group(read_mapping(filename), 1000):
        mapping = dict(mapping)

        docs = get_docs(mapping.keys())

        # ignore docs that already have the oclc_number that we are about to set
        docs = [doc for doc in docs
                if not has_identifier(doc, "oclc_numbers", mapping[doc['key']])]

        for doc in docs:
            add_identifier(doc, "oclc_numbers", mapping[doc['key']])

        if docs:
            ol.save_many(docs, comment="Added OCLC numbers.")