class MongoSessionFactory(object): def __init__(self, database, collection='sessions', ttl=None, **kwargs): self.collection = MongoClient(**kwargs)[database][collection] self.collection.drop_indexes() if ttl is not None: self.collection.ensure_index('last_access', expireAfterSeconds=ttl) def load(self, id=None): session = Session() try: doc = self.collection.find_one({'_id': ObjectId(id)}) if doc is not None: session.data = doc['data'] session.id = id except: pass return session def save(self, session): doc = { 'data': session.data, 'last_access': datetime.utcnow(), } if session.id is not None: doc['_id'] = ObjectId(session.id) self.collection.save(doc) return str(doc['_id'])
class cache_session(Session): def __init__(self, db_uri, dbname='tmp', colname='cache', expire_time=None, disabled=False): self.col = MongoClient(db_uri)[dbname][colname] self.disabled = disabled if expire_time: if not self.col.index_information().get('cache_time'+'_-1'): self.col.create_index([("cache_time", DESCENDING)],expireAfterSeconds=expire_time) else: self.col.drop_indexes() self.col.create_index([("cache_time", DESCENDING)],expireAfterSeconds=expire_time) super(cache_session, self).__init__() def request(self, method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, json=None): req = ( method.upper(), url, headers, files, data or {}, json, params or {}, auth, cookies, hooks, ) req1 = { 'method': method.upper(), 'url': url, 'headers': headers, 'files': files, 'data': data or {}, 'json': json, 'params': params or {}, 'auth': auth, 'cookies': cookies, 'hooks': hooks, } req_to_str = '&'.join("%s=%s" % (k, v) for k, v in req1.items()) key = sha1(req_to_str).hexdigest() cached_one = self.col.find_one({'key': key}) if cached_one and not self.disabled: print 'cached' return cached_one['html'] else: online_req = super(cache_session, self).request(method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, json=None ) html = online_req.text self.col.insert_one({'key': key, 'html': html, 'cache_time': datetime.utcnow()}) return html
return json.load(fp) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("inputPath", type=str, help="path to a wiktionary dump") # parser.add_argument("outputPath", type=str, help="path to the outputed json file") args = parser.parse_args() inputPath = args.inputPath # outputPath = args.outputPath # extractAndDump(inputPath, outputPath) collection = MongoClient()['wiktionary'][os.path.basename(inputPath).split('.')[0]] collection.drop() collection.drop_indexes() collection.create_indexes([ IndexModel([('language', 1), ('synset', 1)]) ]) data = load_json(inputPath) for i, (lang, v) in enumerate(data.items()): for synset, doc in tqdm(v.items(), "Importing {} ({}/{})".format(lang, i, len(data))): collection.insert_one({ 'language': lang, 'synset': synset, **doc })
class cache_session(Session): def __init__(self, db_uri, dbname='tmp', colname='cache', expire_time=None, disabled=False, url_only=True): self.col = MongoClient(db_uri)[dbname][colname] self.disabled = disabled self.url_only = url_only if expire_time: if not self.col.index_information().get('cache_time'+'_-1'): self.col.create_index([("cache_time", DESCENDING)],expireAfterSeconds=expire_time) else: self.col.drop_indexes() self.col.create_index([("cache_time", DESCENDING)],expireAfterSeconds=expire_time) super(cache_session, self).__init__() def request(self, method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, json=None): req = ( method.upper(), url, headers, files, data or {}, json, params or {}, auth, cookies, hooks, ) if self.url_only: req1 = { 'url': url, } else: req1 = { 'method': method.upper(), 'url': url, 'headers': headers, 'files': files, 'data': data or {}, 'json': json, 'params': params or {}, 'auth': auth, 'cookies': cookies, 'hooks': hooks, } req_to_str = '&'.join("%s=%s" % (k, v) for k, v in req1.items()) key = sha1(req_to_str).hexdigest() cached_one = self.col.find_one({'key': key}) if cached_one and not self.disabled: print 'cached' return cached_one['html'] else: online_req = super(cache_session, self).request(method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, json=None ) html = online_req.text self.col.insert_one({'key': key, 'html': html, 'cache_time': datetime.utcnow()}) return html