def main(opts, config): log = logging.getLogger() session = utils.get_session(config) to_collection = session.get_collection('fulltext') to_collection.drop() bibiter = session.iterate(models.FulltextLink) bibiter = itertools.imap(lambda x: x.bibcode, bibiter) if opts.limit: bibiter = itertools.islice(bibiter, opts.limit) tasks = Queue() # start up our builder threads log.debug("Creating %d Copier processes" % opts.threads) procs = [ Copier(tasks, opts, config) for i in xrange(opts.threads)] for p in procs: p.start() for bib in bibiter: tasks.put(bib) # add some poison pills to the end of the queue log.debug("poisoning our task threads") for i in xrange(opts.threads): tasks.put(None) log.info("All work complete")
def init_app(self, app, config=None): "Initialize the session extension" if not (config is None or isinstance(config, dict)): raise ValueError("`config` must be an instance of dict or None") if config is None: config = self.config if config is None: config = app.config config.setdefault("ADSDATA_MONGO_DATABASE", 'adsdata') config.setdefault("ADSDATA_MONGO_HOST", 'localhost') config.setdefault("ADSDATA_MONGO_PORT", 27017) config.setdefault("ADSDATA_MONGO_SAFE", True) config.setdefault("ADSDATA_MONGO_USER", "adsdata") config.setdefault("ADSDATA_MONGO_PASSWORD", None) session = get_session(config) if not hasattr(app, 'extensions'): app.extensions = {} app.extensions['adsdata'] = session return session
def load_data(update_args): model_class, data_file, batch_size = update_args log = logging.getLogger() log.debug("thread '%s' working on %s" % (current_process().name, model_class)) session = utils.get_session(config) model_class.load_data(session, data_file, batch_size=batch_size)
def __init__(self, task_queue, result_queue, do_docs=True, do_metrics=True): Process.__init__(self) self.do_docs = do_docs self.do_metrics = do_metrics self.task_queue = task_queue self.result_queue = result_queue self.session = utils.get_session(config)
def sync(opts, config): """ updates the mongo data collections from their data source files """ log = logging.getLogger() session = utils.get_session(config) update_args = [] for model_class, data_file in get_models(opts, config): if model_class.needs_sync(session, data_file) or opts.force: log.info("%s needs synching" % model_class.config_collection_name) data_file = copy_source(data_file, config['ADSDATA_TMP_DIR']) update_args.append((model_class, data_file, config['ADSDATA_MONGO_DATA_LOAD_BATCH_SIZE'])) else: log.info("%s does not need syncing" % model_class.config_collection_name) if opts.threads > 0: p = Pool(opts.threads) p.map(load_data, update_args) else: for cls, data_file, batch_size in update_args: data_file = copy_source(data_file, config['ADSDATA_TMP_DIR']) load_data((cls, data_file, batch_size))
def __init__(self, task_queue, result_queue, do_docs=True, do_metrics=True, publish_to_solr=True): Process.__init__(self) self.do_docs = do_docs self.do_metrics = do_metrics self.task_queue = task_queue self.result_queue = result_queue self.session = utils.get_session(config, name=self.__repr__()) if do_metrics: psql_session_ = psql_session.Session() else: psql_session_ = None self.psql = { 'session': psql_session_, 'payload': [], 'payload_size': 100, } self.rabbit = { 'publish': publish_to_solr, 'payload': [], 'payload_size': 100, }
def build(opts): tasks = JoinableQueue() results = JoinableQueue() if opts.remove: log.info("Removing existing docs collection") session = utils.get_session(config) session.docs.drop() # start up our builder threads log.info("Creating %d Builder processes" % opts.threads) builders = [ Builder(tasks, results) for i in xrange(opts.threads)] for b in builders: b.start() # queue up the bibcodes for bib in get_bibcodes(opts): tasks.put(bib) # add some poison pills to the end of the queue log.info("poisoning our task threads") for i in xrange(opts.threads): tasks.put(None) # join the results queue. this should # block until all tasks in the task queue are completed log.info("Joining the task queue") tasks.join() log.info("Joining the task threads") for b in builders: b.join() log.info("All work complete")
def __init__(self, task_queue, opts, config): Process.__init__(self) self.task_queue = task_queue self.from_collection = MongoClient(host=opts.from_mongo)['solr4ads']['docs'] session = utils.get_session(config) self.to_collection = session.get_collection('fulltext') self.wanted = dict([(x,1) for x in opts.fields.split(',')])
def build_synchronous(opts): session = utils.get_session(config) for bib in get_bibcodes(opts): doc = session.generate_doc(bib) if doc is not None: saved = session.store_doc(doc) log.info("Saved: %s", str(saved)) return
def delete(opts, config): log.info("Deleting all records from %s that do not appear in %s" % (opts.subject, opts.authority)) session = utils.get_session(config) subject_collection = session.get_collection(opts.subject) count = 0 for count, bib in enumerate(find_deletions(opts, config), 1): log.info("deleting %s" % bib) subject_collection.remove({'_id': bib}) log.info("done. %d items deleted" % count)
def test_dereference_manipulator(self): self.session = utils.get_session(self.config, inc_manipulators=False) collection_a = self.session.get_collection('test_a') collection_b = self.session.get_collection('test_b') collection_a.insert({"_id": 1, "foo": "bar"}) collection_b.insert({"baz": "blah", "foo": DBRef(collection="test_a", id=1)}) manipulator = DereferenceManipulator(ref_fields=[('test_b', 'foo')]) self.session.add_manipulator(manipulator) doc = collection_b.find_one({"baz": "blah"}) self.assertEqual(doc['foo'], 'bar')
def status(opts, config): """ reports on update status of mongo data collections """ log = logging.getLogger() session = utils.get_session(config) for model_class, data_file in get_models(opts, config): needs_sync = model_class.needs_sync(session, data_file) and 'yes' or 'no' last_synced = model_class.last_synced(session) log.info("%s last synced: %s; needs sync? : %s" % (model_class.config_collection_name, last_synced, needs_sync))
def test_digest_manipulator(self): self.session = utils.get_session(self.config, inc_manipulators=False) self.session.add_manipulator(DigestInjector('ads_test')) collection = self.session.get_collection('ads_test') collection.insert({"foo": 1}) entry = collection.find_one({"foo": 1}, manipulate=False) self.assertTrue(entry.has_key('_digest')) digest = doc_digest({"bar": 1}, self.session.db) collection.insert({"baz": 1, "_digest": digest}) entry = collection.find_one({"baz": 1}, manipulate=False) self.assertEqual(entry['_digest'], digest)
def test_digest_manipulator(self): self.session = utils.get_session(self.config, inc_manipulators=False) self.session.add_manipulator(DigestInjector('ads_test')) collection = self.session.get_collection('ads_test') collection.insert({"foo": 1}) entry = collection.find_one({"foo": 1}, manipulate=False) self.assertTrue(entry.has_key('_digest')) digest = record_digest({"bar": 1}, self.session.db) collection.insert({"baz": 1, "_digest": digest}) entry = collection.find_one({"baz": 1}, manipulate=False) self.assertEqual(entry['_digest'], digest)
def build_synchronous(opts): session = utils.get_session(config) for bib in get_bibcodes(opts): if 'doc' in opts.do: doc = session.generate_doc(bib) if doc is not None: session.store(doc, session.docs) if 'metrics' in opts.do: metrics = session.generate_metrics_data(bib) if metrics is not None: session.store(metrics, session.metrics_data) log.debug("Done building %s", bib) return
def build_synchronous(opts): session = utils.get_session(config) for bib in get_bibcodes(opts): if 'doc' in opts.do: doc = session.generate_doc(bib) if doc is not None: session.store(doc, session.docs) if 'metrics' in opts.do: metrics = session.generate_metrics_data(bib) if metrics is not None: session.store(metrics, session.metrics_data) log.info("Done building %s", bib) return
def test_dt_manipulator(self): self.session = utils.get_session(self.config, inc_manipulators=False) self.session.add_manipulator(DatetimeInjector('ads_test')) collection = self.session.get_collection('ads_test') collection.insert({"foo": 1}) entry = collection.find_one({"foo": 1}, manipulate=False) self.assertTrue(entry.has_key('_dt')) self.assertTrue(isinstance(entry['_dt'], datetime)) # let the manipulator remove the _dt entry = collection.find_one({"foo": 1}) self.assertFalse(entry.has_key('_dt')) # make sure that no '_dt' values are preserved dt = datetime.utcnow().replace(tzinfo=pytz.utc) collection.insert({"foo": 1, '_dt': dt}) entry = collection.find_one({"foo": 1}, manipulate=False) self.assertNotEqual(dt, entry['_dt'])
def get_bibcodes(opts): if opts.infile: if opts.infile == '-': stream = sys.stdin else: stream = open(opts.infile, 'r') bibcodes = itertools.imap(lambda x: x.strip(), stream) elif opts.source_model: try: source_model = eval('models.' + opts.source_model) assert hasattr(source_model, 'class_name') except AssertionError, e: raise Exception("Invalid source_model value: %s" % e) session = utils.get_session(config) bibcodes = itertools.imap(lambda x: x.bibcode, session.iterate(source_model))
def setUp(self): self.box = mongobox.MongoBox(scripting=True, auth=True) self.box.start() self.boxclient = self.box.client() self.boxclient['admin'].add_user('foo','bar') self.boxclient['admin'].authenticate('foo','bar') self.boxclient['test'].add_user('test','test') base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) config = utils.load_config(os.path.join(base_dir, 'test/adsdata.cfg.test')) config['ADSDATA_MONGO_DATABASE'] = 'test' config['ADSDATA_MONGO_HOST'] = 'localhost' config['ADSDATA_MONGO_PORT'] = self.box.port config['ADSDATA_MONGO_USER'] = '******' config['ADSDATA_MONGO_PASSWORD'] = '******' self.config = config self.session = utils.get_session(config) load_data(self.config)
def find_deletions(opts, config): log = logging.getLogger() session = utils.get_session(config) subject_collection = session.get_collection(opts.subject) bibiter = itertools.imap(lambda x: x['_id'], subject_collection.find({}, {'_id': 1})) if opts.limit: bibiter = itertools.islice(bibiter, opts.limit) tasks = JoinableQueue() deletes = Queue() # start up our builder threads log.debug("Creating %d Worker processes" % opts.threads) procs = [ Worker(tasks, deletes, config, opts.authority) for i in xrange(opts.threads) ] for p in procs: p.start() log.debug("Queueing work") count = 0 for count, bib in enumerate(bibiter, 1): tasks.put(bib) log.info("Subject collection contained %d items" % count) # add some poison pills to the end of the queue log.debug("poisoning our task threads") for i in xrange(opts.threads): tasks.put(None) log.debug("joining task queue") tasks.join() while True: try: yield deletes.get_nowait() except QueueEmpty: break
def build(opts): tasks = JoinableQueue() results = JoinableQueue() if opts.remove: log.info("Removing existing docs and metrics_data collection") session = utils.get_session(config) session.docs.drop() session.metrics_data.drop() do_docs = 'docs' in opts.do do_metrics = 'metrics' in opts.do # start up our builder threads log.info("Creating %d Builder processes" % opts.threads) builders = [ Builder(tasks, results, do_docs, do_metrics) for i in xrange(opts.threads) ] for b in builders: b.start() # queue up the bibcodes for bib in get_bibcodes(opts): tasks.put(bib) # add some poison pills to the end of the queue log.info("poisoning our task threads") for i in xrange(opts.threads): tasks.put(None) # join the results queue. this should # block until all tasks in the task queue are completed log.info("Joining the task queue") tasks.join() log.info("Joining the task threads") for b in builders: b.join() log.info("All work complete")
def init_app(self, app, config=None): "Initialize the session extension" if not (config is None or isinstance(config, dict)): raise ValueError("`config` must be an instance of dict or None") if config is None: config = self.config if config is None: config = app.config config.setdefault("ADSDATA_MONGO_DATABASE", 'adsdata') config.setdefault("ADSDATA_MONGO_HOST", 'localhost') config.setdefault("ADSDATA_MONGO_PORT", 27017) config.setdefault("ADSDATA_MONGO_USER", "adsdata") config.setdefault("ADSDATA_MONGO_PASSWORD", None) session = get_session(config) if not hasattr(app, 'extensions'): app.extensions = {} app.extensions['adsdata'] = session return session
def __init__(self, task_queue, deletes_queue, config, authority): Process.__init__(self) self.task_queue = task_queue self.deletes_queue = deletes_queue session = utils.get_session(config) self.authority_collection = session.get_collection(authority)