Ejemplo n.º 1
0
def main(opts, config):
    
    log = logging.getLogger()
    
    session = utils.get_session(config)
    to_collection = session.get_collection('fulltext')
    to_collection.drop()
    
    bibiter = session.iterate(models.FulltextLink)
    bibiter = itertools.imap(lambda x: x.bibcode, bibiter)
    if opts.limit:
        bibiter = itertools.islice(bibiter, opts.limit)
    
    tasks = Queue()

    # start up our builder threads
    log.debug("Creating %d Copier processes" % opts.threads)
    procs = [ Copier(tasks, opts, config) for i in xrange(opts.threads)]
    for p in procs:
        p.start()

    for bib in bibiter:
        tasks.put(bib)
        
    # add some poison pills to the end of the queue
    log.debug("poisoning our task threads")
    for i in xrange(opts.threads):
        tasks.put(None)

    log.info("All work complete")
Ejemplo n.º 2
0
    def init_app(self, app, config=None):
        "Initialize the session extension"

        if not (config is None or isinstance(config, dict)):
            raise ValueError("`config` must be an instance of dict or None")

        if config is None:
            config = self.config
        if config is None:
            config = app.config

        config.setdefault("ADSDATA_MONGO_DATABASE", 'adsdata')
        config.setdefault("ADSDATA_MONGO_HOST", 'localhost')
        config.setdefault("ADSDATA_MONGO_PORT", 27017)
        config.setdefault("ADSDATA_MONGO_SAFE", True)
        config.setdefault("ADSDATA_MONGO_USER", "adsdata")
        config.setdefault("ADSDATA_MONGO_PASSWORD", None)

        session = get_session(config)
        
        if not hasattr(app, 'extensions'):
            app.extensions = {}
        
        app.extensions['adsdata'] = session
        return session
Ejemplo n.º 3
0
def load_data(update_args):
    model_class, data_file, batch_size = update_args
    log = logging.getLogger()
    log.debug("thread '%s' working on %s" %
              (current_process().name, model_class))
    session = utils.get_session(config)
    model_class.load_data(session, data_file, batch_size=batch_size)
Ejemplo n.º 4
0
 def __init__(self, task_queue, result_queue, do_docs=True, do_metrics=True):
     Process.__init__(self)
     self.do_docs = do_docs
     self.do_metrics = do_metrics
     self.task_queue = task_queue
     self.result_queue = result_queue
     self.session = utils.get_session(config)
Ejemplo n.º 5
0
def sync(opts, config):
    """
    updates the mongo data collections from their data source files
    """
    log = logging.getLogger()

    session = utils.get_session(config)

    update_args = []
    for model_class, data_file in get_models(opts, config):
        if model_class.needs_sync(session, data_file) or opts.force:
            log.info("%s needs synching" % model_class.config_collection_name)
            data_file = copy_source(data_file, config['ADSDATA_TMP_DIR'])
            update_args.append((model_class, data_file,
                                config['ADSDATA_MONGO_DATA_LOAD_BATCH_SIZE']))
        else:
            log.info("%s does not need syncing" %
                     model_class.config_collection_name)
    if opts.threads > 0:
        p = Pool(opts.threads)
        p.map(load_data, update_args)
    else:
        for cls, data_file, batch_size in update_args:
            data_file = copy_source(data_file, config['ADSDATA_TMP_DIR'])
            load_data((cls, data_file, batch_size))
Ejemplo n.º 6
0
 def __init__(self,
              task_queue,
              result_queue,
              do_docs=True,
              do_metrics=True,
              publish_to_solr=True):
     Process.__init__(self)
     self.do_docs = do_docs
     self.do_metrics = do_metrics
     self.task_queue = task_queue
     self.result_queue = result_queue
     self.session = utils.get_session(config, name=self.__repr__())
     if do_metrics:
         psql_session_ = psql_session.Session()
     else:
         psql_session_ = None
     self.psql = {
         'session': psql_session_,
         'payload': [],
         'payload_size': 100,
     }
     self.rabbit = {
         'publish': publish_to_solr,
         'payload': [],
         'payload_size': 100,
     }
Ejemplo n.º 7
0
def build(opts):
    tasks = JoinableQueue()
    results = JoinableQueue()
    
    if opts.remove:
        log.info("Removing existing docs collection")
        session = utils.get_session(config)
        session.docs.drop()
        
    # start up our builder threads
    log.info("Creating %d Builder processes" % opts.threads)
    builders = [ Builder(tasks, results) for i in xrange(opts.threads)]
    for b in builders:
        b.start()
        
    # queue up the bibcodes
    for bib in get_bibcodes(opts):
        tasks.put(bib)
    
    # add some poison pills to the end of the queue
    log.info("poisoning our task threads")
    for i in xrange(opts.threads):
        tasks.put(None)
    
    # join the results queue. this should
    # block until all tasks in the task queue are completed
    log.info("Joining the task queue")
    tasks.join()
    log.info("Joining the task threads")
    for b in builders:
        b.join()
    
    log.info("All work complete")
Ejemplo n.º 8
0
 def __init__(self, task_queue, opts, config):
     Process.__init__(self)
     self.task_queue = task_queue
     self.from_collection = MongoClient(host=opts.from_mongo)['solr4ads']['docs']
     session = utils.get_session(config)
     self.to_collection = session.get_collection('fulltext')
     self.wanted = dict([(x,1) for x in opts.fields.split(',')])
Ejemplo n.º 9
0
def build_synchronous(opts):
    session = utils.get_session(config)
    for bib in get_bibcodes(opts):
        doc = session.generate_doc(bib)
        if doc is not None:
            saved = session.store_doc(doc)
            log.info("Saved: %s", str(saved))
    return
Ejemplo n.º 10
0
def delete(opts, config):
    log.info("Deleting all records from %s that do not appear in %s" % (opts.subject, opts.authority))
    session = utils.get_session(config)
    subject_collection = session.get_collection(opts.subject)
    count = 0
    for count, bib in enumerate(find_deletions(opts, config), 1):
        log.info("deleting %s" % bib)
        subject_collection.remove({'_id': bib})
    log.info("done. %d items deleted" % count)
Ejemplo n.º 11
0
def delete(opts, config):
    log.info("Deleting all records from %s that do not appear in %s" %
             (opts.subject, opts.authority))
    session = utils.get_session(config)
    subject_collection = session.get_collection(opts.subject)
    count = 0
    for count, bib in enumerate(find_deletions(opts, config), 1):
        log.info("deleting %s" % bib)
        subject_collection.remove({'_id': bib})
    log.info("done. %d items deleted" % count)
Ejemplo n.º 12
0
 def test_dereference_manipulator(self):
     self.session = utils.get_session(self.config, inc_manipulators=False)
     collection_a = self.session.get_collection('test_a')
     collection_b = self.session.get_collection('test_b')
     collection_a.insert({"_id": 1, "foo": "bar"})
     collection_b.insert({"baz": "blah", "foo": DBRef(collection="test_a", id=1)})
     manipulator = DereferenceManipulator(ref_fields=[('test_b', 'foo')])
     self.session.add_manipulator(manipulator)
     doc = collection_b.find_one({"baz": "blah"})
     self.assertEqual(doc['foo'], 'bar')
Ejemplo n.º 13
0
 def test_dereference_manipulator(self):
     self.session = utils.get_session(self.config, inc_manipulators=False)
     collection_a = self.session.get_collection('test_a')
     collection_b = self.session.get_collection('test_b')
     collection_a.insert({"_id": 1, "foo": "bar"})
     collection_b.insert({"baz": "blah", "foo": DBRef(collection="test_a", id=1)})
     manipulator = DereferenceManipulator(ref_fields=[('test_b', 'foo')])
     self.session.add_manipulator(manipulator)
     doc = collection_b.find_one({"baz": "blah"})
     self.assertEqual(doc['foo'], 'bar')
Ejemplo n.º 14
0
def status(opts, config):
    """
    reports on update status of mongo data collections
    """
    log = logging.getLogger()
    session = utils.get_session(config)
    for model_class, data_file in get_models(opts, config):
        needs_sync = model_class.needs_sync(session, data_file) and 'yes' or 'no'
        last_synced = model_class.last_synced(session)
        log.info("%s last synced: %s; needs sync? : %s" % (model_class.config_collection_name, last_synced, needs_sync))
Ejemplo n.º 15
0
 def test_digest_manipulator(self):
     self.session = utils.get_session(self.config, inc_manipulators=False)
     self.session.add_manipulator(DigestInjector('ads_test'))
     collection = self.session.get_collection('ads_test')
     collection.insert({"foo": 1})
     entry = collection.find_one({"foo": 1}, manipulate=False)
     self.assertTrue(entry.has_key('_digest'))
     
     digest = doc_digest({"bar": 1}, self.session.db)
     collection.insert({"baz": 1, "_digest": digest})
     entry = collection.find_one({"baz": 1}, manipulate=False)
     self.assertEqual(entry['_digest'], digest)
Ejemplo n.º 16
0
def status(opts, config):
    """
    reports on update status of mongo data collections
    """
    log = logging.getLogger()
    session = utils.get_session(config)
    for model_class, data_file in get_models(opts, config):
        needs_sync = model_class.needs_sync(session,
                                            data_file) and 'yes' or 'no'
        last_synced = model_class.last_synced(session)
        log.info("%s last synced: %s; needs sync? : %s" %
                 (model_class.config_collection_name, last_synced, needs_sync))
Ejemplo n.º 17
0
 def test_digest_manipulator(self):
     self.session = utils.get_session(self.config, inc_manipulators=False)
     self.session.add_manipulator(DigestInjector('ads_test'))
     collection = self.session.get_collection('ads_test')
     collection.insert({"foo": 1})
     entry = collection.find_one({"foo": 1}, manipulate=False)
     self.assertTrue(entry.has_key('_digest'))
     
     digest = record_digest({"bar": 1}, self.session.db)
     collection.insert({"baz": 1, "_digest": digest})
     entry = collection.find_one({"baz": 1}, manipulate=False)
     self.assertEqual(entry['_digest'], digest)
Ejemplo n.º 18
0
def build_synchronous(opts):
    session = utils.get_session(config)
    for bib in get_bibcodes(opts):
        if 'doc' in opts.do:
            doc = session.generate_doc(bib)
            if doc is not None:
                session.store(doc, session.docs)
        if 'metrics' in opts.do:
            metrics = session.generate_metrics_data(bib)
            if metrics is not None:
                session.store(metrics, session.metrics_data)
        log.debug("Done building %s", bib)
    return
Ejemplo n.º 19
0
def build_synchronous(opts):
    session = utils.get_session(config)
    for bib in get_bibcodes(opts):
        if 'doc' in opts.do:
            doc = session.generate_doc(bib)
            if doc is not None:
                session.store(doc, session.docs)
        if 'metrics' in opts.do:
            metrics = session.generate_metrics_data(bib)
            if metrics is not None:
                session.store(metrics, session.metrics_data)
        log.info("Done building %s", bib)
    return
Ejemplo n.º 20
0
 def test_dt_manipulator(self):
     self.session = utils.get_session(self.config, inc_manipulators=False)
     self.session.add_manipulator(DatetimeInjector('ads_test'))
     collection = self.session.get_collection('ads_test')
     collection.insert({"foo": 1})
     entry = collection.find_one({"foo": 1}, manipulate=False)
     self.assertTrue(entry.has_key('_dt'))
     self.assertTrue(isinstance(entry['_dt'], datetime))
     # let the manipulator remove the _dt
     entry = collection.find_one({"foo": 1})
     self.assertFalse(entry.has_key('_dt'))
     # make sure that no '_dt' values are preserved
     dt = datetime.utcnow().replace(tzinfo=pytz.utc)
     collection.insert({"foo": 1, '_dt': dt})
     entry = collection.find_one({"foo": 1}, manipulate=False)
     self.assertNotEqual(dt, entry['_dt'])
Ejemplo n.º 21
0
def get_bibcodes(opts):
    
    if opts.infile:
        if opts.infile == '-':
            stream = sys.stdin
        else:
            stream = open(opts.infile, 'r')
        bibcodes = itertools.imap(lambda x: x.strip(), stream)
    elif opts.source_model:
        try:
            source_model = eval('models.' + opts.source_model)
            assert hasattr(source_model, 'class_name')
        except AssertionError, e:
            raise Exception("Invalid source_model value: %s" % e)
        session = utils.get_session(config)
        bibcodes = itertools.imap(lambda x: x.bibcode, session.iterate(source_model))
Ejemplo n.º 22
0
 def test_dt_manipulator(self):
     self.session = utils.get_session(self.config, inc_manipulators=False)
     self.session.add_manipulator(DatetimeInjector('ads_test'))
     collection = self.session.get_collection('ads_test')
     collection.insert({"foo": 1})
     entry = collection.find_one({"foo": 1}, manipulate=False)
     self.assertTrue(entry.has_key('_dt'))
     self.assertTrue(isinstance(entry['_dt'], datetime))
     # let the manipulator remove the _dt
     entry = collection.find_one({"foo": 1})
     self.assertFalse(entry.has_key('_dt'))
     # make sure that no '_dt' values are preserved
     dt = datetime.utcnow().replace(tzinfo=pytz.utc)
     collection.insert({"foo": 1, '_dt': dt})
     entry = collection.find_one({"foo": 1}, manipulate=False)
     self.assertNotEqual(dt, entry['_dt'])
Ejemplo n.º 23
0
def get_bibcodes(opts):

    if opts.infile:
        if opts.infile == '-':
            stream = sys.stdin
        else:
            stream = open(opts.infile, 'r')
        bibcodes = itertools.imap(lambda x: x.strip(), stream)
    elif opts.source_model:
        try:
            source_model = eval('models.' + opts.source_model)
            assert hasattr(source_model, 'class_name')
        except AssertionError, e:
            raise Exception("Invalid source_model value: %s" % e)
        session = utils.get_session(config)
        bibcodes = itertools.imap(lambda x: x.bibcode,
                                  session.iterate(source_model))
Ejemplo n.º 24
0
 def setUp(self):
     self.box = mongobox.MongoBox(scripting=True, auth=True)
     self.box.start()
     self.boxclient = self.box.client()
     self.boxclient['admin'].add_user('foo','bar')
     self.boxclient['admin'].authenticate('foo','bar')
     self.boxclient['test'].add_user('test','test')
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     config = utils.load_config(os.path.join(base_dir, 'test/adsdata.cfg.test'))
     config['ADSDATA_MONGO_DATABASE'] = 'test'
     config['ADSDATA_MONGO_HOST'] = 'localhost'
     config['ADSDATA_MONGO_PORT'] = self.box.port
     config['ADSDATA_MONGO_USER'] = '******'
     config['ADSDATA_MONGO_PASSWORD'] = '******'
     self.config = config
     self.session = utils.get_session(config)
     load_data(self.config)
Ejemplo n.º 25
0
def find_deletions(opts, config):

    log = logging.getLogger()

    session = utils.get_session(config)
    subject_collection = session.get_collection(opts.subject)
    bibiter = itertools.imap(lambda x: x['_id'],
                             subject_collection.find({}, {'_id': 1}))

    if opts.limit:
        bibiter = itertools.islice(bibiter, opts.limit)

    tasks = JoinableQueue()
    deletes = Queue()

    # start up our builder threads
    log.debug("Creating %d Worker processes" % opts.threads)
    procs = [
        Worker(tasks, deletes, config, opts.authority)
        for i in xrange(opts.threads)
    ]
    for p in procs:
        p.start()

    log.debug("Queueing work")
    count = 0
    for count, bib in enumerate(bibiter, 1):
        tasks.put(bib)

    log.info("Subject collection contained %d items" % count)

    # add some poison pills to the end of the queue
    log.debug("poisoning our task threads")
    for i in xrange(opts.threads):
        tasks.put(None)

    log.debug("joining task queue")
    tasks.join()

    while True:
        try:
            yield deletes.get_nowait()
        except QueueEmpty:
            break
Ejemplo n.º 26
0
def build(opts):
    tasks = JoinableQueue()
    results = JoinableQueue()

    if opts.remove:
        log.info("Removing existing docs and metrics_data collection")
        session = utils.get_session(config)
        session.docs.drop()
        session.metrics_data.drop()

    do_docs = 'docs' in opts.do
    do_metrics = 'metrics' in opts.do

    # start up our builder threads
    log.info("Creating %d Builder processes" % opts.threads)
    builders = [
        Builder(tasks, results, do_docs, do_metrics)
        for i in xrange(opts.threads)
    ]
    for b in builders:
        b.start()

    # queue up the bibcodes
    for bib in get_bibcodes(opts):
        tasks.put(bib)

    # add some poison pills to the end of the queue
    log.info("poisoning our task threads")
    for i in xrange(opts.threads):
        tasks.put(None)

    # join the results queue. this should
    # block until all tasks in the task queue are completed
    log.info("Joining the task queue")
    tasks.join()
    log.info("Joining the task threads")
    for b in builders:
        b.join()

    log.info("All work complete")
Ejemplo n.º 27
0
    def init_app(self, app, config=None):
        "Initialize the session extension"

        if not (config is None or isinstance(config, dict)):
            raise ValueError("`config` must be an instance of dict or None")

        if config is None:
            config = self.config
        if config is None:
            config = app.config

        config.setdefault("ADSDATA_MONGO_DATABASE", 'adsdata')
        config.setdefault("ADSDATA_MONGO_HOST", 'localhost')
        config.setdefault("ADSDATA_MONGO_PORT", 27017)
        config.setdefault("ADSDATA_MONGO_USER", "adsdata")
        config.setdefault("ADSDATA_MONGO_PASSWORD", None)

        session = get_session(config)

        if not hasattr(app, 'extensions'):
            app.extensions = {}

        app.extensions['adsdata'] = session
        return session
Ejemplo n.º 28
0
 def __init__(self, task_queue, deletes_queue, config, authority):
     Process.__init__(self)
     self.task_queue = task_queue
     self.deletes_queue = deletes_queue
     session = utils.get_session(config)
     self.authority_collection = session.get_collection(authority)