def setUp(self): self.config = None #placeholder self.TEST_PROVIDER_CONFIG = [ ("wikipedia", {}) ] # hacky way to delete the "ti" db, then make it fresh again for each test. temp_dao = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB")) temp_dao.delete_db(os.getenv("CLOUDANT_DB")) self.d = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB")) # do the same thing for the redis db, set up the test redis database. We're using DB Number 8 self.r = tiredis.from_url("redis://localhost:6379", db=8) self.r.flushdb() provider_queues = {} providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) for provider in providers: provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue") self.b = backend.Backend( backend.RedisQueue("alias-unittest", self.r), provider_queues, [backend.PythonQueue("couch_queue")], self.r) self.fake_item = { "_id": "1", "type": "item", "num_providers_still_updating":1, "aliases":{"pmid":["111"]}, "biblio": {}, "metrics": {} } self.fake_aliases_dict = {"pmid":["222"]} self.tiid = "abcd"
def test_get_providers(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
def test_get_providers_filters_by_biblio(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "biblio") provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Pubmed']))
def sniffer(item_aliases, provider_config=default_settings.PROVIDERS): (genre, host) = item_module.decide_genre(item_aliases) all_metrics_providers = [provider.provider_name for provider in ProviderFactory.get_providers(provider_config, "metrics")] if (genre == "article") and (host != "arxiv"): run = [[("aliases", provider)] for provider in ["mendeley", "crossref", "pubmed", "altmetric_com"]] run += [[("biblio", provider) for provider in ["crossref", "pubmed", "mendeley", "webpage"]]] run += [[("metrics", provider) for provider in all_metrics_providers]] elif (host == "arxiv") or ("doi" in item_aliases): run = [[("aliases", provider)] for provider in [host, "altmetric_com"]] run += [[("biblio", provider) for provider in [host, "mendeley"]]] run += [[("metrics", provider) for provider in all_metrics_providers]] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] run = [[("aliases", provider)] for provider in relevant_providers] run += [[("biblio", provider) for provider in relevant_providers]] run += [[("metrics", provider) for provider in all_metrics_providers]] return(run)
def setUp(self): self.config = None #placeholder self.TEST_PROVIDER_CONFIG = [("wikipedia", {})] self.d = None # do the same thing for the redis db, set up the test redis database. We're using DB Number 8 self.r = tiredis.from_url("redis://localhost:6379", db=REDIS_UNITTEST_DATABASE_NUMBER) self.r.flushdb() provider_queues = {} providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) for provider in providers: provider_queues[provider.provider_name] = backend.PythonQueue( provider.provider_name + "_queue") self.b = backend.Backend(backend.RedisQueue("alias-unittest", self.r), provider_queues, [backend.PythonQueue("couch_queue")], self.r) self.fake_item = { "_id": "1", "type": "item", "num_providers_still_updating": 1, "aliases": { "pmid": ["111"] }, "biblio": {}, "metrics": {}, "last_modified": datetime.datetime(2013, 1, 1) } self.fake_aliases_dict = {"pmid": ["222"]} self.tiid = "abcd" self.db = setup_postgres_for_unittests(db, app)
def test_get_providers_filters_by_aliases(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "aliases") provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Pubmed', 'Mendeley']))
def setUp(self): self.config = None #placeholder self.TEST_PROVIDER_CONFIG = [ ("wikipedia", {}) ] self.d = None # do the same thing for the redis db, set up the test redis database. We're using DB Number 8 self.r = tiredis.from_url("redis://localhost:6379", db=8) self.r.flushdb() provider_queues = {} providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) for provider in providers: provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue") self.b = backend.Backend( backend.RedisQueue("alias-unittest", self.r), provider_queues, [backend.PythonQueue("couch_queue")], self.r) self.fake_item = { "_id": "1", "type": "item", "num_providers_still_updating":1, "aliases":{"pmid":["111"]}, "biblio": {}, "metrics": {}, "last_modified": datetime.datetime(2013, 1, 1) } self.fake_aliases_dict = {"pmid":["222"]} self.tiid = "abcd" self.db = setup_postgres_for_unittests(db, app)
def test_03_init_aliases(self): providers = ProviderFactory.get_providers(self.config) pat = ProvidersAliasThread(providers, self.d) assert hasattr(pat, "stop") assert hasattr(pat, "stopped") assert hasattr(pat, "first") assert pat.queue is not None
def get_metric_names(providers_config): full_metric_names = [] providers = ProviderFactory.get_providers(providers_config) for provider in providers: metric_names = provider.metric_names() for metric_name in metric_names: full_metric_names.append(provider.provider_name + ':' + metric_name) return full_metric_names
def get_metric_names(providers_config): full_metric_names = [] providers = ProviderFactory.get_providers(providers_config) for provider in providers: metric_names = provider.metric_names() for metric_name in metric_names: full_metric_names.append(provider.provider_name + ':' + metric_name) return full_metric_names
def test_get_providers_filters_by_metrics(self): # since all the providers do metrics, "metrics" arg changes nought. providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "metrics") provider_names = [ provider.__class__.__name__ for provider in providers ] assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS): # default to nothing aliases_providers = [] biblio_providers = [] metrics_providers = [] all_metrics_providers = [ provider.provider_name for provider in ProviderFactory.get_providers( provider_config, "metrics") ] (genre, host) = item_module.decide_genre(item_aliases) has_enough_alias_urls = ("url" in item_aliases) if has_enough_alias_urls: if ("doi" in item_aliases): has_enough_alias_urls = (len([ url for url in item_aliases["url"] if url.startswith("http://dx.doi.org") ]) > 0) if (genre == "article"): if not "mendeley" in aliases_providers_run: aliases_providers = ["mendeley"] elif not "crossref" in aliases_providers_run: aliases_providers = [ "crossref" ] # do this before pubmed because might tease doi from url elif not "pubmed" in aliases_providers_run: aliases_providers = ["pubmed"] else: metrics_providers = all_metrics_providers biblio_providers = ["crossref", "pubmed", "webpage"] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] # if all the relevant providers have already run, then all the aliases are done # or if it already has urls if has_enough_alias_urls or (set(relevant_providers) == set(aliases_providers_run)): metrics_providers = all_metrics_providers biblio_providers = relevant_providers else: aliases_providers = relevant_providers return ({ "aliases": aliases_providers, "biblio": biblio_providers, "metrics": metrics_providers })
def test_alias_queue(self): self.d.create_new_db_and_connect(self.testing_db_name) providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"]) response = self.client.post('/item/doi/' + quote_plus(TEST_DRYAD_DOI)) tiid = json.loads(response.data) # now get it back out response = self.client.get('/item/' + tiid) print tiid assert_equals(response.status_code, 200) resp_dict = json.loads(response.data) assert_equals( set(resp_dict.keys()), set([u'tiid', u'created', u'last_requested', u'metrics', u'last_modified', u'biblio', u'id', u'aliases']) ) assert_equals(unicode(TEST_DRYAD_DOI), resp_dict["aliases"]["doi"][0]) # test the view works res = self.d.view("aliases") assert len(res["rows"]) == 1, res assert_equals(TEST_DRYAD_DOI, res["rows"][0]["value"]["aliases"]["doi"][0]) # see if the item is on the queue my_alias_queue = AliasQueue(self.d) assert isinstance(my_alias_queue.queue, list) assert_equals(len(my_alias_queue.queue), 1) # get our item from the queue my_item = my_alias_queue.first() assert_equals(my_item.aliases.doi[0], TEST_DRYAD_DOI) # do the update using the backend alias_thread = ProvidersAliasThread(providers, self.d) alias_thread.run(run_only_once=True) # get the item back out again and bask in the awesome response = self.client.get('/item/' + tiid) resp_dict = json.loads(response.data) print tiid print response.data assert_equals( resp_dict["aliases"]["title"][0], "data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides" ) print resp_dict assert_equals(resp_dict["biblio"]["data"]["year"], "2010")
def setUp(self): #setup api test client self.app = api.app self.app.testing = True self.client = self.app.test_client() # setup the database self.testing_db_name = "metrics_queue_test" self.old_db_name = self.app.config["DB_NAME"] self.app.config["DB_NAME"] = self.testing_db_name self.d = dao.Dao(self.testing_db_name, self.app.config["DB_URL"], self.app.config["DB_USERNAME"], self.app.config["DB_PASSWORD"]) self.providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"])
def rq_metrics_for_all_live_profiles(args): url_slug = args.get("url_slug", None) tiid = args.get("tiid", None) no_rq = args.get("no_rq", False) limit = args.get("limit", 5) if url_slug: limit = 1 queue_number = 0 q = db.session.query(Product.tiid).select_from(Profile) q = q.filter(Product.removed == None) q = q.join(Profile.products) if url_slug: q = q.filter(Profile.url_slug==url_slug) elif tiid: q = q.filter(Product.tiid==tiid) else: from totalimpactwebapp.profile import default_free_trial_days min_created_date = datetime.datetime.utcnow() - datetime.timedelta(days=default_free_trial_days) q = q.filter(or_(Profile.is_advisor!=None, Profile.stripe_id!=None, Profile.created>=min_created_date)) # q = q.filter(Profile.next_refresh <= datetime.datetime.utcnow()) q = q.order_by(Product.last_refresh_finished) # oldest first q = q.limit(limit) print "q=", q all_metrics_provider_names = [p.provider_name for p in ProviderFactory.get_providers(default_settings.PROVIDERS, "metrics")] for tiid in q.all(): print "tiid", tiid for provider_name in all_metrics_provider_names: print "putting {} on rq queue to run metrics through {}".format( tiid, provider_name) if no_rq: print "asked for no-rq, so calling right now" provider_method_wrapper(tiid, provider_name, "metrics") else: job = ti_queues[queue_number].enqueue_call( func=provider_method_wrapper, args=(tiid, provider_name, "metrics"), timeout=60 * 10, result_ttl=0 # number of seconds ) job.save()
def main(): mydao = None myredis = tiredis.from_url(os.getenv("REDISTOGO_URL")) alias_queue = RedisQueue("aliasqueue", myredis) # to clear alias_queue: #import redis, os #myredis = redis.from_url(os.getenv("REDISTOGO_URL")) #myredis.delete(["aliasqueue"]) # these need to match the tiid alphabet defined in models: couch_queues = {} for i in "abcdefghijklmnopqrstuvwxyz1234567890": couch_queues[i] = PythonQueue(i+"_couch_queue") couch_worker = CouchWorker(couch_queues[i], myredis, mydao) couch_worker.spawn_and_loop() logger.info(u"launched backend couch worker with {i}_couch_queue".format( i=i)) polling_interval = 0.1 # how many seconds between polling to talk to provider provider_queues = {} providers = ProviderFactory.get_providers(default_settings.PROVIDERS) for provider in providers: provider_queues[provider.provider_name] = PythonQueue(provider.provider_name+"_queue") provider_worker = ProviderWorker( provider, polling_interval, alias_queue, provider_queues[provider.provider_name], couch_queues, ProviderWorker.wrapper, myredis) provider_worker.spawn_and_loop() backend = Backend(alias_queue, provider_queues, couch_queues, myredis) try: backend.run_in_loop() # don't need to spawn this one except (KeyboardInterrupt, SystemExit): # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends sys.exit()
def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS): # default to nothing aliases_providers = [] biblio_providers = [] metrics_providers = [] all_metrics_providers = [provider.provider_name for provider in ProviderFactory.get_providers(provider_config, "metrics")] (genre, host) = item_module.decide_genre(item_aliases) has_enough_alias_urls = ("url" in item_aliases) if has_enough_alias_urls: if ("doi" in item_aliases): has_enough_alias_urls = (len([url for url in item_aliases["url"] if url.startswith("http://dx.doi.org")]) > 0) if (genre == "article"): if not "mendeley" in aliases_providers_run: aliases_providers = ["mendeley"] elif not "crossref" in aliases_providers_run: aliases_providers = ["crossref"] # do this before pubmed because might tease doi from url elif not "pubmed" in aliases_providers_run: aliases_providers = ["pubmed"] else: metrics_providers = all_metrics_providers biblio_providers = ["crossref", "pubmed", "webpage"] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] # if all the relevant providers have already run, then all the aliases are done # or if it already has urls if has_enough_alias_urls or (set(relevant_providers) == set(aliases_providers_run)): metrics_providers = all_metrics_providers biblio_providers = relevant_providers else: aliases_providers = relevant_providers return({ "aliases":aliases_providers, "biblio":biblio_providers, "metrics":metrics_providers})
def main(): mydao = None myredis = tiredis.from_url(os.getenv("REDISTOGO_URL")) alias_queue = RedisQueue("aliasqueue", myredis) # to clear alias_queue: #import redis, os #myredis = redis.from_url(os.getenv("REDISTOGO_URL")) #myredis.delete(["aliasqueue"]) # these need to match the tiid alphabet defined in models: couch_queues = {} for i in "abcdefghijklmnopqrstuvwxyz1234567890": couch_queues[i] = PythonQueue(i + "_couch_queue") couch_worker = CouchWorker(couch_queues[i], myredis, mydao) couch_worker.spawn_and_loop() logger.info( u"launched backend couch worker with {i}_couch_queue".format(i=i)) polling_interval = 0.1 # how many seconds between polling to talk to provider provider_queues = {} providers = ProviderFactory.get_providers(default_settings.PROVIDERS) for provider in providers: provider_queues[provider.provider_name] = PythonQueue( provider.provider_name + "_queue") provider_worker = ProviderWorker( provider, polling_interval, alias_queue, provider_queues[provider.provider_name], couch_queues, ProviderWorker.wrapper, myredis) provider_worker.spawn_and_loop() backend = Backend(alias_queue, provider_queues, couch_queues, myredis) try: backend.run_in_loop() # don't need to spawn this one except (KeyboardInterrupt, SystemExit): # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends sys.exit()
def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS): # default to nothing aliases_providers = [] biblio_providers = [] metrics_providers = [] all_metrics_providers = [provider.provider_name for provider in ProviderFactory.get_providers(provider_config, "metrics")] (genre, host) = ItemFactory.decide_genre(item_aliases) has_alias_urls = "url" in item_aliases if (genre == "article"): if not "pubmed" in aliases_providers_run: aliases_providers = ["pubmed"] elif not "crossref" in aliases_providers_run: aliases_providers = ["crossref"] else: metrics_providers = all_metrics_providers biblio_providers = ["pubmed", "crossref"] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] # if all the relevant providers have already run, then all the aliases are done # or if it already has urls if has_alias_urls or (set(relevant_providers) == set(aliases_providers_run)): metrics_providers = all_metrics_providers biblio_providers = relevant_providers else: aliases_providers = relevant_providers return({ "aliases":aliases_providers, "biblio":biblio_providers, "metrics":metrics_providers})
def test_get_providers_filters_by_biblio(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "biblio") provider_names = [provider.__class__.__name__ for provider in providers] assert_equals(set(provider_names), set(['Pubmed']))
def test_09_get_providers(self): providers = ProviderFactory.get_providers(self.provider_configs) assert len(providers) == len(self.provider_configs) pass
def main(logfile=None): logger = logging.getLogger() mydao = dao.Dao( app.config["DB_NAME"], app.config["DB_URL"], app.config["DB_USERNAME"], app.config["DB_PASSWORD"] ) # Adding this by handle. fileConfig doesn't allow filters to be added from totalimpact.backend import ctxfilter handler = logging.handlers.RotatingFileHandler(logfile) handler.level = logging.DEBUG formatter = logging.Formatter("%(asctime)s %(levelname)8s %(item)8s %(thread)s%(provider)s - %(message)s")#,"%H:%M:%S,%f") handler.formatter = formatter handler.addFilter(ctxfilter) logger.addHandler(handler) ctxfilter.threadInit() logger.debug("test") from totalimpact.backend import TotalImpactBackend, ProviderMetricsThread, ProvidersAliasThread, StoppableThread, QueueConsumer from totalimpact.providers.provider import Provider, ProviderFactory # Start all of the backend processes print "Starting alias retrieval thread" providers = ProviderFactory.get_providers(app.config["PROVIDERS"]) alias_threads = [] thread_count = app.config["ALIASES"]["workers"] for idx in range(thread_count): at = ProvidersAliasThread(providers, mydao, idx) at.thread_id = 'AliasThread(%i)' % idx at.start() alias_threads.append(at) print "Starting metric retrieval threads..." # Start each of the metric providers metrics_threads = [] for provider in providers: providers = ProviderFactory.get_providers(app.config["PROVIDERS"]) thread_count = app.config["PROVIDERS"][provider.provider_name]["workers"] print " ", provider.provider_name for idx in range(thread_count): thread = ProviderMetricsThread(provider, mydao) metrics_threads.append(thread) thread.thread_id = thread.thread_id + '(%i)' % idx thread.start() # Install a signal handler so we'll break out of the main loop # on receipt of relevant signals class ExitSignal(Exception): pass def kill_handler(signum, frame): raise ExitSignal() import signal signal.signal(signal.SIGTERM, kill_handler) try: while True: time.sleep(1) except (KeyboardInterrupt, ExitSignal), e: pass
def test_get_providers(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG) provider_names = [provider.__class__.__name__ for provider in providers] assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
def test_get_providers_filters_by_metrics(self): # since all the providers do metrics, "metrics" arg changes nought. providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "metrics") provider_names = [provider.__class__.__name__ for provider in providers] assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
def test_get_providers_filters_by_aliases(self): providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "aliases") provider_names = [provider.__class__.__name__ for provider in providers] assert_equals(set(provider_names), set(['Pubmed', 'Mendeley']))