def setUp(self):
        self.config = None #placeholder
        self.TEST_PROVIDER_CONFIG = [
            ("wikipedia", {})
        ]
        # hacky way to delete the "ti" db, then make it fresh again for each test.
        temp_dao = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB"))
        temp_dao.delete_db(os.getenv("CLOUDANT_DB"))
        self.d = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB"))

        # do the same thing for the redis db, set up the test redis database.  We're using DB Number 8
        self.r = tiredis.from_url("redis://localhost:6379", db=8)
        self.r.flushdb()

        provider_queues = {}
        providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
        for provider in providers:
            provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue")

        self.b = backend.Backend(
            backend.RedisQueue("alias-unittest", self.r), 
            provider_queues, 
            [backend.PythonQueue("couch_queue")], 
            self.r)

        self.fake_item = {
            "_id": "1",
            "type": "item",
            "num_providers_still_updating":1,
            "aliases":{"pmid":["111"]},
            "biblio": {},
            "metrics": {}
        }
        self.fake_aliases_dict = {"pmid":["222"]}
        self.tiid = "abcd"
 def test_get_providers(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names),
                   set(['Mendeley', 'Wikipedia', "Pubmed"]))
 def test_get_providers_filters_by_biblio(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG,
                                               "biblio")
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names), set(['Pubmed']))
def sniffer(item_aliases, provider_config=default_settings.PROVIDERS):

    (genre, host) = item_module.decide_genre(item_aliases)

    all_metrics_providers = [provider.provider_name for provider in 
                    ProviderFactory.get_providers(provider_config, "metrics")]

    if (genre == "article") and (host != "arxiv"):
        run = [[("aliases", provider)] for provider in ["mendeley", "crossref", "pubmed", "altmetric_com"]]
        run += [[("biblio", provider) for provider in ["crossref", "pubmed", "mendeley", "webpage"]]]
        run += [[("metrics", provider) for provider in all_metrics_providers]]
    elif (host == "arxiv") or ("doi" in item_aliases):
        run = [[("aliases", provider)] for provider in [host, "altmetric_com"]]
        run += [[("biblio", provider) for provider in [host, "mendeley"]]]
        run += [[("metrics", provider) for provider in all_metrics_providers]]
    else:
        # relevant alias and biblio providers are always the same
        relevant_providers = [host]
        if relevant_providers == ["unknown"]:
            relevant_providers = ["webpage"]
        run = [[("aliases", provider)] for provider in relevant_providers]
        run += [[("biblio", provider) for provider in relevant_providers]]
        run += [[("metrics", provider) for provider in all_metrics_providers]]

    return(run)
    def setUp(self):
        self.config = None  #placeholder
        self.TEST_PROVIDER_CONFIG = [("wikipedia", {})]
        self.d = None

        # do the same thing for the redis db, set up the test redis database.  We're using DB Number 8
        self.r = tiredis.from_url("redis://localhost:6379",
                                  db=REDIS_UNITTEST_DATABASE_NUMBER)
        self.r.flushdb()

        provider_queues = {}
        providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
        for provider in providers:
            provider_queues[provider.provider_name] = backend.PythonQueue(
                provider.provider_name + "_queue")

        self.b = backend.Backend(backend.RedisQueue("alias-unittest",
                                                    self.r), provider_queues,
                                 [backend.PythonQueue("couch_queue")], self.r)

        self.fake_item = {
            "_id": "1",
            "type": "item",
            "num_providers_still_updating": 1,
            "aliases": {
                "pmid": ["111"]
            },
            "biblio": {},
            "metrics": {},
            "last_modified": datetime.datetime(2013, 1, 1)
        }
        self.fake_aliases_dict = {"pmid": ["222"]}
        self.tiid = "abcd"

        self.db = setup_postgres_for_unittests(db, app)
 def test_get_providers_filters_by_aliases(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG,
                                               "aliases")
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names), set(['Pubmed', 'Mendeley']))
    def setUp(self):
        self.config = None #placeholder
        self.TEST_PROVIDER_CONFIG = [
            ("wikipedia", {})
        ]
        self.d = None

        # do the same thing for the redis db, set up the test redis database.  We're using DB Number 8
        self.r = tiredis.from_url("redis://localhost:6379", db=8)
        self.r.flushdb()

        provider_queues = {}
        providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
        for provider in providers:
            provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue")

        self.b = backend.Backend(
            backend.RedisQueue("alias-unittest", self.r), 
            provider_queues, 
            [backend.PythonQueue("couch_queue")], 
            self.r)

        self.fake_item = {
            "_id": "1",
            "type": "item",
            "num_providers_still_updating":1,
            "aliases":{"pmid":["111"]},
            "biblio": {},
            "metrics": {},
            "last_modified": datetime.datetime(2013, 1, 1)
        }
        self.fake_aliases_dict = {"pmid":["222"]}
        self.tiid = "abcd"

        self.db = setup_postgres_for_unittests(db, app)
 def test_03_init_aliases(self):
     providers = ProviderFactory.get_providers(self.config)
     pat = ProvidersAliasThread(providers, self.d)
     
     assert hasattr(pat, "stop")
     assert hasattr(pat, "stopped")
     assert hasattr(pat, "first")
     assert pat.queue is not None
Exemple #9
0
def get_metric_names(providers_config):
    full_metric_names = []
    providers = ProviderFactory.get_providers(providers_config)
    for provider in providers:
        metric_names = provider.metric_names()
        for metric_name in metric_names:
            full_metric_names.append(provider.provider_name + ':' + metric_name)
    return full_metric_names
Exemple #10
0
def get_metric_names(providers_config):
    full_metric_names = []
    providers = ProviderFactory.get_providers(providers_config)
    for provider in providers:
        metric_names = provider.metric_names()
        for metric_name in metric_names:
            full_metric_names.append(provider.provider_name + ':' + metric_name)
    return full_metric_names
 def test_get_providers_filters_by_metrics(self):
     # since all the providers do metrics, "metrics" arg changes nought.
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG,
                                               "metrics")
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names),
                   set(['Mendeley', 'Wikipedia', "Pubmed"]))
Exemple #12
0
    def sniffer(cls,
                item_aliases,
                aliases_providers_run,
                provider_config=default_settings.PROVIDERS):
        # default to nothing
        aliases_providers = []
        biblio_providers = []
        metrics_providers = []

        all_metrics_providers = [
            provider.provider_name
            for provider in ProviderFactory.get_providers(
                provider_config, "metrics")
        ]
        (genre, host) = item_module.decide_genre(item_aliases)

        has_enough_alias_urls = ("url" in item_aliases)
        if has_enough_alias_urls:
            if ("doi" in item_aliases):
                has_enough_alias_urls = (len([
                    url for url in item_aliases["url"]
                    if url.startswith("http://dx.doi.org")
                ]) > 0)

        if (genre == "article"):
            if not "mendeley" in aliases_providers_run:
                aliases_providers = ["mendeley"]
            elif not "crossref" in aliases_providers_run:
                aliases_providers = [
                    "crossref"
                ]  # do this before pubmed because might tease doi from url
            elif not "pubmed" in aliases_providers_run:
                aliases_providers = ["pubmed"]
            else:
                metrics_providers = all_metrics_providers
                biblio_providers = ["crossref", "pubmed", "webpage"]
        else:
            # relevant alias and biblio providers are always the same
            relevant_providers = [host]
            if relevant_providers == ["unknown"]:
                relevant_providers = ["webpage"]
            # if all the relevant providers have already run, then all the aliases are done
            # or if it already has urls
            if has_enough_alias_urls or (set(relevant_providers)
                                         == set(aliases_providers_run)):
                metrics_providers = all_metrics_providers
                biblio_providers = relevant_providers
            else:
                aliases_providers = relevant_providers

        return ({
            "aliases": aliases_providers,
            "biblio": biblio_providers,
            "metrics": metrics_providers
        })
    def test_alias_queue(self):
        self.d.create_new_db_and_connect(self.testing_db_name)

        providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"])

        response = self.client.post('/item/doi/' + quote_plus(TEST_DRYAD_DOI))
        tiid = json.loads(response.data)


        # now get it back out
        response = self.client.get('/item/' + tiid)
        print tiid
        assert_equals(response.status_code, 200)
        
        resp_dict = json.loads(response.data)
        assert_equals(
            set(resp_dict.keys()),
            set([u'tiid', u'created', u'last_requested', u'metrics', 
                u'last_modified', u'biblio', u'id', u'aliases'])
            )
        assert_equals(unicode(TEST_DRYAD_DOI), resp_dict["aliases"]["doi"][0])

        # test the view works
        res = self.d.view("aliases")
        assert len(res["rows"]) == 1, res
        assert_equals(TEST_DRYAD_DOI, res["rows"][0]["value"]["aliases"]["doi"][0])

        # see if the item is on the queue
        my_alias_queue = AliasQueue(self.d)
        assert isinstance(my_alias_queue.queue, list)
        assert_equals(len(my_alias_queue.queue), 1)
        
        # get our item from the queue
        my_item = my_alias_queue.first()
        assert_equals(my_item.aliases.doi[0], TEST_DRYAD_DOI)

        # do the update using the backend
        alias_thread = ProvidersAliasThread(providers, self.d)
        alias_thread.run(run_only_once=True)

        # get the item back out again and bask in the awesome
        response = self.client.get('/item/' + tiid)
        resp_dict = json.loads(response.data)
        print tiid
        print response.data
        assert_equals(
            resp_dict["aliases"]["title"][0],
            "data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides"
            )
        print resp_dict
        assert_equals(resp_dict["biblio"]["data"]["year"], "2010")
    def setUp(self):
        #setup api test client
        self.app = api.app
        self.app.testing = True 
        self.client = self.app.test_client()

        # setup the database
        self.testing_db_name = "metrics_queue_test"
        self.old_db_name = self.app.config["DB_NAME"]
        self.app.config["DB_NAME"] = self.testing_db_name
        self.d = dao.Dao(self.testing_db_name, self.app.config["DB_URL"],
            self.app.config["DB_USERNAME"], self.app.config["DB_PASSWORD"])

        self.providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"])
def rq_metrics_for_all_live_profiles(args):
    url_slug = args.get("url_slug", None)
    tiid = args.get("tiid", None)
    no_rq = args.get("no_rq", False)
    limit = args.get("limit", 5)
    if url_slug:
        limit = 1

    queue_number = 0

    q = db.session.query(Product.tiid).select_from(Profile)
    q = q.filter(Product.removed == None)
    q = q.join(Profile.products)
    if url_slug:
        q = q.filter(Profile.url_slug==url_slug)
    elif tiid:
        q = q.filter(Product.tiid==tiid)
    else:
        from totalimpactwebapp.profile import default_free_trial_days
        min_created_date = datetime.datetime.utcnow() - datetime.timedelta(days=default_free_trial_days)
        q = q.filter(or_(Profile.is_advisor!=None, Profile.stripe_id!=None, Profile.created>=min_created_date))
        # q = q.filter(Profile.next_refresh <= datetime.datetime.utcnow())
        q = q.order_by(Product.last_refresh_finished)  # oldest first
        q = q.limit(limit)
    print "q=", q

    all_metrics_provider_names = [p.provider_name for p in ProviderFactory.get_providers(default_settings.PROVIDERS, "metrics")]

    for tiid in q.all():
        print "tiid", tiid

        for provider_name in all_metrics_provider_names:
            print "putting {} on rq queue to run metrics through {}".format(
                tiid, provider_name)
            if no_rq:
                print "asked for no-rq, so calling right now"
                provider_method_wrapper(tiid, provider_name, "metrics")
            else:
                job = ti_queues[queue_number].enqueue_call(
                    func=provider_method_wrapper,
                    args=(tiid, provider_name, "metrics"),
                    timeout=60 * 10,
                    result_ttl=0  # number of seconds
                    )
                job.save()
Exemple #16
0
def main():

    mydao = None

    myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"))
    alias_queue = RedisQueue("aliasqueue", myredis)
    # to clear alias_queue:
    #import redis, os
    #myredis = redis.from_url(os.getenv("REDISTOGO_URL"))
    #myredis.delete(["aliasqueue"])


    # these need to match the tiid alphabet defined in models:
    couch_queues = {}
    for i in "abcdefghijklmnopqrstuvwxyz1234567890":
        couch_queues[i] = PythonQueue(i+"_couch_queue")
        couch_worker = CouchWorker(couch_queues[i], myredis, mydao)
        couch_worker.spawn_and_loop() 
        logger.info(u"launched backend couch worker with {i}_couch_queue".format(
            i=i))


    polling_interval = 0.1   # how many seconds between polling to talk to provider
    provider_queues = {}
    providers = ProviderFactory.get_providers(default_settings.PROVIDERS)
    for provider in providers:
        provider_queues[provider.provider_name] = PythonQueue(provider.provider_name+"_queue")
        provider_worker = ProviderWorker(
            provider, 
            polling_interval, 
            alias_queue,
            provider_queues[provider.provider_name], 
            couch_queues,
            ProviderWorker.wrapper,
            myredis)
        provider_worker.spawn_and_loop()

    backend = Backend(alias_queue, provider_queues, couch_queues, myredis)
    try:
        backend.run_in_loop() # don't need to spawn this one
    except (KeyboardInterrupt, SystemExit): 
        # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends
        sys.exit()
Exemple #17
0
    def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS):
        # default to nothing
        aliases_providers = []
        biblio_providers = []
        metrics_providers = []

        all_metrics_providers = [provider.provider_name for provider in 
                        ProviderFactory.get_providers(provider_config, "metrics")]
        (genre, host) = item_module.decide_genre(item_aliases)

        has_enough_alias_urls = ("url" in item_aliases)
        if has_enough_alias_urls:
            if ("doi" in item_aliases):
                has_enough_alias_urls = (len([url for url in item_aliases["url"] if url.startswith("http://dx.doi.org")]) > 0)

        if (genre == "article"):
            if not "mendeley" in aliases_providers_run:
                aliases_providers = ["mendeley"]
            elif not "crossref" in aliases_providers_run:
                aliases_providers = ["crossref"]  # do this before pubmed because might tease doi from url
            elif not "pubmed" in aliases_providers_run:
                aliases_providers = ["pubmed"]
            else:
                metrics_providers = all_metrics_providers
                biblio_providers = ["crossref", "pubmed", "webpage"]
        else:
            # relevant alias and biblio providers are always the same
            relevant_providers = [host]
            if relevant_providers == ["unknown"]:
                relevant_providers = ["webpage"]
            # if all the relevant providers have already run, then all the aliases are done
            # or if it already has urls
            if has_enough_alias_urls or (set(relevant_providers) == set(aliases_providers_run)):
                metrics_providers = all_metrics_providers
                biblio_providers = relevant_providers
            else:
                aliases_providers = relevant_providers

        return({
            "aliases":aliases_providers,
            "biblio":biblio_providers,
            "metrics":metrics_providers})
Exemple #18
0
def main():

    mydao = None

    myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"))
    alias_queue = RedisQueue("aliasqueue", myredis)
    # to clear alias_queue:
    #import redis, os
    #myredis = redis.from_url(os.getenv("REDISTOGO_URL"))
    #myredis.delete(["aliasqueue"])

    # these need to match the tiid alphabet defined in models:
    couch_queues = {}
    for i in "abcdefghijklmnopqrstuvwxyz1234567890":
        couch_queues[i] = PythonQueue(i + "_couch_queue")
        couch_worker = CouchWorker(couch_queues[i], myredis, mydao)
        couch_worker.spawn_and_loop()
        logger.info(
            u"launched backend couch worker with {i}_couch_queue".format(i=i))

    polling_interval = 0.1  # how many seconds between polling to talk to provider
    provider_queues = {}
    providers = ProviderFactory.get_providers(default_settings.PROVIDERS)
    for provider in providers:
        provider_queues[provider.provider_name] = PythonQueue(
            provider.provider_name + "_queue")
        provider_worker = ProviderWorker(
            provider, polling_interval, alias_queue,
            provider_queues[provider.provider_name], couch_queues,
            ProviderWorker.wrapper, myredis)
        provider_worker.spawn_and_loop()

    backend = Backend(alias_queue, provider_queues, couch_queues, myredis)
    try:
        backend.run_in_loop()  # don't need to spawn this one
    except (KeyboardInterrupt, SystemExit):
        # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends
        sys.exit()
Exemple #19
0
    def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS):
        # default to nothing
        aliases_providers = []
        biblio_providers = []
        metrics_providers = []

        all_metrics_providers = [provider.provider_name for provider in 
                        ProviderFactory.get_providers(provider_config, "metrics")]
        (genre, host) = ItemFactory.decide_genre(item_aliases)
        has_alias_urls = "url" in item_aliases

        if (genre == "article"):
            if not "pubmed" in aliases_providers_run:
                aliases_providers = ["pubmed"]
            elif not "crossref" in aliases_providers_run:
                aliases_providers = ["crossref"]
            else:
                metrics_providers = all_metrics_providers
                biblio_providers = ["pubmed", "crossref"]
        else:
            # relevant alias and biblio providers are always the same
            relevant_providers = [host]
            if relevant_providers == ["unknown"]:
                relevant_providers = ["webpage"]
            # if all the relevant providers have already run, then all the aliases are done
            # or if it already has urls
            if has_alias_urls or (set(relevant_providers) == set(aliases_providers_run)):
                metrics_providers = all_metrics_providers
                biblio_providers = relevant_providers
            else:
                aliases_providers = relevant_providers

        return({
            "aliases":aliases_providers,
            "biblio":biblio_providers,
            "metrics":metrics_providers})
 def test_get_providers_filters_by_biblio(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "biblio")
     provider_names = [provider.__class__.__name__ for provider in providers]
     assert_equals(set(provider_names), set(['Pubmed']))
 def test_09_get_providers(self):
     providers = ProviderFactory.get_providers(self.provider_configs)
     assert len(providers) == len(self.provider_configs)
     pass
Exemple #22
0
def main(logfile=None):

    logger = logging.getLogger()

    mydao = dao.Dao(
        app.config["DB_NAME"],
        app.config["DB_URL"],
        app.config["DB_USERNAME"],
        app.config["DB_PASSWORD"]
    ) 

    # Adding this by handle. fileConfig doesn't allow filters to be added
    from totalimpact.backend import ctxfilter
    handler = logging.handlers.RotatingFileHandler(logfile)
    handler.level = logging.DEBUG
    formatter = logging.Formatter("%(asctime)s %(levelname)8s %(item)8s %(thread)s%(provider)s - %(message)s")#,"%H:%M:%S,%f")
    handler.formatter = formatter
    handler.addFilter(ctxfilter)
    logger.addHandler(handler)
    ctxfilter.threadInit()

    logger.debug("test")

    from totalimpact.backend import TotalImpactBackend, ProviderMetricsThread, ProvidersAliasThread, StoppableThread, QueueConsumer
    from totalimpact.providers.provider import Provider, ProviderFactory

    # Start all of the backend processes
    print "Starting alias retrieval thread"
    providers = ProviderFactory.get_providers(app.config["PROVIDERS"])

    alias_threads = []
    thread_count = app.config["ALIASES"]["workers"]
    for idx in range(thread_count):
        at = ProvidersAliasThread(providers, mydao, idx)
        at.thread_id = 'AliasThread(%i)' % idx
        at.start()
        alias_threads.append(at)

    print "Starting metric retrieval threads..."
    # Start each of the metric providers
    metrics_threads = []
    for provider in providers:
        providers = ProviderFactory.get_providers(app.config["PROVIDERS"])
        thread_count = app.config["PROVIDERS"][provider.provider_name]["workers"]
        print "  ", provider.provider_name
        for idx in range(thread_count):
            thread = ProviderMetricsThread(provider, mydao)
            metrics_threads.append(thread)
            thread.thread_id = thread.thread_id + '(%i)' % idx
            thread.start()

    # Install a signal handler so we'll break out of the main loop
    # on receipt of relevant signals
    class ExitSignal(Exception):
        pass
 
    def kill_handler(signum, frame):
        raise ExitSignal()

    import signal
    signal.signal(signal.SIGTERM, kill_handler)

    try:
        while True:
            time.sleep(1)
    except (KeyboardInterrupt, ExitSignal), e:
        pass
 def test_get_providers(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
     provider_names = [provider.__class__.__name__ for provider in providers]
     assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
 def test_get_providers_filters_by_metrics(self):
     # since all the providers do metrics, "metrics" arg changes nought.
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "metrics")
     provider_names = [provider.__class__.__name__ for provider in providers]
     assert_equals(set(provider_names), set(['Mendeley', 'Wikipedia', "Pubmed"]))
 def test_get_providers_filters_by_aliases(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG, "aliases")
     provider_names = [provider.__class__.__name__ for provider in providers]
     assert_equals(set(provider_names), set(['Pubmed', 'Mendeley']))