def import_products(provider_name, import_input):
    if provider_name in ["bibtex", "product_id_strings"]:
        logger.debug(u"in import_products with provider_name {provider_name}".format(
            provider_name=provider_name))
    else:
        logger.debug(u"in import_products with provider_name {provider_name}: {import_input}".format(
            provider_name=provider_name, import_input=import_input))

    aliases = []

    # pull in standard items, if we were passed any of these
    if provider_name=="product_id_strings":
        aliases = get_aliases_from_product_id_strings(import_input)
    elif provider_name=="bibtex":
        provider_module = ProviderFactory.get_provider("bibtex")
        aliases = provider_module.member_items(import_input)
    else:
        try:
            provider_module = ProviderFactory.get_provider(provider_name)
            aliases = provider_module.member_items(import_input)
        except ImportError:
            logger.debug(u"in import_products, got ImportError")
            pass

    # logger.debug(u"returning from import_products with aliases {aliases}".format(
    #     aliases=aliases))

    return(aliases)
    def setUp(self):
        self.config = None #placeholder
        self.TEST_PROVIDER_CONFIG = [
            ("wikipedia", {})
        ]
        self.d = None

        # do the same thing for the redis db, set up the test redis database.  We're using DB Number 8
        self.r = tiredis.from_url("redis://localhost:6379", db=8)
        self.r.flushdb()

        provider_queues = {}
        providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
        for provider in providers:
            provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue")

        self.b = backend.Backend(
            backend.RedisQueue("alias-unittest", self.r), 
            provider_queues, 
            [backend.PythonQueue("couch_queue")], 
            self.r)

        self.fake_item = {
            "_id": "1",
            "type": "item",
            "num_providers_still_updating":1,
            "aliases":{"pmid":["111"]},
            "biblio": {},
            "metrics": {},
            "last_modified": datetime.datetime(2013, 1, 1)
        }
        self.fake_aliases_dict = {"pmid":["222"]}
        self.tiid = "abcd"

        self.db = setup_postgres_for_unittests(db, app)
def provider_method_wrapper(tiid, provider, method_name):

    # logger.info(u"{:20}: in provider_method_wrapper with {tiid} {provider_name} {method_name} with {aliases}".format(
    #    "wrapper", tiid=tiid, provider_name=provider.provider_name, method_name=method_name, aliases=input_aliases_dict))


    product = Product.query.get(tiid)

    if not product:
        logger.warning(u"Empty product in provider_run for tiid {tiid}".format(
           tiid=tiid))
        return None

    input_alias_tuples = product.aliases_for_providers
    try:
        method = getattr(provider, method_name)
    except AttributeError:
        provider = ProviderFactory.get_provider(provider)
        method = getattr(provider, method_name)

    provider_name = provider.provider_name
    worker_name = provider_name+"_worker"

    try:
        method_response = method(input_alias_tuples)
    except ProviderError, e:
        method_response = None

        logger.info(u"{:20}: **ProviderError {tiid} {method_name} {provider_name}, Exception type {exception_type} {exception_arguments}".format(
            worker_name, 
            tiid=tiid, 
            provider_name=provider_name.upper(), 
            method_name=method_name.upper(), 
            exception_type=type(e).__name__, 
            exception_arguments=e.args))
 def test_get_providers_filters_by_aliases(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG,
                                               "aliases")
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names), set(['Pubmed', 'Mendeley']))
 def test_get_providers(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names),
                   set(['Mendeley', 'Wikipedia', "Pubmed"]))
    def setUp(self):
        self.config = None  #placeholder
        self.TEST_PROVIDER_CONFIG = [("wikipedia", {})]
        self.d = None

        # do the same thing for the redis db, set up the test redis database.  We're using DB Number 8
        self.r = tiredis.from_url("redis://localhost:6379",
                                  db=REDIS_UNITTEST_DATABASE_NUMBER)
        self.r.flushdb()

        provider_queues = {}
        providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
        for provider in providers:
            provider_queues[provider.provider_name] = backend.PythonQueue(
                provider.provider_name + "_queue")

        self.b = backend.Backend(backend.RedisQueue("alias-unittest",
                                                    self.r), provider_queues,
                                 [backend.PythonQueue("couch_queue")], self.r)

        self.fake_item = {
            "_id": "1",
            "type": "item",
            "num_providers_still_updating": 1,
            "aliases": {
                "pmid": ["111"]
            },
            "biblio": {},
            "metrics": {},
            "last_modified": datetime.datetime(2013, 1, 1)
        }
        self.fake_aliases_dict = {"pmid": ["222"]}
        self.tiid = "abcd"

        self.db = setup_postgres_for_unittests(db, app)
Example #7
0
def importer_post(provider_name):
    """
    Gets aliases associated with a query from a given provider.
    """
    input_string = request.json["input"]

    if provider_name == "pmids":
        provider_name = "pubmed"
    elif provider_name == "dois":
        provider_name = "crossref"
    elif provider_name == "urls":
        provider_name = "webpage"
    try:
        provider = ProviderFactory.get_provider(provider_name)
    except ImportError:
        abort_custom(404, "an importer for provider '{provider_name}' is not found".format(
            provider_name=provider_name))

    try:
        aliases = provider.member_items(input_string)
    except ProviderItemNotFoundError:
        abort_custom(404, "item not found")
    except (ProviderTimeout, ProviderServerError):
        abort_custom(503, "timeout error, might be transient")
    except ProviderError:
        abort(500, "internal error from provider")

    tiids_aliases_map = item_module.create_tiids_from_aliases(aliases, myredis)
    logger.debug(u"in provider_importer_get with {tiids_aliases_map}".format(
        tiids_aliases_map=tiids_aliases_map))

    products_dict = format_into_products_dict(tiids_aliases_map)

    resp = make_response(json.dumps({"products": products_dict}, sort_keys=True, indent=4), 200)
    return resp
Example #8
0
def prep_collection_items(aliases):
    logger.info("got a list of aliases; creating new items for them.")
    try:
        # remove unprintable characters and change list to tuples
        clean_aliases = [(clean_id(namespace), clean_id(nid)) for [namespace, nid] in aliases]
    except ValueError:
        logger.error(
            "bad input to POST /collection (requires [namespace, id] pairs):{input}".format(input=str(clean_aliases))
        )
        abort(404, "POST /collection requires a list of [namespace, id] pairs.")

    logger.debug(
        "POST /collection got list of aliases; creating new items for {aliases}".format(aliases=str(clean_aliases))
    )

    (tiids, items) = create_or_find_items_from_aliases(clean_aliases)

    logger.debug("POST /collection saving a group of {num} new items: {items}".format(num=len(items), items=str(items)))

    # batch upload the new docs to the db
    # make sure they are there before the provider updates start happening
    for doc in mydao.db.update(items):
        pass

    # for each item, set the number of providers that need to run before the update is done
    # and put them on the update queue
    for item in items:
        myredis.set_num_providers_left(
            item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)
        )
        myredis.add_to_alias_queue(item["_id"], item["aliases"])

    return tiids
Example #9
0
def provider_memberitems(provider_name):
    """
    Starts a memberitems update for a specified provider, using a supplied file.

    Returns a hash of the file's contents, which is needed to get memberitems'
    output. To get output, poll GET /provider/<provider_name>/memberitems/<hash>?method=async
    """
    # logger.debug("Query POSTed to {provider_name}/memberitems with request headers '{headers}'".format(
    #    provider_name=provider_name,
    #    headers=request.headers
    # ))

    file = request.files["file"]
    logger.debug("In provider_memberitems got file")
    logger.debug("filename = " + file.filename)
    query = file.read().decode("utf-8")

    provider = ProviderFactory.get_provider(provider_name)
    memberitems = MemberItems(provider, myredis)
    query_hash = memberitems.start_update(query)

    response_dict = {"query_hash": query_hash}
    resp = make_response(json.dumps(response_dict), 201)  # created
    resp.mimetype = "application/json"
    resp.headers["Access-Control-Allow-Origin"] = "*"
    return resp
Example #10
0
def create_item(namespace, nid, myredis, mydao):
    logger.debug("In create_item with alias" + str((namespace, nid)))
    item = make()
    namespace = clean_id(namespace)
    nid = clean_id(nid)
    item["aliases"][namespace] = [nid]
    item["aliases"] = canonical_aliases(item["aliases"])

    # set this so we know when it's still updating later on
    myredis.set_num_providers_left(
        item["_id"],
        ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)
    )

    mydao.save(item)

    myredis.add_to_alias_queue(item["_id"], item["aliases"])

    logger.info("Created new item '{id}' with alias '{alias}'".format(
        id=item["_id"],
        alias=str((namespace, nid))
    ))

    mixpanel.track("Create:Item", {"Namespace":namespace})

    try:
        return item["_id"]
    except AttributeError:
        abort(500)
 def test_get_providers_filters_by_biblio(self):
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG,
                                               "biblio")
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names), set(['Pubmed']))
Example #12
0
def collection_update(cid=""):

    # first, get the tiids in this collection:
    try:
        collection = mydao.get(cid)
        tiids = collection["alias_tiids"].values()
    except Exception:
        logger.exception("couldn't get tiids for collection '{cid}'".format(cid=cid))
        abort(404, "couldn't get tiids for this collection...maybe doesn't exist?")

    # put each of them on the update queue
    for tiid in tiids:
        logger.debug("In update_item with tiid " + tiid)

        # set this so we know when it's still updating later on
        myredis.set_num_providers_left(tiid, ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS))

        item_doc = mydao.get(tiid)
        try:
            myredis.add_to_alias_queue(item_doc["_id"], item_doc["aliases"])
        except (KeyError, TypeError):
            logger.debug("couldn't get item_doc for {tiid}. Skipping its update".format(tiid=tiid))
            pass

    resp = make_response("true", 200)
    resp.mimetype = "application/json"
    return resp
    def setUp(self):
        self.config = None #placeholder
        self.TEST_PROVIDER_CONFIG = [
            ("wikipedia", {})
        ]
        # hacky way to delete the "ti" db, then make it fresh again for each test.
        temp_dao = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB"))
        temp_dao.delete_db(os.getenv("CLOUDANT_DB"))
        self.d = dao.Dao("http://localhost:5984", os.getenv("CLOUDANT_DB"))

        # do the same thing for the redis db, set up the test redis database.  We're using DB Number 8
        self.r = tiredis.from_url("redis://localhost:6379", db=8)
        self.r.flushdb()

        provider_queues = {}
        providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG)
        for provider in providers:
            provider_queues[provider.provider_name] = backend.PythonQueue(provider.provider_name+"_queue")

        self.b = backend.Backend(
            backend.RedisQueue("alias-unittest", self.r), 
            provider_queues, 
            [backend.PythonQueue("couch_queue")], 
            self.r)

        self.fake_item = {
            "_id": "1",
            "type": "item",
            "num_providers_still_updating":1,
            "aliases":{"pmid":["111"]},
            "biblio": {},
            "metrics": {}
        }
        self.fake_aliases_dict = {"pmid":["222"]}
        self.tiid = "abcd"
Example #14
0
def provider_run(aliases_dict, tiid, method_name, provider_name):

    provider = ProviderFactory.get_provider(provider_name)

    # logger.info(u"in provider_run for {provider}".format(
    #    provider=provider.provider_name))

    (success, estimated_wait_seconds) = rate.acquire(provider_name, block=False)
    # add up to random 2 seconds to spread it out
    estimated_wait_seconds += random.random() * 3
    if not success:
        logger.warning(u"RATE LIMIT HIT in provider_run for {provider} {method_name} {tiid}, retrying".format(
           provider=provider.provider_name, method_name=method_name, tiid=tiid))
        provider_run.retry(args=[aliases_dict, tiid, method_name, provider_name],
                countdown=estimated_wait_seconds, 
                max_retries=10)

    timeout_seconds = 30
    try:
        with timeout.Timeout(timeout_seconds):
            response = provider_method_wrapper(tiid, aliases_dict, provider, method_name)

    except timeout.Timeout:
        msg = u"TIMEOUT in provider_run for {provider} {method_name} {tiid} after {timeout_seconds} seconds".format(
           provider=provider.provider_name, method_name=method_name, tiid=tiid, timeout_seconds=timeout_seconds)
        # logger.warning(msg)  # message is written elsewhere
        raise ProviderTimeout(msg)

    return response
Example #15
0
def sniffer(item_aliases, provider_config=default_settings.PROVIDERS):

    (genre, host) = item_module.decide_genre(item_aliases)

    all_metrics_providers = [provider.provider_name for provider in 
                    ProviderFactory.get_providers(provider_config, "metrics")]

    if (genre == "article") and (host != "arxiv"):
        run = [[("aliases", provider)] for provider in ["mendeley", "crossref", "pubmed", "altmetric_com"]]
        run += [[("biblio", provider) for provider in ["crossref", "pubmed", "mendeley", "webpage"]]]
        run += [[("metrics", provider) for provider in all_metrics_providers]]
    elif (host == "arxiv") or ("doi" in item_aliases):
        run = [[("aliases", provider)] for provider in [host, "altmetric_com"]]
        run += [[("biblio", provider) for provider in [host, "mendeley"]]]
        run += [[("metrics", provider) for provider in all_metrics_providers]]
    else:
        # relevant alias and biblio providers are always the same
        relevant_providers = [host]
        if relevant_providers == ["unknown"]:
            relevant_providers = ["webpage"]
        run = [[("aliases", provider)] for provider in relevant_providers]
        run += [[("biblio", provider) for provider in relevant_providers]]
        run += [[("metrics", provider) for provider in all_metrics_providers]]

    return(run)
Example #16
0
def get_metric_names(providers_config):
    full_metric_names = []
    providers = ProviderFactory.get_providers(providers_config)
    for provider in providers:
        metric_names = provider.metric_names()
        for metric_name in metric_names:
            full_metric_names.append(provider.provider_name + ':' + metric_name)
    return full_metric_names
Example #17
0
 def test_03_init_aliases(self):
     providers = ProviderFactory.get_providers(self.config)
     pat = ProvidersAliasThread(providers, self.d)
     
     assert hasattr(pat, "stop")
     assert hasattr(pat, "stopped")
     assert hasattr(pat, "first")
     assert pat.queue is not None
Example #18
0
def get_metric_names(providers_config):
    full_metric_names = []
    providers = ProviderFactory.get_providers(providers_config)
    for provider in providers:
        metric_names = provider.metric_names()
        for metric_name in metric_names:
            full_metric_names.append(provider.provider_name + ':' + metric_name)
    return full_metric_names
Example #19
0
def providers():
    metadata = ProviderFactory.get_all_metadata()
    metadata_list = []
    for k, v in metadata.iteritems():
        v["name"] = k
        metadata_list.append(v)

    return json_resp_from_thing(metadata_list)
 def test_get_providers_filters_by_metrics(self):
     # since all the providers do metrics, "metrics" arg changes nought.
     providers = ProviderFactory.get_providers(self.TEST_PROVIDER_CONFIG,
                                               "metrics")
     provider_names = [
         provider.__class__.__name__ for provider in providers
     ]
     assert_equals(set(provider_names),
                   set(['Mendeley', 'Wikipedia', "Pubmed"]))
Example #21
0
    def sniffer(cls,
                item_aliases,
                aliases_providers_run,
                provider_config=default_settings.PROVIDERS):
        # default to nothing
        aliases_providers = []
        biblio_providers = []
        metrics_providers = []

        all_metrics_providers = [
            provider.provider_name
            for provider in ProviderFactory.get_providers(
                provider_config, "metrics")
        ]
        (genre, host) = item_module.decide_genre(item_aliases)

        has_enough_alias_urls = ("url" in item_aliases)
        if has_enough_alias_urls:
            if ("doi" in item_aliases):
                has_enough_alias_urls = (len([
                    url for url in item_aliases["url"]
                    if url.startswith("http://dx.doi.org")
                ]) > 0)

        if (genre == "article"):
            if not "mendeley" in aliases_providers_run:
                aliases_providers = ["mendeley"]
            elif not "crossref" in aliases_providers_run:
                aliases_providers = [
                    "crossref"
                ]  # do this before pubmed because might tease doi from url
            elif not "pubmed" in aliases_providers_run:
                aliases_providers = ["pubmed"]
            else:
                metrics_providers = all_metrics_providers
                biblio_providers = ["crossref", "pubmed", "webpage"]
        else:
            # relevant alias and biblio providers are always the same
            relevant_providers = [host]
            if relevant_providers == ["unknown"]:
                relevant_providers = ["webpage"]
            # if all the relevant providers have already run, then all the aliases are done
            # or if it already has urls
            if has_enough_alias_urls or (set(relevant_providers)
                                         == set(aliases_providers_run)):
                metrics_providers = all_metrics_providers
                biblio_providers = relevant_providers
            else:
                aliases_providers = relevant_providers

        return ({
            "aliases": aliases_providers,
            "biblio": biblio_providers,
            "metrics": metrics_providers
        })
 def test_get_all_metric_names(self):
     response = ProviderFactory.get_all_metric_names(
         self.TEST_PROVIDER_CONFIG)
     expected = [
         'wikipedia:mentions', 'mendeley:country',
         'pubmed:pmc_citations_reviews', 'mendeley:discipline',
         'pubmed:f1000', 'mendeley:career_stage',
         'pubmed:pmc_citations_editorials', 'mendeley:readers',
         'pubmed:pmc_citations', 'mendeley:groups'
     ]
     assert_equals(response, expected)
Example #23
0
def provider_memberitems(provider_name):
    """
    Make a descr string (like bibtex) into a dict strings describing items.
    """

    provider = ProviderFactory.get_provider(provider_name)
    items_dict = provider.parse(request.json["descr"])

    resp = make_response(
        json.dumps({"memberitems": items_dict}, sort_keys=True, indent=4), 200)
    return resp
Example #24
0
def provider_memberitems(provider_name):
    query = request.values.get('query','')

    logger.debug("In provider_memberitems with " + query)

    provider = ProviderFactory.get_provider(provider_name)
    logger.debug("provider: " + provider.provider_name)

    memberitems = provider.member_items(query, cache_enabled=False)
    
    resp = make_response( json.dumps(memberitems, sort_keys=True, indent=4), 200 )
    resp.mimetype = "application/json"
    return resp
Example #25
0
def provider_biblio(provider_name, id):

    provider = ProviderFactory.get_provider(provider_name)
    if id=="example":
        id = provider.example_id[1]
        url = "http://localhost:8080/" + provider_name + "/biblio?%s"
    else:
        url = None

    biblio = provider.get_biblio_for_id(id, url, cache_enabled=False)
    resp = make_response( json.dumps(biblio, sort_keys=True, indent=4) )
    resp.mimetype = "application/json"
    return resp
Example #26
0
def provider_memberitems(provider_name):
    """
    Make a descr string (like bibtex) into a dict strings describing items.
    """

    provider = ProviderFactory.get_provider(provider_name)
    items_dict = provider.parse(request.json["descr"])

    resp = make_response(
        json.dumps({"memberitems": items_dict}, sort_keys=True, indent=4),
        200
    )
    return resp
    def test_alias_queue(self):
        self.d.create_new_db_and_connect(self.testing_db_name)

        providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"])

        response = self.client.post('/item/doi/' + quote_plus(TEST_DRYAD_DOI))
        tiid = json.loads(response.data)


        # now get it back out
        response = self.client.get('/item/' + tiid)
        print tiid
        assert_equals(response.status_code, 200)
        
        resp_dict = json.loads(response.data)
        assert_equals(
            set(resp_dict.keys()),
            set([u'tiid', u'created', u'last_requested', u'metrics', 
                u'last_modified', u'biblio', u'id', u'aliases'])
            )
        assert_equals(unicode(TEST_DRYAD_DOI), resp_dict["aliases"]["doi"][0])

        # test the view works
        res = self.d.view("aliases")
        assert len(res["rows"]) == 1, res
        assert_equals(TEST_DRYAD_DOI, res["rows"][0]["value"]["aliases"]["doi"][0])

        # see if the item is on the queue
        my_alias_queue = AliasQueue(self.d)
        assert isinstance(my_alias_queue.queue, list)
        assert_equals(len(my_alias_queue.queue), 1)
        
        # get our item from the queue
        my_item = my_alias_queue.first()
        assert_equals(my_item.aliases.doi[0], TEST_DRYAD_DOI)

        # do the update using the backend
        alias_thread = ProvidersAliasThread(providers, self.d)
        alias_thread.run(run_only_once=True)

        # get the item back out again and bask in the awesome
        response = self.client.get('/item/' + tiid)
        resp_dict = json.loads(response.data)
        print tiid
        print response.data
        assert_equals(
            resp_dict["aliases"]["title"][0],
            "data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides"
            )
        print resp_dict
        assert_equals(resp_dict["biblio"]["data"]["year"], "2010")
    def setUp(self):
        #setup api test client
        self.app = api.app
        self.app.testing = True 
        self.client = self.app.test_client()

        # setup the database
        self.testing_db_name = "metrics_queue_test"
        self.old_db_name = self.app.config["DB_NAME"]
        self.app.config["DB_NAME"] = self.testing_db_name
        self.d = dao.Dao(self.testing_db_name, self.app.config["DB_URL"],
            self.app.config["DB_USERNAME"], self.app.config["DB_PASSWORD"])

        self.providers = ProviderFactory.get_providers(self.app.config["PROVIDERS"])
 def test_get_all_metric_names(self):
     response = ProviderFactory.get_all_metric_names(self.TEST_PROVIDER_CONFIG)
     expected = [
         "wikipedia:mentions",
         "mendeley:country",
         "pubmed:pmc_citations_reviews",
         "mendeley:discipline",
         "pubmed:f1000",
         "mendeley:career_stage",
         "pubmed:pmc_citations_editorials",
         "mendeley:readers",
         "pubmed:pmc_citations",
         "mendeley:groups",
     ]
     assert_equals(response, expected)
def get_metric_value_lists(items):
    (ordered_fieldnames, rows) = make_csv_rows(items)
    metric_values = {}
    for metric_name in ProviderFactory.get_all_metric_names():
        if metric_name in ordered_fieldnames:
            if metric_name in ["tiid", "title", "doi"]:
                pass
            else:
                values = [row[metric_name] for row in rows]
                values = [value if value else 0 for value in values]
                # treat "Yes" as 1 for normalizaations
                values = [1 if value == "Yes" else value for value in values]
                metric_values[metric_name] = sorted(values, reverse=True)
        else:
            metric_values[metric_name] = [0 for row in rows]
    return metric_values
def get_metric_value_lists(items):
    (ordered_fieldnames, rows) = make_csv_rows(items)
    metric_values = {}
    for metric_name in ProviderFactory.get_all_metric_names():
        if metric_name in ordered_fieldnames:
            if metric_name in ["tiid", "title", "doi"]:
                pass
            else:
                values = [row[metric_name] for row in rows]
                values = [value if value else 0 for value in values]
                # treat "Yes" as 1 for normalizaations
                values = [1 if value=="Yes" else value for value in values]
                metric_values[metric_name] = sorted(values, reverse=True)
        else:
            metric_values[metric_name] = [0 for row in rows]
    return metric_values
Example #32
0
def rq_metrics_for_all_live_profiles(args):
    url_slug = args.get("url_slug", None)
    tiid = args.get("tiid", None)
    no_rq = args.get("no_rq", False)
    limit = args.get("limit", 5)
    if url_slug:
        limit = 1

    queue_number = 0

    q = db.session.query(Product.tiid).select_from(Profile)
    q = q.filter(Product.removed == None)
    q = q.join(Profile.products)
    if url_slug:
        q = q.filter(Profile.url_slug==url_slug)
    elif tiid:
        q = q.filter(Product.tiid==tiid)
    else:
        from totalimpactwebapp.profile import default_free_trial_days
        min_created_date = datetime.datetime.utcnow() - datetime.timedelta(days=default_free_trial_days)
        q = q.filter(or_(Profile.is_advisor!=None, Profile.stripe_id!=None, Profile.created>=min_created_date))
        # q = q.filter(Profile.next_refresh <= datetime.datetime.utcnow())
        q = q.order_by(Product.last_refresh_finished)  # oldest first
        q = q.limit(limit)
    print "q=", q

    all_metrics_provider_names = [p.provider_name for p in ProviderFactory.get_providers(default_settings.PROVIDERS, "metrics")]

    for tiid in q.all():
        print "tiid", tiid

        for provider_name in all_metrics_provider_names:
            print "putting {} on rq queue to run metrics through {}".format(
                tiid, provider_name)
            if no_rq:
                print "asked for no-rq, so calling right now"
                provider_method_wrapper(tiid, provider_name, "metrics")
            else:
                job = ti_queues[queue_number].enqueue_call(
                    func=provider_method_wrapper,
                    args=(tiid, provider_name, "metrics"),
                    timeout=60 * 10,
                    result_ttl=0  # number of seconds
                    )
                job.save()
Example #33
0
def create_item(namespace, nid):
    logger.debug("In create_item with alias" + str((namespace, nid)))
    item = ItemFactory.make()

    # set this so we know when it's still updating later on
    myredis.set_num_providers_left(item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS))

    item["aliases"][namespace] = [nid]
    mydao.save(item)

    myredis.add_to_alias_queue(item["_id"], item["aliases"])

    logger.info("Created new item '{id}' with alias '{alias}'".format(id=item["_id"], alias=str((namespace, nid))))

    try:
        return item["_id"]
    except AttributeError:
        abort(500)
Example #34
0
def provider_memberitems(provider_name):
    """
    Make a file into a dict strings describing items.
    """

    mixpanel.track("Trigger:Import", {"Provider":provider_name}, request)

    file = request.files['file']
    logger.debug("In"+provider_name+"/memberitems, got file: filename="+file.filename)
    entries_str = file.read().decode("utf-8")

    provider = ProviderFactory.get_provider(provider_name)
    items_dict = provider.parse(entries_str)

    resp = make_response(json.dumps(items_dict), 200) # created
    resp.mimetype = "application/json"
    resp.headers['Access-Control-Allow-Origin'] = "*"
    return resp
Example #35
0
def main():

    mydao = None

    myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"))
    alias_queue = RedisQueue("aliasqueue", myredis)
    # to clear alias_queue:
    #import redis, os
    #myredis = redis.from_url(os.getenv("REDISTOGO_URL"))
    #myredis.delete(["aliasqueue"])


    # these need to match the tiid alphabet defined in models:
    couch_queues = {}
    for i in "abcdefghijklmnopqrstuvwxyz1234567890":
        couch_queues[i] = PythonQueue(i+"_couch_queue")
        couch_worker = CouchWorker(couch_queues[i], myredis, mydao)
        couch_worker.spawn_and_loop() 
        logger.info(u"launched backend couch worker with {i}_couch_queue".format(
            i=i))


    polling_interval = 0.1   # how many seconds between polling to talk to provider
    provider_queues = {}
    providers = ProviderFactory.get_providers(default_settings.PROVIDERS)
    for provider in providers:
        provider_queues[provider.provider_name] = PythonQueue(provider.provider_name+"_queue")
        provider_worker = ProviderWorker(
            provider, 
            polling_interval, 
            alias_queue,
            provider_queues[provider.provider_name], 
            couch_queues,
            ProviderWorker.wrapper,
            myredis)
        provider_worker.spawn_and_loop()

    backend = Backend(alias_queue, provider_queues, couch_queues, myredis)
    try:
        backend.run_in_loop() # don't need to spawn this one
    except (KeyboardInterrupt, SystemExit): 
        # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends
        sys.exit()
Example #36
0
def provider_aliases(provider_name, id):

    provider = ProviderFactory.get_provider(provider_name)
    if id=="example":
        id = provider.example_id[1]
        url = "http://localhost:8080/" + provider_name + "/aliases?%s"
    else:
        url = None

    try:
        new_aliases = provider._get_aliases_for_id(id, url, cache_enabled=False)
    except NotImplementedError:
        new_aliases = []
        
    all_aliases = [(provider.example_id[0], id)] + new_aliases

    resp = make_response( json.dumps(all_aliases, sort_keys=True, indent=4) )
    resp.mimetype = "application/json"
    return resp
Example #37
0
    def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS):
        # default to nothing
        aliases_providers = []
        biblio_providers = []
        metrics_providers = []

        all_metrics_providers = [provider.provider_name for provider in 
                        ProviderFactory.get_providers(provider_config, "metrics")]
        (genre, host) = item_module.decide_genre(item_aliases)

        has_enough_alias_urls = ("url" in item_aliases)
        if has_enough_alias_urls:
            if ("doi" in item_aliases):
                has_enough_alias_urls = (len([url for url in item_aliases["url"] if url.startswith("http://dx.doi.org")]) > 0)

        if (genre == "article"):
            if not "mendeley" in aliases_providers_run:
                aliases_providers = ["mendeley"]
            elif not "crossref" in aliases_providers_run:
                aliases_providers = ["crossref"]  # do this before pubmed because might tease doi from url
            elif not "pubmed" in aliases_providers_run:
                aliases_providers = ["pubmed"]
            else:
                metrics_providers = all_metrics_providers
                biblio_providers = ["crossref", "pubmed", "webpage"]
        else:
            # relevant alias and biblio providers are always the same
            relevant_providers = [host]
            if relevant_providers == ["unknown"]:
                relevant_providers = ["webpage"]
            # if all the relevant providers have already run, then all the aliases are done
            # or if it already has urls
            if has_enough_alias_urls or (set(relevant_providers) == set(aliases_providers_run)):
                metrics_providers = all_metrics_providers
                biblio_providers = relevant_providers
            else:
                aliases_providers = relevant_providers

        return({
            "aliases":aliases_providers,
            "biblio":biblio_providers,
            "metrics":metrics_providers})
Example #38
0
def start_item_update(tiids, myredis, mydao, sleep_in_seconds=0):
    # put each of them on the update queue
    for tiid in tiids:
        logger.debug("In start_item_update with tiid " + tiid)

        # set this so we know when it's still updating later on
        myredis.set_num_providers_left(
            tiid,
            ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)
        )

        item_doc = mydao.get(tiid)
        try:
            myredis.add_to_alias_queue(item_doc["_id"], item_doc["aliases"])
        except (KeyError, TypeError):
            logger.debug("couldn't get item_doc for {tiid}. Skipping its update".format(
                tiid=tiid))
            pass

        time.sleep(sleep_in_seconds)
Example #39
0
def provider_memberitems_get(provider_name, query):

    provider = ProviderFactory.get_provider(provider_name)
    memberitems = MemberItems(provider, myredis)
    method = request.args.get("method", "sync")

    try:
        ret = getattr(memberitems, "get_" + method)(query)
    except ProviderItemNotFoundError:
        abort(404)
    except ProviderError:
        abort(500)

    if ret:
        if ret["error"]:
            abort(503)  # crossref lookup error, might be transient

    resp = make_response(json.dumps(ret, sort_keys=True, indent=4), 200)
    resp.mimetype = "application/json"
    resp.headers["Access-Control-Allow-Origin"] = "*"
    return resp
Example #40
0
def provider_memberitems_get(provider_name, query):
    """
    Gets aliases associated with a query from a given provider.
    """
    query = unicode_helpers.remove_nonprinting_characters(query)
    provider = ProviderFactory.get_provider(provider_name)

    try:
        items_dict = provider.member_items(query)

    except ProviderItemNotFoundError:
        abort_custom(404, "item not found")

    except (ProviderTimeout, ProviderServerError):
        abort_custom(503, "crossref lookup error, might be transient")

    except ProviderError:
        abort(500, "internal error from provider")

    resp = make_response(
        json.dumps({"memberitems": items_dict}, sort_keys=True, indent=4), 200)
    return resp
Example #41
0
def main():

    mydao = None

    myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"))
    alias_queue = RedisQueue("aliasqueue", myredis)
    # to clear alias_queue:
    #import redis, os
    #myredis = redis.from_url(os.getenv("REDISTOGO_URL"))
    #myredis.delete(["aliasqueue"])

    # these need to match the tiid alphabet defined in models:
    couch_queues = {}
    for i in "abcdefghijklmnopqrstuvwxyz1234567890":
        couch_queues[i] = PythonQueue(i + "_couch_queue")
        couch_worker = CouchWorker(couch_queues[i], myredis, mydao)
        couch_worker.spawn_and_loop()
        logger.info(
            u"launched backend couch worker with {i}_couch_queue".format(i=i))

    polling_interval = 0.1  # how many seconds between polling to talk to provider
    provider_queues = {}
    providers = ProviderFactory.get_providers(default_settings.PROVIDERS)
    for provider in providers:
        provider_queues[provider.provider_name] = PythonQueue(
            provider.provider_name + "_queue")
        provider_worker = ProviderWorker(
            provider, polling_interval, alias_queue,
            provider_queues[provider.provider_name], couch_queues,
            ProviderWorker.wrapper, myredis)
        provider_worker.spawn_and_loop()

    backend = Backend(alias_queue, provider_queues, couch_queues, myredis)
    try:
        backend.run_in_loop()  # don't need to spawn this one
    except (KeyboardInterrupt, SystemExit):
        # this approach is per http://stackoverflow.com/questions/2564137/python-how-to-terminate-a-thread-when-main-program-ends
        sys.exit()
Example #42
0
def importer_post(provider_name):
    """
    Gets aliases associated with a query from a given provider.
    """
    input_string = request.json["input"]

    if provider_name == "pmids":
        provider_name = "pubmed"
    elif provider_name == "dois":
        provider_name = "crossref"
    elif provider_name == "urls":
        provider_name = "webpage"
    try:
        provider = ProviderFactory.get_provider(provider_name)
    except ImportError:
        abort_custom(
            404,
            "an importer for provider '{provider_name}' is not found".format(
                provider_name=provider_name))

    try:
        aliases = provider.member_items(input_string)
    except ProviderItemNotFoundError:
        abort_custom(404, "item not found")
    except (ProviderTimeout, ProviderServerError):
        abort_custom(503, "timeout error, might be transient")
    except ProviderError:
        abort(500, "internal error from provider")

    tiids_aliases_map = item_module.create_tiids_from_aliases(aliases, myredis)
    logger.debug(u"in provider_importer_get with {tiids_aliases_map}".format(
        tiids_aliases_map=tiids_aliases_map))

    products_dict = format_into_products_dict(tiids_aliases_map)

    resp = make_response(
        json.dumps({"products": products_dict}, sort_keys=True, indent=4), 200)
    return resp
Example #43
0
def provider_memberitems_get(provider_name, query):
    """
    Gets aliases associated with a query from a given provider.
    """

    mixpanel.track("Trigger:Import", {"Provider":provider_name}, request)

    try:
        provider = ProviderFactory.get_provider(provider_name)
        ret = provider.member_items(query)
    except ProviderItemNotFoundError:
        abort(404)
    except (ProviderTimeout, ProviderServerError):
        abort(503)  # crossref lookup error, might be transient
    except ProviderError:
        abort(500)

    resp = make_response(
        json.dumps({"memberitems":ret}, sort_keys=True, indent=4),
        200
    )
    resp.mimetype = "application/json"
    resp.headers['Access-Control-Allow-Origin'] = "*"
    return resp
Example #44
0
 def setUp(self):
     self.provider = ProviderFactory.get_provider(self.provider_name)
     self.old_http_get = Provider.http_get
Example #45
0
def start_item_update(tiid, aliases_dict, myredis):
    logger.debug(u"In start_item_update with {tiid}, /biblio_print {aliases_dict}".format(
        tiid=tiid, aliases_dict=aliases_dict))
    myredis.set_num_providers_left(tiid,
        ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS))
    myredis.add_to_alias_queue(tiid, aliases_dict)
Example #46
0
def provider():
    ret = ProviderFactory.get_all_metadata()
    resp = make_response(json.dumps(ret, sort_keys=True, indent=4), 200)

    return resp
Example #47
0
from totalimpact import json_sqlalchemy
from totalimpact import db


# Master lock to ensure that only a single thread can write
# to the DB at one time to avoid document conflicts

import logging
logger = logging.getLogger('ti.item')

# print out extra debugging
#logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)


all_static_meta = ProviderFactory.get_all_static_meta()



class NotAuthenticatedError(Exception):
    pass

def delete_item(tiid):
    item_object = Item.from_tiid(tiid)
    db.session.delete(item_object)
    try:
        db.session.commit()
    except (IntegrityError, FlushError) as e:
        db.session.rollback()
        logger.warning(u"Fails Integrity check in delete_item for {tiid}, rolling back.  Message: {message}".format(
            tiid=tiid, 
 def test_get_all_metadata(self):
     md = ProviderFactory.get_all_metadata(self.TEST_PROVIDER_CONFIG)
     print md["pubmed"]
     assert_equals(md["pubmed"]['url'], 'http://pubmed.gov')
 def test_get_all_static_meta(self):
     sm = ProviderFactory.get_all_static_meta(self.TEST_PROVIDER_CONFIG)
     expected = 'The number of citations by papers in PubMed Central'
     assert_equals(sm["pubmed:pmc_citations"]["description"], expected)
 def test_get_provider(self):
     provider = ProviderFactory.get_provider("wikipedia")
     assert_equals(provider.__class__.__name__, "Wikipedia")