Beispiel #1
0
 def update_item_with_new_aliases(cls, alias_dict, item):
     if alias_dict == item["aliases"]:
         item = None
     else:
         merged_aliases = ItemFactory.merge_alias_dicts(alias_dict, item["aliases"])
         item["aliases"] = merged_aliases
     return(item)
Beispiel #2
0
def collection_get(cid="", format="json"):
    coll = mydao.get(cid)
    if not coll:
        abort(404)

    # if not include items, then just return the collection straight from couch
    if request.args.get("include_items") in ["0", "false", "False"]:
        # except if format is csv.  can't do that.
        if format == "csv":
            abort(405)  # method not supported
        else:
            response_code = 200
            resp = make_response(json.dumps(coll, sort_keys=True, indent=4), response_code)
            resp.mimetype = "application/json"
    else:
        try:
            (coll_with_items, something_currently_updating) = collection.get_collection_with_items_for_client(
                cid, myrefsets, myredis, mydao
            )
        except (LookupError, AttributeError):
            logger.error("couldn't get tiids for collection '{cid}'".format(cid=cid))
            abort(404)  # not found

        # return success if all reporting is complete for all items
        if something_currently_updating:
            response_code = 210  # update is not complete yet
        else:
            response_code = 200

        if format == "csv":
            # remove scopus before exporting to csv, so don't add magic keep-scopus keys to clean method
            clean_items = [ItemFactory.clean_for_export(item) for item in coll_with_items["items"]]
            csv = collection.make_csv_stream(clean_items)
            resp = make_response(csv, response_code)
            resp.mimetype = "text/csv;charset=UTF-8"
            resp.headers.add("Content-Disposition", "attachment; filename=ti.csv")
            resp.headers.add("Content-Encoding", "UTF-8")
        else:
            api_key = request.args.get("api_key", None)
            clean_if_necessary_items = [
                ItemFactory.clean_for_export(item, api_key, os.getenv("API_KEY")) for item in coll_with_items["items"]
            ]
            coll_with_items["items"] = clean_if_necessary_items
            resp = make_response(json.dumps(coll_with_items, sort_keys=True, indent=4), response_code)
            resp.mimetype = "application/json"
    return resp
def put_snaps_in_items():
    logger.debug("running put_snaps_in_items() now.")
    starttime = time.time()
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500

    start_key = ["metric_snap", "000000000"]
    end_key = ["metric_snap", "zzzzzzzzzzzzzzzzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    #for row in view_rows[startkey:endkey]:
    while page:
        for row in page:
            if not "metric_snap" in row.key[0]:
                #print "not a metric_snap so skipping", row.key
                continue
            #print row.key
            row_count += 1
            snap = row.doc
            item = db.get(snap["tiid"])

            if item:
                saving = True
                while saving:
                    try:
                        from totalimpact.models import ItemFactory
                        updated_item = ItemFactory.add_snap_data(item, snap)

                        # to decide the proper last modified date
                        snap_last_modified = snap["created"]
                        item_last_modified = item["last_modified"]
                        updated_item["last_modified"] = max(snap_last_modified, item_last_modified)
                        
                        logger.info("now on snap row %i, saving item %s back to db, deleting snap %s" % 
                            (row_count, updated_item["_id"], snap["_id"]))

                        db.save(updated_item)
                        db.delete(snap)
                        saving = False
                    except couchdb.http.ResourceConflict:
                        logger.warning("couch conflict.  trying again")
                        pass
            else:
                logger.warning("now on snap row %i, couldn't get item %s for snap %s" % 
                    (row_count, snap["tiid"], snap["_id"]))

        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    logger.info("updated {rows} rows in {elapsed} seconds".format(
        rows=row_count, elapsed=round(time.time() - starttime)
    ))
Beispiel #4
0
 def queue(self):
     res = self.queueids
     items = []
     # using reversed() as a hack...we actually want to use the couchdb
     # descending=true param to get the oldest stuff first, but
     for id in res:
         my_item = ItemFactory.get(self.dao, item_id, ProviderFactory.get_provider, default_settings.PROVIDERS)
         items.append(my_item)
     return items
Beispiel #5
0
    def queue(self):
        # due to error in couchdb this reads from json output - see dao view
        res = self.queueids
        items = []
        for id in res:
            my_item = ItemFactory.get(self.dao, id, ProviderFactory.get_provider, default_settings.PROVIDERS)
            items.append(my_item)

        return items
def get_collection_with_items_for_client(cid, myrefsets, myredis, mydao):
    startkey = [cid, 0]
    endkey = [cid, "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"]
    view_response = mydao.db.view(
        "collections_with_items/collections_with_items", include_docs=True, startkey=startkey, endkey=endkey
    )
    # the first row is the collection document
    first_row = view_response.rows[0]
    collection = first_row.doc
    try:
        del collection["ip_address"]
    except KeyError:
        pass
    del collection["alias_tiids"]

    # start with the 2nd row, since 1st row is the collection document
    collection["items"] = []
    if len(view_response.rows) > 1:
        for row in view_response.rows[1:]:
            item_doc = row.doc
            try:
                item_for_client = ItemFactory.build_item_for_client(item_doc, myrefsets)
            except (KeyError, TypeError):
                logging.info(
                    "Couldn't build item {item_doc}, excluding it from the returned collection {cid}".format(
                        item_doc=item_doc, cid=cid
                    )
                )
                item_for_client = None
                raise
            if item_for_client:
                collection["items"] += [item_for_client]

    something_currently_updating = False
    for item in collection["items"]:
        item["currently_updating"] = ItemFactory.is_currently_updating(item["_id"], myredis)
        something_currently_updating = something_currently_updating or item["currently_updating"]

    logging.info("Got items for collection %s" % cid)
    # print json.dumps(collection, sort_keys=True, indent=4)
    return (collection, something_currently_updating)
    def checkDryad(self):
        # Test reading data from Dryad
        item = ItemFactory.make(self.mydao, app.config["PROVIDERS"])
        item.aliases.add_alias('doi', '10.5061/dryad.7898')
        item_aliases_list = item.aliases.get_aliases_list()

        dryad = Dryad()
        new_aliases = dryad.aliases(item_aliases_list)
        new_metrics = dryad.metrics(item_aliases_list)

        self.check_aliases('dryad.url', new_aliases, ("url", 'http://hdl.handle.net/10255/dryad.7898'))
        self.check_aliases('dryad.title', new_aliases, ("title", 'data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides'))
Beispiel #8
0
def make_item_dict(tiid):
    '''Utility function for /item and /items endpoints
    Will cause the request to abort with 404 if item is missing from db'''
    try:
        item = ItemFactory.get(mydao,
            tiid,
            ProviderFactory.get_provider,
            app.config["PROVIDERS"])
        item_dict = item.as_dict()
    except LookupError:
        abort(404)
    return item_dict
Beispiel #9
0
    def wrapper(cls, tiid, input_aliases_dict, provider, method_name, aliases_providers_run, callback):
        #logger.info("{:20}: **Starting {tiid} {provider_name} {method_name} with {aliases}".format(
        #    "wrapper", tiid=tiid, provider_name=provider.provider_name, method_name=method_name, aliases=aliases))

        provider_name = provider.provider_name
        worker_name = provider_name+"_worker"

        input_alias_tuples = ItemFactory.alias_tuples_from_dict(input_aliases_dict)
        method = getattr(provider, method_name)

        try:
            method_response = method(input_alias_tuples)
        except ProviderError:
            method_response = None
            logger.info("{:20}: **ProviderError {tiid} {method_name} {provider_name} ".format(
                worker_name, tiid=tiid, provider_name=provider_name.upper(), method_name=method_name.upper()))

        if method_name == "aliases":
            # update aliases to include the old ones too
            aliases_providers_run += [provider_name]
            if method_response:
                new_aliases_dict = ItemFactory.alias_dict_from_tuples(method_response)
                response = ItemFactory.merge_alias_dicts(new_aliases_dict, input_aliases_dict)
            else:
                response = input_aliases_dict
        else:
            response = method_response

        logger.info("{:20}: RETURNED {tiid} {method_name} {provider_name} : {response}".format(
            worker_name, tiid=tiid, method_name=method_name.upper(), 
            provider_name=provider_name.upper(), response=response))

        callback(tiid, response, method_name, aliases_providers_run)

        try:
            del thread_count[provider_name][tiid+method_name]
        except KeyError:  # thread isn't there when we call wrapper in unit tests
            pass

        return response
    def checkWikipedia(self):
        # Test reading data from Wikipedia
        item = ItemFactory.make(self.mydao, app.config["PROVIDERS"])
        item.aliases.add_alias("doi", "10.1371/journal.pcbi.1000361")
        #item.aliases.add_alias("url", "http://cottagelabs.com")

        item_aliases_list = item.aliases.get_aliases_list()

        wikipedia = Wikipedia()
        # No aliases for wikipedia
        #new_aliases = wikipedia.aliases(item_aliases_list)
        new_metrics = wikipedia.metrics(item_aliases_list)

        self.check_metric('wikipedia:mentions', new_metrics['wikipedia:mentions'], 1)
Beispiel #11
0
 def first(self):
     if self.none_count >= 3:
         if self.max_items:
             if self.current_item > self.max_items:
                 return None
         # Generate a mock item with initial alias ('mock', id)
         item = ItemFactory.make()
         item.id = self.current_item
         item.aliases['mock'] = str(item.id)
         self.items[self.current_item] = item
         return item
     else:
         self.none_count += 1
         return None
Beispiel #12
0
def get_item_from_tiid(tiid, format=None):
    # TODO check request headers for format as well.

    try:
        item = ItemFactory.get_item(tiid, myrefsets, mydao)
    except (LookupError, AttributeError):
        abort(404)

    if not item:
        abort(404)

    if ItemFactory.is_currently_updating(tiid, myredis):
        response_code = 210  # not complete yet
        item["currently_updating"] = True
    else:
        response_code = 200
        item["currently_updating"] = False

    clean_item = ItemFactory.clean_for_export(item)
    resp = make_response(json.dumps(clean_item, sort_keys=True, indent=4), response_code)
    resp.mimetype = "application/json"

    return resp
Beispiel #13
0
    def checkWikipedia(self):
        # Test reading data from Wikipedia
        item = ItemFactory.make_simple(self.mydao)
        item.aliases.add_alias("doi", "10.1371/journal.pcbi.1000361")
        #item.aliases.add_alias("url", "http://cottagelabs.com")

        item_aliases_list = item.aliases.get_aliases_list()

        wikipedia = Wikipedia()
        # No aliases for wikipedia
        #new_aliases = wikipedia.aliases(item_aliases_list)
        new_metrics = wikipedia.metrics(item_aliases_list)

        self.check_metric('wikipedia:mentions',
                          new_metrics['wikipedia:mentions'], 1)
Beispiel #14
0
    def checkDryad(self):
        # Test reading data from Dryad
        item = ItemFactory.make_simple(self.mydao)
        item.aliases.add_alias('doi', '10.5061/dryad.7898')
        item_aliases_list = item.aliases.get_aliases_list()

        dryad = Dryad()
        new_aliases = dryad.aliases(item_aliases_list)
        new_metrics = dryad.metrics(item_aliases_list)

        self.check_aliases('dryad.url', new_aliases,
                           ("url", 'http://hdl.handle.net/10255/dryad.7898'))
        self.check_aliases('dryad.title', new_aliases, (
            "title",
            'data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides'
        ))
Beispiel #15
0
    def checkGithub(self):
        item = ItemFactory.make_simple(self.mydao)

        github = Github()
        members = github.member_items("egonw")
        self.check_members('github.github_user', members,
                           [('github', ('egonw', 'blueobelisk.debian')),
                            ('github', ('egonw', 'ron')),
                            ('github', ('egonw', 'pubchem-cdk')),
                            ('github', ('egonw', 'org.openscience.cdk')),
                            ('github', ('egonw', 'java-rdfa')),
                            ('github', ('egonw', 'cdk')),
                            ('github', ('egonw', 'RobotDF')),
                            ('github', ('egonw', 'egonw.github.com')),
                            ('github', ('egonw', 'knime-chemspider')),
                            ('github', ('egonw', 'gtd')),
                            ('github', ('egonw', 'cheminfbenchmark')),
                            ('github', ('egonw', 'cdk-taverna')),
                            ('github', ('egonw', 'groovy-jcp')),
                            ('github', ('egonw', 'jnchem')),
                            ('github', ('egonw', 'acsrdf2010')),
                            ('github', ('egonw', 'Science-3.0')),
                            ('github', ('egonw', 'SNORQL')),
                            ('github', ('egonw', 'ctr-cdk-groovy')),
                            ('github', ('egonw', 'CDKitty')),
                            ('github', ('egonw', 'rednael')),
                            ('github', ('egonw', 'de.ipbhalle.msbi')),
                            ('github',
                             ('egonw', 'collaborative.cheminformatics')),
                            ('github', ('egonw', 'xws-taverna')),
                            ('github', ('egonw', 'cheminformatics.classics')),
                            ('github', ('egonw', 'chembl.rdf')),
                            ('github', ('egonw', 'blueobelisk.userscript')),
                            ('github', ('egonw', 'ojdcheck')),
                            ('github', ('egonw', 'nmrshiftdb-rdf')),
                            ('github', ('egonw', 'bioclipse.ons')),
                            ('github', ('egonw', 'medea_bmc_article'))])

        item.aliases.add_alias("github", "egonw,gtd")
        item_aliases_list = item.aliases.get_aliases_list()

        new_metrics = github.metrics(item_aliases_list)

        self.check_metric('github:forks', new_metrics['github:forks'], 0)
        self.check_metric('github:watchers', new_metrics['github:watchers'], 7)
Beispiel #16
0
def create_item(namespace, nid):
    logger.debug("In create_item with alias" + str((namespace, nid)))
    item = ItemFactory.make()

    # set this so we know when it's still updating later on
    myredis.set_num_providers_left(item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS))

    item["aliases"][namespace] = [nid]
    mydao.save(item)

    myredis.add_to_alias_queue(item["_id"], item["aliases"])

    logger.info("Created new item '{id}' with alias '{alias}'".format(id=item["_id"], alias=str((namespace, nid))))

    try:
        return item["_id"]
    except AttributeError:
        abort(500)
Beispiel #17
0
def create_item(namespace, id):
    '''Utility function to keep DRY in single/multiple item creation endpoins'''
    item = ItemFactory.make(mydao, app.config["PROVIDERS"])
    item.aliases.add_alias(namespace, id)

    ## FIXME - see issue 86
    ## Should look up this namespace and id and see if we already have a tiid
    ## If so, return its tiid with a 200.
    # right now this makes a new item every time, creating many dupes

    # does not filter by whether we actually can process the namespace, since
    # we may be able to someday soon. It's user's job to not pass in junk.
    item.save()

    try:
        return item.id
    except AttributeError:
        abort(500)     
    def checkGithub(self):
        item = ItemFactory.make(self.mydao, app.config["PROVIDERS"])

        github = Github()
        members = github.member_items("egonw")
        self.check_members('github.github_user', members, 
            [('github', ('egonw', 'blueobelisk.debian')),
             ('github', ('egonw', 'ron')),
             ('github', ('egonw', 'pubchem-cdk')),
             ('github', ('egonw', 'org.openscience.cdk')),
             ('github', ('egonw', 'java-rdfa')),
             ('github', ('egonw', 'cdk')),
             ('github', ('egonw', 'RobotDF')),
             ('github', ('egonw', 'egonw.github.com')),
             ('github', ('egonw', 'knime-chemspider')),
             ('github', ('egonw', 'gtd')),
             ('github', ('egonw', 'cheminfbenchmark')),
             ('github', ('egonw', 'cdk-taverna')),
             ('github', ('egonw', 'groovy-jcp')),
             ('github', ('egonw', 'jnchem')),
             ('github', ('egonw', 'acsrdf2010')),
             ('github', ('egonw', 'Science-3.0')),
             ('github', ('egonw', 'SNORQL')),
             ('github', ('egonw', 'ctr-cdk-groovy')),
             ('github', ('egonw', 'CDKitty')),
             ('github', ('egonw', 'rednael')),
             ('github', ('egonw', 'de.ipbhalle.msbi')),
             ('github', ('egonw', 'collaborative.cheminformatics')),
             ('github', ('egonw', 'xws-taverna')),
             ('github', ('egonw', 'cheminformatics.classics')),
             ('github', ('egonw', 'chembl.rdf')),
             ('github', ('egonw', 'blueobelisk.userscript')),
             ('github', ('egonw', 'ojdcheck')),
             ('github', ('egonw', 'nmrshiftdb-rdf')),
             ('github', ('egonw', 'bioclipse.ons')),
             ('github', ('egonw', 'medea_bmc_article'))])

        item.aliases.add_alias("github", "egonw,gtd")
        item_aliases_list = item.aliases.get_aliases_list()

        new_metrics = github.metrics(item_aliases_list)

        self.check_metric('github:forks', new_metrics['github:forks'], 0)
        self.check_metric('github:watchers', new_metrics['github:watchers'], 7)
Beispiel #19
0
def create_or_find_items_from_aliases(clean_aliases):
    tiids = []
    items = []
    for alias in clean_aliases:
        (namespace, nid) = alias
        existing_tiid = get_tiid_by_alias(namespace, nid)
        if existing_tiid:
            tiids.append(existing_tiid)
            logger.debug(
                "found an existing tiid ({tiid}) for alias {alias}".format(tiid=existing_tiid, alias=str(alias))
            )
        else:
            logger.debug("alias {alias} isn't in the db; making a new item for it.".format(alias=alias))
            item = ItemFactory.make()
            item["aliases"][namespace] = [nid]

            items.append(item)
            tiids.append(item["_id"])
    return (tiids, items)
Beispiel #20
0
    def metrics(self, 
            aliases,
            provider_url_template=None, # ignore this because multiple url steps
            cache_enabled=True):

        # Only lookup metrics for items with appropriate ids
        from totalimpact.models import ItemFactory
        aliases_dict = ItemFactory.alias_dict_from_tuples(aliases)

        metrics_page = None    
        # try lookup by doi
        try:
            metrics_page = self._get_metrics_lookup_page(self.metrics_from_doi_template, aliases_dict["doi"][0])
        except KeyError:
            pass
        # try lookup by pmid
        if not metrics_page:
            try:
                metrics_page = self._get_metrics_lookup_page(self.metrics_from_pmid_template, aliases_dict["pmid"][0])
            except KeyError:
                pass
        # try lookup by title
        if not metrics_page:
            try:
                page = self._get_uuid_lookup_page(aliases_dict["biblio"][0]["title"])
                if page:
                    uuid = self._get_uuid_from_title(aliases_dict, page)
                    if uuid:
                        logger.debug("Mendeley: uuid is %s for %s" %(uuid, aliases_dict["biblio"][0]["title"]))
                        metrics_page = self._get_metrics_lookup_page(self.metrics_from_uuid_template, uuid)
                    else:
                        logger.debug("Mendeley: couldn't find uuid for %s" %(aliases_dict["biblio"][0]["title"]))
            except KeyError:
                pass
        # give up!
        if not metrics_page:
            return {}

        metrics_and_drilldown = self._get_metrics_and_drilldown_from_metrics_page(metrics_page)

        return metrics_and_drilldown
Beispiel #21
0
    def dequeue(self):
        item_ids = self.queueids
        found = None

        # Synchronised section
        # This will the the item out of the queue by recording that we
        # have seen the item.
        metric_queue_lock.acquire()

        for item_id in item_ids:
            if not metric_queue_seen.has_key((item_id, self.provider)):
                log.debug("found item %s" % item_id)
                metric_queue_seen[(item_id, self.provider)] = True
                found = item_id
                break

        metric_queue_lock.release()

        if found:
            return ItemFactory.get(self.dao, item_id, ProviderFactory.get_provider, default_settings.PROVIDERS)
        else:
            return None
Beispiel #22
0
    def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS):
        # default to nothing
        aliases_providers = []
        biblio_providers = []
        metrics_providers = []

        all_metrics_providers = [provider.provider_name for provider in 
                        ProviderFactory.get_providers(provider_config, "metrics")]
        (genre, host) = ItemFactory.decide_genre(item_aliases)
        has_alias_urls = "url" in item_aliases

        if (genre == "article"):
            if not "pubmed" in aliases_providers_run:
                aliases_providers = ["pubmed"]
            elif not "crossref" in aliases_providers_run:
                aliases_providers = ["crossref"]
            else:
                metrics_providers = all_metrics_providers
                biblio_providers = ["pubmed", "crossref"]
        else:
            # relevant alias and biblio providers are always the same
            relevant_providers = [host]
            if relevant_providers == ["unknown"]:
                relevant_providers = ["webpage"]
            # if all the relevant providers have already run, then all the aliases are done
            # or if it already has urls
            if has_alias_urls or (set(relevant_providers) == set(aliases_providers_run)):
                metrics_providers = all_metrics_providers
                biblio_providers = relevant_providers
            else:
                aliases_providers = relevant_providers

        return({
            "aliases":aliases_providers,
            "biblio":biblio_providers,
            "metrics":metrics_providers})
Beispiel #23
0
 def update_item_with_new_metrics(cls, metric_name, metrics_method_response, item):
     item = ItemFactory.add_metrics_data(metric_name, metrics_method_response, item)
     return(item)