def update_item_with_new_aliases(cls, alias_dict, item): if alias_dict == item["aliases"]: item = None else: merged_aliases = ItemFactory.merge_alias_dicts(alias_dict, item["aliases"]) item["aliases"] = merged_aliases return(item)
def collection_get(cid="", format="json"): coll = mydao.get(cid) if not coll: abort(404) # if not include items, then just return the collection straight from couch if request.args.get("include_items") in ["0", "false", "False"]: # except if format is csv. can't do that. if format == "csv": abort(405) # method not supported else: response_code = 200 resp = make_response(json.dumps(coll, sort_keys=True, indent=4), response_code) resp.mimetype = "application/json" else: try: (coll_with_items, something_currently_updating) = collection.get_collection_with_items_for_client( cid, myrefsets, myredis, mydao ) except (LookupError, AttributeError): logger.error("couldn't get tiids for collection '{cid}'".format(cid=cid)) abort(404) # not found # return success if all reporting is complete for all items if something_currently_updating: response_code = 210 # update is not complete yet else: response_code = 200 if format == "csv": # remove scopus before exporting to csv, so don't add magic keep-scopus keys to clean method clean_items = [ItemFactory.clean_for_export(item) for item in coll_with_items["items"]] csv = collection.make_csv_stream(clean_items) resp = make_response(csv, response_code) resp.mimetype = "text/csv;charset=UTF-8" resp.headers.add("Content-Disposition", "attachment; filename=ti.csv") resp.headers.add("Content-Encoding", "UTF-8") else: api_key = request.args.get("api_key", None) clean_if_necessary_items = [ ItemFactory.clean_for_export(item, api_key, os.getenv("API_KEY")) for item in coll_with_items["items"] ] coll_with_items["items"] = clean_if_necessary_items resp = make_response(json.dumps(coll_with_items, sort_keys=True, indent=4), response_code) resp.mimetype = "application/json" return resp
def put_snaps_in_items(): logger.debug("running put_snaps_in_items() now.") starttime = time.time() view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["metric_snap", "000000000"] end_key = ["metric_snap", "zzzzzzzzzzzzzzzzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) #for row in view_rows[startkey:endkey]: while page: for row in page: if not "metric_snap" in row.key[0]: #print "not a metric_snap so skipping", row.key continue #print row.key row_count += 1 snap = row.doc item = db.get(snap["tiid"]) if item: saving = True while saving: try: from totalimpact.models import ItemFactory updated_item = ItemFactory.add_snap_data(item, snap) # to decide the proper last modified date snap_last_modified = snap["created"] item_last_modified = item["last_modified"] updated_item["last_modified"] = max(snap_last_modified, item_last_modified) logger.info("now on snap row %i, saving item %s back to db, deleting snap %s" % (row_count, updated_item["_id"], snap["_id"])) db.save(updated_item) db.delete(snap) saving = False except couchdb.http.ResourceConflict: logger.warning("couch conflict. trying again") pass else: logger.warning("now on snap row %i, couldn't get item %s for snap %s" % (row_count, snap["tiid"], snap["_id"])) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None logger.info("updated {rows} rows in {elapsed} seconds".format( rows=row_count, elapsed=round(time.time() - starttime) ))
def queue(self): res = self.queueids items = [] # using reversed() as a hack...we actually want to use the couchdb # descending=true param to get the oldest stuff first, but for id in res: my_item = ItemFactory.get(self.dao, item_id, ProviderFactory.get_provider, default_settings.PROVIDERS) items.append(my_item) return items
def queue(self): # due to error in couchdb this reads from json output - see dao view res = self.queueids items = [] for id in res: my_item = ItemFactory.get(self.dao, id, ProviderFactory.get_provider, default_settings.PROVIDERS) items.append(my_item) return items
def get_collection_with_items_for_client(cid, myrefsets, myredis, mydao): startkey = [cid, 0] endkey = [cid, "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"] view_response = mydao.db.view( "collections_with_items/collections_with_items", include_docs=True, startkey=startkey, endkey=endkey ) # the first row is the collection document first_row = view_response.rows[0] collection = first_row.doc try: del collection["ip_address"] except KeyError: pass del collection["alias_tiids"] # start with the 2nd row, since 1st row is the collection document collection["items"] = [] if len(view_response.rows) > 1: for row in view_response.rows[1:]: item_doc = row.doc try: item_for_client = ItemFactory.build_item_for_client(item_doc, myrefsets) except (KeyError, TypeError): logging.info( "Couldn't build item {item_doc}, excluding it from the returned collection {cid}".format( item_doc=item_doc, cid=cid ) ) item_for_client = None raise if item_for_client: collection["items"] += [item_for_client] something_currently_updating = False for item in collection["items"]: item["currently_updating"] = ItemFactory.is_currently_updating(item["_id"], myredis) something_currently_updating = something_currently_updating or item["currently_updating"] logging.info("Got items for collection %s" % cid) # print json.dumps(collection, sort_keys=True, indent=4) return (collection, something_currently_updating)
def checkDryad(self): # Test reading data from Dryad item = ItemFactory.make(self.mydao, app.config["PROVIDERS"]) item.aliases.add_alias('doi', '10.5061/dryad.7898') item_aliases_list = item.aliases.get_aliases_list() dryad = Dryad() new_aliases = dryad.aliases(item_aliases_list) new_metrics = dryad.metrics(item_aliases_list) self.check_aliases('dryad.url', new_aliases, ("url", 'http://hdl.handle.net/10255/dryad.7898')) self.check_aliases('dryad.title', new_aliases, ("title", 'data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides'))
def make_item_dict(tiid): '''Utility function for /item and /items endpoints Will cause the request to abort with 404 if item is missing from db''' try: item = ItemFactory.get(mydao, tiid, ProviderFactory.get_provider, app.config["PROVIDERS"]) item_dict = item.as_dict() except LookupError: abort(404) return item_dict
def wrapper(cls, tiid, input_aliases_dict, provider, method_name, aliases_providers_run, callback): #logger.info("{:20}: **Starting {tiid} {provider_name} {method_name} with {aliases}".format( # "wrapper", tiid=tiid, provider_name=provider.provider_name, method_name=method_name, aliases=aliases)) provider_name = provider.provider_name worker_name = provider_name+"_worker" input_alias_tuples = ItemFactory.alias_tuples_from_dict(input_aliases_dict) method = getattr(provider, method_name) try: method_response = method(input_alias_tuples) except ProviderError: method_response = None logger.info("{:20}: **ProviderError {tiid} {method_name} {provider_name} ".format( worker_name, tiid=tiid, provider_name=provider_name.upper(), method_name=method_name.upper())) if method_name == "aliases": # update aliases to include the old ones too aliases_providers_run += [provider_name] if method_response: new_aliases_dict = ItemFactory.alias_dict_from_tuples(method_response) response = ItemFactory.merge_alias_dicts(new_aliases_dict, input_aliases_dict) else: response = input_aliases_dict else: response = method_response logger.info("{:20}: RETURNED {tiid} {method_name} {provider_name} : {response}".format( worker_name, tiid=tiid, method_name=method_name.upper(), provider_name=provider_name.upper(), response=response)) callback(tiid, response, method_name, aliases_providers_run) try: del thread_count[provider_name][tiid+method_name] except KeyError: # thread isn't there when we call wrapper in unit tests pass return response
def checkWikipedia(self): # Test reading data from Wikipedia item = ItemFactory.make(self.mydao, app.config["PROVIDERS"]) item.aliases.add_alias("doi", "10.1371/journal.pcbi.1000361") #item.aliases.add_alias("url", "http://cottagelabs.com") item_aliases_list = item.aliases.get_aliases_list() wikipedia = Wikipedia() # No aliases for wikipedia #new_aliases = wikipedia.aliases(item_aliases_list) new_metrics = wikipedia.metrics(item_aliases_list) self.check_metric('wikipedia:mentions', new_metrics['wikipedia:mentions'], 1)
def first(self): if self.none_count >= 3: if self.max_items: if self.current_item > self.max_items: return None # Generate a mock item with initial alias ('mock', id) item = ItemFactory.make() item.id = self.current_item item.aliases['mock'] = str(item.id) self.items[self.current_item] = item return item else: self.none_count += 1 return None
def get_item_from_tiid(tiid, format=None): # TODO check request headers for format as well. try: item = ItemFactory.get_item(tiid, myrefsets, mydao) except (LookupError, AttributeError): abort(404) if not item: abort(404) if ItemFactory.is_currently_updating(tiid, myredis): response_code = 210 # not complete yet item["currently_updating"] = True else: response_code = 200 item["currently_updating"] = False clean_item = ItemFactory.clean_for_export(item) resp = make_response(json.dumps(clean_item, sort_keys=True, indent=4), response_code) resp.mimetype = "application/json" return resp
def checkWikipedia(self): # Test reading data from Wikipedia item = ItemFactory.make_simple(self.mydao) item.aliases.add_alias("doi", "10.1371/journal.pcbi.1000361") #item.aliases.add_alias("url", "http://cottagelabs.com") item_aliases_list = item.aliases.get_aliases_list() wikipedia = Wikipedia() # No aliases for wikipedia #new_aliases = wikipedia.aliases(item_aliases_list) new_metrics = wikipedia.metrics(item_aliases_list) self.check_metric('wikipedia:mentions', new_metrics['wikipedia:mentions'], 1)
def checkDryad(self): # Test reading data from Dryad item = ItemFactory.make_simple(self.mydao) item.aliases.add_alias('doi', '10.5061/dryad.7898') item_aliases_list = item.aliases.get_aliases_list() dryad = Dryad() new_aliases = dryad.aliases(item_aliases_list) new_metrics = dryad.metrics(item_aliases_list) self.check_aliases('dryad.url', new_aliases, ("url", 'http://hdl.handle.net/10255/dryad.7898')) self.check_aliases('dryad.title', new_aliases, ( "title", 'data from: can clone size serve as a proxy for clone age? an exploration using microsatellite divergence in populus tremuloides' ))
def checkGithub(self): item = ItemFactory.make_simple(self.mydao) github = Github() members = github.member_items("egonw") self.check_members('github.github_user', members, [('github', ('egonw', 'blueobelisk.debian')), ('github', ('egonw', 'ron')), ('github', ('egonw', 'pubchem-cdk')), ('github', ('egonw', 'org.openscience.cdk')), ('github', ('egonw', 'java-rdfa')), ('github', ('egonw', 'cdk')), ('github', ('egonw', 'RobotDF')), ('github', ('egonw', 'egonw.github.com')), ('github', ('egonw', 'knime-chemspider')), ('github', ('egonw', 'gtd')), ('github', ('egonw', 'cheminfbenchmark')), ('github', ('egonw', 'cdk-taverna')), ('github', ('egonw', 'groovy-jcp')), ('github', ('egonw', 'jnchem')), ('github', ('egonw', 'acsrdf2010')), ('github', ('egonw', 'Science-3.0')), ('github', ('egonw', 'SNORQL')), ('github', ('egonw', 'ctr-cdk-groovy')), ('github', ('egonw', 'CDKitty')), ('github', ('egonw', 'rednael')), ('github', ('egonw', 'de.ipbhalle.msbi')), ('github', ('egonw', 'collaborative.cheminformatics')), ('github', ('egonw', 'xws-taverna')), ('github', ('egonw', 'cheminformatics.classics')), ('github', ('egonw', 'chembl.rdf')), ('github', ('egonw', 'blueobelisk.userscript')), ('github', ('egonw', 'ojdcheck')), ('github', ('egonw', 'nmrshiftdb-rdf')), ('github', ('egonw', 'bioclipse.ons')), ('github', ('egonw', 'medea_bmc_article'))]) item.aliases.add_alias("github", "egonw,gtd") item_aliases_list = item.aliases.get_aliases_list() new_metrics = github.metrics(item_aliases_list) self.check_metric('github:forks', new_metrics['github:forks'], 0) self.check_metric('github:watchers', new_metrics['github:watchers'], 7)
def create_item(namespace, nid): logger.debug("In create_item with alias" + str((namespace, nid))) item = ItemFactory.make() # set this so we know when it's still updating later on myredis.set_num_providers_left(item["_id"], ProviderFactory.num_providers_with_metrics(default_settings.PROVIDERS)) item["aliases"][namespace] = [nid] mydao.save(item) myredis.add_to_alias_queue(item["_id"], item["aliases"]) logger.info("Created new item '{id}' with alias '{alias}'".format(id=item["_id"], alias=str((namespace, nid)))) try: return item["_id"] except AttributeError: abort(500)
def create_item(namespace, id): '''Utility function to keep DRY in single/multiple item creation endpoins''' item = ItemFactory.make(mydao, app.config["PROVIDERS"]) item.aliases.add_alias(namespace, id) ## FIXME - see issue 86 ## Should look up this namespace and id and see if we already have a tiid ## If so, return its tiid with a 200. # right now this makes a new item every time, creating many dupes # does not filter by whether we actually can process the namespace, since # we may be able to someday soon. It's user's job to not pass in junk. item.save() try: return item.id except AttributeError: abort(500)
def checkGithub(self): item = ItemFactory.make(self.mydao, app.config["PROVIDERS"]) github = Github() members = github.member_items("egonw") self.check_members('github.github_user', members, [('github', ('egonw', 'blueobelisk.debian')), ('github', ('egonw', 'ron')), ('github', ('egonw', 'pubchem-cdk')), ('github', ('egonw', 'org.openscience.cdk')), ('github', ('egonw', 'java-rdfa')), ('github', ('egonw', 'cdk')), ('github', ('egonw', 'RobotDF')), ('github', ('egonw', 'egonw.github.com')), ('github', ('egonw', 'knime-chemspider')), ('github', ('egonw', 'gtd')), ('github', ('egonw', 'cheminfbenchmark')), ('github', ('egonw', 'cdk-taverna')), ('github', ('egonw', 'groovy-jcp')), ('github', ('egonw', 'jnchem')), ('github', ('egonw', 'acsrdf2010')), ('github', ('egonw', 'Science-3.0')), ('github', ('egonw', 'SNORQL')), ('github', ('egonw', 'ctr-cdk-groovy')), ('github', ('egonw', 'CDKitty')), ('github', ('egonw', 'rednael')), ('github', ('egonw', 'de.ipbhalle.msbi')), ('github', ('egonw', 'collaborative.cheminformatics')), ('github', ('egonw', 'xws-taverna')), ('github', ('egonw', 'cheminformatics.classics')), ('github', ('egonw', 'chembl.rdf')), ('github', ('egonw', 'blueobelisk.userscript')), ('github', ('egonw', 'ojdcheck')), ('github', ('egonw', 'nmrshiftdb-rdf')), ('github', ('egonw', 'bioclipse.ons')), ('github', ('egonw', 'medea_bmc_article'))]) item.aliases.add_alias("github", "egonw,gtd") item_aliases_list = item.aliases.get_aliases_list() new_metrics = github.metrics(item_aliases_list) self.check_metric('github:forks', new_metrics['github:forks'], 0) self.check_metric('github:watchers', new_metrics['github:watchers'], 7)
def create_or_find_items_from_aliases(clean_aliases): tiids = [] items = [] for alias in clean_aliases: (namespace, nid) = alias existing_tiid = get_tiid_by_alias(namespace, nid) if existing_tiid: tiids.append(existing_tiid) logger.debug( "found an existing tiid ({tiid}) for alias {alias}".format(tiid=existing_tiid, alias=str(alias)) ) else: logger.debug("alias {alias} isn't in the db; making a new item for it.".format(alias=alias)) item = ItemFactory.make() item["aliases"][namespace] = [nid] items.append(item) tiids.append(item["_id"]) return (tiids, items)
def metrics(self, aliases, provider_url_template=None, # ignore this because multiple url steps cache_enabled=True): # Only lookup metrics for items with appropriate ids from totalimpact.models import ItemFactory aliases_dict = ItemFactory.alias_dict_from_tuples(aliases) metrics_page = None # try lookup by doi try: metrics_page = self._get_metrics_lookup_page(self.metrics_from_doi_template, aliases_dict["doi"][0]) except KeyError: pass # try lookup by pmid if not metrics_page: try: metrics_page = self._get_metrics_lookup_page(self.metrics_from_pmid_template, aliases_dict["pmid"][0]) except KeyError: pass # try lookup by title if not metrics_page: try: page = self._get_uuid_lookup_page(aliases_dict["biblio"][0]["title"]) if page: uuid = self._get_uuid_from_title(aliases_dict, page) if uuid: logger.debug("Mendeley: uuid is %s for %s" %(uuid, aliases_dict["biblio"][0]["title"])) metrics_page = self._get_metrics_lookup_page(self.metrics_from_uuid_template, uuid) else: logger.debug("Mendeley: couldn't find uuid for %s" %(aliases_dict["biblio"][0]["title"])) except KeyError: pass # give up! if not metrics_page: return {} metrics_and_drilldown = self._get_metrics_and_drilldown_from_metrics_page(metrics_page) return metrics_and_drilldown
def dequeue(self): item_ids = self.queueids found = None # Synchronised section # This will the the item out of the queue by recording that we # have seen the item. metric_queue_lock.acquire() for item_id in item_ids: if not metric_queue_seen.has_key((item_id, self.provider)): log.debug("found item %s" % item_id) metric_queue_seen[(item_id, self.provider)] = True found = item_id break metric_queue_lock.release() if found: return ItemFactory.get(self.dao, item_id, ProviderFactory.get_provider, default_settings.PROVIDERS) else: return None
def sniffer(cls, item_aliases, aliases_providers_run, provider_config=default_settings.PROVIDERS): # default to nothing aliases_providers = [] biblio_providers = [] metrics_providers = [] all_metrics_providers = [provider.provider_name for provider in ProviderFactory.get_providers(provider_config, "metrics")] (genre, host) = ItemFactory.decide_genre(item_aliases) has_alias_urls = "url" in item_aliases if (genre == "article"): if not "pubmed" in aliases_providers_run: aliases_providers = ["pubmed"] elif not "crossref" in aliases_providers_run: aliases_providers = ["crossref"] else: metrics_providers = all_metrics_providers biblio_providers = ["pubmed", "crossref"] else: # relevant alias and biblio providers are always the same relevant_providers = [host] if relevant_providers == ["unknown"]: relevant_providers = ["webpage"] # if all the relevant providers have already run, then all the aliases are done # or if it already has urls if has_alias_urls or (set(relevant_providers) == set(aliases_providers_run)): metrics_providers = all_metrics_providers biblio_providers = relevant_providers else: aliases_providers = relevant_providers return({ "aliases":aliases_providers, "biblio":biblio_providers, "metrics":metrics_providers})
def update_item_with_new_metrics(cls, metric_name, metrics_method_response, item): item = ItemFactory.add_metrics_data(metric_name, metrics_method_response, item) return(item)