def get_cdb_searches(today, lapse='month'): query = "select * from query_log_master" query += " where client='portal-prod'" query += " and type != 'download' and results_by_resource != '{}' and results_by_resource != ''" query = add_time_limit(query=query, today=today, lapse=lapse) searches = cartodb_query(query) pubs = {} for search in searches: res_count = json.loads(search['results_by_resource']) for url in res_count: inst, col = get_inst_col(url) pub = "{0}-{1}".format(inst, col) if pub not in pubs: pubs[pub] = { 'searches': 1, 'records_searched': res_count[url], 'list_records_searched': [res_count[url]], 'url': url, 'inst': inst, 'col': col } else: pubs[pub]['searches'] += 1 pubs[pub]['records_searched'] += res_count[url] pubs[pub]['list_records_searched'].append(res_count[url]) pubs[pub] = get_cdb_stats(search, pubs[pub], from_download=False) return pubs
def get_cdb_downloads(lapse, today): """Download the info in the downloads from CDB""" query = "select * from query_log_master where download is not null and download !=''" query += " and client='portal-prod'" # Just production portal downloads query = add_time_limit(query=query, today=today, lapse=lapse) # Just from the specific month d = cartodb_query(query) return d
def get_inst_col(url): query = "select icode from resource_staging where url='{0}'".format(url) max_retries = 3 retry = 0 while retry < max_retries: d = cartodb_query(query) if len(d) > 0: inst = d[0]['icode'] col = url.split('?r=')[1] return inst, col else: retry += 1 return None, None
def get_all_repos(self): """Extract a list of all orgnames and reponames from CartoDB.""" query = "select github_orgname, github_reponame\ from resource_staging\ where ipt is true and networks like '%VertNet%';" all_repos = cartodb_query(query) logging.info("Got {0} repos currently in CartoDB" .format(len(all_repos))) result = [] for repo in all_repos: result.append((repo['github_orgname'], repo['github_reponame'])) return result
def post(self): urlfetch.set_default_fetch_deadline(60) self.response.headers['Content-Type'] = 'application/json' q = "select gbifdatasetid, icode, orgname, github_orgname, " \ "source_url, github_reponame, url, gbifpublisherid " \ "from resource_staging " \ "where ipt=true and networks like '%VertNet%'" resources = cartodb_query(q) ds = [] for resource in resources: ds.append(Dataset(id=resource['gbifdatasetid'], **resource)) keys = ndb.put_multi(ds) result = { "datasets processed": len(keys), } self.response.write(json.dumps(result)) return
def get_events(self): """Build query and extract records.""" # Extract CartoDB data, base query logging.info("Building %s query" % self.t) if self.t == 'download': # Line #6 of SQL is to avoid too large queries query = "SELECT cartodb_id, lat, lon, created_at, " \ "query AS query_terms, response_records, " \ "results_by_resource " \ "FROM %s " \ "WHERE type='download' "\ "AND octet_length(query)<=1500 " \ "AND download IS NOT NULL " \ "AND download !=''" % self.table_name else: # Line #6 of SQL is to avoid too large queries query = "SELECT cartodb_id, lat, lon, created_at, " \ "query AS query_terms, response_records, " \ "results_by_resource " \ "FROM %s " \ "WHERE left(type, 5)='query' " \ "AND octet_length(query)<=1500 " \ "AND results_by_resource IS NOT NULL " \ "AND results_by_resource != '{}' " \ "AND results_by_resource !=''" % self.table_name # Just production portal downloads query += " and client='portal-prod'" # Only restrict time if using default table if self.table_name == CDB_TABLE: queried_date = datetime( int(self.period[:4]), int(self.period[-2:]), 1 ) queried_date += timedelta(days=32) query = add_time_limit(query=query, today=queried_date) logging.info("Executing query") logging.info(query) try: data = cartodb_query(query) except ApiQueryMaxRetriesExceededError: self.error(504) resp = { "status": "error", "message": "Could not retrieve data from CartoDB", "data": { "period": self.period, "event_type": self.t } } self.response.write(json.dumps(resp) + "\n") return 1 # Store 'data' in class property self.data = data # Finish method logging.info("Extracted %d %s events" % (len(data), self.t)) return 0