class SocrataClient: def __init__(self): self.client = Socrata(conf.DOMAIN, conf.TOKEN, timeout=conf.TIMEOUT) def __del__(self): self.client.close() def dataset_id(self, year): return conf.DATASET_IDS[year] def get(self, year, **kwargs): id = self.dataset_id(year) for attempt in range(conf.ATTEMPTS): try: return self.client.get(id, **kwargs) except Exception as e: if attempt < conf.ATTEMPTS - 1: continue else: raise e def get_metadata(self, year): id = self.dataset_id(year) return self.client.get_metadata(id) def get_datasets(self): ''' Search for "MyLA311 Service Request Data" within the response to get the dataset ids for each year. ''' return self.client.datasets()
def _create_foreign_tables(self, schema: str, server_id: str, tables: TableInfo) -> List[MountError]: from sodapy import Socrata from psycopg2.sql import SQL logging.info("Getting Socrata metadata") client = Socrata(domain=self.params["domain"], app_token=self.credentials.get("app_token")) tables = self.tables or tables if isinstance(tables, list): sought_ids = tables else: sought_ids = [t[1]["socrata_id"] for t in tables.values()] try: datasets = client.datasets(ids=sought_ids, only=["dataset"]) except Exception as e: if "Unknown response format: text/html" in str(e): # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly # and instead stumbles on an unexpected content-type of the 404 page it's served. # We catch that and reraise a more friendly message. raise RepositoryNotFoundError( "Socrata domain or dataset not found!") from e raise if not datasets: raise RepositoryNotFoundError( "Socrata domain or dataset not found!") mount_statements, mount_args = generate_socrata_mount_queries( sought_ids, datasets, schema, server_id, tables) self.engine.run_sql(SQL(";").join(mount_statements), mount_args) return []
def socrata_metadata_excel(socrata_dataset, authentication_token): print(socrata_dataset.info()) socrata = remove_characters(socrata_dataset) list_datasets = [] for index, row in socrata.iterrows(): website_link = getattr(row, 'website_url') print("Link:", website_link) client = Socrata(website_link, authentication_token) print(getattr(row, 'city')) dataframe = pd.DataFrame(client.datasets()) resource = dataframe['resource'].apply(pd.Series) print("Resource Columns:",resource.columns) classification = dataframe['classification'].apply(pd.Series) print("\n Classification Columns:",classification.columns) link = dataframe['link'].apply(pd.Series) print("\n Links Columns:", link.columns) metadata = dataframe['metadata'].apply(pd.Series) print("\n Metadata Columns:", metadata.columns) owner = dataframe['owner'].apply(pd.Series) print("\n Owner Columns:",owner.columns ) permalink = dataframe['permalink'].apply(pd.Series) print("\n Permalink Columns:", permalink.columns) print(" All Function are working") final_dataset = pd.concat([resource, classification, link, metadata, owner, permalink], axis=1) list_datasets.append(final_dataset) excel_dataset = pd.concat(list_datasets, axis=0) return excel_dataset
def test_get_datasets(): mock_adapter = {} mock_adapter["prefix"] = PREFIX adapter = requests_mock.Adapter() mock_adapter["adapter"] = adapter client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter) setup_datasets_mock(adapter, "get_datasets.txt", 200, params={"limit": "7"}) response = client.datasets(limit=7) assert isinstance(response, list) assert len(response) == 7
class SocrataClient: def __init__(self): conf = config['Socrata'] domain = conf['DOMAIN'] token = conf['TOKEN'] timeout = conf['TIMEOUT'] self.client = Socrata(domain, token, timeout=timeout) self.attempts = conf['ATTEMPTS'] self.years = conf def __del__(self): self.client.close() def dataset_id(self, year): return self.years['AP' + str(year)] def get(self, year, **kwargs): id = self.dataset_id(year) for attempt in range(self.attempts): try: return self.client.get(id, **kwargs) except Exception as e: if attempt < self.attempts - 1: continue else: raise e def get_metadata(self, year): id = self.dataset_id(year) return self.client.get_metadata(id) def get_datasets(self): ''' Search for "MyLA311 Service Request Data" within the response to get the dataset ids for each year. ''' return self.client.datasets()
def initialize_socrata(data_portal_url, app_token=None) -> tuple: client = Socrata(data_portal_url, app_token) ds = client.datasets() return client, ds
pdb.set_trace() df.append(line) # Load initial json data with open(DATA_FILE, 'r') as f: d = f.readline(1000) # Get json data client = Socrata(DATA_DOMAIN, APP_TOKEN) results = client.get(DATA_ID, limit=10000) # Load json into pandas dataframe df = pd.DataFrame(results) df.head() # Get list of all datasets chi_datasets = client.datasets() # Get set of categories categories = [ i for k, v in enumerate(chi_datasets, 0) for i in chi_datasets[k]['classification']['categories'] ] categories = set(categories) d_list = list() with open('test.json', 'r') as f: businesses = ijson.items(f, 'item') for business in businesses: d_list.append(business)
def mount_socrata( mountpoint: str, server, port, username, password, domain: str, tables: Optional[Dict[str, Any]] = None, app_token: Optional[str] = None, batch_size: Optional[int] = 10000, ) -> None: """ Mount a Socrata dataset. Mounts a remote Socrata dataset and forwards queries to it \b :param domain: Socrata domain, for example, data.albanyny.gov. Required. :param tables: A dictionary mapping PostgreSQL table names to Socrata table IDs. For example, {"salaries": "xzkq-xp2w"}. If skipped, ALL tables in the Socrata endpoint will be mounted. :param app_token: Socrata app token. Optional. :param batch_size: Amount of rows to fetch from Socrata per request (limit parameter). Maximum 50000. """ from splitgraph.engine import get_engine from sodapy import Socrata from psycopg2.sql import Identifier, SQL engine = get_engine() logging.info("Mounting Socrata domain...") server_id = mountpoint + "_server" options: Dict[str, Optional[str]] = { "wrapper": "splitgraph.ingestion.socrata.fdw.SocrataForeignDataWrapper", } if domain: options["domain"] = domain if app_token: options["app_token"] = app_token if batch_size: options["batch_size"] = str(batch_size) init_fdw( engine, server_id=server_id, wrapper="multicorn", server_options=options, ) engine.run_sql(SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint))) logging.info("Getting Socrata metadata") client = Socrata(domain=domain, app_token=app_token) sought_ids = tables.values() if tables else [] try: datasets = client.datasets(ids=sought_ids, only=["dataset"]) except Exception as e: if "Unknown response format: text/html" in str(e): # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly # and instead stumbles on an unexpected content-type of the 404 page it's served. # We catch that and reraise a more friendly message. raise RepositoryNotFoundError("Socrata domain or dataset not found!") from e mount_statements, mount_args = generate_socrata_mount_queries( sought_ids, datasets, mountpoint, server_id, tables ) engine.run_sql(SQL(";").join(mount_statements), mount_args)
'contentUrl': contentUrl}) apiSecret = '924eeda9ef2b4b5d9d41013ffdb7cd30' from newsapi import NewsApiClient # Init newsapi = NewsApiClient(api_key=apiSecret) # /v2/top-headlines top_headlines = newsapi.get_top_headlines(q='bitcoin', sources='bbc-news,the-verge', category='business', language='en', country='us') # /v2/everything all_articles = newsapi.get_everything(q='bitcoin', language='en', sort_by='relevancy', page=1) text = 'camp' response = requests.get('http://catalog.data.gov/api/3/action/package_search?q=' +text+ '&rows=50') response_dict = json.loads(response.content)''' from sodapy import Socrata client = Socrata("data.kcmo.org", None) client.datasets()
class SocrataRepository(HarvestRepository): """ Socrata Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "socrata" super(SocrataRepository, self).setRepoParams(repoParams) # sodapy doesn't like http/https preceding URLs self.socratarepo = Socrata(self.url, self.socrata_app_token) self.domain_metadata = [] def _crawl(self): kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "socrata", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url, "repo_oai_name": self.repo_oai_name } self.repository_id = self.db.update_repo(**kwargs) records = self.socratarepo.datasets() item_count = 0 for rec in records: result = self.db.write_header(rec["resource"]["id"], self.repository_id) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize(tdelta), item_count/tdelta) ) self.logger.info("Found {} items in feed".format(item_count) ) def format_socrata_to_oai(self, socrata_record, local_identifier): record = {} record["title"] = socrata_record.get("name","").strip() record["description"] = socrata_record.get("description", "") record["tags"] = socrata_record.get("tags", "") record["identifier"] = local_identifier record["creator"] = socrata_record.get("attribution", self.name) record["pub_date"] = datetime.fromtimestamp(socrata_record["publicationDate"]).strftime('%Y-%m-%d') record["subject"] = socrata_record.get("category", "") record["title_fr"] = "" record["series"] = "" record["rights"] = [] if ('license' in socrata_record) and socrata_record['license']: # Winnipeg, Nova Scotia, PEI record["rights"].append(socrata_record['license'].get("name", "")) record["rights"].append(socrata_record['license'].get("termsLink", "")) record["rights"] = "\n".join(record["rights"]) record["rights"] = record["rights"].strip() if record["rights"] == "See Terms of Use": # Calgary, Edmonton record["rights"] = [] if ('metadata' in socrata_record) and socrata_record['metadata']: if ('custom_fields' in socrata_record['metadata']) and socrata_record['metadata']['custom_fields']: if ('License/Attribution' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['License/Attribution']: if ('License URL' in socrata_record['metadata']['custom_fields']['License/Attribution'] and socrata_record['metadata']['custom_fields']['License/Attribution']['License URL']): if record["rights"] == "" or record["rights"] == []: # Calgary record["rights"] = socrata_record['metadata']['custom_fields']['License/Attribution']['License URL'] if ('License-URL' in socrata_record['metadata']['custom_fields']['License/Attribution'] and socrata_record['metadata']['custom_fields']['License/Attribution']['License-URL']): if record["rights"] == "" or record["rights"] == []: # Calgary record["rights"] = socrata_record['metadata']['custom_fields']['License/Attribution']['License-URL'] elif ('Licence' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['Licence']: if ('Licence' in socrata_record['metadata']['custom_fields']['Licence']) and socrata_record['metadata']['custom_fields']['Licence']['Licence']: if record["rights"] == "" or record["rights"] == []: # Winnipeg record["rights"] = socrata_record['metadata']['custom_fields']['Licence']['Licence'] elif ('Attributes' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['Attributes']: if ('Licence' in socrata_record['metadata']['custom_fields']['Attributes']) and socrata_record['metadata']['custom_fields']['Attributes']['Licence']: if record["rights"] == "" or record["rights"] == []: # Strathcona record["rights"] = socrata_record['metadata']['custom_fields']['Attributes']['Licence'] if record["rights"] == "" or record["rights"] == []: record.pop("rights") # Continue to default to English for our current Socrata repositories. # For Nova Scoatia, "fra" language refers to the dataset, not the metadata. # language = self.default_language # if "metadata" in socrata_record: # if "custom_fields" in socrata_record["metadata"]: # if "Detailed Metadata" in socrata_record["metadata"]["custom_fields"]: # if "Language" in socrata_record["metadata"]["custom_fields"]["Detailed Metadata"]: # # Nova Scotia # language = socrata_record["metadata"]["custom_fields"]["Detailed Metadata"]["Language"] # elif "Dataset Information" in socrata_record["metadata"]["custom_fields"]: # if "Language" in socrata_record["metadata"]["custom_fields"]["Dataset Information"]: # # Prince Edward Island # language = socrata_record["metadata"]["custom_fields"]["Dataset Information"]["Language"] # language = language.lower() # # if language in ["fr", "fre", "fra", "french"]: # language = "fr" # else: # language = "en" return record @rate_limited(5) def _update_record(self,record): try: socrata_record = self.socratarepo.get_metadata(record['local_identifier']) oai_record = self.format_socrata_to_oai(socrata_record,record['local_identifier']) if oai_record: self.db.write_record(oai_record, self) return True except Exception as e: self.logger.error("Updating record {} failed: {}".format(record['local_identifier'], e)) if self.dump_on_failure == True: try: print(socrata_record) except: pass # Touch the record so we do not keep requesting it on every run self.db.touch_record(record) self.error_count = self.error_count + 1 if self.error_count < self.abort_after_numerrors: return True return False
class SocrataRepository(HarvestRepository): """ Socrata Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "socrata" super(SocrataRepository, self).setRepoParams(repoParams) # sodapy doesn't like http/https preceding URLs self.socratarepo = Socrata(self.url, self.socrata_app_token) self.domain_metadata = [] def _crawl(self): kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "socrata", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url } self.repository_id = self.db.update_repo(**kwargs) records = self.socratarepo.datasets() item_count = 0 for rec in records: result = self.db.write_header(rec["resource"]["id"], self.repository_id) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize(tdelta), item_count/tdelta) ) self.logger.info("Found {} items in feed".format(item_count) ) def format_socrata_to_oai(self, socrata_record, local_identifier): record = {} record["title"] = socrata_record["name"] record["description"] = socrata_record.get("description", "") record["tags"] = socrata_record.get("tags", "") record["identifier"] = local_identifier record["creator"] = socrata_record.get("attribution", self.name) record["pub_date"] = datetime.fromtimestamp(socrata_record["publicationDate"]).strftime('%Y-%m-%d') record["contact"] = self.contact record["series"] = socrata_record.get("category", "") return record def _rate_limited(max_per_second): """ Decorator that make functions not be called faster than a set rate """ threading = __import__('threading') lock = threading.Lock() min_interval = 1.0 / float(max_per_second) def decorate(func): last_time_called = [0.0] @wraps(func) def rate_limited_function(*args, **kwargs): lock.acquire() elapsed = time.clock() - last_time_called[0] left_to_wait = min_interval - elapsed if left_to_wait > 0: time.sleep(left_to_wait) lock.release() ret = func(*args, **kwargs) last_time_called[0] = time.clock() return ret return rate_limited_function return decorate @_rate_limited(5) def _update_record(self,record): try: socrata_record = self.socratarepo.get_metadata(record['local_identifier']) oai_record = self.format_socrata_to_oai(socrata_record,record['local_identifier']) if oai_record: self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata) return True except Exception as e: self.logger.error("Updating record {} failed: {}".format(record['local_identifier'], e)) # Touch the record so we do not keep requesting it on every run self.db.touch_record(record) self.error_count = self.error_count + 1 if self.error_count < self.abort_after_numerrors: return True return False