def execute(self): dataset_id = '{}_{}'.format(self.dataset_url, self.dataset_filename) latest_status = self.redis_client.get(dataset_id) if latest_status is not None: latest_status = latest_status.decode(ENCODING) if latest_status == 'COMPLETED' or latest_status == 'ATTEMPT TO REFILL': self.redis_client.set(dataset_id, 'ATTEMPT TO REFILL') print('ATTEMPT TO REFILL') return False client = Socrata(self.dataset_url, None) self.redis_client.set(dataset_id, 'STARTED') for i in range(int(NUMBER_OF_MESSAGES / MESSAGES_PER_FETCH)): results = client.get(self.dataset_filename, limit=MESSAGES_PER_FETCH, offset=MESSAGES_PER_FETCH * i) results_df = pd.DataFrame.from_records(results) current_progress = '{} - {}'.format( str(i * MESSAGES_PER_FETCH + 1), str((i + 1) * MESSAGES_PER_FETCH)) self.redis_client.set(dataset_id, current_progress) print('Progress {}'.format(current_progress)) print(results_df) self.redis_client.set(self.dataset_url + "_" + self.dataset_filename, 'COMPLETED') print('COMPLETED')
def get_population_data(): client = Socrata("data.cityofnewyork.us", app_token, username=username, password=password) result = client.get("27iv-9uub", limit=2000) return pd.DataFrame.from_records(result)
def get_311(data_id): ''' Get 2012 and 2017 311 data from chicago data portal Input: data_id: id of different type of report Return: pandas dataframe with the columns and dtypes as COL_TYPES ''' '''COL_TYPES = {'sr_type': str, #'created_date': str, 'zip_code': int, 'longitude': float, 'latitude': float, 'legacy_record': bool} ''' #cols = [item for item in COL_TYPES.keys()] client = Socrata('data.cityofchicago.org', 'E0eO5nY1aKuEY1pVrunfqFhDz', username='******', password='******') conds = '''legacy_sr_number IS NULL''' res = client.get( data_id, #where= conds, limit=1000000) client.close() df = pd.DataFrame.from_records(res) # df['created_date'] = pd.to_datetime(df['created_date']) # df = df[df['created_date']>start_year] # df = df[df['created_date']<end_year] return df
def get_data(): client = Socrata(socrata_domain, 'ohJoNJOw8qez7ABLZhzGUja4n') lmt = 1000000 start_at = 0 all_complaints = [] while True: complaints = client.get( socrata_dataset_identifier, limit=lmt, offset=start_at, order="created_date ASC", select="created_date, borough, complaint_type, incident_zip", where= "created_date between '2017-01-01T00:00:00' and '2017-12-31T23:59:59'" ) all_complaints.extend(complaints) start_at += lmt if len(complaints) < lmt: break headers = ['created_date', 'borough', 'complaint_type', 'incident_zip'] with open('complaint.csv', 'w') as f: csvwriter = csv.DictWriter(f, headers) csvwriter.writeheader() csvwriter.writerows(all_complaints) print('SUCCESS. Number of complaints downloaded: {}'.format( len(all_complaints))) print('Saved to {} file'.format('complaint.csv'))
def fetchSocrata(self, year=2019, querySize=20000, pageSize=20000): '''Fetch data from Socrata connection and return pandas dataframe''' # Load config files print('Retrieving partial Socrata query...') fetchTimer = time.time() socrata_domain = self.config['Socrata']['DOMAIN'] socrata_dataset_identifier = self.config['Socrata']['AP' + str(year)] socrata_token = self.token # Establish connection to Socrata resource client = Socrata(socrata_domain, socrata_token) # Fetch data # Loop for querying dataset queryDf = None for i in range(0, querySize, pageSize): # print(i + pageSize) results = client.get(socrata_dataset_identifier, offset=i, select="*", order="updateddate DESC", limit=querySize) tempDf = pd.DataFrame.from_dict(results) if queryDf is None: queryDf = tempDf.copy() else: queryDf = queryDf.append(tempDf) self.data = queryDf print('%d records retrieved in %.2f minutes' % (self.data.shape[0], self.elapsedTimer(fetchTimer)))
def fetch_311_data(zip, max_query_results=None, num_entries_to_search=10000, t_out=10) -> Dict[str, any]: nyc_311_dataset_domain = "data.cityofnewyork.us" nyc_311_dataset_identifier = "fhrw-4uyv" try: nyc_311_dataset_token = get_311_socrata_key() except KeyError: nyc_311_dataset_token = ( None # works with None but lower number of requests can be made ) client = Socrata(nyc_311_dataset_domain, nyc_311_dataset_token) client.timeout = t_out try: return client.get( nyc_311_dataset_identifier, select= "created_date, incident_zip, incident_address, city, complaint_type, descriptor, status", # q=str(zip), #uncomment if want to query directly on the server side (may lead to timeout) order="created_date DESC", limit=num_entries_to_search, ) except requests.exceptions.Timeout: raise TimeoutError
def fetchSocrata(self, year=2019, querySize=10000): '''Fetch data from Socrata connection and return pandas dataframe''' # Load config files socrata_domain = self.config['Socrata']['DOMAIN'] socrata_dataset_identifier = self.config['Socrata']['AP' + str(year)] socrata_token = self.token # Establish connection to Socrata resource client = Socrata(socrata_domain, socrata_token) # Fetch data metadata = client.get_metadata(socrata_dataset_identifier) # Loop for querying dataset queryDf = None for i in range(0, querySize, 1000): print(i) results = client.get(socrata_dataset_identifier, offset=i, select="*", order="updateddate DESC") tempDf = pd.DataFrame.from_dict(results) if queryDf is None: queryDf = tempDf.copy() else: queryDf = queryDf.append(tempDf) self.data = queryDf # Fetch data metadata = client.get_metadata(socrata_dataset_identifier)
def execute(self): dataset_id = '{}_{}'.format(self.dataset_url, self.dataset_filename) latest_status = self.redis_client.get(dataset_id) kafka_server = '{}:{}'.format( config_reader.cfg.get('LAB', 'kafka_host'), config_reader.cfg.get('LAB', 'kafka_port')) producer = KafkaProducer( bootstrap_servers=[kafka_server], value_serializer=lambda v: json.dumps(v).encode(ENCODING)) if latest_status == str( States.COMPLETED_STATUS) or latest_status == str( States.ATTEMPT_TO_REFILL_STATUS): self.redis_client.set(dataset_id, str(States.ATTEMPT_TO_REFILL_STATUS)) client = Socrata(self.dataset_url, None) self.redis_client.set(dataset_id, str(States.STARTED_STATUS)) for i in range(NUMBER_OF_MESSAGES): results = client.get(self.dataset_filename, limit=1, offset=i) results_df = pd.DataFrame.from_records(results) current_progress = 'row #{}'.format(str(i)) self.redis_client.set(dataset_id, current_progress) producer.send(config_reader.cfg.get('LAB', 'elastic_search_topic'), results_df.to_dict()) print("Results {}".format(current_progress)) self.redis_client.set(dataset_id, str(States.COMPLETED_STATUS))
class SocrataClient: def __init__(self, config=None): config = config['Socrata'] domain = config['DOMAIN'] token = None if config['TOKEN'] == 'None' else config['TOKEN'] timeout = int(config['TIMEOUT']) self.client = Socrata(domain, token, timeout=timeout) self.attempts = int(config['ATTEMPTS']) self.config = config def __del__(self): self.client.close() def dataset_id(self, year): return self.config['AP' + str(year)] def get(self, year, **kwargs): id = self.dataset_id(year) for attempt in range(self.attempts): try: return self.client.get(id, **kwargs) except Exception as e: if attempt < self.attempts - 1: continue else: raise e def get_metadata(self, year): id = self.dataset_id(year) return self.client.get_metadata(id)
def PersonToTelefonoCorreo_u5cm_hpr6(m): TRX = MaltegoTransform() #m.parseArguments(sys.argv) #person=sys.argv[1] person = m.Value try: client = Socrata("www.datos.gov.co", None) r = client.get("u5mc-hpr6", limit=2000) #for key, value in data.items(): #print key, value for i in range(len(r)): if (r[i]['nombre'] == person): celular = r[i]['celular'] email = r[i]['correo_electr_nico'] break ent = TRX.addEntity('maltego.PhoneNumber', celular) ent.addAdditionalFields("phonenumber.countrycode", "Country Code", True, "57") ent2 = TRX.addEntity('maltego.EmailAddress', email) except Exception as e: TRX.addUIMessage("Cedula no encontrada en la base de datos") TRX.returnOutput()
def get_unique_locations(self): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata("data.melbourne.vic.gov.au", None) # Example authenticated client (needed for non-public datasets): # client = Socrata(data.melbourne.vic.gov.au, # MyAppToken, # userame="*****@*****.**", # password="******") # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. results = client.get("mxb8-wn4w", limit=1000) locations = set() for value in results: for key in value: if 'sensor_name' in key: locations.add(value[key]) """ feature_collection = [] for location in locations: print locations location.append """ print locations return None
def fetch_results(api, order=None, select=None, where=None, rowlimit=2000): client = Socrata(api['root'], None) return client.get(api['code'], order=order, select=select, where=where, limit=rowlimit)
def import_geometries(ds_id, proj=None): ''' Imports the shapefiles from a given data store on the City of Chicago data portal and retuns a geopandas dataframe linking geometries to different attributes. Inputs: ds_id (str): the data set identifier Returns: geodf from ds_id References: geopandas example docs: http://geopandas.org/gallery/create_geopandas_ from_pandas.html?highlight=regular%20pandas ''' if not proj: proj = {'init': 'epsg:4326'} client = Socrata('data.cityofchicago.org', app_token, username=, password=) files = client.get(ds_id) df = pd.DataFrame(files) df['the_geom'] = df.the_geom.apply(shapely.geometry.shape) df = geo_pd.GeoDataFrame(df, geometry='the_geom') df.crs = proj df.drop(['shape_area', 'shape_len'], axis=1, inplace=True) return df
def getData(): from sodapy import Socrata from datetime import datetime global columnsList #apiEndpoint = "https://data.austintexas.gov/resource/hcup-htgu.csv" Date = datetime.now().strftime('%Y%m%d') # set up a connection client = Socrata("data.austintexas.gov", None) # retrieve data from the animal shelter resource # will retrieve as json doc from API aka Python list of dictionaries via sodapy results = client.get("hcup-htgu") # convert to pandas dataframe results_df = pd.DataFrame.from_records(results) columnsList = list(results_df) results_df.to_csv(fileMerged + Date + '.csv', index = False, encoding='utf-8', \ columns=columnsList) return columnsList
def get_driver_license_info(self): client = Socrata("www.datossct.gob.mx", None) for rec in self: try: driver_license = client.get('3qhi-59v6', licencia=rec.driver_license) license_valid_from = datetime.strptime( driver_license[0]['fecha_inicio_vigencia'], '%Y-%m-%dT%H:%M:%S.%f') license_expiration = datetime.strptime( driver_license[0]['fecha_fin_vigencia'], '%Y-%m-%dT%H:%M:%S.%f') rec.write({ 'license_type': driver_license[0]['categoria_de_la_licencia'], 'license_valid_from': license_valid_from, 'license_expiration': license_expiration, }) client.close() except Exception: client.close() raise ValidationError( _('The driver license is not in SCT database'))
def PersonToDireccion(m): TRX = MaltegoTransform() #m.parseArguments(sys.argv) #nombre=sys.argv[1] nombre = m.Value try: client = Socrata("www.datos.gov.co", None) r = client.get("3ard-sj8g", limit=2000) #for key, value in data.items(): #print key, value for i in range(len(r)): if (r[i]['nombre_prestador'] == nombre): dir = r[i]['nombre_concejal'] barrio = r[i]['barrio'] l = r[i]['localizacion']['coordinates'] break ent = TRX.addEntity('maltego.Location', dir) ent.addAdditionalFields("country", "Country", True, "Colombia") ent.addAdditionalFields("location.area", "Area", True, barrio) ent.addAdditionalFields("streetaddress", "Street Address", True, dir) ent.addAdditionalFields("longitude", "Longitude", True, l[0]) ent.addAdditionalFields("latitude", "Latituded", True, l[1]) except Exception as e: TRX.addUIMessage("Cedula no encontrada en la base de datos") TRX.returnOutput()
def indicator_value(self, indicator: str, day: datetime) -> list: """ Populate provinces indicator :indicator: The indicator needed (must be a key of the indicator_map) :day: The day of interest :return: The average value of indicator """ date_fmt = day.strftime('%Y-%m-%dT00:00.000') key = self.indicator_map[indicator]['key'] param = self.indicator_map[indicator]['param'] client = Socrata("www.dati.friuliveneziagiulia.it", None) sensors_data = client.get(key, limit=10**10, data_misura=date_fmt) for province in self.provinces: values = [ x for x in sensors_data if x['rete'] == province.name and x['dati_insuff'] == 'False' ] if len(values) != 0: float_values = [float(x[param]) for x in values] setattr(province.quality, indicator, round(mean(float_values), 2))
def create_results_df(): """Create a dataframe based on JSON from the Chicago traffic API Args: None Returns: Dataframe object: ``results_df`` """ try: # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: data_client = Socrata("data.cityofchicago.org", None) results = data_client.get( "8v9j-bter", limit=2000) # unique id for chicago traffic data # Convert to pandas DataFrame results_df = pd.DataFrame.from_records(results) logger.info("Successfully created a pandas dataframe!") return results_df except Exception as e: logger.error("Failure to create a pandas dataframe :(") raise e
def dataFrame(api_id, record): print("Importing dataframe from API...") res = requests.get(f'https://opendata.usac.org/resource/{api_id}.json?$select=count(*)') jsonRes = res.json() s1 = json.dumps(jsonRes) d2 = json.loads(s1) record_count = int(d2[0]['count']) fileName = str(now.strftime("%Y-%m-%d_%H-%M") + "_RHC_" + record + ".csv") filePath = os.path.join(os.getcwd(), fileName) call_range = int(record_count / 40000 + 1) offset = 0 for i in range(call_range): client = Socrata("opendata.usac.org", None) results = client.get(api_id, limit=20000, offset=offset) results_df = pd.DataFrame.from_records(results) results_df.to_csv(filePath, sep="|", mode='a', header=False) if i == 0: print("Converting DataFrame into CSV") print(f'Offset #: {offset}') offset += 40000
def fillCDC500CitiesData(self): client = Socrata( "chronicdata.cdc.gov", 'QoQet97KEDYpMW4x4Manaflkp' ) #My (John Pham's) access token is QoQet97KEDYpMW4x4Manaflkp censusTractRetrievalString = "place_tractid='3651000-" + self.censusTractNum + "'" censusTract = client.get("47z2-4wuh", where=censusTractRetrievalString, content_type="json", order="place_tractid ASC", limit=1000) client.close() self.coreMenCrudePrev = float( censusTract[0]["corem_crudeprev"] ) #Model-based estimate for crude prevalence of older adult men aged >=65 years who are up to date on a core set of clinical preventive services: Flu shot past year, PPV shot ever, Colorectal cancer screening, 2016 self.coreWomenCrudePrev = float( censusTract[0]["corew_crudeprev"] ) #Model-based estimate for crude prevalence of older adult women aged >=65 years who are up to date on a core set of clinical preventive services: Flu shot past year, PPV shot ever, Colorectal cancer screening, and Mammogram past 2 years, 2016 self.colonScreenCrudePrev = float( censusTract[0]["colon_screen_crudeprev"] ) #Model-based estimate for crude prevalence of fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50–75 years, 2016 self.mammoUseCrudePrev = float( censusTract[0]["mammouse_crudeprev"] ) #Model-based estimate for crude prevalence of mammography use among women aged 50–74 years, 2016 self.teethLostCrudePrev = float( censusTract[0]["teethlost_crudeprev"] ) #Model-based estimate for crude prevalence of all teeth lost among adults aged >=65 years, 2016 self.geolocation = censusTract[0][ "geolocation"] #Latitude, longitude of census tract centroid self.coreMenPercentage = self.coreMenCrudePrev / self.totalMales65Plus * 100 #save this value as a number in the range of 0-100 self.coreWomenPercentage = self.coreWomenCrudePrev / self.totalFemales65Plus * 100 #save this value as a number in the range of 0-100 self.mammoUsePercentage = self.mammoUseCrudePrev / self.totalFemales50To74 * 100 #save this value as a number in the range of 0-100 self.colonScreenPercentage = self.colonScreenCrudePrev / self.totalOlderAdults50To74 * 100 #save this value as a number in the range of 0-100 self.teethLostPercentage = self.teethLostCrudePrev / self.totalOlderAdults65Plus * 100 #save this value as a number in the range of 0-100
def DatasetColumns(dataset_id): try: #Creating Socrata Client client = Socrata(cfg["web"], cfg["token"], username=cfg["email"], password=cfg["password"]) data = client.get(dataset_id, limit=2) data = sorted(data[0].keys()) #getting columns that could be used in the dataframe columns = str(data).upper() columns = columns.replace("[", "") columns = columns.replace("]", "") columns = columns.replace("'", "") columns = columns.replace(" ", "") columns = columns.split(",") #getting data to compare with the uploaded data for i in range(0, len(data)): data[i] = (str(data[i]).upper(), str(data[i]).upper()) client.close() except BaseException as e: #if there is an error, reload login with error message error = str(e) print('Error description:') print(error) data = None columns = None client.close() return data, columns
def get_covid_data(zip): #Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata("data.sfgov.org", None) # Example authenticated client (needed for non-public datasets): # client = Socrata(data.sfgov.org, # MyAppToken, # userame="*****@*****.**", # password="******") # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. results = client.get("favi-qct6", zip_code=zip, limit=2000) # Convert to pandas DataFrame results_df = pd.DataFrame.from_records(results) #user enters invalid zipcode if len(results)==0: label['text'] = "" label2['text'] = "" label3['text'] = "No results for ZIP code " + "'" + str(zip) + "'" + " \n\nPlease enter valid San Francisco ZIP code." label4['text'] = "" label5['text'] = "" else: label['text'] = "As of " + results[0]['data_as_of'] label2['text'] = "For ZIP Code: " + results[0]['zip_code'] label3['text'] = "No. of Cases: " + results[0]['count'] label4['text'] = "Population Size (2017 Census estimate): "+ results[0]['acs_population'] label5['text'] = "Estimated Rate of Cases (per 10k): " + results[0]['rate']
def get_case_study_data(): """ Function to match complaint narratives to message ids used in case study """ # Use Socrata client to collect consumer finance data client = Socrata("data.consumerfinance.gov", None) results = client.get("s6ew-h6mp", limit=99999999) # Convert to pandas DataFrame results_df = pd.DataFrame.from_records(results) results_df = results_df[['complaint_id', 'complaint_what_happened']] results_df.rename(columns={'complaint_what_happened':'text'}, inplace=True) results_df.complaint_id = results_df.complaint_id.astype('int64') # Load Case Study message ids case_study_df = pd.read_csv("case_study_msg_ids.csv") case_study_df.complaint_id = case_study_df.complaint_id.astype('int64') # Join by msg_id case_study_df = case_study_df.merge(results_df, on='complaint_id', how='left') # Drop NAs case_study_df.dropna(inplace=True) return case_study_df
def NombreToCargo(m): TRX = MaltegoTransform() #m.parseArguments(sys.argv) #nombre=sys.argv[1] nombre=m.Value try: client = Socrata("www.datos.gov.co", None) r = client.get("2gvv-khi3", limit=2000) #for key, value in data.items(): #print key, value for i in range(len(r)): if ( r[i]['nombre'] == nombre.upper()) : cargo=r[i]['cargo'] direccion = r[i]['direccion'] email=r[i]['email'] telefono=r[i]['telefono'] break ent=TRX.addEntity('eci.Cargo', cargo) ent.addAdditionalFields("properity.direccion", "Direccion", True, direccion) ent.addAdditionalFields("properity.email", "Email", True, email) ent.addAdditionalFields("properity.telefono", "Telefono", True, telefono) except Exception as e: TRX.addUIMessage("Nombre no encontrado en la base de datos") TRX.returnOutput()
def get_open_data(url, endpoint, api_key, limit=1000, query=''): #input: Socrata endpoint for data, and OPTIONAL query client = Socrata(url, api_key) out = [] idx = 0 time.sleep(5) #sleep 20 seconds, to allow time to connect data = client.get(endpoint, limit=limit, offset=idx, where=query) time.sleep(20) #sleep 20 seconds, to allow time to connect while len(data) > 0: #page through the results, appending to the out list out += data idx += limit data = client.get(endpoint, limit=limit, offset=idx, where=query) client.close() return out
def run(self): # Autenticación en S3 ses = boto3.session.Session(profile_name='luigi_dpa', region_name='us-west-2') s3_resource = ses.resource('s3') obj = s3_resource.Bucket(self.bucket) print(ses) # Autenticación del cliente: client = Socrata(settings.get('dburl'), settings.get('apptoken'), username=settings.get('user'), password=settings.get('pass')) # los resultados son retornados como un archivo JSON desde la API / # convertida a una lista de Python usando sodapy client.timeout = 1000 limit = 1000000000 # query results = client.get( "erm2-nwe9", limit=limit, where= f"created_date between '{self.year}-{self.month}-{self.day}T00:00:00.000' and '{self.year}-{self.month}-{self.day}T23:59:59.999'" ) with self.output().open('w') as json_file: json.dump(results, json_file)
def load_subset(self, limit, query=False): '''Uses the sodapy Socrata as an API to access the Seattle Public Library checkouts dataset.''' client = Socrata(self.datasource, None) if query is not False: try: results = client.get(self.ID, query=query) except HTTPError: print('For queries, terms, such as select, groupby, or where,', 'need to be lowercase. Strings need to be in quotes', 'e.g. "EBOOK" and numbers need to not have quotes.') self.results_df = pd.DataFrame.from_records(results) else: results = client.get("tmmm-ytt6", limit=limit) self.results_df = pd.DataFrame.from_records(results) return self.results_df.head()
def trafficDataIngestion(datalimit, start_datetime, file_dir): # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata("data.cityofnewyork.us", None) date = f"data_as_of>'{start_datetime}'" #se define la fecha de inicio de la consulta date = date.replace(" ", "T") print(date) columns = "data_as_of, id, speed, travel_time, link_name" #columnas que se pediran a la api results = client.get("i4gi-tjb9", limit=datalimit, borough = "Manhattan", where = date, select = columns) # Convert to pandas DataFrame results_df = pd.DataFrame.from_records(results) if results_df.empty: return False #-----------------------------------------datetime - time_hour --------------------------------------# results_df["datetime"] = results_df["data_as_of"].str[:-9] + "00:00" results_df["datetime_traffic"] = results_df["data_as_of"].str[:-4] results_df["datetime"] = pd.to_datetime(results_df["datetime"]) results_df["weekday"] = results_df['datetime'].dt.day_name() results_df["datetime"] = results_df["datetime"].dt.strftime("%Y-%m-%dT%H:%M:%S") # se añaden nuevas columnas results_df = results_df[["datetime", "datetime_traffic", "weekday", "id", "speed", "travel_time", "link_name"]] #se guarda en la direccion del archivo pasado como parametro results_df.to_csv(file_dir, index=False) print(f"TrafficApi: {file_dir}") return True
def getDataset(dataset_id): table = '' try: # Creating Socrata Client client = Socrata(cfg["web"], cfg["token"], username=cfg["email"], password=cfg["password"]) data = client.get(dataset_id, content_type="json") data = str(data) data = data.replace("'", "\"") data = data.upper() #getting data to compare with the uploaded data #print(type(data)) #print(data) table = pd.read_json(data) #Replacing NaN for '' table = table.replace(pd.np.nan, '', regex=True) table = table.to_html(classes='table-striped " id = "my_table', index=False) vistas = client.get_metadata(dataset_id) vistas = str(vistas.get("viewCount")) client.close() except BaseException as e: #if there is an error, reload login with error message error = str(e) print('Error description:') print(error) client.close() return table, vistas
def retrieve_save_data(**kwargs): # Using Authenticated Client: # API Document: https://dev.socrata.com/foundry/data.lacity.org/aub4-z9pc client = Socrata("data.lacity.org", MY_APP_TOKEN, USERNAME, PASSWORD) # Getting the total number of rows in the dataset # row_count = client.get("aub4-z9pc", select="count(srnumber)") # row_count = pd.DataFrame.from_records(row_count) # row_count = row_count.count_srnumber[0] # Grabs last 10K records for upserting row_count = 10000 logging.info( "The number of rows is read successfully. Now it's pulling data.") # Retrieving dataset using API, limit=row_count is to specify we are retrieving all the rows dataset = client.get("aub4-z9pc", content_type="csv", limit=row_count) logging.info("the dataset is pulled successfully.") # Remove prev file if exist if os.path.exists(filename): logging.info(filename + " already exist. Overwriting it") os.remove(filename) # Writing new file with open(filename, 'w') as writeFile: writer = csv.writer(writeFile) writer.writerows(dataset) writeFile.close() logging.info( "the dataset is saved successfully, locally at {}.".format(filename))
def getData(self): # Get crime records via API if self.storage == "remote": print("Getting crime data from remote source.\nThis takes a while (approx. 5 mins)! Please be patient.") # API request information client_crime = Socrata('data.lacity.org','7pTgt6f2oTY53aDI1jXNJoNZD') offset_temp = 0 conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() cur.execute("DROP TABLE IF EXISTS 'Crime2016'") # Getting data in dataframe then manipulate before storing in ReferralCrimeMap.db while True: results = client_crime.get("7fvc-faax", limit=10000, offset=offset_temp) crime_df_temp = pd.DataFrame.from_records(results) # This loop stops when the next block of dataframe is empty if crime_df_temp.empty == True: break # Split location_1 into lat and long # Create 'year_rptd' to filter cases reported in 2016 # Create 'count' for later data analysis crime_df_temp['location_1'] = crime_df_temp['location_1'].astype('str') crime_df_temp['long'] = crime_df_temp['location_1'].map(lambda x: x.split(']')[0].split('[')[-1].split(',')[0]) crime_df_temp['lat'] = crime_df_temp['location_1'].map(lambda x: x.split(']')[0].split('[')[-1].split(',')[-1]) crime_df_temp['year_rptd'] = crime_df_temp['date_rptd'].map(lambda x: x.split('-')[0]) crime_df_temp['month_rptd'] = crime_df_temp['date_rptd'].map(lambda x: x.split('-')[1]) crime_df_temp['count'] = 1 crime_df_temp = crime_df_temp[crime_df_temp['year_rptd']=='2016'] # Insert dataframe into ReferralCrimeMap.db pd_sql.to_sql(crime_df_temp, 'Crime2016', conn, if_exists='append', index=False) offset_temp+=10000 # Shows the percentage of data if offset_temp % 100000 == 0: print(offset_temp/2000000*100,"%") else: continue cur.execute("SELECT * FROM Crime2016") print(cur.fetchone()) conn.close() # Load local data if -source is set to local else: print("Getting crime data from local source.") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() query = "SELECT * FROM Crime" try: crime = pd.read_sql(query, conn) conn.close() print(crime.head()) except Exception as e: print("There is an error:", e) print("Please set data course as remote.") exit()
def gen_data(filepath, api_key, username=None, password=None, output='json'): api = Socrata('data.seattle.gov', api_key, username=username, password=password) with open(filepath) as fp: uid = set([i.strip() for i in fp.readlines()]) | set([i.strip() for i in open('completed.json')]) for dataset in uid: print(dataset, file=open('completed.json', 'a')) yield {dataset: api.get('/resource/' + dataset + '.' + output)}
def createBarChart(charttype): with open("database_charts/chartconfig.json") as config: jsonData = json.load(config) config.close() chart = jsonData[charttype] chartTitle = chart['chart-title'] dataTitle = chart['data-title'] yFormat = chart['y-axis-format'] xaxis = chart['x-axis'] items = chart['y-axis'] colors = chart['color'] with open("database_charts/url_info.json") as urlConfig: urlInfo = json.load(urlConfig) client = Socrata(urlInfo['url'], None, username=urlInfo['username'], password=urlInfo['password']) colorIndex = 0 data = [] for item in items: dict = { "type": chart["chart-type"], "legendText": item.replace("_", " "), "cursor": "pointer" if len(items) > 1 else "default", "showInLegend": True, 'legendMarkerColor': colors[colorIndex], "toolTipContent": item.replace("_", " ") + " in year " + "{label}: {y}" } dataPoints = [] request = client.get(dataTitle, select=xaxis + ", " + item) for r in request: if len(r) > 1: d = { "label": int(r[xaxis]), "y": int(r[item]), "color": colors[colorIndex], } if d['label'] is not None: dataPoints.append(d) colorIndex += 1 dict["dataPoints"] = dataPoints data.append(dict) # Create Chart Information chartInfo = {} chartInfo["chartTitle"] = chartTitle chartInfo["data"] = data chartInfo["json"] = json.dumps(data) chartInfo["valueFormat"] = yFormat chartInfo['addClick'] = False if len(items) == 1 else True return chartInfo
def createPieChart(self): with open("database_charts/chartconfig.json") as config: data = json.load(config) chartconfig = {} for key, value in data.items(): if key == self.chartType: chartconfig = value # The position index from the excel file xaxis = chartconfig['x-axis'] # An array of dictionary with Name of the category and index in excel file yaxis = chartconfig['y-axis'] charttitle = chartconfig['chart-title'] datatitle = chartconfig['data-title'] colors = chartconfig['color'] colorIndex = 0 with open("database_charts/url_info.json") as urlConfig: urlInfo = json.load(urlConfig) client = Socrata(urlInfo['url'], None, username=urlInfo['username'], password=urlInfo['password']) request = client.get(datatitle, select=",".join(yaxis), where=xaxis + "=" + str(self.year))[0] newData = [] dic = { "type": "pie", "showInLegend": True, "toolTipContent": "{y} - #percent %", "yValueFormatString": "#0.#,,. Million", "legendText": "{indexLabel}", } dataPoints = {} for cat in yaxis: if cat in request.keys() and int(request[cat]) > 0: if cat not in dataPoints.keys(): dataPoints[cat] = {'y': int(request[cat]), 'indexLabel': cat.replace("_", " "), 'legendMarkerColor': colors[colorIndex], 'color': colors[colorIndex]} else: dataPoints[cat]['y'] += int(request[cat]) colorIndex += 1 for item in dataPoints: newData.append(dataPoints[item]) dic['dataPoints'] = newData pieChartInfo = {} pieChartInfo['title'] = charttitle + str(self.year) pieChartInfo['data'] = dic pieChartInfo['json'] = json.dumps(dic) return pieChartInfo
def test_get(): mock_adapter = {} mock_adapter["prefix"] = PREFIX adapter = requests_mock.Adapter() mock_adapter["adapter"] = adapter client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter) response_data = "get_songs.txt" setup_mock(adapter, "GET", response_data, 200) response = client.get(DATASET_IDENTIFIER) assert isinstance(response, list) assert len(response) == 10 client.close()
def getData(self): if self.storage == "remote": print("Getting DCFS referral data from remote source.") # API request client_dcfs = Socrata('data.lacounty.gov','7pTgt6f2oTY53aDI1jXNJoNZD') offset_temp = 0 dcfs_df=pd.DataFrame() conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() cur.execute('DROP TABLE IF EXISTS dcfs') print('Inserting DCFS referral dataframe into ReferralCrimeMap.db.') while True: results = client_dcfs.get("8vmx-hhtu", limit=5000, offset=offset_temp) dcfs_df = pd.DataFrame.from_records(results) # Break the loop and stop requesting if the block is empty if dcfs_df.empty == True: break # Insert dataframe into ReferralCrimeMap.db dcfs_df['location'] = dcfs_df['location'].astype('str') pd_sql.to_sql(dcfs_df, 'dcfs', conn, if_exists='append', index=False) offset_temp+=5000 # I didn't use time.sleep as this API is unlimited # time.sleep(1) conn.close() else: print ("Getting DCFS referral data from local source.") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() query = ''' SELECT * FROM dcfs ''' try: dcfs = pd.read_sql(query, conn) conn.close() print(dcfs.head()) # If the table does not exist it will throw an error. except Exception as e: print('There is an error:', e) print('Please enter remote source.') exit()
def fetch_data(): # Make a connection conn = r.connect(host="localhost", port=28015, db="test") # You need to register the App Token (manually?) client = Socrata("data.sunshinecoast.qld.gov.au", "6MbT9NoWolynKM1ooRzrvm7Fs") # API Endpoint endpoint = "/resource/mn3m-fqri.json" off = 0 while True: data = client.get(endpoint, limit=50000,offset=off) if len(data) == 0: break for elem in data: try: # Only store if we have a date cur = elem['d_date_rec'] r.table("planning").insert(elem).run(conn) except KeyError: pass off = off+50000
def get_resource_data(socrata_resource, since=None, limit=1000): client = Socrata(socrata_resource.domain, socrata_resource.token) kwargs = { 'limit': limit, 'where': socrata_resource.conditions, } if socrata_resource.unique_key: kwargs['order'] = socrata_resource.unique_key while True: if since: kwargs['where'] = "{} and {} > '{}'".format( socrata_resource.conditions, socrata_resource.unique_key, since ) batch = client.get(socrata_resource.endpoint, **kwargs) if len(batch) > 0: since = batch[-1][socrata_resource.unique_key] yield batch else: return
def run_script(): client = Socrata("data.sfgov.org", "wvRAyq5wvCnf9YKGmiuZ7T9y3") fetched_data = client.get("/resource/rqzj-sfat.json", select="objectid,latitude,longitude,fooditems,expirationdate") data = [] for i in fetched_data: a = Node( i.get("objectid", None), i.get("latitude", None), i.get("longitude", None), i.get("fooditems", None), i.get("expirationdate", None), ) print repr(a) data.append(a._asdict()) urlparse.uses_netloc.append("postgres") try: url = urlparse.urlparse(os.environ["DATABASE_URL"]) conn = psycopg2.connect( database=url.path[1:], user=url.username, password=url.password, host=url.hostname, port=url.port ) except: url = urlparse.urlparse("postgresql://localhost/mydb") conn = psycopg2.connect( database=url.path[1:], user=url.username, password=url.password, host=url.hostname, port=url.port ) print "I am unable to connect to the database" cur = conn.cursor() try: cur.executemany( "insert into foodtrucks(objectid,latitude, longitude,fooditems,expirationdate) select %(objectid)s,%(latitude)s,%(longitude)s,%(fooditems)s,%(expirationdate)s where not exists (select 1 from foodtrucks where objectid=%(objectid)s)", data, ) except Exception as e: print "unable to query postgres =>", e conn.commit() return render_template("successScript.html")
def run_script(): client = Socrata("data.sfgov.org","wvRAyq5wvCnf9YKGmiuZ7T9y3") fetched_data = client.get("/resource/rqzj-sfat.json",select="objectid,latitude,longitude,fooditems,expirationdate") for row in fetched_data: print str(row) print "fetched_data", str(fetched_data) data = [] for i in fetched_data: a = Node(i.get('objectid',None),i.get('latitude',None),i.get('longitude',None),i.get('fooditems',None),i.get('expirationdate',None)) print repr(a) data.append(a._asdict()) print data try: conn = psycopg2.connect("dbname='mydb'") except: print "I am unable to connect to the database" cur = conn.cursor() try: cur.executemany("insert into foodtrucks(objectid,latitude, longitude,fooditems,expirationdate) select %(objectid)s,%(latitude)s,%(longitude)s,%(fooditems)s,%(expirationdate)s where not exists (select 1 from foodtrucks where objectid=%(objectid)s)",data) except Exception as e: print "unable to query postgres =>", e conn.commit()
def get(self, request, *args, **kwargs): ''' Get criminal records from the dataset and save them the database as CriminalRecord objects''' ids = CriminalRecord.objects.values_list('case_id', flat=True) domain = settings.SOCRATA_DOMAIN token = settings.SOCRATA_APP_TOKEN endpoint = settings.SOCRATA_DATASET_ENDPOINT client = Socrata(domain, token) order = "date" where = 'latitude IS NOT NULL' offset = 0 limit = 1000 status = 200 try: data = client.get( endpoint, order=order, where=where, offset=offset, limit=limit) while data: for record in data: if self.to_int(record.get('id')) not in ids: attrs = { 'case_id': self.to_int( self.get_from_dict(record, 'id')), 'case_number': self.get_from_dict( record, 'case_number'), 'date': datetime.strptime( self.get_from_dict(record, 'date'), '%Y-%m-%dT%H:%M:%S'), 'block': self.get_from_dict(record, 'block'), 'iucr': self.get_from_dict(record, 'iucr'), 'primary_type': self.get_from_dict( record, 'primary_type'), 'crime_description': self.get_from_dict( record, 'description'), 'location_description': self.get_from_dict( record, 'location_description'), 'has_arrested': self.get_from_dict( record, 'arrest'), 'is_domestic': self.get_from_dict( record, 'domestic'), 'beat': self.get_from_dict(record, 'beat'), 'district': self.get_from_dict( record, 'district'), 'ward': self.to_int( self.get_from_dict(record, 'ward')), 'community_area': self.get_from_dict( record, 'community_area'), 'fbi_code': self.get_from_dict( record, 'fbi_code'), 'x_coordinate': self.to_int( self.get_from_dict( record, 'x_coordinate')), 'y_coordinate': self.to_int( self.get_from_dict( record, 'y_coordinate')), 'year': self.to_int( self.get_from_dict(record, 'year')), 'updated_on': datetime.strptime( self.get_from_dict(record, 'updated_on'), '%Y-%m-%dT%H:%M:%S'), 'latitude': float(self.get_from_dict( record, 'latitude')), 'longitude': float(self.get_from_dict( record, 'longitude')), 'location': Point( float(self.get_from_dict( record, 'longitude')), float(self.get_from_dict( record, 'latitude'))) } CriminalRecord.objects.create(**attrs) offset += limit data = client.get( endpoint, order=order, where=where, offset=offset, limit=limit) client.close() except Exception, e: print e status = 400
from sodapy import Socrata # client = Socrata("sandbox.demo.socrata.com", None) # print client.get("nimj-3ivp", limit=10) # client = Socrata("data.cms.gov/", None) # print client.get("97k6-zzx3", limit=10) # https://resource/97k6-zzx3.json?$limit=5 client = Socrata("data.sfgov.org", "8gffbg1meMZ1e2Z0yOz2OpwZq") #client.get("cuks-n6tp", limit=1) assault = client.get("cuks-n6tp", select ="category,time,location", where ="category='ASSAULT'",limit=10) print "assault data" print assault theft = client.get("cuks-n6tp", select ="category,time,location", where ="category='VEHICLE THEFT'",limit=10) print "vehicle theft data" print theft vandalism = client.get("cuks-n6tp", select ="category,time,location", where ="category='VANDALISM'",limit=10) print "VANDALISM data" print vandalism kidnapping = client.get("cuks-n6tp", select ="category,time,location", where ="category='KIDNAPPING'",limit=10) print "KIDNAPPING data" print kidnapping sex = client.get("cuks-n6tp", select ="category,time,location", where ="category='SEX OFFENSES, FORCIBLE'",limit=10) print "SEX OFFENSES, FORCIBLE data" print sex dui = client.get("cuks-n6tp", select ="category,time,location", where ="category='DRIVING UNDER THE INFLUENCE'",limit=10) print "DUI data" print dui client.close()
def call_api(): """Request data from socrata api and get back JSON.""" client = Socrata("data.seattle.gov", 'SOCRATA_TOKEN') data = client.get("ih58-ykqj", content_type="json", limit=49998) return data
time_start = time.time() #Timer start for script #Setup APIs call and initial variables gmaps = googlemaps.Client(key=g_api_key) api_setup = Socrata(socrata_api, socrata_app_token, socrata_id, socrata_pw) max_retry = 3 #Setup Dictionary of found locations and coordinates location_dict = {} coord_pets = Pet.objects.filter(loc_lat__isnull=False).order_by('-intake_at') for x in coord_pets: if not location_dict.has_key(x.found_location): location_dict[x.found_location] = {"lat": x.loc_lat, "lon": x.loc_lon} print coord_pets.count() #Get record count from AAC record_count = api_setup.get(intake_endpoint, select = "count('')") record_count_out = api_setup.get(outcome_endpoint, select = "count('')") count = record_count[0]['count'].encode('ascii','ignore') count_out = record_count_out[0]['count'].encode('ascii','ignore') #Functions to Update Database def pull_in_intakes(intakes): for x in intakes: intime = datetime.strptime(x["intake_date"], '%m/%d/%y') if len(Pet.objects.filter(animal_id = x["animal_id"])) == 0: #Create a new pet new_pet = Pet() if x.has_key("name"): new_pet.name = x["name"] else: new_pet.name = "No Name" new_pet.animal_id = x["animal_id"]
from sodapy import Socrata import csv client = Socrata(site, app_token, username=user, password=passw) dset = "/resource/xb7i-cvg2.json" filepath = "grants-trunc.csv" client.get(dset) rowlist = [] with open(filepath, "rb") as csvfile: reader = csv.DictReader(csvfile) for row in reader: rowlist.append(row) client.replace(dset, rowlist)