def test_toplevel_load_dataset(dw_instances, profile): datadotworld.load_dataset('agentid/datasetid', profile=profile) assert_that( dw_instances[profile].load_dataset, called().times(1).with_args(equal_to('agentid/datasetid'), force_update=equal_to(False))) assert_that( dw_instances[profile].load_dataset, called().times(1).with_args(equal_to('agentid/datasetid'), auto_update=equal_to(False)))
def get_data(dataset_name, dataframe_name): """Request data from datadotworld API, and returns pandas dataframe. Additional information on the datadotworld api can be found at the following site: https://apidocs.data.world/api Parameters ---------- dataset_name: str name assigned to the desired dataset stored with datadotworld service. dataframe_name: str name of the key associated with the datadotworld dataset which stores objects within the dataset within a dictionary of dataframes in key value pair. Returns ------- Pandas Dataframe Examples -------- >>> get_data(dataset_name='census2020', dataframe_name='Kansas') >>> get_data('performance_indicators', 'public_safety') """ dataworld_obj = dw.load_dataset(dataset_name) dataframe = dataworld_obj.dataframes[dataframe_name] return dataframe
def import_covid_data(filename, FIPS_name): # Load COVID19 county data using datadotworld API # Data provided by Johns Hopkins, file provided by Associated Press dataset = dw.load_dataset( "associatedpress/johns-hopkins-coronavirus-case-tracker", auto_update=True) # the dataset includes multiple dataframes. We will only use #2 covid_data = dataset.dataframes["2_cases_and_deaths_by_county_timeseries"] # Include only oberservation for political entities within states # i.e., not territories, etc... drop any nan fip values with covid_data[FIPS_name] > 0 covid_data = covid_data[covid_data[FIPS_name] < 57000] covid_data = covid_data[covid_data[FIPS_name] > 0] # Transform FIPS codes into integers (not floats) covid_data[FIPS_name] = covid_data[FIPS_name].astype(int) covid_data.set_index([FIPS_name, "date"], inplace=True) # Prepare a column for state abbreviations. We will draw these from a # dictionary created in the next step. covid_data["state_abr"] = "" for state, abr in state_dict.items(): covid_data.loc[covid_data["state"] == state, "state_abr"] = abr # Create "Location" which concatenates county name and state abbreviation covid_data["Location"] = covid_data["location_name"] + ", " + \ covid_data["state_abr"] return covid_data
def import_covid_data(FIPS_name): # Load COVID19 county data using datadotworld API # Data provided by John Hopkins, file provided # by Associated Press dataset = dw.load_dataset( "associatedpress/johns-hopkins-coronavirus-case-tracker", auto_update = True) #the dataset includes multiple dataframes. We will only use #2 covid_data = dataset.dataframes[ "2_cases_and_deaths_by_county_timeseries"] covid_data = covid_data[covid_data[FIPS_name] < 57000] covid_data = covid_data[covid_data[FIPS_name] > 0] # Transform FIPS codes into integers covid_data[FIPS_name] = covid_data[FIPS_name].astype(int) covid_data.set_index([FIPS_name, "date"], inplace = True) # Prepare a column for state abbreviations. we will draw these # from state_dict covid_data["state_abr"] = "" for state, abr in state_dict.items(): #.loc[row(s), col] covid_data.loc[ covid_data["state"] == state, "state_abr"] = abr # save location name as Cass, ND covid_data["Location"] = covid_data["location_name"] + ", " +\ covid_data["state_abr"] return covid_data
def load_us_covid_dataset( county_level: bool = False, death: bool = False, cumulative: bool = True, start_date: str = '2020-01-23', end_date: Optional[str] = None, selected_counties: Optional[List[str]] = None, ): epi_df = dw.load_dataset( dataset_key='covid-19-data-resource-hub/covid-19-case-counts', force_update=False, auto_update=True, ).dataframes['covid_19_activity'] ctry_col = 'country_short_name' state_col = 'province_state_name' county_col = 'county_name' date_col = 'report_date' case_col = f'''people_{f"death{'' if cumulative else '_new'}" if death else f"positive{'' if cumulative else '_new'}_cases"}_count''' cdfs, columns = [], [] if county_level: if selected_counties is None: epi_df = epi_df.loc[(epi_df[ctry_col] == 'United States') & (epi_df[state_col].isin(state2abbr.keys())) & (epi_df[county_col] != 'Unknown'), [date_col, state_col, county_col, case_col]] else: states, counties = zip(*[c.split('/') for c in selected_counties]) selected_states = list(set(states)) selected_counties = list(set(counties)) epi_df = epi_df.loc[(epi_df[ctry_col] == 'United States') & (epi_df[state_col].isin(selected_states)) & (epi_df[county_col].isin(selected_counties)), [date_col, state_col, county_col, case_col]] for (state, county), data in epi_df.groupby([state_col, county_col]): data = data.loc[:, [date_col, case_col]].groupby(date_col).sum() data.index = pd.to_datetime(data.index) cdfs.append(data) columns.append(f"{state}/{county}") else: epi_df = epi_df.loc[(epi_df[ctry_col] == 'United States') & (epi_df[state_col].isin(state2abbr.keys())), [date_col, state_col, county_col, case_col]] for state, data in epi_df.groupby(state_col): data = data.loc[:, [date_col, case_col]].groupby(date_col).sum() cdfs.append(data) columns.append(state) epi_df = pd.concat(cdfs, axis=1) epi_df.columns = columns epi_df.index.name = 'date' epi_df.index = pd.to_datetime(epi_df.index) epi_df.fillna(0.0, inplace=True) start_date = pd.to_datetime(start_date) if end_date is None: end_date = pd.to_datetime(datetime.today().date()) else: end_date = pd.to_datetime(end_date) end_date = end_date - pd.Timedelta(1, unit='d') epi_df = epi_df.loc[start_date:end_date] return epi_df
def last_update(): cvd_data1 = dw.load_dataset( 'https://data.world/associatedpress/johns-hopkins-coronavirus-case-tracker', auto_update=True) cvd_data1 = cvd_data1.dataframes['1_county_level_confirmed_cases'] cvd_data1['date'] = pd.to_datetime(cvd_data1['last_update']) date = cvd_data1.date[0].strftime('%B %d, %Y') return date
def dataworld_table_to_csv(data_path, dataworld_path=ZIPCODE_DATASET_PATH): dataset = dw.load_dataset(dataworld_path) properties = dataset.describe() dataset_name = properties["resources"][0]["name"] zipcode_df = dataset.dataframes[dataset_name] zipcode_df["zip_code"] = zipcode_df["zip_code"].apply( lambda x: '{0:0>5}'.format(x)) zipcode_df.to_csv(data_path)
def read_dtw_csv(project_key, filename, **kwargs): '''Reads a dataframe from a raw CSV file on data.world (circumventing DTW's preprocessing).''' datasets = dw.load_dataset(project_key, force_update=True) data_bytes = datasets.raw_data[filename] new_file, tmpfilename = tempfile.mkstemp() print('Writing CSV to temp file:', tmpfilename) os.write(new_file, data_bytes) os.close(new_file) return pd.read_csv(tmpfilename, **kwargs)
def audio_features_tosql(): past_music = dw.load_dataset('kcmillersean/billboard-hot-100-1958-2017') past_music.dataframes audio_features_df = past_music.dataframes['audiio'] audio_features_df = past_music.dataframes['audiio'] audio_features_df.to_sql('audio_features', con=engine, if_exists='append', chunksize=1500)
def grab_data(): past_music = dw.load_dataset('kcmillersean/billboard-hot-100-1958-2017') past_music.dataframes features_df = past_music.dataframes['audiio'] billboard_df = past_music.dataframes['hot_stuff_2'] features_df = past_music.dataframes['audiio'] billboard_df = past_music.dataframes['hot_stuff_2'] billboard_df["year"] = billboard_df["weekid"].str[0:4] billboard_df["year"] = billboard_df["year"].astype(int) feautre_obj = pickle.dumps(features_df) billboard_obj = pickle.dumps(billboard_df)
def dw_load_data(path): """ Function to load the datasets from the data.world using their Python connector REST API 1. takes input from user for the dataset path username/dataset_name 2. returns LocalDataset object which holds data """ dataset = dw.load_dataset(path, auto_update=True) df_dict = dataset.dataframes print("**Dictionary of the dataframes present**") print(df_dict) return dataset
def import_covid_data(filename, fips_name): # Load COVID19 county data using datadotworld API # Data provided by Johns Hopkins, file provided by Associated Press dataset = dw.load_dataset("associatedpress/johns-hopkins-coronavirus-case-tracker") covid_data = dataset.dataframes["2_cases_and_deaths_by_county_timeseries"] covid_data = covid_data[covid_data[fips_name] < 57000] covid_data[fips_name] = covid_data[fips_name].astype(int) covid_data.set_index([fips_name, "date"], inplace = True) covid_data.loc[:, "state_abr"] = "" for state, abr in state_dict.items(): covid_data.loc[covid_data["state"] == state, "state_abr"] = abr return covid_data
def read_dtw_excel(project_key, filename, select_sheet=None): '''Reads a dataframe from a raw Excel file on data.world (circumventing DTW's preprocessing).''' datasets = dw.load_dataset(project_key, force_update=True) data_bytes = datasets.raw_data[filename] new_file, tmpfilename = tempfile.mkstemp() print('Writing excel file to temp file:', tmpfilename) os.write(new_file, data_bytes) os.close(new_file) xl = pd.ExcelFile(tmpfilename) if select_sheet: return xl.parse(select_sheet) sheet_names = xl.sheet_names if len(sheet_names) == 1: return xl.parse(sheet_names[0]) return dict((name, xl.parse(name)) for name in sheet_names)
def getCovidJSON(): dataset = dw.load_dataset('markmarkoh/coronavirus-data',auto_update=True) dfs = dataset.dataframes json_ret = {} json_ret['full_data'] = dfs['full_data'].to_json(date_format='iso', orient='split') json_ret['new_cases'] = dfs['new_cases'].to_json(date_format='iso', orient='split') json_ret['total_deaths'] = dfs['total_deaths'].to_json(date_format='iso', orient='split') json_ret['total_cases'] = dfs['total_cases'].to_json(date_format='iso', orient='split') json_ret['new_deaths'] = dfs['new_deaths'].to_json(date_format='iso', orient='split') return json_ret
def process(self): #self.diagram() # print('* ProcessLoad Data.World') self.getSummary()[self.get_class_key()] = {} self.getSummary()[self.get_class_key()]['before'] = 0 ''' import_file_name is the full path and name of import file returns the original raw data as pandas dataframe ''' # download to ~/.dw/cache/{}/latest/data/grb_drains.csv self.dataframe = dw.load_dataset(self.import_file_name, auto_update=True) fstr = '~/.dw/cache/{}/latest/data/lgrow_current.csv'.format( 'citizenlabs/lgrow-storm-drains-current') # self.dataframe = pd.read_csv(fstr) self.addColumns() cols = 'columns: ' for col in self.get_dataframe().columns.values: cols += col + ', ' self.addPath(''' Overview Details -------- ------- ({}) | [Retrieve Production Dataset] | source: {} | | | [Get Data from DataWorld] (response.data: {}) | | [Load Production Dataset] | source: {} | | | [Cache DW data] <--- ({}) | | (production count: {})'''. format(self.import_file_name, self.getClassName(), cols, self.getClassName(), fstr, len(self.dataframe))) # SUMMARIZE self.getSummary()[self.get_class_key()]['after'] = len(self.dataframe)
def load_world_covid_dataset( death: bool = False, cumulative: bool = True, n_ctry: Optional[int] = None, start_date: str = '2020-01-23', end_date: Optional[str] = None, ): epi_df = dw.load_dataset( dataset_key='covid-19-data-resource-hub/covid-19-case-counts', force_update=False, auto_update=True, ).dataframes['covid_19_activity'] ctry_col = 'country_short_name' date_col = 'report_date' case_col = f'''people_{f"death{'' if cumulative else '_new'}" if death else f"positive{'' if cumulative else '_new'}_cases"}_count''' if n_ctry is None: ref_ctry = ref_countries else: ref_ctry = epi_df.loc[:[date_col, ctry_col, state_col, case_col]].groupby([ctry_col, date_col]).sum().reset_index()\ .groupby(ctry_col).last().sort_values(case_col, ascending=False).head(n_ctry).index.values ref_data = epi_df.loc[epi_df[ctry_col].isin(ref_ctry), [date_col, ctry_col, case_col]] columns, cdfs = [], [] for ctry, data in ref_data.groupby(ctry_col): data = data[[date_col, case_col]].groupby(date_col).sum() data.index = pd.to_datetime(data.index) cdfs.append(data) columns.append(ctry) epi_df = pd.concat(cdfs, axis=1) epi_df.columns = columns epi_df.index.name = 'date' epi_df.index = pd.to_datetime(epi_df.index) epi_df.fillna(0.0, inplace=True) start_date = pd.to_datetime(start_date) if end_date is None: end_date = pd.to_datetime(datetime.today().date()) else: end_date = pd.to_datetime(end_date) end_date = end_date - pd.Timedelta(1, unit='d') epi_df = epi_df.loc[start_date:end_date] return epi_df
def fetch_dataframe() -> pd.DataFrame: """ Fetch the raw endpoints dataset as Pandas dataframe and validate it for the required fields. """ dataframe = dw.load_dataset( settings.DATADOTWORLD['dataset'], auto_update=True).dataframes[settings.DATADOTWORLD['dataframe']] supplied_fields = list(dataframe) missing_fields = set(REQUIRED_FIELDS) - set(supplied_fields) if missing_fields: raise ValueError( f'The provided endpoints dataset does not include required fields: ' f'{", ".join(missing_fields)}. ' f'Fields provided: {", ".join(supplied_fields)}') return dataframe.fillna('')
def load_bed_and_population_data(): beds = dw.load_dataset( dataset_key='liz-friedman/hospital-capacity-data-from-hghi', force_update=False, auto_update=True, ).dataframes['20_population'] beds = beds.loc[:, [ 'hrr', 'total_hospital_beds', 'total_icu_beds', 'adult_population', 'population_65' ]] beds[['county', 'state']] = beds.hrr.str.split(', ', expand=True) beds = beds.loc[:, [ 'state', 'total_hospital_beds', 'total_icu_beds', 'adult_population', 'population_65' ]].groupby('state').sum() geo = pd.read_csv( "https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv", usecols=['state', 'population', 'pop_density'], index_col='state', ) geo['area'] = geo['population'] / geo['pop_density'] beds['density'] = beds['adult_population'] / geo.loc[beds.index, 'area'] return beds
def fetch_dataset(DATASET_URL=DATASET_URL): """ Fetchs the data.world dataset from the given url path using dw.load_dataset() The load_dataset() function facilitates maintaining copies of datasets on the local filesystem. It will download a given dataset's datapackage and store it under ~/.dw/cache. When used subsequently, load_dataset() will use the copy stored on disk and will work offline, unless it's called with force_update=True or auto_update=True. force_update=True will overwrite your local copy unconditionally. auto_update=True will only overwrite your local copy if a newer version of the dataset is available on data.world. Returns ------- `datadotworld.models.dataset.LocalDataset` object """ sys.stdout.write("\n> Fetching bookmarks from: https://data.world/" + DATASET_URL + " -> ") with Spinner(): dataset = dw.load_dataset(DATASET_URL, auto_update=True) print("\n") if args.verbose: colorama.init(autoreset=True) print( colorama.Fore.BLACK + colorama.Back.YELLOW + "\n Local Dataset Info: " + "---" * 23, "\n") pp = pprint.PrettyPrinter(indent=4) pp.pprint(dataset.describe()) print("\n", dataset.dataframes) print(colorama.Fore.BLACK + colorama.Back.YELLOW + "\n" + "---" * 30) return dataset
def get_data(key, data_name): """ Return datadotworld dataset as pandas dataframe. Parameters ---------- key: str Dataset key for target data.world dataset. data_name: str Name of the data.world dataset or value associated with the key Returns ------- pandas dataframe Examples -------- >>> load_data(key='org/division', data_name='employee_history') """ data_obj = dw.load_dataset(dataset_key=key, auto_update=True) data = data_obj.dataframes[data_name] return data
def import_Umemployment_data(filename, FIPS_name): # data provided by USDA # https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/ dataset = dw.load_dataset("unemployment.csv", auto_update=True) # the dataset includes multiple dataframes. We will only use unemployment_data = dataset.dataframes["Unemployment_rate_2019"] # drop any nan fip values with covid_data[FIPS_name] > 0 unemployment_data = unemployment_data[covid_data[FIPS_name] < 57000] unemployment_data = unemployment_data[covid_data[FIPS_name] > 0] # Transform FIPS codes into integers (not floats) unemployment_data[FIPS_name] = unemployment_data[FIPS_name].astype(int) unemployment_data.set_index([FIPS_name, "date"], inplace=True) # Prepare a column for state abbreviations. We will draw these from a # dictionary created in the next step. unemployment_data["state_abr"] = "" for state, abr in state_dict.items(): unemployment_data.loc[unemployment_data["state"] == state, "state_abr"] = abr # Create "Location" which concatenates county name and state abbreviation unemployment_data["Location"] = unemployment_data["location_name"] + ", " + \ unemployment_data["state_abr"] return unemployment_data
import pandas as pd import datadotworld as dw d = dw.load_dataset('ian/3-centuries-of-uk-economy-data') # pull data into ~/.dw s = pd.Series({k: v.shape for k, v in d.dataframes.items()}) df_orig = d.dataframes['m6_mthly_prices_and_wages'] cols = df_orig.iloc[1:5] names = cols.iloc[:,0].values cols = pd.MultiIndex.from_arrays(cols.iloc[:,2:].values) cols.names = names df = df_orig.iloc[5:].set_index(['column_a', 'column_b']) df.index.names = ['year', 'month'] df.columns = cols df = df.astype(float)
# Import the datadotworld module as dw import datadotworld as dw # Import the city council votes dataset dataset = dw.load_dataset('stephen-hoover/chicago-city-council-votes')
queryResults = dw.query('http://data.world/tutorial/sparqltutorial', sparql_query, query_type='sparql') # Use the dataframe property of the resulting query to create a dataframe variable named `houseStark` houseStark = queryResults.dataframe # Use pp.pprint() to print the dataframe to the screen. pp.pprint(houseStark) # Import the datadotworld module as dw and the sys module import datadotworld as dw import sys # Import a dataset refugee_dataset = dw.load_dataset('nrippner/refugee-host-nations') # Get the size of the dataset: sys.getsizeof(refugee_dataset) # List all of the data files: dataframes = refugee_dataset.dataframes for df in dataframes: pp.pprint(df) # print all of the files in a dataset: resources = refugee_dataset.describe()['resources'] pp.pprint('name:') for r in resources: pp.pprint(r['name']) pp.pprint('\ntype of file:')
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 4: # The datadotworld module and dataset have already been loaded for you: import datadotworld as dw dataset = dw.load_dataset( 'https://data.world/stephen-hoover/chicago-city-council-votes') # Use the dataframes property to assign the alderman_votes table to the variable votes_dataframe. votes_dataframe = dataset.dataframes['alderman_votes'] # Use the pandas shape property to get rows/columns size for the `votes_dataframe` dataframe. pp.pprint(votes_dataframe.shape) # Use the pandas head function to print the first 3 rows of the `votes_dataframe` dataframe. pp.pprint(votes_dataframe.head(3))
''' Created on Nov 28, 2017 @author: drews ''' import datadotworld as dw dataset = dw.load_dataset('data-society/european-soccer-data') matches = dataset.dataframes['match'] # drop unneeded columns # keep players, xy coords of players, date, id, goals, home/away team ids matches = matches.drop(matches.columns[[range(77, 115)]], 1) matches = matches.drop(matches.columns[[range(0, 5)]], 1) matches = matches.dropna(0, 'any') matches.to_csv('matches.csv', index=False) print('wrote matches.csv') teams = dataset.dataframes['team_attributes'] teams = teams.drop(teams.columns[[0, 1, 6]], 1) # change string values to integer values teams['buildupplayspeedclass'] = teams['buildupplayspeedclass'].replace( ['Fast', 'Balanced', 'Slow'], [3, 2, 1]) teams['buildupplaydribblingclass'] = teams[ 'buildupplaydribblingclass'].replace(['Lots', 'Normal', 'Little'], [3, 2, 1]) teams['buildupplaypassingclass'] = teams['buildupplaypassingclass'].replace(
import datadotworld as d td = intro_dataset = d.load_dataset('rfabbri/test1') q = """SELECT * WHERE {?s ?p ?o}""" q2 = """ PREFIX po: <http://purl.org/socialparticipation/po/> SELECT ?s WHERE {?s a po:Participant} """ r = d.query('rfabbri/test1', q2, query_type='sparql')
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 5: # datadotworld module has been imported as dw import datadotworld as dw # We've loaded two datasets to use 'int_dataset' and 'fipsCodes_dataset' int_dataset = dw.load_dataset('https://data.world/jonloyens/intermediate-data-world') fipsCodes_dataset = dw.load_dataset('https://data.world/uscensusbureau/fips-state-codes') ## Create two dataframes: police_shootings from the 'fatal_police_shootings_data' table of int_dataset and state_abbrvs, from the 'statesfipscodes' table of fipsCodes_dataset police_shootings = int_dataset.dataframes['fatal_police_shootings_data'] state_abbrvs = fipsCodes_dataset.dataframes['statesfipscodes'] ## Merge the two datasets together on the state and stusab fields. Assign to a merged_dataframe variable. merged_dataframe = police_shootings.merge(state_abbrvs, how = 'left', left_on = 'state', right_on='stusab') ## Add a 'citystate' column to the merged_dataframe dataframe, populating it with the concatinated values from the 'city' and 'state_name' columns, separated by ', '. merged_dataframe["citystate"] = merged_dataframe["city"] + ", " + merged_dataframe["state_name"] ## Print first 5 rows of merged_dataframe pp.pprint(merged_dataframe.head(5))
# datadotworld module has been imported as dw import datadotworld as dw # We've loaded two datasets to use 'int_dataset' and 'fipsCodes_dataset' int_dataset = dw.load_dataset( 'https://data.world/jonloyens/intermediate-data-world') fipsCodes_dataset = dw.load_dataset( 'https://data.world/uscensusbureau/fips-state-codes') ## Create two dataframes: police_shootings from the 'fatal_police_shootings_data' table of int_dataset and state_abbrvs, from the 'statesfipscodes' table of fipsCodes_dataset police_shootings = int_dataset.dataframes['fatal_police_shootings_data'] state_abbrvs = fipsCodes_dataset.dataframes['statesfipscodes'] ## Merge the two datasets together on the state and stusab fields. Assign to a merged_dataframe variable. merged_dataframe = police_shootings.merge(state_abbrvs, how='left', left_on='state', right_on='stusab') ## Add a 'citystate' column to the merged_dataframe dataframe, populating it with the concatinated values from the 'city' and 'state_name' columns, separated by ', '. merged_dataframe['citystate'] = merged_dataframe[ 'city'] + ", " + merged_dataframe['state_name'] ## Print first 5 rows of merged_dataframe pp.pprint(merged_dataframe.head(5))
""" # %% Imports. import datadotworld as dw import io import pandas as pd # %% Reading, parsing, naming, upsampling and interpolating the data. # Loading the data (CSV) from: https://data.world/aryoryte/ DDWUsrDir = "aryoryte/" DDWUaDir = "meteorological-uppsala-automatic-weather-station-1998-2017" DDWUaDir = DDWUsrDir + DDWUaDir dataPath = "original/Uppsala 1998 till 2017.csv" UaCSV = dw.load_dataset(DDWUaDir).raw_data[dataPath] # Names given to the input variuables/features/... colnames = ['UTC', 'windDir', 'windSpeed', 'airTemp', 'dewPt', 'relHum'] # Reading the data from the CSV file into a Pandas dataframe; headers # are at row 6 (but given the names colnames), trying to skip the last # row which is blank/empty but an initial space - fails..., choosing # parse_dates True to most quickly (C engine) parse the dates of the # UTC/index column. df = pd.read_csv(io.StringIO(UaCSV.decode('utf-8')), header=6, sep=';', names=colnames, index_col='UTC', skipinitialspace=True,
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 1: # Import the datadotworld module as dw import datadotworld as dw # Import the city council votes dataset dataset = dw.load_dataset('stephen-hoover/chicago-city-council-votes')
def load_children(self): if self.data_set is None: full_name = self.owner + "/" + self.data_set_name self.data_set = dw.load_dataset(full_name) for table_name in self.data_set.dataframes.keys(): self.add_child(DataDotWorldTableNode(self, table_name))
""" Name : c3_27_datadotworld_1.py Book : Hands-on Data Science with Anaconda) Publisher: Packt Publishing Ltd. Author : Yuxing Yan and James Yan Date : 1/15/2018 email : [email protected] [email protected] """ import datadotworld as dw dataset = 'jonloyens/an-intro-to-dataworld-dataset' data = dw.load_dataset(dataset, force_update=True) list(dataset.dataframes)
def from_uri(cls, uri: str, **kwargs) -> "DataDotWorldTable": dataset_name = "/".join(uri.split("/")[2:-1]) dataset = dw.load_dataset(dataset_name) df = dataset.dataframes[uri.split("/")[-1]] return cls(inner_data=df, uri=uri, **kwargs)
import numpy as np from matplotlib import pyplot as plt import pandas as pd import datadotworld as dw dataset = dw.load_dataset('tonatihu/prueba-sngular') dataset.describe() dataframedatos = dataset.dataframes['datos'] dataframeprovincia = dataset.dataframes['provincia'] left = pd.DataFrame(dataframeprovincia) right = pd.DataFrame(dataframedatos) mergequery = pd.merge(left, right, on='id_provincia', how='inner') prov = mergequery.groupby('provincia') readytoplot = prov['ventas_totales'].agg(np.sum) print(readytoplot) plt.title("Ventas por provincia") plt.xlabel("Ventas totales") plt.ylabel("Provincia") plt.plot(readytoplot) plt.show()
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 4: # The datadotworld module and dataset have already been loaded for you: import datadotworld as dw dataset = dw.load_dataset('https://data.world/stephen-hoover/chicago-city-council-votes') # Use the dataframes property to assign the alderman_votes table to the variable votes_dataframe. votes_dataframe = dataset.dataframes['alderman_votes'] # Use the pandas shape property to get rows/columns size for the `votes_dataframe` dataframe. pp.pprint(votes_dataframe.shape) # Use the pandas head function to print the first 3 rows of the `votes_dataframe` dataframe. pp.pprint(votes_dataframe.head(3))