def _get_data(self, indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): if isinstance(country, six.string_types): country = [country] countries = ';'.join(country) # Build URL for api call """ url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json") """ endpoint = "/countries/{countries}/indicators/{indicator}"\ .format(countries=countries, indicator=indicator) url = self._url(endpoint) params = { 'date': "%s:%s" % (start, end), 'per_page': 25000, 'format': 'json' } # Download response = self.session.get(url, params=params) data = response.content # Check to see if there is a possible problem possible_message = json.loads(data)[0] if 'message' in possible_message.keys(): msg = possible_message['message'][0] try: msg = msg['key'].split() + ["\n "] + msg['value'].split() wb_err = ' '.join(msg) except: wb_err = "" if 'key' in msg.keys(): wb_err = msg['key'] + "\n " if 'value' in msg.keys(): wb_err += msg['value'] error_msg = "Problem with a World Bank Query \n %s" return None, error_msg % wb_err if 'total' in possible_message.keys(): if possible_message['total'] == 0: return None, "No results from world bank." # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso_code = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso_code, year, value]).T out.columns = ['country', 'iso_code', 'year', indicator] return out,"Success"
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country="US", start=2002, end=2005): if type(country) == str: country = [country] countries = ";".join(country) # Build URL for api call url = ( "http://api.worldbank.org/countries/" + countries + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json" ) # Download with urlopen(url) as response: data = response.read() # Check to see if there is a possible problem possible_message = json.loads(data)[0] if "message" in possible_message.keys(): msg = possible_message["message"][0] try: msg = msg["key"].split() + ["\n "] + msg["value"].split() wb_err = " ".join(msg) except: wb_err = "" if "key" in msg.keys(): wb_err = msg["key"] + "\n " if "value" in msg.keys(): wb_err += msg["value"] error_msg = "Problem with a World Bank Query \n %s" return None, error_msg % wb_err if "total" in possible_message.keys(): if possible_message["total"] == 0: return None, "No results from world bank." # Parse JSON file data = json.loads(data)[1] country = [x["country"]["value"] for x in data] iso_code = [x["country"]["id"] for x in data] year = [x["date"] for x in data] value = [x["value"] for x in data] # Prepare output out = pandas.DataFrame([country, iso_code, year, value]).T out.columns = ["country", "iso_code", "year", indicator] return out, "Success"
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): if type(country) == str: country = [country] countries = ';'.join(country) # Build URL for api call url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json") # Download with urlopen(url) as response: data = response.read() # Check to see if there is a possible problem possible_message = json.loads(data)[0] if 'message' in possible_message.keys(): msg = possible_message['message'][0] try: msg = msg['key'].split() + ["\n "] + msg['value'].split() wb_err = ' '.join(msg) except: wb_err = "" if 'key' in msg.keys(): wb_err = msg['key'] + "\n " if 'value' in msg.keys(): wb_err += msg['value'] error_msg = "Problem with a World Bank Query \n %s" return None, error_msg % wb_err if 'total' in possible_message.keys(): if possible_message['total'] == 0: return None, "No results from world bank." # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso_code = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso_code, year, value]).T out.columns = ['country', 'iso_code', 'year', indicator] return out, "Success"
def test_main_classification(mock_parameters, mock_save_results, mock_get_results, mock_fetch_data, method, name): # create mock objects from database mock_parameters.return_value = {'type': method} mock_fetch_data.return_value = fx.inputs_classification( include_categorical=True) mock_get_results.return_value = None main(job_id=None, generate_pfa=True) pfa = mock_save_results.call_args[0][0] pfa_dict = json.loads(pfa) # NOTE: this does not work due to bug in jsonpickle # deserialize model # estimator = deserialize_sklearn_estimator(pfa_dict['metadata']['estimator']) # assert estimator.__class__.__name__ == name # make some prediction with PFA from titus.genpy import PFAEngine engine, = PFAEngine.fromJson(pfa_dict) engine.action({ 'stress_before_test1': 10., 'iq': 10., 'agegroup': '50-59y' })
def get_indicators(): '''Download information about all World Bank data series ''' url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) # Clean fields data.source = [x['value'] for x in data.source] fun = lambda x: x.encode('ascii', 'ignore') data.sourceOrganization = data.sourceOrganization.apply(fun) # Clean topic field def get_value(x): try: return x['value'] except: return '' fun = lambda x: [get_value(y) for y in x] data.topics = data.topics.apply(fun) data.topics = data.topics.apply(lambda x: ' ; '.join(x)) # Clean outpu data = data.sort(columns='id') data.index = pandas.Index(lrange(data.shape[0])) return data
def get_indicators(): """Download information about all World Bank data series """ url = "http://api.worldbank.org/indicators?per_page=50000&format=json" with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) # Clean fields data.source = [x["value"] for x in data.source] fun = lambda x: x.encode("ascii", "ignore") data.sourceOrganization = data.sourceOrganization.apply(fun) # Clean topic field def get_value(x): try: return x["value"] except: return "" fun = lambda x: [get_value(y) for y in x] data.topics = data.topics.apply(fun) data.topics = data.topics.apply(lambda x: " ; ".join(x)) # Clean outpu data = data.sort(columns="id") data.index = pandas.Index(lrange(data.shape[0])) return data
def read_csv(): # read the csv file df = pd.read_csv(file_name) # for each row in the csv... for index, row in df.iterrows(): print('building a person ...') root = ET.Element('Person') subFirstName = ET.SubElement(root, 'FirstName') subFirstName.text = str(row['FirstName']) subLastName = ET.SubElement(root, 'LastName') subLastName.text = str(row['LastName']) subCpr = ET.SubElement(root, 'cprnumber') subCpr.text = str(row['DateOfBirth']).replace('-', '') + '-' + str( random.randint(1000, 9999)) subEmail = ET.SubElement(root, 'email') subEmail.text = str(row['Email']) objectThing = prettify(root) print(objectThing) # send the xml object to the nemID service to get a nemId response = requests.post('http://localhost:8080/nemId', data=ET.tostring(root), headers={"Content-Type": "text/xml"}) nemId = json.loads(response.text) # save ToMsgPack saveToMsgPack(nemId, subCpr.text)
def getOneBatch(location_list, len_of_locations): ''' Retrieve up to 20 requests from the API for the English station names :param location_list: List of coordinates for stations :param len_of_locations: Number of stations to be searched ''' ops_url_list = ['' for i in range(len_of_locations)] station_names_en = ['' for i in range(len_of_locations)] main_query = '/v3/place/text?city=' + city_name + '&output=json&offset=1&page=1&key=' + api_key_web_service + '&citylimit=true&language=en&types=150500&location=' BURL = 'https://restapi.amap.com/v3/batch?key=' + api_key_web_service BPARAMS = '{"ops": [' for x in range(len_of_locations): ops_url_list[x] = main_query + location_list[x] BPARAMS += '{"url": "' + ops_url_list[x] + '"}' BPARAMS += ']}' if (x == len_of_locations - 1) else ',' body = json.loads(BPARAMS) url = 'https://restapi.amap.com/v3/batch' params = {'key': api_key_web_service} responseBatchEn = requests.post(url, params=params, json=body) dataEn = responseBatchEn.json() for x in range(len_of_locations): station_names_en[x] = str( dataEn[x]['body']['pois'][0]['name']).split('(')[0] return station_names_en
def make_json(self): #создаем dict в котором key это вложенные листы excel, value это список из DataFrame data_parced = pd.read_excel(self.input_file, None) dict_jsons = {} for key, value in data_parced.items(): json_obj_list = json.loads( value.to_json( #конвертируем DataFrame в json строку None, orient='records', date_format='iso')) for item in json_obj_list: for key, value in item.items(): item[key] = get_normalized_value( value ) #меняем данные в ячеке на json object, если данные это json строка dict_jsons[key] = json_obj_list with open(self.output_file, 'w') as result: #кладем получившийся dict в файл по указанному пути json.dump(dict_jsons, result, ensure_ascii=False, indent=4, separators=(',', ': '))
def send(query: str): host = 'http://localhost:19002/query/service' data = dict() data['statement'] = query data = urllib.parse.urlencode(data).encode('utf-8') with urllib.request.urlopen(host, data) as handler: result = json.loads(handler.read()) return result['status']
def get_country_data(country): print(f'LOG: getting country {country} data') base_url = 'https://api.covid19api.com' ## open API url = f'{base_url}/country/{country}?' payload = {} headers = {} try: response = requests.request("GET", url, headers=headers, data=payload, timeout=3.0) except Timeout: print(f'LOG: timeout while requesting country {country} data') return pd.DataFrame() # json_response = response.json() try: json_data = json.loads(response.text) except ValueError as err: print(f'LOG: error while loading json country {country} data ({err})') return pd.DataFrame() df = json_normalize(json_data) if (df.empty or response.status_code != 200): print(f'LOG: error in request country {country} data') return df rename = { 'Country': 'country', 'CountryCode': 'country_code', 'Lat': 'lat', 'Lon': 'lon', 'Confirmed': 'confirmed', 'Deaths': 'deaths', 'Recovered': 'recovered', 'Active': 'active', 'Date': 'date', } columns = [ 'country', 'country_code', 'lat', 'lon', 'confirmed', 'deaths', 'recovered', 'active', 'date' ] df = df.rename(columns=rename) df = df[columns] df['date'] = df['date'].apply( lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")) return df
def _load_intermediate_data(job_ids): jobs_data = [io_helper.get_results(job_id).data for job_id in job_ids] # chain all results together, ignore empty results data = list(itertools.chain(*[json.loads(d) for d in jobs_data if d])) if not data: raise errors.UserError('Intermediate jobs {} do not have any data.'.format(job_ids)) return data
def collect_limit(start_date, end_date): log.info("collect start, start date: {}, end date: {}".format( start_date, end_date)) # 获取股票基础数据 stock_basic = pro.query( 'stock_basic', exchange='', list_status='L', fields='ts_code, symbol, area, industry, list_date') calendar_list = pro.query( 'trade_cal', exchange="SSE", start_date=start_date, end_date=end_date, is_open="1", fields='exchange, cal_date, is_open, pretrade_date') count = 0 for index, cal_row in calendar_list.iterrows(): # 获取某日涨停股票 limit_list = pro.limit_list(trade_date=cal_row['cal_date'], limit_type='U') limit_list_mg = pd.merge(limit_list, stock_basic, how='left', on='ts_code') limit_list_mg['limit_num'] = 1 for index, row in limit_list_mg.iterrows(): # 30天前日期 day30 = (datetime.date.today() - datetime.timedelta(30)).strftime("%Y%m%d") if 'ST' in row['name'] or row['list_date'] > day30: continue limit_stock = r.hget( STOCK_LIMIT_UP.format(cal_row['pretrade_date']), row['ts_code']) if limit_stock is not None: stock = Stock() stock.__dict__ = json.loads(limit_stock) row['limit_num'] = stock.limit_num + 1 r.hset(STOCK_LIMIT_UP.format(row['trade_date']), row['ts_code'], row.to_json(orient='index', force_ascii=False)) # 获取某日跌停股票 limit_list = pro.limit_list(trade_date=cal_row['cal_date'], limit_type='D') limit_list_mg = pd.merge(limit_list, stock_basic, how='left', on='ts_code') for index, row in limit_list_mg.iterrows(): if 'ST' in row['name']: continue r.hset(STOCK_LIMIT_DOWN.format(row['trade_date']), row['ts_code'], row.to_json(orient='index', force_ascii=False)) log.info("collect end")
def test_main_partial(mock_parameters, mock_save_results, mock_get_results, mock_fetch_data, method, name): # create mock objects from database mock_parameters.return_value = {'type': method} mock_fetch_data.return_value = fx.inputs_regression() mock_get_results.return_value = None main(job_id=None, generate_pfa=False) js = json.loads(mock_save_results.call_args[0][0]) estimator = deserialize_sklearn_estimator(js['estimator']) assert estimator.__class__.__name__ == name
def aggregate_stats(job_ids): """Get all partial statistics from all nodes and aggregate them. :input job_ids: list of job_ids with intermediate results """ # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") results = [json.loads(io_helper.get_results(str(job_id)).data) for job_id in job_ids] corr, columns = _aggregate_results(results) _save_corr_heatmap(corr, columns)
def convert_df(raw: str, config: Dict[str, Any]) -> DataFrame: """Convert raw string to DataFrame, currently only supports json/csv. :rtype: DataFrame :param raw: the raw source string in json/csv format, this is to be converted to DataFrame :param config: the config of the data source specified in task config, see `configs/*.py` :return: the converted `DataFrame` """ ftype = "json" if "file_format" not in config else config["file_format"] df = None if ftype == "jsonl": jlines = raw.split("\n") df = DataFrame() for jline in jlines: if len(jline) < 3: continue line = json.loads(jline) df = df.append(Series(line), ignore_index=True) elif ftype == "json": if "json_path" in config: extracted_json = json_extract(raw, config["json_path"]) elif "json_path_nested" in config: extracted_json = json_unnest(raw, config["json_path_nested"], config["fields"], {}, []) else: extracted_json = raw data = pd_json.loads(extracted_json) df = pd_json.json_normalize(data) elif ftype == "csv": if "header" in config: df = pd.read_csv(StringIO(raw), names=config["header"]) else: df = pd.read_csv(StringIO(raw)) # convert timezone according to config tz = None if "timezone" in config: tz = pytz.timezone(config["timezone"]) elif "country_code" in config: tz = get_country_tz(config["country_code"]) # TODO: support multiple countries/timezones in the future if needed if "date_fields" in config: for date_field in config["date_fields"]: df[date_field] = pd.to_datetime(df[date_field]) if tz is not None: df["tz"] = get_tz_str(tz) for date_field in config["date_fields"]: df[date_field] = ( df[date_field].dt.tz_localize(tz).dt.tz_convert(pytz.utc)) df[date_field] = df[date_field].astype("datetime64[ns]") return df
def get_countries(): '''Query information about countries ''' url = 'http://api.worldbank.org/countries/all?format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data
def get_countries(): """Query information about countries """ url = "http://api.worldbank.org/countries/?per_page=1000&format=json" with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x["value"] for x in data.adminregion] data.incomeLevel = [x["value"] for x in data.incomeLevel] data.lendingType = [x["value"] for x in data.lendingType] data.region = [x["value"] for x in data.region] data = data.rename(columns={"id": "iso3c", "iso2Code": "iso2c"}) return data
def get_countries(): '''Query information about countries ''' url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data
def _process_json(self, json_string): loaded_json = [] if json_string: json_string = re.sub("\",\"\"", ",\"", json_string) json_string = re.sub('"\n\t\"', "", json_string) json_string = re.sub('}\]\[\]', '}]', json_string) if json_string == "[[][]]": loaded_json = [] else: try: loaded_json = json.loads(json_string) except ValueError: loaded_json = None return loaded_json
def create(self, path: str): query = 'create %s;\n' % self._dataverse query += 'use %s;\n' % self._dataverse host = 'http://localhost:19002/query/service' data = {} query += 'create type Schema as open{ \n' \ 'id: int64};\n' query += 'create dataset %s(Schema) primary key id;\n' % self._dataset query += 'LOAD DATASET %s USING localfs\n ' \ '((\"path\"=\"127.0.0.1://%s\"),(\"format\"=\"adm\"));\n' % (self._dataset, path) data['statement'] = query data = urllib.parse.urlencode(data).encode('utf-8') with urllib.request.urlopen(host, data) as handler: result = json.loads(handler.read()) ret_array = result['results']
def _load_intermediate_data(job_ids): data = [] for job_id in job_ids: job_result = io_helper.get_results(job_id) # log errors (e.g. about missing data), but do not reraise them if job_result.error: logging.warning(job_result.error) else: pfa = json.loads(job_result.data) data.append(pfa) if not data: raise errors.UserError('All jobs {} returned an error.'.format(job_ids)) return data
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): # Build URL for api call url = "http://api.worldbank.org/countries/" + country + "/indicators/" + \ indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000" + \ "&format=json" # Download with urlopen(url) as response: data = response.read() # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso2c = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso2c, year, value]).T return out
def hello_gcs(event, context): """Triggered by a change to a Cloud Storage bucket. Args: event (dict): Event payload. context (google.cloud.functions.Context): Metadata for the event. """ file = event bucket = storage.get_bucket(file['bucket']) blob = bucket.get_blob(file['name']) bstr = str(blob.download_as_string(), 'utf-8') for bline in bstr.splitlines(): json_data = json.loads(bline) doc_ref = db.collection(u'data').document(json_data['entityId']) doc_ref.set(json_data['data']) print(f"bucket : {file['bucket']}") print(f"selfLink : {file['selfLink']}") print(f"Processing file : {file['name']}.")
def get_data(ticker, start_date=None, end_date=None, index_as_date=True): '''Downloads historical stock price data into a pandas data frame @param: ticker @param: start_date = None @param: end_date = None @param: index_as_date = True ''' site = build_url(ticker, start_date, end_date) resp = requests.get(site) html = resp.content html = html.decode() start = html.index('"HistoricalPriceStore"') end = html.index("firstTradeDate") needed = html[start:end] needed = needed.strip('"HistoricalPriceStore":') needed = needed.strip(""","isPending":false,'""") needed = needed + "}" temp = loads(needed) result = json_normalize(temp['prices']) result = result[[ "date", "open", "high", "low", "close", "adjclose", "volume" ]] # fix date field result['date'] = result['date'].map( lambda x: pd.datetime.fromtimestamp(x).date()) result['ticker'] = ticker.upper() result = result.dropna() result = result.reset_index(drop=True) if index_as_date: result = result.sort_values("date") result.index = result.date.copy() del result["date"] return result
def get_countries(): """Query information about countries Provides information such as: country code, region, income level, capital city, latitude and longitude """ url = "http://api.worldbank.org/countries/?per_page=1000&format=json" with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x["value"] for x in data.adminregion] data.incomeLevel = [x["value"] for x in data.incomeLevel] data.lendingType = [x["value"] for x in data.lendingType] data.region = [x["value"] for x in data.region] data.latitude = [float(x) if x != "" else np.nan for x in data.latitude] data.longitude = [float(x) if x != "" else np.nan for x in data.longitude] data = data.rename(columns={"id": "iso3c", "iso2Code": "iso2c"}) return data
def _get_data(ticker): """ Downloads historical stock price data into a pandas data frame Args: ticker: stock ticker returns: price of stock""" site = _build_url(ticker) resp = requests.get(site) html = resp.content.decode() start = html.index('"HistoricalPriceStore"') end = html.index("firstTradeDate") needed = html[start:end] needed = needed.strip('"HistoricalPriceStore":').strip( ""","isPending":false,'""") + "}" temp = loads(needed) result = json_normalize(temp['prices']) return result[[ "date", "open", "high", "low", "close", "adjclose", "volume" ]]['adjclose'][0]
def get_countries(): '''Query information about countries Provides information such as: country code, region, income level, capital city, latitude and longitude ''' url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data.latitude = [float(x) if x != "" else np.nan for x in data.latitude] data.longitude = [float(x) if x != "" else np.nan for x in data.longitude] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data
def test_main_distributed(mock_parameters, mock_save_results, mock_get_results, mock_fetch_data, method, name): mock_parameters.return_value = {'type': method} mock_fetch_data.return_value = fx.inputs_regression() mock_get_results.return_value = None # run intermediate job main(job_id=None, generate_pfa=False) mock_get_results.return_value = mock.MagicMock(data=mock_save_results.call_args[0][0]) # generate PFA main(job_id='1', generate_pfa=True) pfa = mock_save_results.call_args_list[1][0][0] pfa_dict = json.loads(pfa) # make some prediction with PFA from titus.genpy import PFAEngine engine, = PFAEngine.fromJson(pfa_dict) engine.action({'stress_before_test1': 10., 'iq': 10., 'agegroup': '-50y'})
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): # Build URL for api call url = ("http://api.worldbank.org/countries/" + country + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json") # Download with urlopen(url) as response: data = response.read() # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso2c = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso2c, year, value]).T return out
def dayopen(): nse = Nse() print(nse) companies_list = nse.get_stock_codes(cached=False) my_dict = companies_list w = csv.writer(open("stocklist.csv", "w", newline='')) for key, val in my_dict.items(): w.writerow([key, val]) stock_list = csv.reader(open('stocklist.csv')) next(stock_list) #creates new data frame final = pd.DataFrame() # dates configuration end_day = date.today() start_day = end_day - timedelta(365) for company in stock_list: try: symbol, name = company df1 = nse.get_quote(symbol.format(symbol), as_json=True) # df10 = nse.get_history(symbol.format(symbol), start=start_day, end=end_day,as_json=True) # datax = pd_json.loads(df10) # df11 = pd.json_normalize(datax) # df12 = pd.DataFrame(df11) data = pd_json.loads(df1) # load df = pd.json_normalize(data) # normalise df2 = pd.DataFrame(df) except: continue selected = df2.iloc[0:, [1, 6, 65, 11, 20, 67]] final = pd.concat([final, selected]) final = final.reset_index(drop=True) print(final) final.to_csv('dayopendata.csv', index=None, header=True)
def aggregate_kmeans(job_ids): """Compute merging of clusters according to least merging error (e.g. smallest distance betweeen centroids) :input job_ids: list of job_ids with intermediate results """ # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") data = [ json.loads(io_helper.get_results(str(job_id)).data) for job_id in job_ids ] local_centroids = [ np.array(x['centroids']) for x in data if x['centroids'] ] indep_vars = data[0]['indep_vars'] # Aggregate clusters remotely remote_centroids = remote.aggregate_clusters(local_centroids) logging.info("Centroids:\n{}".format(remote_centroids)) # Create fake KMeans estimator and assign it our centroids estimator = KMeans() estimator.cluster_centers_ = np.array(remote_centroids) # Generate PFA for kmeans and add centroids to metadata featurizer = _create_featurizer(indep_vars) types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = { 'centroids': json.dumps(np.array(remote_centroids).tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) logging.info("DONE")
def get_input(self, input_str, **kwargs): """Get inputs. Principales identificadores de la base de datos Tempus3. * Ejemplo 1: Identificador del elemento operación estadística. Existen tres códigos para la identificación de la operación estadística "Índice de Precios de Consumo (IPC)": - código alfabético Tempus3 interno (IPC) - código numérico Tempus3 interno (Id=25) - código de la operación estadística en el Inventario de Operaciones Estadísticas (IOE30138) (Ver operaciones disponibles: https://servicios.ine.es/wstempus/js/ES/OPERACIONES_DISPONIBLES) * Ejemplo 2: Identificador de la variable "Provincias": - código numérico Tempus3 interno (Id=115) (Ver Variables: https://servicios.ine.es/wstempus/js/ES/VARIABLES) Identificador de las tablas PcAxis * Ejemplo 1: Identificador de la tabla "Gastos internos totales en actividades de I+D por años y sectores/unidad" - código alfanumérico PcAxis interno (Id=/t14/p057/a2016/l0/01001.px) (Ver Obtención del identificador de una tabla utilizando INEbase ) Parameters ---------- inputs : str kwargs : dict, optional See Also -------- get_function """ endpoint_input = f"{self.endpoint}/{input_str}" r = requests.get(endpoint_input, params=kwargs, verify=False) r_dict = json.loads(r.text) return r_dict
def get_functions(self, function="OPERACIONES_DISPONIBLES"): """Get functions availables. Parameters ---------- function : str Function can take below values: * Operaciones: OPERACIONES_DISPONIBLES, OPERACIÓN... * Variables: VARIABLES, VARIABLES_OPERACION... * Valores: VALORES_VARIABLES, VALORES_VARIABLEOPERACION... * Tablas: TABLAS_OPERACION, GRUPOS_TABLA... * Series: SERIE, SERIES_OPERACION... * Publicaciones: PUBLICACIONES, PUBLICACIONES_OPERACION... * Datos: DATOS_SERIE, DATOS_TABLA... Returns ------- function : pandas.DataFrame """ endpoint_function = f"{self.endpoint}/{function}" r = requests.get(endpoint_function, verify=False) r_dict = json.loads(r.text) return pd.DataFrame(r_dict)
def __init__(self, channel: str, arch: str = "linux-64", base_url: str = "https://conda.anaconda.org/", ttl=600): # setup cache self.ttl = ttl # normal seetings logger.info(f"RETRIEVING: {channel}, {arch}") if '{channel}' in base_url and '{arch}' in base_url: url_prefix = base_url.format(channel=channel, arch=arch) elif '{channel}' in base_url: url_prefix = base_url.format( channel=channel).rstrip('/') + f"/{arch}" else: url_prefix = f"{base_url.rstrip('/')}/{channel}/{arch}" repodata_url = f"{url_prefix}/repodata.json.bz2" data = requests.get(repodata_url) repodata = json.loads(bz2.decompress(data.content)) self.channel = channel self.arch = arch self.graph = build_repodata_graph(repodata, arch, url_prefix) logger.info(f"GRAPH BUILD FOR {repodata_url}")
def get_countries(self): '''Query information about countries ''' # Build URL for api call endpoint = '/countries' url = self._url(endpoint) params = { 'per_page': 1000, 'format': 'json' } # Download response = self.session.get(url, params=params) data = response.content data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data
def get_indicators(self): '''Download information about all World Bank data series ''' # Build URL for api call endpoint = '/indicators' url = self._url(endpoint) params = { 'per_page': 50000, 'format': 'json' } # Download response = self.session.get(url, params=params) data = response.content data = json.loads(data)[1] data = pandas.DataFrame(data) # Clean fields data.source = [x['value'] for x in data.source] fun = lambda x: x.encode('ascii', 'ignore') data.sourceOrganization = data.sourceOrganization.apply(fun) # Clean topic field def get_value(x): try: return x['value'] except: return '' fun = lambda x: [get_value(y) for y in x] data.topics = data.topics.apply(fun) data.topics = data.topics.apply(lambda x: ' ; '.join(x)) # Clean outpu data = data.sort(columns='id') data.index = pandas.Index(lrange(data.shape[0])) return data
def get_COOPS_json(begin_dt, end_dt, base_url): """Function accepts: a base url (API endpoint), a beginning and end datetime string in the form 'YYYYMMDD mm:ss' which are <= 1 year apart, passing these to the query_builder function. Function returns the hourly prediction data as a PANDAS DataFrame Object where the returned time becomes the datetime index.""" # import dependencies import pandas as pd import numpy as np from pandas.io.common import urlopen from pandas.io import json # construct the query query, query_dict = query_builder(begin_dt, end_dt, base_url) # execute query and read response with urlopen(query) as response: data = response.read() # convert json object to python dictionary and extract time and values for predictions data = json.loads(data)['predictions'] # read into PANDAS DataFrame, then manipulate DataFrame object data = pd.DataFrame(data) data.columns = ['Date_Time', 'Level'] data.index = data.Date_Time data.index = pd.to_datetime(data.index) data = data.drop('Date_Time', axis=1) # reindex to fill in any missing time values, this needs # work to initialize the range on the data/query vs. hardcoding as it # currently stands. periods, begin, end = dt_periodizer(query_dict) begin_string = begin.strftime('%Y-%m-%d %H:%M:%S') rng = pd.date_range(begin_string, periods=periods, freq='6min') # the actual reindex itself needs to be reworked for a better fill # a good start might be the median of the points directly above and # below the missing dt index. Since this is very few points typically # I am filling them with 100 for easy removal later. I would rather # remove the points than fill in a non-measured value. data = data.reindex(rng, fill_value=100) # convert value from string to float data.Level = data.Level.astype(float) # adjust level to account for distance of Carkeek from NOAA # monitoring station (+ 5.5%) level_adjust = data.Level.values + (.05 * data.Level.values) data.Level = np.round(level_adjust, decimals=2) # add date column to dataframe for later use with weather data data['Date'] = data.index.date # add a column for hourly re-sample # data['Hour'] = data.index.hour # data['Time'] = data.index.time # return DataFrame object return data
for prop in props: # big_df = pd.read_pickle(prop + '-O_' + cn + '.pkl') # ''' print '-------------------' big_df = pd.DataFrame() for el in elements: indexes_to_drop = [] df = pd.read_pickle(prop + '_cn.pkl') comps_to_remove = [] for i, row in df.iterrows(): if el not in row['metadata']['_structure']['elements'] \ or 'O' not in row['metadata']['_structure']['elements']: indexes_to_drop.append(i) if el in row['metadata']['_structure']['elements']: try: df.set_value(i, el + '_cn', json.loads(row[cn])[el]) except KeyError: for sp in json.loads(row[cn]).keys(): if el in sp: df.set_value(i, el + '_cn', json.loads(row[cn])[sp]) except TypeError: pass if row['is_ordered'] < 1: comps_to_remove.append(row['reduced_cell_formula']) df.drop(df.index[indexes_to_drop], inplace=True) for comp in comps_to_remove: df.drop(df[df['reduced_cell_formula'] == comp].index, inplace=True) df_groupby = df.groupby(['reduced_cell_formula', 'is_' + prop], as_index=False).mean() # Find all compounds with CN > 8 # print df_groupby[df_groupby[el + '_cn'] > 8] # for i, row in df_groupby[df_groupby[el + '_cn'] > 8].iterrows():
def _get_resee(self, html_re_see): obj_re_see = json.loads(html_re_see) return obj_re_see
def _get_product_comments(self, html_cont_comments): obj_comments = json.loads(html_cont_comments) return obj_comments
def _get_product_price(self, html_cont_price): json_price = re.findall(r"\[(.+)\]", html_cont_price)[0] obj_price = json.loads(json_price) return obj_price