def main(): # Select your transport with a defined url endpoint access_token = os.environ['ACCESS_TOKEN'] transport = AIOHTTPTransport( url=f"https://gis-api.aiesec.org/graphql/?access_token={access_token}") async def getData(): # Create a GraphQL client using the defined transport async with Client(transport=transport, fetch_schema_from_transport=True) as session: # Provide a GraphQL query query = gql(""" query getApplicationList ($limit: Int, $start_date: DateTime, $end_date: DateTime){ allOpportunityApplication(per_page: $limit, filters: {created_at: {from: $start_date, to: $end_date}}) { data { id status created_at date_matched date_pay_by_cash date_approved date_realized experience_start_date experience_end_date date_approval_broken nps_response_completed_at updated_at person { id full_name contact_detail { email phone } home_mc { name } home_lc { name } } host_lc { name } host_mc: home_mc { name } opportunity { id created_at title duration sub_product { name } programme { short_name_display } } standards { option } } } } """) params = { "mc_id": [518], # Bahrain's MC ID "start_date": "2021-01-01", "end_date": "", "limit": 1000 # Could be any large enough number } # Execute the query on the transport results = await session.execute(query, variable_values=params) # print(result) return results print("Executing query off of EXPA ...") apps_data = asyncio.run(getData()) print("Started preprocessing...") # Reduce the dict by 3 Levels apps_data = apps_data['allOpportunityApplication']['data'] # Flatten dictionary and compress keys apps_df = pd.json_normalize(apps_data, sep='_') """ Create new columns for easy comprehension * LC * Department * Partner_MC * Partner_LC """ new_fields = ['department', 'lc', 'partner_mc', 'partner_lc'] def generate_new_fields(row): if row['person_home_mc_name'] == 'Bahrain': values = [ 'o' + row['opportunity_programme_short_name_display'], row['person_home_lc_name'], row['host_mc_name'], row['host_lc_name'] ] else: values = [ 'i' + row['opportunity_programme_short_name_display'], row['host_lc_name'], row['person_home_mc_name'], row['person_home_lc_name'] ] return dict(zip(new_fields, values)) print("Generating new fields and tables ...") apps_df[new_fields] = apps_df.apply(lambda row: generate_new_fields(row), axis=1, result_type='expand') pointless_cols = [ 'opportunity_programme_short_name_display', 'host_mc_name', 'host_lc_name', 'person_home_mc_name', 'person_home_lc_name' ] apps_df.drop(pointless_cols, inplace=True, axis=1) """ Produce Performance Analytics Table * First convert dates from longform to YYYY-MM-DD * Retain Date, LC, Dept, PartnerMC, PartnerLC, and the Status Column like # of Applications, Accepted etc.. will be the aggregation """ date_cols = [ 'created_at', 'date_matched', 'date_approved', 'date_realized', 'updated_at' ] multi_indices = ['lc', 'department', 'partner_mc', 'partner_lc'] aggregration_fields = ['id', 'person_id'] # Generate table with these columns only perf_table = apps_df[aggregration_fields + date_cols + multi_indices].copy() # Ensure that dates are uniform and shortened perf_table.loc[:, date_cols] = apps_df[date_cols].applymap(lambda x: x[:-10], na_action='ignore') def get_timeseries_formetric(table: pd.DataFrame, other_fields: list, selected_date_col: str, metric_name: str) -> pd.DataFrame: table = table[[selected_date_col, *other_fields, *aggregration_fields]] _ = table.sort_values([selected_date_col, *other_fields]) _[metric_name] = 1 _.rename(columns={ selected_date_col: "date", "id": "AppID", "person_id": "PersonID" }, inplace=True) return _.dropna(axis=0) apps_per_day = get_timeseries_formetric(perf_table, multi_indices, "created_at", "Applied") acc_per_day = get_timeseries_formetric(perf_table, multi_indices, "date_matched", "Accepted") apd_per_day = get_timeseries_formetric(perf_table, multi_indices, "date_approved", "Approved") perf_analysis_df = pd.concat([apps_per_day, acc_per_day, apd_per_day]) perf_analysis_df.fillna(0, inplace=True, axis=0) # ### Push it to Google Sheets # Credentials from service account file for Google Sheets print("Creating temporary file for service account credentials...") temp = tempfile.NamedTemporaryFile() try: access_creds = os.environ['GOOGLE_CREDS'] write_base64str_obj_to_file(access_creds, temp.name) finally: gc = pygsheets.authorize(service_file=temp.name) temp.close() print("Writing to Google Sheets...") workbook = gc.open_by_key(os.environ["SPREADSHEET_ID"]) perf_worksheet = workbook.worksheet_by_title( os.environ["PerformanceSheet"]) applications_worksheet = workbook.worksheet_by_title( os.environ["ApplicationsSheet"]) # Create handy function to write to sheets set_worksheet_todf = partial(pygsheets.Worksheet.set_dataframe, start="A1", copy_head=True) set_worksheet_todf(perf_worksheet, perf_analysis_df) set_worksheet_todf(applications_worksheet, apps_df) print("Done!")
async def missingfiles(self, human_readable=None, **params): """Show files which are missing from blocks at a node. Parameters ---------- block block name (wildcards) (*) lfn logical file name (*) node node name (wildcards) se storage element. subscribed y or n. whether the block is subscribed to the node or not default is null (either) custodial y or n. filter for custodial responsibility, default is to return either group group name default is to return missing blocks for any group. (*) either block or lfn is required """ resjson = await self.jsonmethod("missingfiles", **params) out = [] if human_readable is not None and type(human_readable) is not bool: print("Wrong human_readable parameter type") df = pandas.json_normalize(out) return df elif human_readable is None or human_readable is False: for _block in resjson["phedex"]["block"]: for _file in _block["file"]: for _missing in _file["missing"]: out.append( { "block_name": _block["name"], "file_name": _file["name"], "checksum": _file["checksum"], "size": _file["bytes"], "created": _file["time_create"], "origin_node": _file["origin_node"], "missing_from": _missing["node_name"], "disk": _missing["se"], "custodial": _missing["custodial"], "subscribed": _missing["subscribed"], } ) df = pandas.json_normalize(out) return format_dates(df, ["created"]) elif human_readable is True: for _block in resjson["phedex"]["block"]: for _file in _block["file"]: for _missing in _file["missing"]: out.append( { "Block Name": _block["name"], "File Name": _file["name"], "checksum": _file["checksum"], "Size of file": _file["bytes"], "Time created": _file["time_create"], "Origin Node": _file["origin_node"], "Missing from": _missing["node_name"], "Disk": _missing["se"], "Custodial?": _missing["custodial"], "Subscribed?": _missing["subscribed"], } ) df = pandas.json_normalize(out) return format_dates(df, ["Time created"])
async def requestlist(self, human_readable=None, **params): """Serve as a simple request search and cache-able catalog of requests to save within a client, which may then use the request ID to obtain further details using TransferRequests or DeletionRequests. Parameters ---------- request * request id type request type, 'xfer' (default) or 'delete' approval approval state, 'approved', 'disapproved', 'mixed', or 'pending' requested_by * requestor's name node * name of the destination node (show requests in which this node is involved) decision decision at the node, 'approved', 'disapproved' or 'pending' group * user group create_since created since this time create_until created until this time decide_since decided since this time decide_until decided until this time dataset * dataset is part of request, or a block from this dataset block * block is part of request, or part of a dataset in request decided_by * name of person who approved the request * could be multiple and/or with wildcard ** when both 'block' and 'dataset' are present, they form a logical disjunction (ie. or) """ resjson = await self.jsonmethod("requestlist", **params) out = [] if human_readable is not None and type(human_readable) is not bool: df = pandas.json_normalize(out) raise Exception("Wrong human_readable parameter type") return df elif human_readable is None or human_readable is False: for _request in resjson["phedex"]["request"]: for _node in _request["node"]: out.append( { "request_id": _request["id"], "time_created": _request["time_create"], "requested_by": _request["requested_by"], "approval": _request["approval"], "node": _node["name"], "time_decided": _node["time_decided"], "decided_by": _node["decided_by"], } ) df = pandas.json_normalize(out) return format_dates(df, ["time_created", "time_decided"]) else: for _request in resjson["phedex"]["request"]: for _node in _request["node"]: out.append( { "Request ID": _request["id"], "Time Created": _request["time_create"], "Requested by": _request["requested_by"], "Approval": _request["approval"], "Node": _node["name"], "Time decided": _node["time_decided"], "Decided by": _node["decided_by"], } ) df = pandas.json_normalize(out) return format_dates(df, ["Time Created", "Time decided"])
def test_value_array_record_prefix(self): # GH 21536 result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.") expected = DataFrame([[1], [2]], columns=["Prefix.0"]) tm.assert_frame_equal(result, expected)
class StockTwits_BackUp: def open(self, filename): self.Text = gopen(filename, 'rt').readlines() self.TotalTweets = len(self.Text) # %% prepare directory fil files_to_analyze = Files(folder_name=Local_Settings.Messages_Folder, category=2, year=2009).FilesList test = gopen(files_to_analyze[0], 'rt').readlines() test = pd.read_json(files_to_analyze[0], lines=True) TweetsDetails = pd.json_normalize(test['data']) ReviewsSample = Reviews.join([RatingsDetail,ResponseDetails]).drop(['RatingsDetail', 'ResponseDetails'], axis=1) new_df = pd.concat([pd.DataFrame(json_normalize(x)) for x in df['json']],ignore_index=True) # %% open Loughran and McDonald's sentiment dictionary lm_negative = Local_Settings.lm_dictionary.get('Negative')[0].tolist() lm_positive = Local_Settings.lm_dictionary.get('Positive')[0].tolist() def loughran_scores(text): twords = text.split() twords2 = [i.upper() for i in twords] len_twords2 = len(twords2) negative_found = len([i for i in twords2 if i in lm_negative])
async def get_active_exchange_markets(cls) -> pd.DataFrame: """ *required Returns all currently active BTC trading pairs from Eterbase, sorted by volume in descending order. """ async with aiohttp.ClientSession() as client: async with client.get(f"{constants.REST_URL}/markets") as products_response: products_response: aiohttp.ClientResponse = products_response if products_response.status != 200: raise IOError(f"Error fetching active Eterbase markets. HTTP status is {products_response.status}.") data = await products_response.json() for pair in data: pair["symbol"] = convert_from_exchange_trading_pair(pair["symbol"]) all_markets: pd.DataFrame = pd.DataFrame.from_records(data=data, index="id") all_markets.rename({"base": "baseAsset", "quote": "quoteAsset"}, axis="columns", inplace=True) all_markets = all_markets[(all_markets.state == 'Trading')] ids: List[str] = list(all_markets.index) volumes: List[float] = [] prices: List[float] = [] tickers = None async with client.get(f"{constants.REST_URL}/tickers") as tickers_response: tickers_response: aiohttp.ClientResponse = tickers_response if tickers_response.status == 200: data = await tickers_response.json() tickers: pd.DataFrame = pd.DataFrame.from_records(data=data, index="marketId") else: raise IOError(f"Error fetching tickers on Eterbase. " f"HTTP status is {tickers_response.status}.") for product_id in ids: volumes.append(float(tickers.loc[product_id].volume)) prices.append(float(tickers.loc[product_id].price)) all_markets["volume"] = volumes all_markets["price"] = prices cross_rates = None async with client.get(f"{constants.REST_URL}/tickers/cross-rates") as crossrates_response: crossrates_response: aiohttp.ClientResponse = crossrates_response if crossrates_response.status == 200: data = await crossrates_response.json() cross_rates: pd.DataFrame = pd.json_normalize(data, record_path ='rates', meta = ['base']) else: raise IOError(f"Error fetching cross-rates on Eterbase. " f"HTTP status is {crossrates_response.status}.") usd_volume: List[float] = [] cross_rates_ids: List[str] = list(cross_rates.base) for row in all_markets.itertuples(): quote_name: str = row.quoteAsset quote_volume: float = row.volume quote_price: float = row.price found = False for product_id in cross_rates_ids: if quote_name == product_id: rate: float = cross_rates.loc[(cross_rates['base'] == product_id) & (cross_rates['quote'].str.startswith("USDT"))].iat[0, 1] usd_volume.append(quote_volume * quote_price * rate) found = True break if found is False: usd_volume.append(NaN) cls.logger().error(f"Unable to convert volume to USD for market - {quote_name}.") all_markets["USDVolume"] = usd_volume return all_markets.sort_values(by = ["USDVolume"], ascending = False)
def test_deprecated_import(self): with tm.assert_produces_warning(FutureWarning): from pandas.io.json import json_normalize recs = [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}] json_normalize(recs)
else: # current object is already a Linestring split_lines.append(river['geometry'].iloc[i]) river_merged = geopandas.GeoSeries(MultiLineString(split_lines)) river_merged.to_file(river_file_merged, driver='GeoJSON') #------------------------------------------------------------------------------- # output a file containing total miles by GEOMETRYID #------------------------------------------------------------------------------- data = json.load(open(trails_file, 'r')) # extract the GEOMETRYID and coordinates from the geoJSON df = json_normalize(data=data['features'])[[ 'properties.GEOMETRYID', 'geometry.coordinates' ]] df.columns = ['GEOMETRYID', 'coordinates'] # extract the points to rows df = df.explode('coordinates') # get the coordinates of the next point df['next_GEOMETRYID'] = df['GEOMETRYID'].shift(periods=-1) df['next_coordinates'] = df['coordinates'].shift(periods=-1) df = df[df['GEOMETRYID'] == df['next_GEOMETRYID']] # split the lat/lon into columns df.reset_index(inplace=True) df[['start_lon', 'start_lat']] = DataFrame(df['coordinates'].tolist()) df[['end_lon', 'end_lat']] = DataFrame(df['next_coordinates'].tolist())
#setup recurring data tables with no dynamic gameID requirements import requests as rq import pandas as pd ###conference table response = rq.get("https://statsapi.web.nhl.com/api/v1/conferences") conferences = pd.json_normalize(response.json()["conferences"]).set_index("id") conferences.rename_axis("conferenceID", inplace=True) ###division table response = rq.get("https://statsapi.web.nhl.com/api/v1/divisions") divisions = pd.json_normalize(response.json()["divisions"]).set_index("id") divisions.rename_axis("divisionID", inplace=True) ###team table response = rq.get("https://statsapi.web.nhl.com/api/v1/teams") teams = pd.json_normalize(response.json()["teams"]).set_index("id") teams.rename_axis("teamID", inplace=True) ###team standings response = rq.get("https://statsapi.web.nhl.com/api/v1/standings") team_standings = pd.json_normalize(response.json()['records'], record_path=['teamRecords'], errors='ignore').set_index("team.id") team_standings.rename_axis("teamID", inplace=True) ###last played game by teamID response = rq.get( "https://statsapi.web.nhl.com/api/v1/teams?expand=team.schedule.previous") last = pd.json_normalize(
def df_authors(self, data): """Utility Returns inner json as a pandas dataframe, exposing authors + pubId. List of affiliations per each author are not broken down and are returned as JSON. So in essence you get one row per author. NOTE this method works only for publications searches -and it's clever enough to know if `authors` or `author_affiliations` (deprecated) field is used. Each publication.author_affiliations object has a nested list structure like this: ``` [[{'first_name': 'Laura', 'last_name': 'Pasin', 'orcid': '', 'current_organization_id': '', 'researcher_id': '', 'affiliations': [{'name': 'Department of Anesthesia and Intensive Care, Ospedale S. Antonio, Via Facciolati, 71, Padova, Italy'}]}, {'first_name': 'Sabrina', 'last_name': 'Boraso', 'orcid': '', 'current_organization_id': '', 'researcher_id': '', 'affiliations': [{'name': 'Department of Anesthesia and Intensive Care, Ospedale S. Antonio, Via Facciolati, 71, Padova, Italy'}]}, {'first_name': 'Ivo', 'last_name': 'Tiberio', 'orcid': '', 'current_organization_id': '', 'researcher_id': '', 'affiliations': [{'name': 'Department of Anesthesia and Intensive Care, Ospedale S. Antonio, Via Facciolati, 71, Padova, Italy'}]}]] ``` """ output = pd.DataFrame() if 'publications' in self.good_keys: if exists_key_in_dicts_list(data['publications'], "author_affiliations"): FIELD = "author_affiliations" elif exists_key_in_dicts_list(data['publications'], "authors"): FIELD = "authors" else: FIELD = "" if FIELD == "author_affiliations": # simplify deep nested dict structure for deprecated field for x in data['publications']: if 'author_affiliations' in x and x[ 'author_affiliations']: # if key exists and contents are not empty eg '[]' if type( x['author_affiliations'][0] ) == list: # then break down nested dict structure x['author_affiliations'] = x[ 'author_affiliations'][0] elif type( x['author_affiliations'] [0]) == dict: # = it's already been broken down pass else: # put in default empty element x['author_affiliations'] = [] elif FIELD == "authors": normalize_key("authors", data['publications'], []) if FIELD: output = json_normalize(data['publications'], record_path=[FIELD], meta=['id'], errors='ignore') output.rename(columns={"id": "pub_id"}, inplace=True) else: print( f"[Warning] Dataframe cannot be created as 'publications' were not found in data. Available: {self.good_keys}" ) return output
from sqlalchemy import create_engine import json from cook import Archiver import requests import tempfile import os if __name__ == "__main__": url = "https://www.nycgovparks.org/bigapps/DPR_CapitalProjectTracker_001.json" data = json.loads(requests.get(url).content) df = pd.DataFrame(data) df = df[["TrackerID", "FMSID", "Title", "TotalFunding", "Locations"]] df["Locations"] = df["Locations"].apply(lambda x: x.get("Location")) df2 = df.drop(columns=["Locations"]).join( df["Locations"].explode().to_frame()) horiz_exploded = pd.json_normalize(df2["Locations"]) horiz_exploded.index = df2.index df3 = pd.concat([df2, horiz_exploded], axis=1).drop(columns=["Locations"]) df3 = df3.rename( columns={ "TrackerID": "proj_id", "FMSID": "fmsid", "Title": "desc", "TotalFunding": "total_funding", "ParkID": "park_id", "Latitude": "lat", "Longitude": "lon" }) df3 = df3[[ "proj_id", "fmsid", "desc", "total_funding", "park_id", "lat", "lon" ]]
def df_concepts(self, data, key): """from a list of publications or grants including concepts, return a DF with one line per concept Enrich the dataframe with scores and other metrics. """ FIELD_NAME = "concepts" FIELD_NAME_SCORES = "concepts_scores" ROUNDING = 5 if not ('publications' in self.good_keys) and not ('grants' in self.good_keys): s = f"Dataframe can be created only with searches returning 'publications' or 'grants' . Available: {self.good_keys}" raise Exception(s) concepts = self.df_simple(data, key) if (FIELD_NAME not in concepts.columns) and (FIELD_NAME_SCORES not in concepts.columns): s = f"Dataframe requires raw concepts data, but no 'concepts' or 'concepts_scores' column was not found in: {concepts.columns.to_list()}" raise Exception(s) if not 'id' in concepts.columns: s = f"Dataframe requires an 'id' column for counting concepts, which was not found in: {concepts.columns.to_list()}" raise Exception(s) if FIELD_NAME_SCORES in concepts.columns: # use `concepts_scores` field preferably df = concepts.explode(FIELD_NAME_SCORES) df.dropna(subset=[FIELD_NAME_SCORES], inplace=True) # remove rows if there is no concept df.reset_index(inplace=True, drop=True) original_cols = [ x for x in df.columns.to_list() if x != FIELD_NAME_SCORES ] df = df.drop(FIELD_NAME_SCORES, 1).assign(**pd.json_normalize( df[FIELD_NAME_SCORES])) # unpack dict with new columns df = df[df.relevance != 0] # remove 0-relevance scores df['relevance'] = df['relevance'].round(ROUNDING) df.rename(columns={"relevance": "score"}, inplace=True) df['frequency'] = df.groupby('concept')['concept'].transform( 'count') df['concepts_count'] = df.groupby("id")['concept'].transform( 'size') else: # with traditional 'concepts', scores are simulated df = concepts.explode(FIELD_NAME) original_cols = [ x for x in df.columns.to_list() if x != FIELD_NAME ] df.dropna(subset=[FIELD_NAME], inplace=True) # remove rows if there is no concept df.rename(columns={FIELD_NAME: "concept"}, inplace=True) df['frequency'] = df.groupby('concept')['concept'].transform( 'count') df['concepts_count'] = df.groupby("id")['concept'].transform( 'size') ranks = df.groupby('id').cumcount() + 1 # scores = normalized rank from 0 to 1, where 1 is the highest rank df['score'] = ( (df['concepts_count'] + 1) - ranks) / df['concepts_count'] df['score'] = df['score'].round(ROUNDING) # finally df['score_avg'] = df.groupby('concept')['score'].transform( 'mean').round(ROUNDING) df.reset_index(drop=True, inplace=True) out_cols = original_cols + [ 'concepts_count', 'concept', 'score', 'frequency', 'score_avg' ] return df[out_cols]
def get_gif(access_token: str, min_lon: float, max_lat: float, max_lon: float, min_lat: float, ratio: float, colour: str, backgroundColour: str, alpha: float, activity_type: str, bg_img: str, duration: int): activities = requests.get('https://www.strava.com/api/v3/activities' + '?access_token=' + access_token + '&per_page=200' + '&page=' + str(1)) activities = activities.json() # convert activities to pandas dataframe df = json_normalize(activities) # filter df by type of activity if activity_type == 'Run': df = df[df['type'] == 'Run'] elif activity_type == 'Ride': df = df[df['type'] == 'Ride'] else: df = df[(df['type'] == 'Run') | (df['type'] == 'Ride')] # filter df by start coordinates using the bounding box df_bbox = df[(df['start_latitude'] < max_lat) & (df['start_latitude'] > min_lat) & (df["start_longitude"] < max_lon) & (df["start_longitude"] > min_lon)] df_bbox = df_bbox.sort_values(by=['start_date']) # create imagery based on bg_img if bg_img == 'sat': imagery = GoogleTiles(style='satellite') elif bg_img == 'osm': imagery = OSM() else: imagery = OSM() # create figure to plot routes on fig = plt.figure(figsize=(8, ratio * 8), frameon=False) ax = fig.add_subplot(1, 1, 1, projection=imagery.crs) fig.patch.set_visible(False) ax.set_extent([min_lon, max_lon, min_lat, max_lat]) ax.set_axis_off() # filepaths fp_out = 'image.gif' imgs = [] for i in range(len(df_bbox)): try: lat, lng = zip( *polyline.decode(df_bbox.iloc[i]['map.summary_polyline'])) except: print(i) plt.plot(lng, lat, transform=ccrs.Geodetic(), color=colour, alpha=alpha) imgs.append(fig2img(fig)) # create background image if bg_img == 'none': bg = Image.new(mode='RGBA', size=imgs[0].size, color=ImageColor.getrgb(backgroundColour)) else: fig = plt.figure(figsize=(8, ratio * 8), frameon=False) ax = fig.add_subplot(1, 1, 1, projection=imagery.crs) ax.set_extent([min_lon, max_lon, min_lat, max_lat]) fig.patch.set_visible(False) ax.set_axis_off() # set background imagery if one was sent ax.add_image(imagery, 15) # converting background to image bg = fig2img(fig) imgs = map(lambda img: Image.alpha_composite(bg, img), imgs) bg.save(fp=fp_out, format='GIF', append_images=imgs, save_all=True, duration=duration, loop=0) file = open('image.gif', 'rb') return {'gif': base64.b64encode(file.read())}
def download(self, symbol, start_date=None, end_date=None): if start_date is None: start_date = self._start_date if end_date is None: now = pd.Timestamp.now(self._calendar.tz) end_date = ( self._calendar.previous_close(now) .astimezone(self._calendar.tz) .normalize() ) full_code = self.get_full_code(symbol) url = "http://data.krx.co.kr/comm/bldAttendant/getJsonData.cmd" data = { "bld": self._bld, "isuCd": full_code, "isuCd2": "", "strtDd": start_date.strftime("%Y%m%d"), "endDd": end_date.strftime("%Y%m%d"), "share": "1", "money": "1", "csvxls_isNo": "false", } response = requests.post(url, data, headers=self._headers) df = pd.json_normalize(response.json()["output"]) if df.shape[0] == 0: return None column_names = { "TRD_DD": "Date", "ISU_CD": "Code", "ISU_NM": "Name", "MKT_NM": "Market", "SECUGRP_NM": "SecuGroup", "TDD_CLSPRC": "Close", "FLUC_TP_CD": "UpDown", "CMPPRVDD_PRC": "Change", "FLUC_RT": "ChangeRate", "TDD_OPNPRC": "Open", "TDD_HGPRC": "High", "TDD_LWPRC": "Low", "ACC_TRDVOL": "Volume", "ACC_TRDVAL": "Amount", "MKTCAP": "MarCap", "CMPPREVDD_PRC": "Change", "LIST_SHRS": "Shares", } df.rename(columns=column_names, inplace=True) int_columns = [ "Close", "UpDown", "Change", "ChangeRate", "Open", "High", "Low", "Volume", "Amount", "MarCap", "Shares", ] for col in int_columns: if col in df.columns: df[col] = pd.to_numeric(df[col].str.replace(",", ""), errors="coerce") df["Date"] = pd.to_datetime(df["Date"]) df.set_index("Date", inplace=True) return df
script = schema['metadata']['github_url'] if param == 'ALL_SCHEMA': table_name_git = '{}{}'.format( schema['metadata']['TablePrefix'], os.path.basename(schema['metadata']['target_S3URI']).lower()) else: try: table_name_git = schema['metadata']['TableName'] except: table_name_git = '{}{}'.format( schema['metadata']['TablePrefix'], os.path.basename( schema['metadata']['target_S3URI']).lower()) tb = pd.json_normalize(schema['schema']).to_markdown() toc = "{}{}".format(github_link, table_name_git) top_readme += '\n- [{0}]({1})'.format(table_name_git, toc) README += template.format(table_name_git, DatabaseName, target_S3URI, partition, tb, script) README = README.format(top_readme) with open(os.path.join(str(Path(path).parent), '00_data_catalog/README.md'), "w") as outfile: outfile.write(README) ### Create toc ### Update TOC in Github parent_path = str(Path(path).parent) for p in [ parent_path,
if elem == old: doubles.append(elem) old = None continue old = elem return doubles # TODO: implement checks: # no duplicate area ids # no duplicate area ids # no duplicate area names # no duplicate zone ids with open('zones.yaml', 'r') as f: zone_list = pd.json_normalize(yaml.safe_load(f)['zones']) with open('areas.yaml', 'r') as f: area_list = pd.json_normalize(yaml.safe_load(f)['areas']) dangling_zones = find_zones_without_valid_areas(zone_list, area_list) if len(dangling_zones) > 0: print('ERROR: found zones without area') print(dangling_zones) duplicate_areas = find_duplicate_area_ids(area_list) if len(duplicate_areas) > 0: print('ERROR: found duplicate area ids') print(duplicate_areas) # check_area_refs(area_dataMap['areas'], zone_dataMap['zones'])
xg_reg_start = xgb.Booster({'nthread': 4}) xg_reg_end = xgb.Booster({'nthread': 4}) xg_reg_start.load_model('Modeling/xgb_trip_starts_py.model') xg_reg_end.load_model('Modeling/xgb_trip_ends_py.model') # read in the data old_data = pd.read_sql("SELECT * FROM last_12 WHERE is_pred = 0", con=conn) old_data['datetime'] = old_data.datetime.dt.tz_localize('America/New_York') # read in latest json station_status = pd.read_json( "https://gbfs.citibikenyc.com/gbfs/en/station_status.json") datetime = pd.to_datetime(station_status['last_updated'], unit='s')\ .dt.tz_localize('UTC')\ .dt.tz_convert('America/New_York')[0] station_status = pd.json_normalize(station_status['data']['stations']) # only retain relavant columns data_to_append = station_status[[ 'station_id', 'num_bikes_available', 'num_docks_available' ]].drop_duplicates() data_to_append['datetime'] = datetime data_to_append['is_pred'] = 0 # combine data and delete observations > 24 hours old new_data = old_data.append(data_to_append).drop_duplicates() new_data['is_pred'] = 0 boolean = pd.to_datetime(new_data.datetime, utc=True).dt.tz_convert('America/New_York') >= ( datetime - dt.timedelta(hours=24)) new_data = new_data.loc[boolean, :].reset_index(drop=True)
def get_result_table(config: dict, results: list, total_epochs: int, highlight_best: bool, print_colorized: bool, remove_header: int) -> str: """ Log result table """ if not results: return '' tabulate.PRESERVE_WHITESPACE = True trials = json_normalize(results, max_level=1) trials['Best'] = '' if 'results_metrics.winsdrawslosses' not in trials.columns: # Ensure compatibility with older versions of hyperopt results trials['results_metrics.winsdrawslosses'] = 'N/A' trials = trials[['Best', 'current_epoch', 'results_metrics.trade_count', 'results_metrics.winsdrawslosses', 'results_metrics.avg_profit', 'results_metrics.total_profit', 'results_metrics.profit', 'results_metrics.duration', 'loss', 'is_initial_point', 'is_best']] trials.columns = ['Best', 'Epoch', 'Trades', 'W/D/L', 'Avg profit', 'Total profit', 'Profit', 'Avg duration', 'Objective', 'is_initial_point', 'is_best'] trials['is_profit'] = False trials.loc[trials['is_initial_point'], 'Best'] = '* ' trials.loc[trials['is_best'], 'Best'] = 'Best' trials.loc[trials['is_initial_point'] & trials['is_best'], 'Best'] = '* Best' trials.loc[trials['Total profit'] > 0, 'is_profit'] = True trials['Trades'] = trials['Trades'].astype(str) trials['Epoch'] = trials['Epoch'].apply( lambda x: '{}/{}'.format(str(x).rjust(len(str(total_epochs)), ' '), total_epochs) ) trials['Avg profit'] = trials['Avg profit'].apply( lambda x: '{:,.2f}%'.format(x).rjust(7, ' ') if not isna(x) else "--".rjust(7, ' ') ) trials['Avg duration'] = trials['Avg duration'].apply( lambda x: '{:,.1f} m'.format(x).rjust(7, ' ') if not isna(x) else "--".rjust(7, ' ') ) trials['Objective'] = trials['Objective'].apply( lambda x: '{:,.5f}'.format(x).rjust(8, ' ') if x != 100000 else "N/A".rjust(8, ' ') ) trials['Profit'] = trials.apply( lambda x: '{:,.8f} {} {}'.format( x['Total profit'], config['stake_currency'], '({:,.2f}%)'.format(x['Profit']).rjust(10, ' ') ).rjust(25+len(config['stake_currency'])) if x['Total profit'] != 0.0 else '--'.rjust(25+len(config['stake_currency'])), axis=1 ) trials = trials.drop(columns=['Total profit']) if print_colorized: for i in range(len(trials)): if trials.loc[i]['is_profit']: for j in range(len(trials.loc[i])-3): trials.iat[i, j] = "{}{}{}".format(Fore.GREEN, str(trials.loc[i][j]), Fore.RESET) if trials.loc[i]['is_best'] and highlight_best: for j in range(len(trials.loc[i])-3): trials.iat[i, j] = "{}{}{}".format(Style.BRIGHT, str(trials.loc[i][j]), Style.RESET_ALL) trials = trials.drop(columns=['is_initial_point', 'is_best', 'is_profit']) if remove_header > 0: table = tabulate.tabulate( trials.to_dict(orient='list'), tablefmt='orgtbl', headers='keys', stralign="right" ) table = table.split("\n", remove_header)[remove_header] elif remove_header < 0: table = tabulate.tabulate( trials.to_dict(orient='list'), tablefmt='psql', headers='keys', stralign="right" ) table = "\n".join(table.split("\n")[0:remove_header]) else: table = tabulate.tabulate( trials.to_dict(orient='list'), tablefmt='psql', headers='keys', stralign="right" ) return table
# Authenticate ourselves using tweepy so we can scrape the feed of CARES Bot auth = tweepy.OAuthHandler(config.api_key, config.secret_key) auth.set_access_token(config.access_token, config.access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True) cares_bot_tweets = [] # Iteratively scrape the tweets from CARES Bot for tweet in tweepy.Cursor(api.user_timeline, screen_name="ExtendCaresUI").items(): cares_bot_tweets.append(tweet) cares_bot = pd.Series() # Since we only want the URL of the image, we'll normalize the JSON data and extract the media_url key, which # has the image link as a URL for tweet in cares_bot_tweets: try: cares_bot = cares_bot.append(pd.json_normalize( tweet._json, ["entities", "media"])["media_url"], ignore_index=True) # It looks like some tweets don't have a media_url (perhaps when Data For Progress launched this bot, they had tweets with no image). Given # this, let's skip over these tweets except KeyError: continue # Write out the image URLs to a CSV file so we have a static representation of the data that was pulled and so # the scraper doesn't have to be rerun cares_bot.to_csv( "C:/Users/Rober/DATS_6103/project_2/data/cares_bot_image_urls.csv")
def export_csv_file(config: dict, results: list, total_epochs: int, highlight_best: bool, csv_file: str) -> None: """ Log result to csv-file """ if not results: return # Verification for overwrite if Path(csv_file).is_file(): logger.error(f"CSV file already exists: {csv_file}") return try: io.open(csv_file, 'w+').close() except IOError: logger.error(f"Failed to create CSV file: {csv_file}") return trials = json_normalize(results, max_level=1) trials['Best'] = '' trials['Stake currency'] = config['stake_currency'] base_metrics = ['Best', 'current_epoch', 'results_metrics.trade_count', 'results_metrics.avg_profit', 'results_metrics.total_profit', 'Stake currency', 'results_metrics.profit', 'results_metrics.duration', 'loss', 'is_initial_point', 'is_best'] param_metrics = [("params_dict."+param) for param in results[0]['params_dict'].keys()] trials = trials[base_metrics + param_metrics] base_columns = ['Best', 'Epoch', 'Trades', 'Avg profit', 'Total profit', 'Stake currency', 'Profit', 'Avg duration', 'Objective', 'is_initial_point', 'is_best'] param_columns = list(results[0]['params_dict'].keys()) trials.columns = base_columns + param_columns trials['is_profit'] = False trials.loc[trials['is_initial_point'], 'Best'] = '*' trials.loc[trials['is_best'], 'Best'] = 'Best' trials.loc[trials['is_initial_point'] & trials['is_best'], 'Best'] = '* Best' trials.loc[trials['Total profit'] > 0, 'is_profit'] = True trials['Epoch'] = trials['Epoch'].astype(str) trials['Trades'] = trials['Trades'].astype(str) trials['Total profit'] = trials['Total profit'].apply( lambda x: '{:,.8f}'.format(x) if x != 0.0 else "" ) trials['Profit'] = trials['Profit'].apply( lambda x: '{:,.2f}'.format(x) if not isna(x) else "" ) trials['Avg profit'] = trials['Avg profit'].apply( lambda x: '{:,.2f}%'.format(x) if not isna(x) else "" ) trials['Avg duration'] = trials['Avg duration'].apply( lambda x: '{:,.1f} m'.format(x) if not isna(x) else "" ) trials['Objective'] = trials['Objective'].apply( lambda x: '{:,.5f}'.format(x) if x != 100000 else "" ) trials = trials.drop(columns=['is_initial_point', 'is_best', 'is_profit']) trials.to_csv(csv_file, index=False, header=True, mode='w', encoding='UTF-8') logger.info(f"CSV file created: {csv_file}")
def test_empty_array(self): result = json_normalize([]) expected = DataFrame() tm.assert_frame_equal(result, expected)
df[columnName] = df[columnName].apply(apply_transliteration) df[columnName] = df[columnName].apply(to_lowerCase) df[columnName] = df[columnName].apply(process_URLs) df[columnName] = df[columnName].apply(filter_alpha_numeric) df[columnName] = df[columnName].apply(remove_punctuations) df[columnName] = df[columnName].apply(remove_non_ascii) df[columnName] = df[columnName].apply(trim) df[columnName] = df[columnName].apply(strip_whiteSpaces) df = remove_empty(df, columnName) df = df.reset_index(drop=True) print("Processing Complete !!") return df ############################ END ############################### ######################## Main Function ######################### if __name__ == "__main__": # Read data from command line data = sys.argv[1] with open(data) as f: json_data = json.load(f) df = pd.json_normalize(json_data) df = df.reindex(columns=list(json_data[0].keys())) # Preprocess Dataset df = preprocess(df, 'inputText') df.to_json(data.split('.')[0] + "_preprocessed.json", orient='records') ###################### END ###############################
def getHistoricData(): start_date = timer() print(start_date) with open('venv/data/search_results_output.jsonl', 'r') as json_file: json_list = list(json_file) search_df = pd.DataFrame(columns=[ 'asin', 'title', 'url', 'rating', 'reviews', 'price_crape', 'search_url', 'crape_date' ]) df_iniciado = False for index, json_str in enumerate(json_list): print("iteracion " + str(index + 1) + " de " + str(len(json_list))) result = json.loads(json_str) print("New Element") asin = re.search("B0[\d\w]{8}", str(result['url'])) # print("asin -> " + asin.group()) # print("title -> " + str(result['title'])) # print("url -> " + str(result['url'])) # print("rating -> " + str(result['rating'])) # print("reviews -> " + str(result['reviews'])) # print("price -> " + str(result['price'])) # print("search_url -> " + str(result['search_url'])) #Eliminar los que contengan picasso en la url ya que son productos promocionados, que pueden no tener nada que ver if "picassoRedirect" not in str(result['url']): new_row = { 'asin': asin.group(), 'title': str(result['title']), 'url': str(result['url']), 'rating': str(result['rating']), 'reviews': str(result['reviews']), 'price_crape': str(result['price']), 'search_url': str(result['search_url']), 'crape_date': str(datetime.now().strftime("%m/%d/%Y")) } try: inter_date_start = timer() json_response = amazonPriceRequest(asin.group()) json_normalized = json_normalize( data=json_response.json(), record_path='price_history', meta=['asin', 'currency', 'price_type']) if df_iniciado == False: json_df = json_normalized df_iniciado = True else: json_df = json_df.append(json_normalized, ignore_index=True, sort=False) search_df = search_df.append(new_row, ignore_index=True) inter_date_end = timer() print(timedelta(seconds=inter_date_end - inter_date_start)) except: print("Response - KO") df_merge = pd.merge(json_df, search_df, on='asin') df_merge.to_csv(r'venv/data/OSFPD.csv', index=False, header=True) end_date = timer() print(timedelta(seconds=end_date - start_date)) pass
def _download_file_make(pl_simple): g = rdflib.ConjunctiveGraph() g.bind("skos", rdflib.namespace.SKOS) # make triples # Base rtype = rdflib.namespace.RDF.type # Type scon = rdflib.namespace.SKOS.Concept # Concept plabel = rdflib.namespace.SKOS.prefLabel # prefLabel alabel = rdflib.namespace.SKOS.altLabel # altLabel broader = rdflib.namespace.SKOS.broader # broader narrower = rdflib.namespace.SKOS.narrower # narrower # add List namel = [] # broader name_bt = [] # narrower name_nw = [] # JSON convert to pandas.DataFrame nm = pd.json_normalize(pl_simple) # JSON query Get Concept, prefLabel and narrower base namelpl = nm.query('term == preferred_label and uri != ""') # get uri and term namelx = namelpl.loc[:, ['term', 'uri']].values for name in namelx: # print('prefLabel:'+str(name[0])+' '+str(name[1])) nameb = [rdflib.URIRef(str(name[1])), rtype, scon] namel.append(nameb) nameb = [ rdflib.URIRef(str(name[1])), plabel, rdflib.Literal(str(name[0])) ] namel.append(nameb) # narrower _add_check_term(name_nw, name[0], name[0], name[1]) # query altLabel namelal = nm.query('term != preferred_label and uri != ""') # get uri and term namelx = namelal.loc[:, ['term', 'uri']].values for name in namelx: # print('altLabel:' + str(name[0])+' '+str(name[1])) nameb = [ rdflib.URIRef(str(name[1])), alabel, rdflib.Literal(str(name[0])) ] namel.append(nameb) # create broader links # query broader_term namelbt = nm.query('broader_term != "" and uri != ""') # get uri and broader_term namelx = namelbt.loc[:, ['broader_term', 'term', 'uri']].values for name in namelx: _add_check_term(name_bt, name[0], name[1], name[2]) for namebt in name_bt: # print('namebt:', str(namebt[0]), # str(namebt[1]), str(namebt[2])) # query prefLabel wkquery =\ 'term == preferred_label and term == "' + str(namebt[0]) + '"' # print(wkquery) namelpl = nm.query(wkquery) # get uri and term namelx = namelpl.loc[:, ['term', 'uri']].values for name in namelx: nameb = [ rdflib.URIRef(str(namebt[2])), broader, rdflib.URIRef(str(name[1])) ] namel.append(nameb) # print('add broader:'+str(name[0])+' '+str(name[1])) # print("--- printing narrower ---") # create narrower links for namenw in name_nw: # query prefLabel wkquery =\ 'term == preferred_label and uri != "" and broader_term == "' +\ str(namenw[0]) + '"' # print(wkquery) namelpl = nm.query(wkquery) # get uri and term namelx = namelpl.loc[:, ['term', 'uri']].values for name in namelx: nameb = [ rdflib.URIRef(str(namenw[2])), narrower, rdflib.URIRef(str(name[1])) ] namel.append(nameb) # print('add narrower:'+str(name[0])+' '+str(name[1])) # Add List to Graph for name in namel: g.add((name[0], name[1], name[2])) return g
async def filereplicas(self, human_readable=None, **params): """Serves the file replicas known to phedex. Parameters ---------- block block name, with '*' wildcards, can be multiple (*). required when no lfn is specified. Block names must follow the syntax /X/Y/Z#, i.e. have three /'s and a '#'. Anything else is rejected. dataset dataset name. Syntax: /X/Y/Z, all three /'s obligatory. Wildcads are allowed. node node name, can be multiple (*) se storage element name, can be multiple (*) update_since unix timestamp, only return replicas updated since this time create_since unix timestamp, only return replicas created since this time complete y or n. if y, return only file replicas from complete block replicas. if n only return file replicas from incomplete block replicas. default is to return either. dist_complete y or n. if y, return only file replicas from blocks where all file replicas are available at some node. if n, return only file replicas from blocks which have file replicas not available at any node. default is to return either. subscribed y or n, filter for subscription. default is to return either. custodial y or n. filter for custodial responsibility. default is to return either. group group name. default is to return replicas for any group. lfn logical file name """ if type(human_readable) is not bool and human_readable is not None: raise Exception("Wrong human_readable parameter type") resjson = await self.jsonmethod("filereplicas", **params) out = [] for _block in resjson["phedex"]["block"]: for _file in _block["file"]: for _replica in _file["replica"]: out.append( { "Block_name": _block["name"], "Files": _block["files"], "Block_size_(GB)": _block["bytes"] / 1000000000.0, "lfn": _file["name"], "Checksum": _file["checksum"], "File_created_on": _file["time_create"], "File_replica_at": _replica["node"], "File_subcribed": _replica["subscribed"], "Custodial": _replica["custodial"], "Group": _replica["group"], "File_in_node_since": _replica["time_create"], } ) df = pandas.json_normalize(out) format_dates(df, ["File_created_on", "File_in_node_since"]) if human_readable is True: mapping = { "Block_name": "Block Name", "Block_size_(GB)": "Block size (GB)", "File_created_on": "File Created On", "File_replica_at": "File Replica At", "File_subcribed": "File Subcribed", "File_in_node_since": "File In Node Since", } df2 = df.rename(columns=mapping) return df2 else: return df
def _download_file_ev_serialize(pl_simple, p_format): # format is csv or xlsx df_json = [] df_json = pd.json_normalize(pl_simple) # print("--- printing "+p_format+" ---") df_org = df_json.copy() # delete word "[","]" df_org['synonym_candidate'] =\ df_org['synonym_candidate'].astype("string") df_org['broader_term_candidate'] =\ df_org['broader_term_candidate'].astype("string") df_org['synonym_candidate'] =\ df_org['synonym_candidate'].str.replace('[', '') df_org['synonym_candidate'] =\ df_org['synonym_candidate'].str.replace(']', '') df_org['synonym_candidate'] =\ df_org['synonym_candidate'].str.replace('\'', '') df_org['broader_term_candidate'] =\ df_org['broader_term_candidate'].str.replace('[', '') df_org['broader_term_candidate'] =\ df_org['broader_term_candidate'].str.replace(']', '') df_org['broader_term_candidate'] =\ df_org['broader_term_candidate'].str.replace('\'', '') # delete columns id hidden df_org.drop(columns=['id', 'hidden'], inplace=True) # header change df_org = df_org.rename( columns={ 'term': '用語名', 'preferred_label': '代表語', 'uri': '代表語のURI', 'broader_term': '上位語', 'broader_term_candidate': '上位語候補', 'synonym_candidate': '同義語候補', 'part_of_speech': '品詞', 'position_x': 'x座標値', 'position_y': 'y座標値', 'color1': '色1', 'color2': '色2', 'confirm': '確定済み用語' }) if p_format == 'csv': with tempfile.TemporaryFile("w+") as f: # encoding='utf-8', index=False df_org.to_csv(f, encoding='utf-8', index=False) f.seek(0) response = make_response() response.data = f.read() response.headers['Content-Type'] = 'text/csv' response.headers['Content-Disposition'] =\ 'attachment; filename=test_sample.csv' return response elif p_format == 'xlsx': downloadFileName = 'temp_excel.xlsx' df_org.to_excel(downloadFileName, encoding='utf-8', index=False) response = make_response() response.data = open(downloadFileName, "rb").read() response.headers['Content-Disposition'] = 'attachment;' response.mimetype = XLSX_MIMETYPE os.remove(downloadFileName) return response
async def blocklatency(self, human_readable=None, **params): """Show authentication state and abilities Parameters ---------- ability authorization ability. If passed then the nodes (from TMDB) that the user is allowed to use "ability" for are returned. require_cert if passed then the call will die if the user is not authenticated by certificate require_passwd if passed then the call will die if the user is not authenticated by password """ resjson = await self.jsonmethod("blocklatency", **params) out = [] if human_readable is not None and type(human_readable) is not bool: print("Wrong human_readable parameter type") df = pandas.json_normalize(out) return df elif human_readable is None or human_readable is False: for _block in resjson["phedex"]["block"]: for _destination in _block["destination"]: for _latency in _destination["latency"]: out.append( { "Block": _block["name"], "Block_ID": _block["id"], "Dataset": _block["dataset"], "Size": _block["bytes"], "Time_create": _block["time_create"], "Number_of_files": _block["files"], "Time_update": _block["time_update"], "Destination": _destination["name"], "custodial": _latency["is_custodial"], "last_suspend": _latency["last_suspend"], "last_replica": _latency["last_replica"], "time_subscription": _latency["time_subscription"], "block_closed": _latency["block_close"], "latency": _latency["latency"], } ) df = pandas.json_normalize(out) return format_dates( df, [ "Time_update", "last_suspend", "last_replica", "time_subscription", "block_closed", "Time_create", ], ) elif human_readable is True: for _block in resjson["phedex"]["block"]: for _destination in _block["destination"]: for _latency in _destination["latency"]: out.append( { "Block": _block["name"], "Block ID": _block["id"], "Dataset": _block["dataset"], "Size": _block["bytes"], "Time Create": _block["time_create"], "Number of files": _block["files"], "Time Update": _block["time_update"], "Destination": _destination["name"], "custodial": _latency["is_custodial"], "Last Suspend": _latency["last_suspend"], "Last Replica": _latency["last_replica"], "Time Subscription": _latency["time_subscription"], "Block Closed": _latency["block_close"], "Latency": _latency["latency"], } ) df = pandas.json_normalize(out) return format_dates( df, [ "Time Update", "Last Suspend", "Last Replica", "Time Subscription", "Block Closed", "Time Create", ], )
def main(): global Verbose_Flag parser = optparse.OptionParser() parser.add_option('-v', '--verbose', dest="verbose", default=False, action="store_true", help="Print lots of output to stdout") parser.add_option("--config", dest="config_filename", help="read configuration from FILE", metavar="FILE") parser.add_option( '-C', '--containers', dest="containers", default=False, action="store_true", help="for the container enviroment in the virtual machine") options, remainder = parser.parse_args() Verbose_Flag = options.verbose if Verbose_Flag: print('ARGV :', sys.argv[1:]) print('VERBOSE :', options.verbose) print('REMAINING :', remainder) if options.config_filename: print("Configuration file : {}".format(options.config_filename)) initialize(options) my_courses = list_my_courses() print("len(my_courses) are {0}".format(len(my_courses))) # set up the output write writer = pd.ExcelWriter('users_in_my_courses.xlsx', engine='xlsxwriter') for course in my_courses: if course['name'].find('do not use') >= 0: print("course id={0} name={1} -- skipping".format( course['id'], course['name'])) continue # if not (course['id'] in [16039, 17234]): # for testing only look at these courses # continue if (course['id'] in [ 85, # Canvas at KTH 4996, # Canvas at KTH 2.0 - New structure 5733, # Grunder, resultathantering och attestering för kursledare och examinatorer. (sv/en) 8356, # GDPR@KTH 17839, # Miljöutbildning 18339 # Vårt uppdrag ]): # skip the courses over all KTH faculty and staff continue print("course id={0} name={1}".format(course['id'], course['name'])) users = users_in_course(course['id']) if Verbose_Flag: print("users are: {0}".format(users)) if (users): users_df = pd.json_normalize(users) # below are examples of some columns that might be dropped columns_to_drop = [ 'associated_user_id', 'course_integration_id', 'created_at', 'end_at', 'enrollment_state', 'grades.current_grade', 'grades.current_score', 'grades.final_grade', 'grades.final_score', 'grades.html_url', 'grades.unposted_current_grade', 'grades.unposted_current_score', 'grades.unposted_final_grade', 'grades.unposted_final_score', 'html_url', 'id', 'last_activity_at', 'last_attended_at', 'limit_privileges_to_course_section', 'role', 'role_id', 'root_account_id', 'section_integration_id', 'sis_account_id', 'sis_section_id', 'start_at', 'total_activity_time', 'type', 'updated_at', 'user.created_at', 'user.id', 'user.integration_id' ] # keep the following: # 'sis_course_id', # 'sis_user_id', # 'user.login_id', # 'user.name', # 'user.short_name' # 'user.sis_user_id' # 'user.sortable_name,' # 'user_id' users_df.drop(columns_to_drop, inplace=True, axis=1) # the following was inspired by the section "Using XlsxWriter with Pandas" on http://xlsxwriter.readthedocs.io/working_with_pandas.html course_sheet_name = "{0}".format(course['name']) if (len(course_sheet_name) > 30): course_sheet_name = course_sheet_name[0:29] course_sheet_name = course_sheet_name.replace(':', '-') users_df.to_excel(writer, sheet_name=course_sheet_name) # Close the Pandas Excel writer and output the Excel file. writer.save()
def create_movie_info(num_rows: int): ##Creation of movie_info csv all_movies = pd.read_json("mflix_movies.json").rename(columns={ "_id": "movie_id" }).iloc[0:num_rows, :] #Released is dropped due to incorrect values award_df = pd.json_normalize(all_movies['awards']).rename( columns={ "wins": "award_wins", "nominations": "award_nominations", "text": "award_text" }) imdb_df = pd.json_normalize(all_movies['imdb']).rename(columns={ "rating": "imdb_rating", "votes": "imdb_votes", "id": "imdb_id" }) tomatoes_df = all_movies["tomatoes"] ## fill na values with empty dicts tomatoes_df = tomatoes_df.fillna({i: {} for i in tomatoes_df.index}) tomatoes_df_flat = pd.json_normalize(tomatoes_df) tomatoes_df_final = tomatoes_df_flat.rename( columns={ "viewer.rating": "tomato_viewer_rating", "viewer.numReviews": "tomato_viewer_num_reviews", "viewer.meter": "tomato_viewer_meter", "lastUpdated.$date": "tomato_lastupdated", "fresh": "tomato_fresh", "rotten": "tomato_rotten", "critic.rating": "tomato_critic_rating", "critic.numReviews": "tomato_critic_num_reviews", "critic.meter": "tomato_critic_meter", "dvd.$date": "tomato_dvd_date", "website": "tomato_website", "production": "tomato_production", "consensus": "tomato_consensus" }).drop("dvd.$date.$numberLong", axis=1) #Released is dropped due to incorrect values movies_info = all_movies.drop([ "genres", "directors", "countries", "cast", "writers", "languages", "released", "awards", "imdb", "tomatoes" ], axis=1) movies_info = pd.merge(movies_info, award_df, right_index=True, left_index=True) movies_info = pd.merge(movies_info, imdb_df, right_index=True, left_index=True) movies_info = pd.merge(movies_info, tomatoes_df_final, right_index=True, left_index=True) #change id to correct key value from dict movies_info["movie_id"] = movies_info["movie_id"].apply( lambda x: x["$oid"]) return movies_info, all_movies
anio_fin = lista_fechas_final[i].year mes_fin = str(lista_fechas_final[i].month).zfill(2) dia_fin = str(lista_fechas_final[i].day).zfill(2) print(anio_ini) print(mes_ini) print(dia_ini) address = "https://ws01.cenace.gob.mx:8082/SWPSC/SIM/" + sistema + "/" + proceso + "/" + str( anio_ini) + "/" + str(mes_ini) + "/" + str( dia_ini) + "/" + str(anio_fin) + "/" + str( mes_fin) + "/" + str(dia_fin) + "/json" print(address) info = requests.get(address).json() proceso = info["proceso"] sistema = info["sistema"] info = info["Resultados"] for i in range(len(info)): temporal = pd.json_normalize(info[i], 'Valores') temporal['clv_zona_reserva'] = "ZONA " + str(i + 1) print(temporal.head(10)) temporal['proceso'] = proceso temporal['sistema'] = sistema appended_data_list.append(temporal) appended_data = pd.concat(appended_data_list) csv_file = 'cenace_' + str( datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv' appended_data.to_csv(csv_file, index=False)