def build_meta(inid): """Perform pre-processing for the metadata files""" status = True # Read and write paths may be different fr = indicator_path(inid, ftype='meta', mode='r') fw = indicator_path(inid, ftype='meta', mode='w') meta = yamlmd.read_yamlmd(fr) git_update = sdg.git.get_git_updates(inid) for k in git_update.keys(): meta[0][k] = git_update[k] yamlmd.write_yamlmd(meta, fw) return status
def get_git_update(inid, ftype): """Change into the working directory of the file (it might be a submodule) and get the latest git history""" f = indicator_path(inid, ftype=ftype, mode='r') f_dir, f_name = os.path.split(f) repo = git.Repo(f_dir, search_parent_directories=True) # Need to translate relative to the repo root (this may be a submodule) repo_dir = os.path.relpath(repo.working_dir, os.getcwd()) f = os.path.relpath(f, repo_dir) commit = next(repo.iter_commits(paths=f, max_count=1)) git_date = str(commit.committed_datetime.date()) git_sha = commit.hexsha # Turn the remote URL into a commit URL remote = repo.remote().url remote_bare = re.sub('^.*github\.com(:|\/)', '', remote).replace('.git', '') commit_url = 'https://github.com/' + remote_bare + '/commit/' + git_sha return { 'date': git_date, 'sha': git_sha, 'file': f, 'id': inid, 'commit_url': commit_url }
def main(): """Run csv checks on all indicator csvs in the data directory""" status = True # Create the place to put the files os.makedirs("data", exist_ok=True) inids = sdg.path.get_ids() print("Building csvs for " + str(len(inids)) + " indicators...") for inid in inids: status = status & build_csv(inid) print("Copying goals info...") in_dir = indicator_path(ftype='data', mode='r') out_dir = indicator_path(ftype='data', mode='w') for f in glob.glob(os.path.join(in_dir, 'sdg*.csv')): shutil.copy(f, out_dir) return (status)
def build_csv(inid): """ For a given ID pull in the raw data and write out the website csv Returns: bool: Status """ status = True in_path = indicator_path(inid, ftype='data', mode='r') out_path = indicator_path(inid, ftype='data', mode='w') try: shutil.copy(in_path, out_path) except Exception as e: print(inid, e) return False return status
def compare_reload(inid, which='edges'): """Load the original csv and compare to reloading the JSON you wrote out which = 'edges' or 'data' """ csv_path = indicator_path(inid, ftype=which, mode='w') jsn = json.load(open(indicator_path(inid, 'json', mode='w'))) df_csv = pd.read_csv(csv_path, encoding='utf-8') df_jsn = pd.DataFrame(jsn[which]).replace({None: np.nan}) # Account for empty data if df_jsn.shape[0] == df_csv.shape[0] == 0: return True df_jsn = df_jsn[df_csv.columns.values] status = isclose_df(df_csv, df_jsn) if not status: print("reload " + which + " error in " + inid) return status
def write_json(inid, orient='list', gz=False): """Write out the main csv and edge data as a single json file. This can either be as records (orient='records') or as columns (orient='list'). Args: inid -- str: The indicator id, e.g. '1-1-1' orient -- str: either 'records' for rowwise, or 'list' for colwise gz -- bool: if True then compress the output with gzip Return: status. bool. """ try: all_data = { 'data': get_main_data(inid, orient=orient), 'edges': get_edge_data(inid, orient=orient) } all_json = pd.io.json.dumps(all_data) all_json = all_json.replace("\\/", "/") # why does it double escape? # Write out if gz: json_bytes = all_json.encode('utf-8') with gzip.open( indicator_path(inid, 'json', mode='w') + '.gz', 'w') as outfile: outfile.write(json_bytes) else: with open(indicator_path(inid, 'json', mode='w'), 'w', encoding='utf-8') as outfile: outfile.write(all_json) except Exception as e: print(inid, e) return False return True
def main(): """Process the metadata files ready for site build""" status = True ids = sdg.path.get_ids() print("Building " + str(len(ids)) + " metadata files...") # Make sure they have somewhere to go out_dir = indicator_path(ftype='meta', mode='w') os.makedirs(out_dir, exist_ok=True) for inid in ids: try: status = status & build_meta(inid) except Exception as e: status = False print(inid, e) return (status)
def get_main_data(inid, orient='records'): """Read the main csv data and return as a json ready object Args: inid --- str. indicator id. e.g. '1-1-1' orient --- either 'records' for rowwise, or 'list' for colwise Return: Depending on orient either a list of dicts (rowwise) or dict of lists (colwise) """ try: df = pd.read_csv(indicator_path(inid, 'data', mode='w'), encoding='utf-8') except Exception as e: print(inid, e) return False if df.shape[0] < 1: return list() else: return df_nan_to_none(df, orient=orient)
def get_edge_data(inid, orient): """Read the edge file associated with a main data csv and return as a json ready object Args: inid --- str. indicator id. e.g. '1-1-1' orient --- either 'records' for rowwise, or 'list' for colwise Return: Depending on orient either a list of dicts (rowwise) or dict of lists (colwise) """ try: edges = pd.read_csv(indicator_path(inid, 'edges', mode='w'), encoding='utf-8') except Exception as e: print(inid, e) return False if edges.shape[0] < 1: return list() else: return df_nan_to_none(edges, orient=orient)