def test_function_regression(func): data = rescale(np.load( here(f'tests/regression_files/input_{func.ndim}d.npy')), range_in=ValueRange(0, 1), range_out=ValueRange(*func.bounds)) for fid in func.fidelity_names: output = np.load( here( f'tests/regression_files/output_{func.ndim}d_{func.name}_{fid}.npy' )) np.testing.assert_allclose(func[fid](data), output)
def _find_package_in_our_project(name: str) -> Optional[str]: """Search the name in the objects of the package we are developing. Args: name: package name Returns: import_string: String required to import the package. """ # Find the package name try: project_package = os.path.basename(here()).replace("-", "_") except RecursionError: # pragma: no cover # I don't know how to make a test that raises this error :( # To manually reproduce, follow the steps of # https://github.com/lyz-code/autoimport/issues/131 return None package_objects = extract_package_objects(project_package) # nocover: as the tests are run inside the autoimport virtualenv, it will # always find the objects on that package if package_objects is None: # pragma: nocover return None try: return package_objects[name] except KeyError: return None
def load_ice_cream(): data = (pd.read_csv(here() / "data/ice_cream_shop.csv", index_col=0).reset_index(drop=True).select_columns([ "shopname", "num_customers", "owner_idx", "num_favs" ])) return data
def load_sterilization(): df = (pd.read_csv( here() / "data/sterilization.csv", na_filter=True, na_values=["#DIV/0!"], ).clean_names().label_encode("treatment")) mapping = dict(zip(df["treatment"], df["treatment_enc"])) return df, mapping
def build_tables(args: typing.List[str] = typer.Option( [], help='Additional arguments passed to `dbt run`')): """ Build tables from base data using dbt """ project_dir = pyprojroot.here() / 'dbt' profiles_dir = pyprojroot.here() / '.dbt' base_args = [ 'run', '--profiles-dir', str(profiles_dir), '--project-dir', str(project_dir) ] # NOTE: Python API is not officially supported, so # watch out if you change dbt versions... typer.secho('Building tables with dbt', fg=typer.colors.BLUE) _ = dbt.main.handle_and_check(base_args + list(args))
def download_links(to=here("./data/raw/veneto")): for link in get_all_pdf_links(): out_path = Path(to) / clean_filename(Path(link).name) if not out_path.exists(): logger.info(f'Downloading file {link}') r = requests.get(link) with open(out_path, "wb") as file: file.write(r.content)
def read_mkdocs() -> Dict: """ Parse mkdocs.yml in project root dir. """ with open(here() / "mkdocs.yml", "r+") as f: f = "".join(l for l in f.readlines()) mkdocs_config = yaml.safe_load(f) return mkdocs_config
def nb_app(): nb = here() / "notebooks" / "wealth_of_nederland.ipynb" voila_args = [str(nb), "--no-browser"] voila_app = VoilaTest.instance() voila_app.initialize(voila_args) voila_app.start() yield voila_app voila_app.stop() voila_app.clear_instance()
def calh_app(): calh_notebook = here() / 'calh.ipynb' voila_args = [str(calh_notebook), '--no-browser'] voila_app = VoilaTest.instance() voila_app.initialize(voila_args) voila_app.start() yield voila_app voila_app.stop() voila_app.clear_instance()
def load_baseball(): df = pd.read_csv(here() / "data/baseballdb/core/Batting.csv") df["AB"] = df["AB"].replace(0, np.nan) df = df.dropna() df["batting_avg"] = df["H"] / df["AB"] df = df[df["yearID"] >= 2016] df = df.iloc[0:15] df.head(5) return df
def main(): my_path = str(here('data-processed')) try: model = sys.argv[1] except IndexError: model = None # check_metadata(my_path, model) check_formatting(my_path, model)
def download_file(url, fpth, song_id=None, use_here=True): if song_id is None: song_id = get_song_id(url) if use_here: fpth = str(here(fpth)) r = requests.get(url, stream=True, headers={'User-agent': 'Mozilla/5.0'}) if r.status_code == 200: with open(fpth, 'wb') as f: f.write(r.content) else: warnings.warn(f"URL Failed: {url}")
def process_raw_pdf(): governorates_pages = { "Tunis": 40, "Ariana": 42, "Ben Arous": 44, "Manouba": 45, "Nabeul": 49, "Zagouan": 50, "Bizerte": 52, "Beja": 55, "Jandouba": 56, "Kef": 58, "Seliana": 62, "Sousse": 66, "Monastir": 68, "Mahdia": 69, "Sfax": 71, "Kairouan": 73, "Kasserine": 75, "Sidi Bouzid": 76, "Gabes": 79, "Médnine": 82, "Tataouine": 82, "Gafsa": 85, "Tozeur": 86, "Kebili": 87, } governorates_dfs = [] for governorate_name, page in governorates_pages.items(): if governorate_name == "Tataouine": table_index = 1 else: table_index = 0 df = ( tabula.read_pdf( input_pdf_path, pages=page, multiple_tables=True, pandas_options={"header": None}, )[table_index] .pipe(cleaning_pipeline) .assign(Gouvernorat=governorate_name) ) governorates_dfs.append(df) combined_df = pd.concat(governorates_dfs, ignore_index=True) combined_df.to_csv( here("data/processed/poverty_rate_Tunisia_2020.csv"), index=False ) return combined_df
def create_dataset(): base_path = here("./data/raw/veneto") for file in base_path.glob('*.pdf'): parse_pdf(file) processed_path = Path(here("./data/processed/veneto")) logger.info('Merging all dataframes and writing to CSV...') df = pd.concat((pd.read_csv(csv_file) for csv_file in processed_path.glob('*.csv')), sort=False) df = df.rename(columns={df.columns[0]: "codice" }) df.loc[df.codice == 1, 'provincia'] = 'Padova' df.loc[df.codice == 2, 'provincia'] = 'Padova' veneto = df.groupby(['data', 'provincia']).sum().drop('codice', axis=1).reset_index() istat = pd.read_csv(here('./data/raw/istat/popolazione/DCIS_POPRES1_06052020222758332.csv')) istat = istat[(istat.STATCIV2 == 99) & (istat.ETA1 == 'TOTAL') & (istat.SEXISTAT1 == 9)][['ITTER107', 'Territorio', 'Value']].rename(columns={'Value': 'popolazione'}) province = istat[istat.ITTER107.str.match(r'IT\w\d\d\b')].drop('ITTER107', axis=1) merged = veneto.merge(province, left_on='provincia', right_on='Territorio').drop('Territorio', axis=1) diffs = merged.groupby('provincia').diff() diffs.columns = ['nuovi_positivi', 'variazione_totale_positivi', 'nuovi_deceduti', 'nuovi_negativizzati', 'drop'] diffs = diffs.drop('drop', axis=1) calculate_per_1M_pop(pd.merge(merged, diffs, left_index=True, right_index=True)).to_csv(Path(here("./data/processed/veneto.csv")))
def main(): """ Main function to call helper functions to generate paths for each type of data in the bucket, derive file_identifier from Sentinel-2, search for files in the S2 metadata and ground truth data for corresponding filename and organize the URIs for the data using Pandas. """ # spyder up to find the root root = here(project_files=[".here"]) # append to path sys.path.append(str(here())) # Construct file paths for worldfloods public Google Cloud storage bucket folder paths_train, paths_test, paths_val = construct_worldfloods_public_filepaths( ) # Extract the file identifiers/names of files without the file extension to # utilize for search. file_id_train = get_file_identifier_from_s2(paths_train) file_id_test = get_file_identifier_from_s2(paths_test) file_id_val = get_file_identifier_from_s2(paths_val) # Create a pandas DataFrame containing image URIs df_train = uri_table_from_file_identifier(file_id_train, paths_train) df_test = uri_table_from_file_identifier(file_id_test, paths_test) df_val = uri_table_from_file_identifier(file_id_val, paths_val) # Concatenate the pandas DataFrame along axis = 0 to # add testing and validation data in the same dataFrame. df_train_test_val = pd.concat([df_train, df_test, df_val], axis=0) # Save the dataframe as a comma separated value (.csv) file # for furtue df_train_test_val.to_csv( Path(root).joinpath( "datasets/trials/image_meta_table.csv")) #save it to a bucket
def scvi_impute() -> None: fnm: str = "sc_10x_5cl_forimput_cnt.csv" save_path: PosixPath = here('./10xGenomics/scRNAseq') symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True) vae = VAE(symsim_dataset.nb_genes) trainer = UnsupervisedTrainer(vae, symsim_dataset, train_size=1.0, use_cuda=use_cuda, frequency=5) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, symsim_dataset, indices=np.arange(len(symsim_dataset))) impute_values = full.sequential().imputation() outfnm: str = "scvi_impt.csv" out_path = here("./10xGenomics/impt/").joinpath(outfnm) np.savetxt(out_path, impute_values, delimiter=",")
def migrate(interactive: bool = True): """ Migrate database to the current schema (as defined in nbs/db.ipynb) """ initialize_db() # Get names of tables generated by dbt and exclude them from the migration dbt_models_path = pyprojroot.here() / 'dbt' / 'models' dbt_tables = [f.stem for f in dbt_models_path.glob('**/*.sql')] # Migrate database tables typer.secho('Migrating database tables...', fg=typer.colors.BRIGHT_BLACK) understatdb.db.DB.evolve( ignore_tables=understatdb.db.EVOLVE_IGNORE_TABLES + dbt_tables, interactive=interactive) typer.secho('Done!', fg=typer.colors.GREEN, bold=True)
def scvi_impute(seed: int = 1, platform: str = "umi") -> None: fnm: str = f"sim_{ncell}_{ngene}_{seed}_{platform}_.csv" save_path: PosixPath = here('./scVI/data/symsim') # fullpath:PosixPath = here('./scVI/data/symsim').joinpath(fnm) symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True) vae = VAE(symsim_dataset.nb_genes) trainer = UnsupervisedTrainer(vae, symsim_dataset, train_size=1.0, use_cuda=use_cuda, frequency=5) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, symsim_dataset, indices=np.arange(len(symsim_dataset))) impute_values = full.sequential().imputation() out_path = here("./simutool/jobs/scvi_result").joinpath(fnm) np.savetxt(out_path, impute_values, delimiter=",")
def test_here(tmp_path: Path, project_file: str, child_dir: str) -> None: """ This test uses pytest's tmp_path facilities to create a simulated project directory, and checks that the path is correct. """ # Create project file (tmp_path / project_file).write_text("blah") # Create child dirs child_path = tmp_path / child_dir child_path.mkdir(parents=True) chdir(child_path) assert Path.cwd() == child_path # Verify the project against current work directory assert here() == tmp_path
def test_here(tmpdir, project_files, child_dir): """ This test uses pytest's tmpdir facilities to create a simulated project directory, and checks that the path is correct. """ # Create project file temp_dir = Path(tmpdir) path = temp_dir / project_files with path.open("w") as file_path: file_path.write("blah") # Create child dirs (temp_dir / child_dir).mkdir(parents=True) chdir(temp_dir / child_dir) assert Path().cwd() == (temp_dir / child_dir) # Verify the project against current work directory current_path = here() assert current_path == temp_dir
def __init__(self, exp_dir: str, write_mode=True, if_exists: str = 'error' ): self.project_dir = here() # will this work if installed as library somewhere else? self.logging_directory = f"{self.project_dir}/models/{exp_dir}" self.results_dir = f"{self.logging_directory}/results" self.specification_dir = f"{self.logging_directory}/specification" if write_mode: _check_directory(self.logging_directory, if_exists) os.makedirs(self.logging_directory, exist_ok=True) os.makedirs(self.results_dir, exist_ok=True) os.makedirs(self.specification_dir, exist_ok=True) # self.logging_directory = logging_directory self.logger = getLogger(str(uuid.uuid4())) self.log_path = os.path.join(self.logging_directory, 'log.txt') self.logger.addHandler(FileHandler(self.log_path)) self.logger.setLevel(DEBUG)
def _find_package_in_our_project(name: str) -> Optional[str]: """Search the name in the objects of the package we are developing. Args: name: package name Returns: import_string: String required to import the package. """ # Find the package name project_package = os.path.basename(here()).replace("-", "_") package_objects = extract_package_objects(project_package) # nocover: as the tests are run inside the autoimport virtualenv, it will # always find the objects on that package if package_objects is None: # pragma: nocover return None try: return package_objects[name] except KeyError: return None
use observable variables. To execute this module by itself, navigate at the command line to the project's root directory and type: `python workflow/testing_images_conditional.py`. """ from pathlib import Path import causal2020.testing.observable_independence as oi import click import pandas as pd from causal2020 import utils from causal2020.graphs.conditional_independence_example import EXAMPLE_GRAPH from pyprojroot import here # Declare paths to data DATA_PATH = here( "data/raw/spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv") # Note the columns of interest in the dataset MODE_ID_COL = "mode_id" TIME_COL = "total_travel_time" COST_COL = "total_travel_cost" DISTANCE_COL = "total_travel_distance" def create_conditional_independence_testing_results( output_path: str, num_permutations: int = 100, permuted_color: str = "#a6bddb", ) -> None: """ Computes and stores the results of permutation testing the implication
import autofit as af import autolens as al import autolens.plot as aplt """ This script simulates _Imaging_ of a strong lens where: - The lens galaxy's _MassProfile_ is a *SphericalIsothermal*. - The source galaxy's _LightProfile_ is a *SphericalExponential*. This dataset is used in chapter 2, tutorials 1-3. """ """Setup the path to the autolens_workspace, using a relative directory name.""" from pyprojroot import here workspace_path = str(here()) """ The 'dataset_type' describes the type of data being simulated (in this case, _Imaging_ data) and 'dataset_name' gives it a descriptive name. They define the folder the dataset is output to on your hard-disk: - The image will be output to '/autolens_workspace/dataset/dataset_type/dataset_name/image.fits'. - The noise-map will be output to '/autolens_workspace/dataset/dataset_type/dataset_name/lens_name/noise_map.fits'. - The psf will be output to '/autolens_workspace/dataset/dataset_type/dataset_name/psf.fits'. """ dataset_type = "chapter_2" dataset_name = "lens_sis__source_exp" """ Create the path where the dataset will be output, which in this case is: '/autolens_workspace/howtolens/dataset/chapter_2/lens_sis__source_exp/' """ dataset_path = af.util.create_path( path=workspace_path,
#!/usr/bin/env python # coding: utf-8 """makes error_across_days.csv, source data for figure that plots error across days for Bengalese Finch Song Repository dataset""" from collections import defaultdict from pathlib import Path import numpy as np import pandas as pd import pyprojroot import seaborn as sns import vak REPO_ROOT = pyprojroot.here() BFSONGREPO_RESULTS_ROOT = REPO_ROOT.joinpath( 'results/BFSongRepository').expanduser().resolve() ERROR_ACROSS_DAYS_CSV_PATH = BFSONGREPO_RESULTS_ROOT.joinpath( 'error_across_days.csv') # used below to determine which columns are metrics, and which of those should be converted to error METRICS = [ 'acc', 'acc_majority_vote', 'acc_min_dur_maj_vote', 'acc_min_segment_dur', 'levenshtein', 'levenshtein_majority_vote', 'levenshtein_min_dur_maj_vote', 'levenshtein_min_segment_dur', 'segment_error_rate',
import sys, os from pyprojroot import here # spyder up to find the root root = here(project_files=[".here"]) # append to path sys.path.append(str(here())) from datetime import timedelta from datetime import datetime import pandas as pd import geopandas as gpd from pathlib import Path import ee from ml4floods.data import ee_download from ml4floods.data.copernicusEMS import activations from ml4floods.data.create_gt import generate_water_cloud_binary_gt from ml4floods.data.io import save_groundtruth_tiff_rasterio from ml4floods.data.utils import ( remove_gcp_prefix, get_files_in_directory_gcp, read_pickle_from_gcp, save_file_to_bucket, ) from ml4floods.data.utils import GCPPath from typing import Tuple from collections import namedtuple import tqdm ActivationFile = namedtuple(
import scipy.stats # noqa: E402 from causal2020.graphs.drive_alone_utility import ( # noqa: E402 noreorder DRIVE_ALONE_UTILITY, LATENT_DRIVE_ALONE_UTILITY, ) from causal2020.utils import sample_from_factor_model # noqa: E402 noreorder from pyprojroot import here # - # ## Set notebook parameters # + # Parameters # Declare paths to data DATA_PATH = here( "data/raw/spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv") # Note that these files are based on using the PPCA model # of Wang and Blei (2018). W represents global factor # coefficients and Z represents latent factor loadings PATH_TO_W_PARAMS = here("data/processed/W_inferred_PPCA.csv") PATH_TO_Z_PARAMS = here("data/processed/Z_inferred_PPCA.csv") # Note the columns of interest for this notebook MODE_ID_COLUMN = "mode_id" OBS_ID_COLUMN = "observation_id" TIME_COLUMN = "total_travel_time" COST_COLUMN = "total_travel_cost" DISTANCE_COLUMN = "total_travel_distance" LICENSE_COLUMN = "num_licensed_drivers" NUM_AUTOS_COLUMN = "num_cars"
Original: check_truth.py in Germany/Poland forecast hub repo Modifications: - Replace RKI / MZ truth data with JHU Note: some unused infrastructure left in to permit adding future truth datasets """ import pandas as pd from pyprojroot import here import glob from datetime import datetime # all possible locations locations = dict() locations['JHU'] = pd.read_csv( here('./data-truth/JHU/truth_JHU-Incident Deaths.csv') ).location_name.unique() with open(here('./code/validation/check_truth.txt'), 'a', encoding='utf-8') as txtfile: latest_check = 'Latest check of truth data: {}\n'.format( datetime.now().strftime("%Y-%m-%d")) txtfile.write(latest_check + '\n') error_count = 0 for source in ['JHU']: list_of_files = glob.glob( str(here() / 'data-truth/{}/*Incident*.csv').format(source))
from joblib import load from pyprojroot import here python_model = load(here("analysis/data/derived_data/python_model.joblib")) test_data = load(here("analysis/data/derived_data/test_data.joblib"))
from pyprojroot import here import pandas as pd import seaborn as sns import matplotlib.pyplot as plt sns.set() data = pd.read_csv(here('./data/raw/train.csv')) print(data.shape) data.head(3) # checking for null values data.isnull().sum() # def split_joke_identifier(df): # for index, row in df.iterrows(): # r = row['Joke_identifier'].split() # df['Comedian'] = " ".join(r[:-2]) # df['State'] = r[-2] # df['part'] = r[-1] # return df data['Comedian'] = data.Joke_identifier.apply(lambda x : ' '.join(x.split()[:-2])) data['State'] = data.Joke_identifier.apply(lambda x : x.split()[-2]) data['Part'] = data.Joke_identifier.apply(lambda x : x.split()[-1]) data.head(3) data.drop(['Joke_identifier', 'Response_ID'], axis=1, inplace=True) data.head(3)