Esempio n. 1
0
def test_function_regression(func):

    data = rescale(np.load(
        here(f'tests/regression_files/input_{func.ndim}d.npy')),
                   range_in=ValueRange(0, 1),
                   range_out=ValueRange(*func.bounds))

    for fid in func.fidelity_names:
        output = np.load(
            here(
                f'tests/regression_files/output_{func.ndim}d_{func.name}_{fid}.npy'
            ))
        np.testing.assert_allclose(func[fid](data), output)
Esempio n. 2
0
    def _find_package_in_our_project(name: str) -> Optional[str]:
        """Search the name in the objects of the package we are developing.

        Args:
            name: package name

        Returns:
            import_string: String required to import the package.
        """
        # Find the package name
        try:
            project_package = os.path.basename(here()).replace("-", "_")
        except RecursionError:  # pragma: no cover
            # I don't know how to make a test that raises this error :(
            # To manually reproduce, follow the steps of
            # https://github.com/lyz-code/autoimport/issues/131
            return None
        package_objects = extract_package_objects(project_package)

        # nocover: as the tests are run inside the autoimport virtualenv, it will
        # always find the objects on that package
        if package_objects is None:  # pragma: nocover
            return None
        try:
            return package_objects[name]
        except KeyError:
            return None
def load_ice_cream():
    data = (pd.read_csv(here() / "data/ice_cream_shop.csv",
                        index_col=0).reset_index(drop=True).select_columns([
                            "shopname", "num_customers", "owner_idx",
                            "num_favs"
                        ]))
    return data
def load_sterilization():
    df = (pd.read_csv(
        here() / "data/sterilization.csv",
        na_filter=True,
        na_values=["#DIV/0!"],
    ).clean_names().label_encode("treatment"))
    mapping = dict(zip(df["treatment"], df["treatment_enc"]))
    return df, mapping
Esempio n. 5
0
def build_tables(args: typing.List[str] = typer.Option(
    [], help='Additional arguments passed to `dbt run`')):
    """ Build tables from base data using dbt """

    project_dir = pyprojroot.here() / 'dbt'
    profiles_dir = pyprojroot.here() / '.dbt'

    base_args = [
        'run', '--profiles-dir',
        str(profiles_dir), '--project-dir',
        str(project_dir)
    ]

    # NOTE: Python API is not officially supported, so
    # watch out if you change dbt versions...
    typer.secho('Building tables with dbt', fg=typer.colors.BLUE)
    _ = dbt.main.handle_and_check(base_args + list(args))
Esempio n. 6
0
def download_links(to=here("./data/raw/veneto")):
    for link in get_all_pdf_links():
        out_path = Path(to) / clean_filename(Path(link).name)
        if not out_path.exists():
            logger.info(f'Downloading file {link}')
            r = requests.get(link)
            with open(out_path, "wb") as file:
                file.write(r.content)
Esempio n. 7
0
def read_mkdocs() -> Dict:
    """
    Parse mkdocs.yml in project root dir.
    """
    with open(here() / "mkdocs.yml", "r+") as f:
        f = "".join(l for l in f.readlines())
        mkdocs_config = yaml.safe_load(f)
    return mkdocs_config
Esempio n. 8
0
def nb_app():
    nb = here() / "notebooks" / "wealth_of_nederland.ipynb"
    voila_args = [str(nb), "--no-browser"]
    voila_app = VoilaTest.instance()
    voila_app.initialize(voila_args)
    voila_app.start()
    yield voila_app
    voila_app.stop()
    voila_app.clear_instance()
Esempio n. 9
0
def calh_app():
    calh_notebook = here() / 'calh.ipynb'
    voila_args = [str(calh_notebook), '--no-browser']
    voila_app = VoilaTest.instance()
    voila_app.initialize(voila_args)
    voila_app.start()
    yield voila_app
    voila_app.stop()
    voila_app.clear_instance()
def load_baseball():
    df = pd.read_csv(here() / "data/baseballdb/core/Batting.csv")
    df["AB"] = df["AB"].replace(0, np.nan)
    df = df.dropna()
    df["batting_avg"] = df["H"] / df["AB"]
    df = df[df["yearID"] >= 2016]
    df = df.iloc[0:15]
    df.head(5)
    return df
def main():
    my_path = str(here('data-processed'))

    try:
        model = sys.argv[1]
    except IndexError:
        model = None

    # check_metadata(my_path, model)
    check_formatting(my_path, model)
Esempio n. 12
0
def download_file(url, fpth, song_id=None, use_here=True):
    if song_id is None:
        song_id = get_song_id(url)
    if use_here:
        fpth = str(here(fpth))

    r = requests.get(url, stream=True, headers={'User-agent': 'Mozilla/5.0'})
    if r.status_code == 200:
        with open(fpth, 'wb') as f:
            f.write(r.content)
    else:
        warnings.warn(f"URL Failed: {url}")
def process_raw_pdf():
    governorates_pages = {
        "Tunis": 40,
        "Ariana": 42,
        "Ben Arous": 44,
        "Manouba": 45,
        "Nabeul": 49,
        "Zagouan": 50,
        "Bizerte": 52,
        "Beja": 55,
        "Jandouba": 56,
        "Kef": 58,
        "Seliana": 62,
        "Sousse": 66,
        "Monastir": 68,
        "Mahdia": 69,
        "Sfax": 71,
        "Kairouan": 73,
        "Kasserine": 75,
        "Sidi Bouzid": 76,
        "Gabes": 79,
        "Médnine": 82,
        "Tataouine": 82,
        "Gafsa": 85,
        "Tozeur": 86,
        "Kebili": 87,
    }

    governorates_dfs = []

    for governorate_name, page in governorates_pages.items():
        if governorate_name == "Tataouine":
            table_index = 1
        else:
            table_index = 0
        df = (
            tabula.read_pdf(
                input_pdf_path,
                pages=page,
                multiple_tables=True,
                pandas_options={"header": None},
            )[table_index]
            .pipe(cleaning_pipeline)
            .assign(Gouvernorat=governorate_name)
        )

        governorates_dfs.append(df)
    combined_df = pd.concat(governorates_dfs, ignore_index=True)
    combined_df.to_csv(
        here("data/processed/poverty_rate_Tunisia_2020.csv"), index=False
    )
    return combined_df
Esempio n. 14
0
def create_dataset():
    base_path = here("./data/raw/veneto")
    
    for file in base_path.glob('*.pdf'):
        parse_pdf(file)

    processed_path = Path(here("./data/processed/veneto"))

    logger.info('Merging all dataframes and writing to CSV...')
    df = pd.concat((pd.read_csv(csv_file) for csv_file in processed_path.glob('*.csv')), sort=False)
    df = df.rename(columns={df.columns[0]: "codice" })
    df.loc[df.codice == 1, 'provincia'] = 'Padova'
    df.loc[df.codice == 2, 'provincia'] = 'Padova'
    veneto = df.groupby(['data', 'provincia']).sum().drop('codice', axis=1).reset_index()
    istat = pd.read_csv(here('./data/raw/istat/popolazione/DCIS_POPRES1_06052020222758332.csv'))
    istat = istat[(istat.STATCIV2 == 99) & (istat.ETA1 == 'TOTAL') & (istat.SEXISTAT1 == 9)][['ITTER107', 'Territorio', 'Value']].rename(columns={'Value': 'popolazione'})
    province = istat[istat.ITTER107.str.match(r'IT\w\d\d\b')].drop('ITTER107', axis=1)
    merged = veneto.merge(province, left_on='provincia', right_on='Territorio').drop('Territorio', axis=1)
    diffs = merged.groupby('provincia').diff()
    diffs.columns = ['nuovi_positivi', 'variazione_totale_positivi', 'nuovi_deceduti', 'nuovi_negativizzati', 'drop']
    diffs = diffs.drop('drop', axis=1)
    calculate_per_1M_pop(pd.merge(merged, diffs, left_index=True, right_index=True)).to_csv(Path(here("./data/processed/veneto.csv")))
Esempio n. 15
0
def main():
    """
    Main function to call helper functions to generate paths for each type of data in the bucket,
    derive file_identifier from Sentinel-2, search for files in the S2 metadata and ground truth
    data for corresponding filename and organize the URIs for the data using Pandas.
    """

    # spyder up to find the root
    root = here(project_files=[".here"])

    # append to path
    sys.path.append(str(here()))

    # Construct file paths for worldfloods public Google Cloud storage bucket folder
    paths_train, paths_test, paths_val = construct_worldfloods_public_filepaths(
    )

    # Extract the file identifiers/names of files without the file extension to
    # utilize for search.
    file_id_train = get_file_identifier_from_s2(paths_train)
    file_id_test = get_file_identifier_from_s2(paths_test)
    file_id_val = get_file_identifier_from_s2(paths_val)

    # Create a pandas DataFrame containing image URIs
    df_train = uri_table_from_file_identifier(file_id_train, paths_train)
    df_test = uri_table_from_file_identifier(file_id_test, paths_test)
    df_val = uri_table_from_file_identifier(file_id_val, paths_val)

    # Concatenate the pandas DataFrame along axis = 0 to
    # add testing and validation data in the same dataFrame.
    df_train_test_val = pd.concat([df_train, df_test, df_val], axis=0)

    # Save the dataframe as a comma separated value (.csv) file
    # for furtue
    df_train_test_val.to_csv(
        Path(root).joinpath(
            "datasets/trials/image_meta_table.csv"))  #save it to a bucket
Esempio n. 16
0
def scvi_impute() -> None:
    fnm: str = "sc_10x_5cl_forimput_cnt.csv"
    save_path: PosixPath = here('./10xGenomics/scRNAseq')

    symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True)

    vae = VAE(symsim_dataset.nb_genes)

    trainer = UnsupervisedTrainer(vae,
                                  symsim_dataset,
                                  train_size=1.0,
                                  use_cuda=use_cuda,
                                  frequency=5)

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(trainer.model,
                                    symsim_dataset,
                                    indices=np.arange(len(symsim_dataset)))
    impute_values = full.sequential().imputation()

    outfnm: str = "scvi_impt.csv"
    out_path = here("./10xGenomics/impt/").joinpath(outfnm)
    np.savetxt(out_path, impute_values, delimiter=",")
Esempio n. 17
0
def migrate(interactive: bool = True):
    """ Migrate database to the current schema (as defined in nbs/db.ipynb) """

    initialize_db()

    # Get names of tables generated by dbt and exclude them from the migration
    dbt_models_path = pyprojroot.here() / 'dbt' / 'models'
    dbt_tables = [f.stem for f in dbt_models_path.glob('**/*.sql')]

    # Migrate database tables
    typer.secho('Migrating database tables...', fg=typer.colors.BRIGHT_BLACK)
    understatdb.db.DB.evolve(
        ignore_tables=understatdb.db.EVOLVE_IGNORE_TABLES + dbt_tables,
        interactive=interactive)
    typer.secho('Done!', fg=typer.colors.GREEN, bold=True)
Esempio n. 18
0
def scvi_impute(seed: int = 1, platform: str = "umi") -> None:
    fnm: str = f"sim_{ncell}_{ngene}_{seed}_{platform}_.csv"
    save_path: PosixPath = here('./scVI/data/symsim')
    # fullpath:PosixPath = here('./scVI/data/symsim').joinpath(fnm)

    symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True)

    vae = VAE(symsim_dataset.nb_genes)

    trainer = UnsupervisedTrainer(vae,
                                  symsim_dataset,
                                  train_size=1.0,
                                  use_cuda=use_cuda,
                                  frequency=5)

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(trainer.model,
                                    symsim_dataset,
                                    indices=np.arange(len(symsim_dataset)))
    impute_values = full.sequential().imputation()

    out_path = here("./simutool/jobs/scvi_result").joinpath(fnm)
    np.savetxt(out_path, impute_values, delimiter=",")
Esempio n. 19
0
def test_here(tmp_path: Path, project_file: str, child_dir: str) -> None:
    """
    This test uses pytest's tmp_path facilities to create a simulated project
    directory, and checks that the path is correct.
    """
    # Create project file
    (tmp_path / project_file).write_text("blah")

    # Create child dirs
    child_path = tmp_path / child_dir
    child_path.mkdir(parents=True)
    chdir(child_path)
    assert Path.cwd() == child_path

    # Verify the project against current work directory
    assert here() == tmp_path
Esempio n. 20
0
def test_here(tmpdir, project_files, child_dir):
    """
    This test uses pytest's tmpdir facilities to create a simulated project
    directory, and checks that the path is correct.
    """
    # Create project file
    temp_dir = Path(tmpdir)
    path = temp_dir / project_files
    with path.open("w") as file_path:
        file_path.write("blah")

    # Create child dirs
    (temp_dir / child_dir).mkdir(parents=True)
    chdir(temp_dir / child_dir)
    assert Path().cwd() == (temp_dir / child_dir)

    # Verify the project against current work directory
    current_path = here()
    assert current_path == temp_dir
Esempio n. 21
0
    def __init__(self,
                 exp_dir: str,
                 write_mode=True,
                 if_exists: str = 'error'
                 ):

        self.project_dir = here()  # will this work if installed as library somewhere else?
        self.logging_directory = f"{self.project_dir}/models/{exp_dir}"
        self.results_dir = f"{self.logging_directory}/results"
        self.specification_dir = f"{self.logging_directory}/specification"

        if write_mode:
            _check_directory(self.logging_directory, if_exists)
            os.makedirs(self.logging_directory, exist_ok=True)
            os.makedirs(self.results_dir, exist_ok=True)
            os.makedirs(self.specification_dir, exist_ok=True)
            # self.logging_directory = logging_directory
            self.logger = getLogger(str(uuid.uuid4()))
            self.log_path = os.path.join(self.logging_directory, 'log.txt')
            self.logger.addHandler(FileHandler(self.log_path))
            self.logger.setLevel(DEBUG)
Esempio n. 22
0
    def _find_package_in_our_project(name: str) -> Optional[str]:
        """Search the name in the objects of the package we are developing.

        Args:
            name: package name

        Returns:
            import_string: String required to import the package.
        """
        # Find the package name
        project_package = os.path.basename(here()).replace("-", "_")
        package_objects = extract_package_objects(project_package)

        # nocover: as the tests are run inside the autoimport virtualenv, it will
        # always find the objects on that package
        if package_objects is None:  # pragma: nocover
            return None
        try:
            return package_objects[name]
        except KeyError:
            return None
use observable variables.

To execute this module by itself, navigate at the command line to the project's
root directory and type: `python workflow/testing_images_conditional.py`.
"""
from pathlib import Path

import causal2020.testing.observable_independence as oi
import click
import pandas as pd
from causal2020 import utils
from causal2020.graphs.conditional_independence_example import EXAMPLE_GRAPH
from pyprojroot import here

# Declare paths to data
DATA_PATH = here(
    "data/raw/spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv")

# Note the columns of interest in the dataset
MODE_ID_COL = "mode_id"
TIME_COL = "total_travel_time"
COST_COL = "total_travel_cost"
DISTANCE_COL = "total_travel_distance"


def create_conditional_independence_testing_results(
    output_path: str,
    num_permutations: int = 100,
    permuted_color: str = "#a6bddb",
) -> None:
    """
    Computes and stores the results of permutation testing the implication
Esempio n. 24
0
import autofit as af
import autolens as al
import autolens.plot as aplt
"""
This script simulates _Imaging_ of a strong lens where:

 - The lens galaxy's _MassProfile_ is a *SphericalIsothermal*.
 - The source galaxy's _LightProfile_ is a *SphericalExponential*.
    
This dataset is used in chapter 2, tutorials 1-3.
"""
"""Setup the path to the autolens_workspace, using a relative directory name."""
from pyprojroot import here

workspace_path = str(here())
"""
The 'dataset_type' describes the type of data being simulated (in this case, _Imaging_ data) and 'dataset_name' 
gives it a descriptive name. They define the folder the dataset is output to on your hard-disk:

 - The image will be output to '/autolens_workspace/dataset/dataset_type/dataset_name/image.fits'.
 - The noise-map will be output to '/autolens_workspace/dataset/dataset_type/dataset_name/lens_name/noise_map.fits'.
 - The psf will be output to '/autolens_workspace/dataset/dataset_type/dataset_name/psf.fits'.
"""
dataset_type = "chapter_2"
dataset_name = "lens_sis__source_exp"
"""
Create the path where the dataset will be output, which in this case is:
'/autolens_workspace/howtolens/dataset/chapter_2/lens_sis__source_exp/'
"""
dataset_path = af.util.create_path(
    path=workspace_path,
#!/usr/bin/env python
# coding: utf-8
"""makes error_across_days.csv, source data for figure that plots error across days for Bengalese Finch Song Repository dataset"""
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import pyprojroot
import seaborn as sns

import vak

REPO_ROOT = pyprojroot.here()
BFSONGREPO_RESULTS_ROOT = REPO_ROOT.joinpath(
    'results/BFSongRepository').expanduser().resolve()

ERROR_ACROSS_DAYS_CSV_PATH = BFSONGREPO_RESULTS_ROOT.joinpath(
    'error_across_days.csv')

# used below to determine which columns are metrics, and which of those should be converted to error
METRICS = [
    'acc',
    'acc_majority_vote',
    'acc_min_dur_maj_vote',
    'acc_min_segment_dur',
    'levenshtein',
    'levenshtein_majority_vote',
    'levenshtein_min_dur_maj_vote',
    'levenshtein_min_segment_dur',
    'segment_error_rate',
import sys, os
from pyprojroot import here

# spyder up to find the root
root = here(project_files=[".here"])

# append to path
sys.path.append(str(here()))

from datetime import timedelta
from datetime import datetime
import pandas as pd
import geopandas as gpd
from pathlib import Path
import ee
from ml4floods.data import ee_download
from ml4floods.data.copernicusEMS import activations
from ml4floods.data.create_gt import generate_water_cloud_binary_gt
from ml4floods.data.io import save_groundtruth_tiff_rasterio
from ml4floods.data.utils import (
    remove_gcp_prefix,
    get_files_in_directory_gcp,
    read_pickle_from_gcp,
    save_file_to_bucket,
)
from ml4floods.data.utils import GCPPath
from typing import Tuple
from collections import namedtuple
import tqdm

ActivationFile = namedtuple(
import scipy.stats  # noqa: E402
from causal2020.graphs.drive_alone_utility import (  # noqa: E402 noreorder
    DRIVE_ALONE_UTILITY, LATENT_DRIVE_ALONE_UTILITY,
)
from causal2020.utils import sample_from_factor_model  # noqa: E402 noreorder
from pyprojroot import here

# -

# ## Set notebook parameters

# +
# Parameters

# Declare paths to data
DATA_PATH = here(
    "data/raw/spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv")
# Note that these files are based on using the PPCA model
# of Wang and Blei (2018). W represents global factor
# coefficients and Z represents latent factor loadings
PATH_TO_W_PARAMS = here("data/processed/W_inferred_PPCA.csv")
PATH_TO_Z_PARAMS = here("data/processed/Z_inferred_PPCA.csv")

# Note the columns of interest for this notebook
MODE_ID_COLUMN = "mode_id"
OBS_ID_COLUMN = "observation_id"

TIME_COLUMN = "total_travel_time"
COST_COLUMN = "total_travel_cost"
DISTANCE_COLUMN = "total_travel_distance"
LICENSE_COLUMN = "num_licensed_drivers"
NUM_AUTOS_COLUMN = "num_cars"
Esempio n. 28
0
Original: check_truth.py in Germany/Poland forecast hub repo
Modifications:
    - Replace RKI / MZ truth data with JHU
Note: some unused infrastructure left in to permit adding future truth datasets
"""

import pandas as pd
from pyprojroot import here
import glob
from datetime import datetime

# all possible locations
locations = dict()
locations['JHU'] = pd.read_csv(
    here('./data-truth/JHU/truth_JHU-Incident Deaths.csv')
).location_name.unique()

with open(here('./code/validation/check_truth.txt'), 'a',
          encoding='utf-8') as txtfile:

    latest_check = 'Latest check of truth data: {}\n'.format(
        datetime.now().strftime("%Y-%m-%d"))
    txtfile.write(latest_check + '\n')

    error_count = 0

    for source in ['JHU']:
        list_of_files = glob.glob(
            str(here() / 'data-truth/{}/*Incident*.csv').format(source))
Esempio n. 29
0
from joblib import load
from pyprojroot import here

python_model = load(here("analysis/data/derived_data/python_model.joblib"))
test_data = load(here("analysis/data/derived_data/test_data.joblib"))
from pyprojroot import here
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

data = pd.read_csv(here('./data/raw/train.csv'))
print(data.shape)
data.head(3)

# checking for null values
data.isnull().sum()

# def split_joke_identifier(df):
#     for index, row in df.iterrows():
#         r = row['Joke_identifier'].split()
#         df['Comedian'] = " ".join(r[:-2])
#         df['State'] = r[-2]
#         df['part'] = r[-1]
#     return df


data['Comedian'] = data.Joke_identifier.apply(lambda x : ' '.join(x.split()[:-2]))
data['State'] = data.Joke_identifier.apply(lambda x : x.split()[-2])
data['Part'] = data.Joke_identifier.apply(lambda x : x.split()[-1])
data.head(3)

data.drop(['Joke_identifier', 'Response_ID'], axis=1, inplace=True)
data.head(3)