Esempio n. 1
0
def compute_features(params: Namespace) -> None:
    """Compute features to use for training.

    Args:
        params (Namespace): Input parameters for operations.
    """
    # Set up
    utils.set_seed(seed=params.seed)

    # Load data
    projects_url = (
        "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json"
    )
    projects = utils.load_json_from_url(url=projects_url)
    df = pd.DataFrame(projects)

    # Compute features
    df["text"] = df.title + " " + df.description
    df.drop(columns=["title", "description"], inplace=True)
    df = df[["id", "created_on", "text", "tags"]]

    # Save
    features = df.to_dict(orient="records")
    df_dict_fp = Path(config.DATA_DIR, "features.json")
    utils.save_dict(d=features, filepath=df_dict_fp)

    return df, features
Esempio n. 2
0
def diff(
    author: str = config.AUTHOR,
    repo: str = config.REPO,
    tag_a: str = "workspace",
    tag_b: str = "",
):  # pragma: no cover, can't be certain what diffs will exist
    """Difference between two release TAGs."""
    # Tag b
    if tag_b == "":
        tags_url = f"https://api.github.com/repos/{author}/{repo}/tags"
        tag_b = utils.load_json_from_url(url=tags_url)[0]["name"]
    logger.info(f"Comparing {tag_a} with {tag_b}:")

    # Params
    params_a = params(author=author, repo=repo, tag=tag_a, verbose=False)
    params_b = params(author=author, repo=repo, tag=tag_b, verbose=False)
    params_diff = utils.dict_diff(d_a=params_a, d_b=params_b, d_a_name=tag_a, d_b_name=tag_b)
    logger.info(f"Parameter differences: {json.dumps(params_diff, indent=2)}")

    # Performance
    performance_a = performance(author=author, repo=repo, tag=tag_a, verbose=False)
    performance_b = performance(author=author, repo=repo, tag=tag_b, verbose=False)
    performance_diff = utils.dict_diff(
        d_a=performance_a, d_b=performance_b, d_a_name=tag_a, d_b_name=tag_b
    )
    logger.info(f"Performance differences: {json.dumps(performance_diff, indent=2)}")

    return params_diff, performance_diff
Esempio n. 3
0
def df():
    projects_dict = utils.load_json_from_url(
        url=
        "https://raw.githubusercontent.com/GokuMohandas/madewithml/main/datasets/projects.json"
    )
    df = pd.DataFrame(projects_dict)
    return df
Esempio n. 4
0
def tags():
    tags_list = utils.load_json_from_url(
        url=
        "https://raw.githubusercontent.com/GokuMohandas/madewithml/main/datasets/tags.json"
    )
    tags = [item["tag"] for item in tags_list]
    return tags
Esempio n. 5
0
def download_data():
    """Download data from online to local drive.

    Note:
        We could've just copied files from `datasets` but
        we'll use this later on with other data sources.
    """
    # Download data
    projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json"
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json"
    projects = utils.load_json_from_url(url=projects_url)
    tags = utils.load_json_from_url(url=tags_url)

    # Save data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    utils.save_dict(d=projects, filepath=projects_fp)
    utils.save_dict(d=tags, filepath=tags_fp)
    logger.info("✅ Data downloaded!")
Esempio n. 6
0
def download_auxiliary_data():
    """Load auxiliary data from URL and save to local drive."""
    # Download auxiliary data
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json"
    tags = utils.load_json_from_url(url=tags_url)

    # Save data
    tags_fp = Path(config.DATA_DIR, "tags.json")
    utils.save_dict(d=tags, filepath=tags_fp)
    logger.info("✅ Auxiliary data downloaded!")
Esempio n. 7
0
def performance(
    author: str = config.AUTHOR,
    repo: str = config.REPO,
    tag: str = "workspace",
    verbose: bool = True,
):
    if tag == "workspace":
        performance = utils.load_dict(filepath=Path(config.MODEL_DIR, "performance.json"))
    else:
        url = f"https://raw.githubusercontent.com/{author}/{repo}/{tag}/model/performance.json"
        performance = utils.load_json_from_url(url=url)
    if verbose:
        logger.info(json.dumps(performance, indent=2))
    return performance
def get_tags(author=config.AUTHOR, repo=config.REPO):
    # Get list of tags
    tags_list = ["workspace"] + [
        tag["name"] for tag in utils.load_json_from_url(
            url=f"https://api.github.com/repos/{author}/{repo}/tags")
    ]

    # Get metadata by tag
    tags = {}
    for tag in tags_list:
        tags[tag] = {}
        tags[tag]["params"] = cli.params(tag=tag, verbose=False)
        tags[tag]["performance"] = pd.json_normalize(
            cli.performance(tag=tag, verbose=False),
            sep=".").to_dict(orient="records")[0]

    return tags
Esempio n. 9
0
def test_load_json_from_url():
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json"
    tags_dict = utils.list_to_dict(utils.load_json_from_url(url=tags_url),
                                   key="tag")
    assert "transformers" in tags_dict
Esempio n. 10
0
def tags():
    # Load tags
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json"
    tags_dict = utils.load_json_from_url(url=tags_url)
    tags = [tag["tag"] for tag in tags_dict]
    return tags
Esempio n. 11
0
def diff(commit_a: str = "workspace",
         commit_b: str = "head"):  # pragma: no cover
    """Compare relevant differences (params, metrics) between commits.
    Inspired by DVC's `dvc metrics diff`but repurposed to
    display diffs pertinent to our experiments.

    Args:
        commit_a (str, optional): Primary commit. Defaults to "workspace".
        commit_b (str, optional): Commit to compare to. Defaults to "head".

    Raises:
        ValueError: Invalid commit.
    """
    diffs = {}
    commits = ["a", "b"]
    if commit_a.lower() in ("head", "current"):
        commit_a = "main"
    if commit_b.lower() in ("head", "current"):
        commit_b = "main"

    # Get params
    params = {"a": {}, "b": {}}
    for i, commit in enumerate([commit_a, commit_b]):
        if commit == "workspace":
            params[commits[i]] = utils.load_dict(
                filepath=Path(config.CONFIG_DIR, "params.json"))
            continue
        params_url = (
            f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/params.json"
        )
        params[commits[i]] = utils.load_json_from_url(url=params_url)

    # Parameter differences
    diffs["params"] = {}
    for arg in params["a"]:
        a = params["a"][arg]
        b = params["b"][arg]
        if a != b:
            diffs["params"][arg] = {commit_a: a, commit_b: b}
    logger.info(
        f"Parameter differences:\n{json.dumps(diffs['params'], indent=2)}")

    # Get metrics
    metrics = {"a": {}, "b": {}}
    for i, commit in enumerate([commit_a, commit_b]):
        if commit == "workspace":
            metrics[commits[i]] = utils.load_dict(
                filepath=Path(config.MODEL_DIR, "performance.json"))
            continue
        metrics_url = f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/performance.json"
        metrics[commits[i]] = utils.load_json_from_url(url=metrics_url)

    # Recursively flatten
    metrics_a = pd.json_normalize(metrics["a"],
                                  sep=".").to_dict(orient="records")[0]
    metrics_b = pd.json_normalize(metrics["b"],
                                  sep=".").to_dict(orient="records")[0]
    if metrics_a.keys() != metrics_b.keys():
        raise Exception(
            "Cannot compare these commits because they have different metrics."
        )

    # Metric differences
    diffs["metrics"] = {}
    diffs["metrics"]["improvements"] = {}
    diffs["metrics"]["regressions"] = {}
    for metric in metrics_a:
        if ((metric in metrics_b) and (metrics_a[metric] != metrics_b[metric])
                and (isinstance(metrics_a[metric], numbers.Number))
                and (metric.split(".")[-1] != "num_samples")):
            item = {
                commit_a: metrics_a[metric],
                commit_b: metrics_b[metric],
                "diff": metrics_a[metric] - metrics_b[metric],
            }
            if item["diff"] >= 0.0:
                diffs["metrics"]["improvements"][metric] = item
            else:
                diffs["metrics"]["regressions"][metric] = item
    logger.info(
        f"Metric differences:\n{json.dumps(diffs['metrics'], indent=2)}")

    return diffs