def flow(
    output_dir: Optional[str] = None,
    download_url: str = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative",
    created: datetime = datetime.utcnow(),
    catalog_path: Path = public_catalog.get_path(),
) -> Flow:
    output_dir_path: Path = Path(output_dir) if output_dir else Path(tempfile.mkdtemp())

    entry = get_entry(created)

    with Flow(f"gwas-catalog-{entry.artifact.version}") as flow:
        catalog_path = constant(catalog_path, name="catalog_path")
        url = constant(entry.resources["parquet"], name="url")
        entry = constant(
            entry, name=f"entry.key={entry_key_str(entry.key)}", value=False
        )

        # Download and convert to parquet
        local_csv = constant(
            output_dir_path.joinpath("gwas_catalog.csv"), name="local_csv"
        )
        parquet_dir = constant(
            output_dir_path.joinpath("gwas_catalog_parquet"), name="parquet_dir"
        )
        local_csv = download(download_url, local_csv)
        info = convert_to_parquet(local_csv, parquet_dir)

        status = upload(entry, parquet_dir, url, upstream_tasks=[info])
        add_entry(entry, info, catalog_path, upstream_tasks=[status])
        return flow
Esempio n. 2
0
def flow(
    output_dir: str = "/tmp/clinvar",
    raw_url:
    str = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/submission_summary_2020-06.txt.gz",
) -> Flow:
    """Get ClinVar submission summary import flow

    Parameters
    ----------
    output_dir : str
        Directory in which csv/parquet files are stored
    raw_url : str
        Link to ClinVar submission summary CSV (on ftp.ncbi.nlm.nih.gov).
        Note that the version and creation timestamp associated with this
        artifact are inferred from the link since ClinVar has no
        semantic versioning in its releases, and the FTP site provides
        archived files where the date of creation/release is clear.

    Returns
    -------
    Flow
        Prefect Flow
    """
    created = raw_url.split("/")[-1].split("_")[-1].split(".")[0]
    if not created:
        raise ValueError(
            'Unable to determine archive date from url "{raw_url}"')
    version = f"v{created}"

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)

    entry = get_entry(version, created)
    catalog_path = catalog.default_urlpath()
    filename = raw_url.split("/")[-1]

    with Flow(f"clinvar-{version}") as flow:
        # Add constants with important to DAG (all others are not visualized)
        catalog_path = constant(catalog_path, name="catalog_path")
        url = constant(entry.resources["parquet"], name="url")  # pylint:disable=unsubscriptable-object
        entry = constant(entry,
                         name=f"entry.key={entry_key_str(entry.key)}",
                         value=False)

        # Download and convert to parquet
        csv_path = constant(str(output_dir / filename), name="csv_path")
        parquet_path = constant(str(output_dir / filename.split(".")[0]) +
                                ".parquet",
                                name="parquet_path")
        csv_path = download(raw_url, csv_path)
        info = convert_to_parquet(csv_path, parquet_path)

        # Upload results
        # pylint:disable=unexpected-keyword-arg
        status = upload(entry, parquet_path, url, upstream_tasks=[info])
        add_entry(entry, info, catalog_path, upstream_tasks=[status])
        return flow
Esempio n. 3
0
def flow(
    output_dir: str = "/tmp/medgen",
    csv_url: str = "https://ftp.ncbi.nlm.nih.gov/pub/medgen/csv",
    n_mgrel_files: Optional[int] = None,
    created: str = "today",
) -> Flow:
    """Get MedGen import flow
    
    Parameters
    ----------
    output_dir : str
        Directory in which csv/parquet files are stored
    csv_url : str
        Link to MedGen CSV exports (e.g.
        https://ftp.ncbi.nlm.nih.gov/pub/medgen/csv).
        Note that MedGen appears to have no archival or
        release process so both versions and created
        timestamps in artifacts will correspond to a
        year-month (e.g. 2020-06).
    n_mgrel_files : Optional[int]
        Number of MGREL files to download.  These contain
        pairwise concept relationships and are often broken
        up into chunks to have < 1M rows for spreadsheet
        users.  At TOW, 2 chunks are present so this can
        be provided explicitly or if left as None, the
        number of files will be inferred by trying increments
        until one fails to exist.
    created: str
        Year-month associated with artifact.  Defaults
        to current year-month.

    Returns
    -------
    Flow
        Prefect Flow
    """
    output_path = Path(output_dir)
    if n_mgrel_files is None:
        n_mgrel_files = get_n_mgrel_files(csv_url)
        if n_mgrel_files <= 0:
            raise ValueError(f"Failed to find any MGREL files at {csv_url}")

    entry = get_entry(created)
    catalog_path = catalog.default_urlpath()

    with Flow(f"medgen-{entry.artifact.version}") as flow:
        catalog_path = constant(catalog_path, name="catalog_path")
        # pylint:disable=unsubscriptable-object
        url = constant(entry.resources["parquet"], name="url")
        entry = constant(entry,
                         name=f"entry.key={entry_key_str(entry.key)}",
                         value=False)

        # Download and convert to parquet
        csv_dir = constant(str(output_path / 'mgrel.csv'), name="csv_dir")
        parquet_dir = constant(str(output_path / 'mgrel.parquet'),
                               name="parquet_dir")
        csv_dir = download(csv_url, csv_dir, n_mgrel_files)
        info = convert_to_parquet(csv_dir, parquet_dir)

        # Upload results
        # pylint:disable=unexpected-keyword-arg
        status = upload(entry, parquet_dir, url, upstream_tasks=[info])
        add_entry(entry, info, catalog_path, upstream_tasks=[status])
        return flow
def flow(
    source: str,
    relpath: str,
    convert: bool = True,
    output_dir: str = "/tmp/otpev",
    version: str = "20.06",
    created: Optional[str] = None,
    n_partitions: Optional[int] = None,
) -> Flow:
    """Get OTP evidence import flow

    Parameters
    ----------
    source : str
        OTP evidence source (e.g. eva, l2g, uniprot)
    relpath : str
        Path relative from `gs://open-targets-data-releases/$VERSION/input/evidence-files` to
        data file or directory (e.g. "progeny-2018-07-23.json.gz" or "evidences_protein_fix/chembl_dataset")
    output_dir : str
        Directory in which temporary json/parquet files are stored
    version : str
        OTP release version
    created: str, optional
        Date at which OTP version was created.  This should NOT
        be a time at which data was collected -- it is intended to
        reflect when OT created the release and should never change
        for the same `version`.  For this reason, `created` will
        default to known release dates (see `OT_VERSION_RELEASE_DATES`).
    n_partitions: int, optional
        Number of partitions used to write parquet result.
        Set as None to use default partitioning.

    Raises
    ------
    KeyError
        If `created` is not provided and no known release date
        was previously recorded for the specified `version`

    Returns
    -------
    Flow
        Prefect Flow
    """
    version = str(version)
    if created is None:
        if version not in OT_VERSION_RELEASE_DATES:
            raise KeyError(
                f'No release date known for version "{version}" '
                "(pass `created` explicitly or add date to `OT_VERSION_RELEASE_DATES`)"
            )
        created = OT_VERSION_RELEASE_DATES[version]

    output_dir = Path(output_dir) / source
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)

    is_file = relpath.endswith("json.gz")
    src_url = OT_URL_FMT.format(version=version) + f'/{relpath.lstrip("/")}'
    entry = get_entry(
        source,
        version,
        created,
        format="parquet" if is_file else "json.gz",
        type="file" if is_file else "directory",
        properties=None if is_file else dict(compression="gzip"),
    )
    catalog_path = catalog.default_urlpath()

    with Flow(f"otpev-{source}-v{version}") as flow:
        # Add constants with important to DAG (all others are not visualized)
        catalog_path = constant(catalog_path, name="catalog_path")
        dst_url = next(iter(entry.resources.values()))
        entry = constant(
            entry, name=f"entry.key={entry_key_str(entry.key)}", value=False
        )
        n_partitions = constant(n_partitions, name="n_partitions")
        if is_file:
            filename = src_url.split("/")[-1]
            src_url = constant(src_url, name="src_url")
            dst_url = constant(dst_url, name="dst_url")

            # Download and convert to parquet
            json_path = constant(str(output_dir / filename), name="json_path")
            parquet_path = constant(
                str(output_dir / filename.split(".")[0]) + ".parquet",
                name="parquet_path",
            )
            json_path = download(src_url, json_path)
            info = convert_to_parquet(
                json_path, parquet_path, n_partitions=n_partitions
            )

            # Upload results
            # pylint:disable=unexpected-keyword-arg
            status = upload(entry, parquet_path, dst_url, upstream_tasks=[info])
            add_entry(entry, info, catalog_path, upstream_tasks=[status])
        else:
            raise NotImplementedError(
                "Integration of data directories (rather than single files) not yet implemented"
            )

        return flow