Ejemplos de download en Python, ejemplos de tcga.utils.download en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test1.py Proyecto: numpde/cbb

    class DATA:
        I = download(
            "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/"
            "dna/Caenorhabditis_elegans.WBcel235.dna.chromosome.I.fa.gz").now

        cDNA = download(
            "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/"
            "cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz").now

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_io_vcf_assumptions.py Proyecto: lblum95/Computational_Biomedicine

 def test_check_all(self):
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd) as fd:
                 from idiva.io.ass import check_all
                 for check in check_all(fd):
                     print(group, check)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: z_sources.py Proyecto: numpde/als1

def make_df_meta_ucsc():
    with download(URLS['meta']).now.open(mode='r') as fd:
        df = pandas.read_table(fd, index_col=0)
        df = df.assign(celltype=df.Renamed_clusternames)
        p = re.compile(r"_([0-9]+)[.]tab[.]([A-Z][0-9]+)")
        df.index = ["SS2_16_{}_{}".format(*tcga.utils.unlist1(p.findall(i))) for i in df.index]
        return df

Ejemplo n.º 4

0

Mostrar archivo

 def test_makes_folder(self):
     with TemporaryDirectory() as tempdir:
         folder = Path(tempdir) / "test"
         with self.assertRaises(ValueError):
             x = download("-").to(abs_path=folder).now
         self.assertTrue(folder.exists())
     self.assertTrue(not folder.exists())

Ejemplo n.º 5

0

Mostrar archivo

Archivo: edirect.py Proyecto: numpde/cbb

class param:
    genera = list(
        lines(
            download(
                "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/"
                "Assignments/NCBIEDirectAssignment/genera.txt").now.text))

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"

Ejemplo n.º 6

0

Mostrar archivo

def make_df_desc() -> pandas.DataFrame:
    import warnings
    with download(URLS['sc_description']).now.open(mode='rb') as fd:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            values = openpyxl.load_workbook(fd).active.values
        # Note: order of arguments matters
        df = pandas.DataFrame(columns=next(values), data=list(values))
        df = df.rename(columns={'GSM ID': "gsm", 'annoated cell types': "celltype"})
        return df

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_clf_df_v0_large.py Proyecto: lblum95/Computational_Biomedicine

 def test_makes_df_ctrl(self):
     # This takes about 20min on v1/v2
     from idiva.clf.df import v0_df
     from idiva.io import ReadVCF
     with download(URLS['ctrl']).now.open(mode='rb') as fd:
         with open_maybe_gz(fd) as fd:
             assert isinstance(fd, io.TextIOBase)
             df = v0_df(ReadVCF(fd))
             self.assertTrue(len(df) > 0)
             self.assertEqual(len(df), ref_len_v2['ctrl'])

Ejemplo n.º 8

0

Mostrar archivo

 def test_count(self):
     from idiva.io.vcf import ReadVCF, RawDataline
     # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972}
     ref_len_v2 = {'ctrl': 2227080, 'case': 2258797}
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd, mode='r') as fd:
                 assert isinstance(fd, io.TextIOBase)
                 nlines = sum(1 for __ in ReadVCF(fd))
                 # print(F"Group {group} has {nlines} datalines")
                 self.assertEqual(nlines, ref_len_v2[group])

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_clf_df_v0_large.py Proyecto: lblum95/Computational_Biomedicine

    def test_combines(self):
        from idiva.io import ReadVCF
        from idiva.clf.df import v0_df, join
        dfs = {}

        for k in URLS:
            with download(URLS[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd) as fd:
                    assert isinstance(fd, io.TextIOBase)
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

Ejemplo n.º 10

0

Mostrar archivo

def make_df_expr() -> pandas.DataFrame:
    with download(URLS['GSE98816']).now.open(mode='rb') as fd:
        # samples x genes
        df_expr = pandas.read_table(fd, compression='gzip', quotechar='"', index_col=0).T

    # Sort by sample ID
    df_expr = df_expr.sort_index()
    df_expr.index.name = "sample"

    assert df_expr.index.is_unique

    # Also remove common prefix to match df_meta
    df_expr.index = df_expr.index.str.slice(len(os.path.commonprefix(list(df_expr.index))))
    assert df_expr.index.is_unique
    return df_expr

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_stat_chi2_chi2_test.py Proyecto: lblum95/Computational_Biomedicine

    def test_chi2_large(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for k in URLS_LARGE:
            with download(URLS_LARGE[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd, mode='r') as fd:
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        p = chi2_test(df[cols[0] + cols[1]], cols, add=1)

Ejemplo n.º 12

0

Mostrar archivo

def make_df_meta() -> pandas.DataFrame:
    import xml.etree.ElementTree as ET
    import tarfile
    import re
    import pandas as pd
    from tcga.utils import unlist1

    with download(URLS['GSE98816_miniml']).now.open(mode='rb') as tf:
        with tarfile.open(fileobj=tf, mode='r') as tar:
            et = ET.parse(source=tar.extractfile(unlist1(tar))).getroot()

            # Namespace a la '{http://www.ncbi.nlm.nih.gov/geo/info/MINiML}'
            ns = unlist1(re.findall(r"({.*}).*", et.tag))

            c1: ET.Element
            # c1 = first(et.findall(ns + "Sample"))
            df_meta = pd.DataFrame(
                data=(
                    {
                        'gsm': c1.attrib['iid'],
                        'sra': unlist1(c1.findall("./*/[@type='SRA']")).attrib["target"].strip(),
                        'taxid': unlist1(c1.findall("*/*/[@taxid]")).attrib["taxid"].strip(),
                        'biosample': unlist1(c1.findall("./*/[@type='BioSample']")).attrib["target"].strip(),
                        'strain': unlist1(c1.findall("*/*/[@tag='strain']")).text.strip().lower(),
                        'tissue': unlist1(c1.findall("*/*/[@tag='tissue']")).text.strip().lower(),
                        'genotype': unlist1(c1.findall("*/*/[@tag='genotype']")).text.strip().lower(),
                        'age': unlist1(c1.findall("*/*/[@tag='age']")).text.strip().lower(),
                        'title': unlist1(c1.findall(ns + "Title")).text.strip(),
                        'accession': unlist1(c1.findall(ns + "Accession")).text.strip(),
                        'description': unlist1(c1.findall(ns + "Description")).text.strip(),
                    }
                    for c1 in et.findall(ns + "Sample")
                )
            )

            # Remove common prefix from the description column
            df_meta = df_meta.assign(
                sample=df_meta.description.str.slice(len(os.path.commonprefix(list(df_meta.description))))
            )

            df_meta = df_meta.drop(columns='description')

        return df_meta

Ejemplo n.º 13

0

Mostrar archivo

    )

    df_markers = df_markers.assign(v=1).pivot_table(
        index='gene', columns='celltype', values='v', fill_value=0,
    )

    df_markers = df_markers.astype(int)

    return df_markers


if __name__ == '__main__':
    from tcga.utils import mkdir

    for (_, url) in URLS.items():
        json.dumps(download(url).now.meta, indent=2)

    df_meta = make_df_meta()
    df_meta = df_meta.merge(make_df_desc(), how="inner", on="gsm", suffixes=("", " (desc)"))
    df_meta = df_meta.set_index('sample', verify_integrity=True).sort_index()

    df_expr = make_df_expr()
    df_mrkr = make_df_markers()

    assert df_meta.index.equals(df_expr.index)

    df_meta.to_csv(src_dir / "meta.csv.gz", compression='gzip', sep='\t')
    df_expr.to_csv(src_dir / "expr.csv.gz", compression='gzip', sep='\t')
    df_mrkr.to_csv(src_dir / "mrkr.csv", sep='\t')

#

Ejemplo n.º 14

0

Mostrar archivo

Archivo: a_download.py Proyecto: numpde/als1

import pandas as pd

from tcga.utils import download

URLS = {
    'expr':
    "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/matrix.csv",
    'meta':
    "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/metadata.csv",
}

out_dir = Path(__file__).with_suffix('')
download = download.to(abs_path=out_dir)

for (k, url) in URLS.items():
    (download(url).now.meta)

# with download(URLS['expr']).now.open() as fd:
#     df_expr_index = pd.read_csv(fd, sep=',', usecols=[0], index_col=0).index
#     assert (76533 == len(df_expr_index))
#
# with download(URLS['meta']).now.open() as fd:
#     df_meta_index = pd.read_csv(fd, sep=',', index_col=0).index
#     assert (df_expr_index.equals(df_meta_index[0:len(df_expr_index)]))

with download(URLS['expr']).now.open() as fd:
    df_expr = pd.read_csv(fd, sep=',', nrows=10, index_col=0).astype(int)
    assert (df_expr.shape == (len(df_expr), 50281))

with download(URLS['meta']).now.open() as fd:
    df_meta = pd.read_csv(fd, sep=',', index_col=0)

Ejemplo n.º 15

0

Mostrar archivo

# https://www.rdocumentation.org/packages/affy/versions/1.50.0/topics/expresso
# https://gist.github.com/numpde/772cd596fb5fe6036f7e29736bd1cf15

# Note:
# Potentially useful slides
# https://bioinformatics.mdanderson.org/MicroarrayCourse/Lectures/

import re, gzip
import pandas as pd
from tcga.utils import download

# Default download directory
download = download.to(rel_path="UV/download")

url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60880/matrix/GSE60880_series_matrix.txt.gz"
with download(url).now.open(mode='rb') as gz:
    gz.seek(0)
    with gzip.open(gz, mode='r') as fd:
        sample_title = [
            re.findall(r'"([.\w]+)"', line)
            for line in fd.read().decode().splitlines()
            if line.lower().startswith("!sample_title")
        ].pop()

    gz.seek(0)  # !
    df_expr = pd.read_csv(gz, compression="gzip", comment='!', sep='\t', index_col='ID_REF').sort_index()

    assert (len(sample_title) == len(df_expr.columns))
    df_expr.columns = sample_title

# Affymetrix platform info (affyID -> gene names, etc.)

Ejemplo n.º 16

0

Mostrar archivo

"""

import numpy as np
import matplotlib.pyplot as plt

from inclusive import range
from plox import Plox
from tcga.utils import download
from pathlib import Path
from itertools import count

download = download.to(rel_path="cache/download")

# Reference [2]
download(
    "https://cpb-us-w2.wpmucdn.com/blog.nus.edu.sg/dist/0/3425/files/2018/10/Understanding-Benjamini-Hochberg-method-2ijolq0.pdf"
).now


def get_obs():
    rs = np.random.RandomState(1)

    # Number of hypothesis tests
    M = 10000

    mus1 = rs.normal(size=M)
    mus2 = mus1 + (np.arange(len(mus1)) > 0.9 * len(mus1))

    # Group sizes
    s1 = 25
    s2 = 25

Ejemplo n.º 17

0

Mostrar archivo

Archivo: download.py Proyecto: lblum95/Computational_Biomedicine

URLS = {
    'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
    'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",
}

CACHE = Path(__file__).parent / "download_cache"
download = download.to(abs_path=CACHE)

HEAD = Path(__file__).parent / "head"
HEAD.mkdir(parents=True, exist_ok=True)

# Number of datalines for the `head` preview of VCF
N = 1000

for url in URLS.values():
    data = download(url).now

for k in URLS:
    data = download(URLS[k]).now
    head = HEAD / Path(data.meta['source']).name

    with ExitStack() as stack:
        src = stack.enter_context(data.open(mode='rb'))

        try:
            import gzip
            src = stack.enter_context(gzip.open(src))
        except:
            raise
        else:
            head = Path(str(head)[:-3])

Ejemplo n.º 18

0

Mostrar archivo

Archivo: edirect.py Proyecto: numpde/cbb

from tcga.utils import download
from tcga.strings import lines
from urllib.parse import urlencode, quote

download = download.to(rel_path="cache/download")


class param:
    genera = list(
        lines(
            download(
                "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/"
                "Assignments/NCBIEDirectAssignment/genera.txt").now.text))

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"


for genus in param.genera:
    data = download(param.urls.descriptors + quote(genus) + "?" +
                    urlencode({
                        'returned_content': "COMPLETE",
                        'tax_exact_match': False
                    })).now

    if data.json:
        df = pd.DataFrame(data.json['datasets'])
        df = df.sort_values('display_name')
        print(F"{genus}, estimated genome size:",
              list(df.estimated_size.astype(int)))

Ejemplo n.º 19

0

Mostrar archivo

Archivo: a_cov.py Proyecto: numpde/cbb

def get_as_df(url, **csv_kwargs) -> pd.DataFrame:
    return pd.read_csv(io.StringIO(download(url).now.text),
                       sep='\t',
                       **csv_kwargs)

Ejemplo n.º 20

0

Mostrar archivo

 def test_fail_bad_url(self):
     with TemporaryDirectory() as tempdir:
         with self.assertRaises(ValueError):
             x = download("-").to(abs_path=tempdir).now
         with self.assertRaises(URLError):
             x = download("http://").to(abs_path=tempdir).now

Ejemplo n.º 21

0

Mostrar archivo

 def test_rel_or_abs(self):
     with self.assertRaises(RuntimeError):
         download("-").to(rel_path="cache", abs_path="cache")
     with self.assertRaises(TypeError):
         download("-").to("cache")

Ejemplo n.º 22

0

Mostrar archivo

 def test_fail_no_to(self):
     with self.assertRaises(RuntimeError):
         x = download("-").now

Ejemplo n.º 23

0

Mostrar archivo

 def test_silent_accept_bad_url(self):
     x = download("-")

Ejemplo n.º 24

0

Mostrar archivo

def get_pstg_seq() -> str:
    viroid_fasta = download(PARAM['viroid']).to(rel_path="cache/download").now.text
    pstg = SeqIO.read(io.StringIO(viroid_fasta), format='fasta')
    s = First(dna_to_dna).then(dna_to_rna)(pstg.seq)
    return s

Ejemplo n.º 25

0

Mostrar archivo

Archivo: 00006_download.py Proyecto: numpde/tcga

# RA, 2020-06-25

from pathlib import Path
from tempfile import gettempdir
from tcga.utils import download

download = download.to(abs_path=(Path(gettempdir()) / "tcga_download_cache"))
print("Will download to:", download.local_folder)
# Will download to: /tmp/tcga_download_cache

# Lambda phage genome
data = download("https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1").again(
    False).now

print(data.meta)  # same as tcga.refs.annotations[data]
# {'source': 'https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1', 'datetime': '2020-06-25 07:18:52.065826+00:00'}

print(data.text[0:42], "...", data.text[330:350], "...")
# >ENA|J02459|J02459.1 Escherichia phage Lam ... CAGGGAATGCCCGTTCTGCG ...

print(data.local_file)
# /tmp/tcga_download_cache/Z9tBKiJCqrfWuYy5BlgrA3zZAWav2CUd4xrPsya93Os=.zip

try:
    from Bio import SeqIO
except ImportError:
    print("Need `biopython`")
else:
    with data.open(mode='r') as fd:
        print(SeqIO.read(fd, format='fasta'))
# ID: ENA|J02459|J02459.1

Ejemplo n.º 26

0

Mostrar archivo

Archivo: z_sources.py Proyecto: numpde/als1

 def url2df(k):
     with download(URLS[k]).now.open(mode='rb') as fd:
         df = pandas.read_table(fd, compression='gzip', index_col=0).astype(int).sort_index(axis=1)
         df.columns = [f"{k}_{c}" for c in df.columns]
         return df

Ejemplo n.º 27

0

Mostrar archivo

# RA, 2020-06-27

from tcga.utils import download

url = "https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/7.1/msigdb_v7.1_files_to_download_locally.zip"
download(url).to(rel_path="original").now

Ejemplo n.º 28

0

Mostrar archivo

Archivo: z_sources.py Proyecto: numpde/als1

def make_df_expr_ucsc():
    import gzip
    with download(URLS['expr']).now.open(mode='rb') as fd:
        with gzip.open(fd, mode='rb') as gz:
            return pandas.read_table(gz, index_col=0)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: a_cov.py Proyecto: numpde/cbb

def load_cov2() -> pd.DataFrame:
    import zipfile
    with zipfile.ZipFile(download(url=PARAM['GSE CoV2']).now.local_file,
                         mode='r') as zf:
        with zf.open("data") as fd:
            return pd.read_csv(fd, compression="gzip", sep='\t', index_col=0)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: df.py Proyecto: lblum95/Computational_Biomedicine

    def maker_clinvar_clf() -> pandas.DataFrame:
        from idiva.db.clinvar import df_clinvar_to_clf_data
        # If you change this function, change the cache key also.
        # Preparing the clinvar dataframe for categorical classification:
        df_clinvar_reduced = df_clinvar[df_clinvar['CLNSIG'].isin({'Pathogenic', 'Benign'})]
        return df_clinvar_to_clf_data(df_clinvar_reduced, base_string_encoding=base_string_encoding)

    return cache_df(name="clinvar_clf_data", key=[base_string_encoding, "v01"], df_maker=maker_clinvar_clf)


if __name__ == '__main__':
    URLS = {
        'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
        'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",
    }

    from tcga.utils import download
    from pathlib import Path
    from contexttimer import Timer

    cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
    assert cache.is_dir()
    download = download.to(abs_path=cache)

    with download(URLS['ctrl']).now.open() as fd:
        with Timer() as timer:
            df = v0_df(idiva.io.ReadVCF(fd))

        print(F"This took {timer.elapsed} seconds")