class DATA: I = download( "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/" "dna/Caenorhabditis_elegans.WBcel235.dna.chromosome.I.fa.gz").now cDNA = download( "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/" "cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz").now
def test_check_all(self): for group in URLS: with download(URLS[group]).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: from idiva.io.ass import check_all for check in check_all(fd): print(group, check)
def make_df_meta_ucsc(): with download(URLS['meta']).now.open(mode='r') as fd: df = pandas.read_table(fd, index_col=0) df = df.assign(celltype=df.Renamed_clusternames) p = re.compile(r"_([0-9]+)[.]tab[.]([A-Z][0-9]+)") df.index = ["SS2_16_{}_{}".format(*tcga.utils.unlist1(p.findall(i))) for i in df.index] return df
def test_makes_folder(self): with TemporaryDirectory() as tempdir: folder = Path(tempdir) / "test" with self.assertRaises(ValueError): x = download("-").to(abs_path=folder).now self.assertTrue(folder.exists()) self.assertTrue(not folder.exists())
class param: genera = list( lines( download( "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/" "Assignments/NCBIEDirectAssignment/genera.txt").now.text)) class urls: descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"
def make_df_desc() -> pandas.DataFrame: import warnings with download(URLS['sc_description']).now.open(mode='rb') as fd: with warnings.catch_warnings(): warnings.simplefilter("ignore") values = openpyxl.load_workbook(fd).active.values # Note: order of arguments matters df = pandas.DataFrame(columns=next(values), data=list(values)) df = df.rename(columns={'GSM ID': "gsm", 'annoated cell types': "celltype"}) return df
def test_makes_df_ctrl(self): # This takes about 20min on v1/v2 from idiva.clf.df import v0_df from idiva.io import ReadVCF with download(URLS['ctrl']).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: assert isinstance(fd, io.TextIOBase) df = v0_df(ReadVCF(fd)) self.assertTrue(len(df) > 0) self.assertEqual(len(df), ref_len_v2['ctrl'])
def test_count(self): from idiva.io.vcf import ReadVCF, RawDataline # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972} ref_len_v2 = {'ctrl': 2227080, 'case': 2258797} for group in URLS: with download(URLS[group]).now.open(mode='rb') as fd: with open_maybe_gz(fd, mode='r') as fd: assert isinstance(fd, io.TextIOBase) nlines = sum(1 for __ in ReadVCF(fd)) # print(F"Group {group} has {nlines} datalines") self.assertEqual(nlines, ref_len_v2[group])
def test_combines(self): from idiva.io import ReadVCF from idiva.clf.df import v0_df, join dfs = {} for k in URLS: with download(URLS[k]).now.open(mode='rb') as fd: with open_maybe_gz(fd) as fd: assert isinstance(fd, io.TextIOBase) dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl'])
def make_df_expr() -> pandas.DataFrame: with download(URLS['GSE98816']).now.open(mode='rb') as fd: # samples x genes df_expr = pandas.read_table(fd, compression='gzip', quotechar='"', index_col=0).T # Sort by sample ID df_expr = df_expr.sort_index() df_expr.index.name = "sample" assert df_expr.index.is_unique # Also remove common prefix to match df_meta df_expr.index = df_expr.index.str.slice(len(os.path.commonprefix(list(df_expr.index)))) assert df_expr.index.is_unique return df_expr
def test_chi2_large(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.clf.df import v0_df, join dfs = {} for k in URLS_LARGE: with download(URLS_LARGE[k]).now.open(mode='rb') as fd: with open_maybe_gz(fd, mode='r') as fd: dfs[k] = v0_df(ReadVCF(fd)) df = join(case=dfs['case'], ctrl=dfs['ctrl']) cols = tuple([F"ALT{n}_{kind}" for n in range(3)] for kind in ['case', 'ctrl']) p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
def make_df_meta() -> pandas.DataFrame: import xml.etree.ElementTree as ET import tarfile import re import pandas as pd from tcga.utils import unlist1 with download(URLS['GSE98816_miniml']).now.open(mode='rb') as tf: with tarfile.open(fileobj=tf, mode='r') as tar: et = ET.parse(source=tar.extractfile(unlist1(tar))).getroot() # Namespace a la '{http://www.ncbi.nlm.nih.gov/geo/info/MINiML}' ns = unlist1(re.findall(r"({.*}).*", et.tag)) c1: ET.Element # c1 = first(et.findall(ns + "Sample")) df_meta = pd.DataFrame( data=( { 'gsm': c1.attrib['iid'], 'sra': unlist1(c1.findall("./*/[@type='SRA']")).attrib["target"].strip(), 'taxid': unlist1(c1.findall("*/*/[@taxid]")).attrib["taxid"].strip(), 'biosample': unlist1(c1.findall("./*/[@type='BioSample']")).attrib["target"].strip(), 'strain': unlist1(c1.findall("*/*/[@tag='strain']")).text.strip().lower(), 'tissue': unlist1(c1.findall("*/*/[@tag='tissue']")).text.strip().lower(), 'genotype': unlist1(c1.findall("*/*/[@tag='genotype']")).text.strip().lower(), 'age': unlist1(c1.findall("*/*/[@tag='age']")).text.strip().lower(), 'title': unlist1(c1.findall(ns + "Title")).text.strip(), 'accession': unlist1(c1.findall(ns + "Accession")).text.strip(), 'description': unlist1(c1.findall(ns + "Description")).text.strip(), } for c1 in et.findall(ns + "Sample") ) ) # Remove common prefix from the description column df_meta = df_meta.assign( sample=df_meta.description.str.slice(len(os.path.commonprefix(list(df_meta.description)))) ) df_meta = df_meta.drop(columns='description') return df_meta
) df_markers = df_markers.assign(v=1).pivot_table( index='gene', columns='celltype', values='v', fill_value=0, ) df_markers = df_markers.astype(int) return df_markers if __name__ == '__main__': from tcga.utils import mkdir for (_, url) in URLS.items(): json.dumps(download(url).now.meta, indent=2) df_meta = make_df_meta() df_meta = df_meta.merge(make_df_desc(), how="inner", on="gsm", suffixes=("", " (desc)")) df_meta = df_meta.set_index('sample', verify_integrity=True).sort_index() df_expr = make_df_expr() df_mrkr = make_df_markers() assert df_meta.index.equals(df_expr.index) df_meta.to_csv(src_dir / "meta.csv.gz", compression='gzip', sep='\t') df_expr.to_csv(src_dir / "expr.csv.gz", compression='gzip', sep='\t') df_mrkr.to_csv(src_dir / "mrkr.csv", sep='\t') #
import pandas as pd from tcga.utils import download URLS = { 'expr': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/matrix.csv", 'meta': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/metadata.csv", } out_dir = Path(__file__).with_suffix('') download = download.to(abs_path=out_dir) for (k, url) in URLS.items(): (download(url).now.meta) # with download(URLS['expr']).now.open() as fd: # df_expr_index = pd.read_csv(fd, sep=',', usecols=[0], index_col=0).index # assert (76533 == len(df_expr_index)) # # with download(URLS['meta']).now.open() as fd: # df_meta_index = pd.read_csv(fd, sep=',', index_col=0).index # assert (df_expr_index.equals(df_meta_index[0:len(df_expr_index)])) with download(URLS['expr']).now.open() as fd: df_expr = pd.read_csv(fd, sep=',', nrows=10, index_col=0).astype(int) assert (df_expr.shape == (len(df_expr), 50281)) with download(URLS['meta']).now.open() as fd: df_meta = pd.read_csv(fd, sep=',', index_col=0)
# https://www.rdocumentation.org/packages/affy/versions/1.50.0/topics/expresso # https://gist.github.com/numpde/772cd596fb5fe6036f7e29736bd1cf15 # Note: # Potentially useful slides # https://bioinformatics.mdanderson.org/MicroarrayCourse/Lectures/ import re, gzip import pandas as pd from tcga.utils import download # Default download directory download = download.to(rel_path="UV/download") url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60880/matrix/GSE60880_series_matrix.txt.gz" with download(url).now.open(mode='rb') as gz: gz.seek(0) with gzip.open(gz, mode='r') as fd: sample_title = [ re.findall(r'"([.\w]+)"', line) for line in fd.read().decode().splitlines() if line.lower().startswith("!sample_title") ].pop() gz.seek(0) # ! df_expr = pd.read_csv(gz, compression="gzip", comment='!', sep='\t', index_col='ID_REF').sort_index() assert (len(sample_title) == len(df_expr.columns)) df_expr.columns = sample_title # Affymetrix platform info (affyID -> gene names, etc.)
""" import numpy as np import matplotlib.pyplot as plt from inclusive import range from plox import Plox from tcga.utils import download from pathlib import Path from itertools import count download = download.to(rel_path="cache/download") # Reference [2] download( "https://cpb-us-w2.wpmucdn.com/blog.nus.edu.sg/dist/0/3425/files/2018/10/Understanding-Benjamini-Hochberg-method-2ijolq0.pdf" ).now def get_obs(): rs = np.random.RandomState(1) # Number of hypothesis tests M = 10000 mus1 = rs.normal(size=M) mus2 = mus1 + (np.arange(len(mus1)) > 0.9 * len(mus1)) # Group sizes s1 = 25 s2 = 25
URLS = { 'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz", 'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz", } CACHE = Path(__file__).parent / "download_cache" download = download.to(abs_path=CACHE) HEAD = Path(__file__).parent / "head" HEAD.mkdir(parents=True, exist_ok=True) # Number of datalines for the `head` preview of VCF N = 1000 for url in URLS.values(): data = download(url).now for k in URLS: data = download(URLS[k]).now head = HEAD / Path(data.meta['source']).name with ExitStack() as stack: src = stack.enter_context(data.open(mode='rb')) try: import gzip src = stack.enter_context(gzip.open(src)) except: raise else: head = Path(str(head)[:-3])
from tcga.utils import download from tcga.strings import lines from urllib.parse import urlencode, quote download = download.to(rel_path="cache/download") class param: genera = list( lines( download( "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/" "Assignments/NCBIEDirectAssignment/genera.txt").now.text)) class urls: descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/" for genus in param.genera: data = download(param.urls.descriptors + quote(genus) + "?" + urlencode({ 'returned_content': "COMPLETE", 'tax_exact_match': False })).now if data.json: df = pd.DataFrame(data.json['datasets']) df = df.sort_values('display_name') print(F"{genus}, estimated genome size:", list(df.estimated_size.astype(int)))
def get_as_df(url, **csv_kwargs) -> pd.DataFrame: return pd.read_csv(io.StringIO(download(url).now.text), sep='\t', **csv_kwargs)
def test_fail_bad_url(self): with TemporaryDirectory() as tempdir: with self.assertRaises(ValueError): x = download("-").to(abs_path=tempdir).now with self.assertRaises(URLError): x = download("http://").to(abs_path=tempdir).now
def test_rel_or_abs(self): with self.assertRaises(RuntimeError): download("-").to(rel_path="cache", abs_path="cache") with self.assertRaises(TypeError): download("-").to("cache")
def test_fail_no_to(self): with self.assertRaises(RuntimeError): x = download("-").now
def test_silent_accept_bad_url(self): x = download("-")
def get_pstg_seq() -> str: viroid_fasta = download(PARAM['viroid']).to(rel_path="cache/download").now.text pstg = SeqIO.read(io.StringIO(viroid_fasta), format='fasta') s = First(dna_to_dna).then(dna_to_rna)(pstg.seq) return s
# RA, 2020-06-25 from pathlib import Path from tempfile import gettempdir from tcga.utils import download download = download.to(abs_path=(Path(gettempdir()) / "tcga_download_cache")) print("Will download to:", download.local_folder) # Will download to: /tmp/tcga_download_cache # Lambda phage genome data = download("https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1").again( False).now print(data.meta) # same as tcga.refs.annotations[data] # {'source': 'https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1', 'datetime': '2020-06-25 07:18:52.065826+00:00'} print(data.text[0:42], "...", data.text[330:350], "...") # >ENA|J02459|J02459.1 Escherichia phage Lam ... CAGGGAATGCCCGTTCTGCG ... print(data.local_file) # /tmp/tcga_download_cache/Z9tBKiJCqrfWuYy5BlgrA3zZAWav2CUd4xrPsya93Os=.zip try: from Bio import SeqIO except ImportError: print("Need `biopython`") else: with data.open(mode='r') as fd: print(SeqIO.read(fd, format='fasta')) # ID: ENA|J02459|J02459.1
def url2df(k): with download(URLS[k]).now.open(mode='rb') as fd: df = pandas.read_table(fd, compression='gzip', index_col=0).astype(int).sort_index(axis=1) df.columns = [f"{k}_{c}" for c in df.columns] return df
# RA, 2020-06-27 from tcga.utils import download url = "https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/7.1/msigdb_v7.1_files_to_download_locally.zip" download(url).to(rel_path="original").now
def make_df_expr_ucsc(): import gzip with download(URLS['expr']).now.open(mode='rb') as fd: with gzip.open(fd, mode='rb') as gz: return pandas.read_table(gz, index_col=0)
def load_cov2() -> pd.DataFrame: import zipfile with zipfile.ZipFile(download(url=PARAM['GSE CoV2']).now.local_file, mode='r') as zf: with zf.open("data") as fd: return pd.read_csv(fd, compression="gzip", sep='\t', index_col=0)
def maker_clinvar_clf() -> pandas.DataFrame: from idiva.db.clinvar import df_clinvar_to_clf_data # If you change this function, change the cache key also. # Preparing the clinvar dataframe for categorical classification: df_clinvar_reduced = df_clinvar[df_clinvar['CLNSIG'].isin({'Pathogenic', 'Benign'})] return df_clinvar_to_clf_data(df_clinvar_reduced, base_string_encoding=base_string_encoding) return cache_df(name="clinvar_clf_data", key=[base_string_encoding, "v01"], df_maker=maker_clinvar_clf) if __name__ == '__main__': URLS = { 'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz", 'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz", } from tcga.utils import download from pathlib import Path from contexttimer import Timer cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() download = download.to(abs_path=cache) with download(URLS['ctrl']).now.open() as fd: with Timer() as timer: df = v0_df(idiva.io.ReadVCF(fd)) print(F"This took {timer.elapsed} seconds")