# RA, 2020-11-14 from idiva import log import pandas import io from unittest import TestCase from pathlib import Path from tcga.utils import download from idiva.io import open_maybe_gz download_cache = (Path(__file__).parent.parent.parent / "input/download_cache") assert download_cache.is_dir() download = download.to(abs_path=download_cache) URLS = { 'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz", 'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz", } # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972} ref_len_v2 = {'ctrl': 2227080, 'case': 2258797} class TestDf(TestCase): def test_makes_df_case(self): # This takes about 20min on v1/v2 from idiva.clf.df import v0_df
Usage: from z_sources import df_meta, df_expr, df_mrkr """ from tcga.utils import download import tcga.utils import pandas import pathlib import os import json import openpyxl src_dir = tcga.utils.mkdir(pathlib.Path(__file__).with_suffix('')) download = download.to(abs_path=(src_dir / "download_cache")) URLS = { 'GSE98816': "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE98nnn/GSE98816/suppl/GSE98816_Brain_samples_raw_read_counts_matrix.txt.gz", 'GSE98816_miniml': "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE98nnn/GSE98816/miniml/GSE98816_family.xml.tgz", # https://figshare.com/collections/_/4077260 # The file contains the description for 3436 single cells from mouse brain and 1504 single cells from mouse lung 'sc_description': "https://ndownloader.figshare.com/files/11188505", } # https://www.nature.com/articles/nature25739/figures/1 # 1c markers = { 'PC': "Pdgfrb Cspg4 Anpep Rgs5 Cd248 Abcc9 Vtn S1pr3", 'SMC': "Acta2 Tagln Myh11 Myl9 Mylk Sncg Cnn1 Pln",
In the first part of the assignment, you must identify which organism has the most number of genomes in the assembly database. You should use on of the edirect scripts provided on the AWS image to complete that task as shown in the manual. In the second part of the assignment, you must calculate the AVERAGE (mean) genome size of the genomes associated with “Prevotella buccalis”. Hint: There is a command called countfasta.py that will likely help you with this step! """ import pandas as pd from tcga.utils import download from tcga.strings import lines from urllib.parse import urlencode, quote download = download.to(rel_path="cache/download") class param: genera = list( lines( download( "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/" "Assignments/NCBIEDirectAssignment/genera.txt").now.text)) class urls: descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/" for genus in param.genera: data = download(param.urls.descriptors + quote(genus) + "?" +
from itertools import chain from collections import Counter import pandas as pd from tcga.utils import download URLS = { 'expr': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/matrix.csv", 'meta': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/metadata.csv", } out_dir = Path(__file__).with_suffix('') download = download.to(abs_path=out_dir) for (k, url) in URLS.items(): (download(url).now.meta) # with download(URLS['expr']).now.open() as fd: # df_expr_index = pd.read_csv(fd, sep=',', usecols=[0], index_col=0).index # assert (76533 == len(df_expr_index)) # # with download(URLS['meta']).now.open() as fd: # df_meta_index = pd.read_csv(fd, sep=',', index_col=0).index # assert (df_expr_index.equals(df_meta_index[0:len(df_expr_index)])) with download(URLS['expr']).now.open() as fd: df_expr = pd.read_csv(fd, sep=',', nrows=10, index_col=0).astype(int) assert (df_expr.shape == (len(df_expr), 50281))
(Initial draft) """ import io import gzip import numpy as np import pandas as pd from collections import Counter from Bio import SeqIO from tcga.utils import download from tcga.utils import First from tcga.strings import triplets download = download.to(rel_path="../20200608-Downloads/cache") class PARAM: class DATA: I = download( "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.chromosome.I.fa.gz" ).now with gzip.open(io.BytesIO(PARAM.DATA.I.bytes)) as fd: rec: SeqIO.SeqRecord rec = SeqIO.read(io.TextIOWrapper(fd), format='fasta') from tcga.codons import standard_rna as rna_to_aa from tcga.complements import dna_to_rna, dna_to_dna
# Note: # The values from the GSE "Series" are ~ those of 'expresso' # (Goes from raw probe intensities to expression values): # https://www.rdocumentation.org/packages/affy/versions/1.50.0/topics/expresso # https://gist.github.com/numpde/772cd596fb5fe6036f7e29736bd1cf15 # Note: # Potentially useful slides # https://bioinformatics.mdanderson.org/MicroarrayCourse/Lectures/ import re, gzip import pandas as pd from tcga.utils import download # Default download directory download = download.to(rel_path="UV/download") url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60880/matrix/GSE60880_series_matrix.txt.gz" with download(url).now.open(mode='rb') as gz: gz.seek(0) with gzip.open(gz, mode='r') as fd: sample_title = [ re.findall(r'"([.\w]+)"', line) for line in fd.read().decode().splitlines() if line.lower().startswith("!sample_title") ].pop() gz.seek(0) # ! df_expr = pd.read_csv(gz, compression="gzip", comment='!', sep='\t', index_col='ID_REF').sort_index() assert (len(sample_title) == len(df_expr.columns))
# RA, 2021-03-16 """ This file only prepares the downloader for the “Mouse Whole Cortex and Hippocampus 10x” dataset from Allen Brain, 2020. https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x https://celltypes.brain-map.org/rnaseq/mouse_ctx-hip_10x The full download (done by a_download.py) takes a few hours and requires over 4GB local storage. 10x protocol: https://portal.brain-map.org/atlases-and-data/rnaseq/protocols-mouse-cortex-and-hippocampus#single_cell_sorting https://www.protocols.io/view/10xv2-rnaseq-sample-processing-ynxfvfn/abstract """ from tcga.utils import download from bugs import * download = download.to(abs_path=(Path(__file__).with_suffix('') / "download_cache")) URLS = { 'expr': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/matrix.csv", 'meta': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/metadata.csv", }
# RA, 2020-06-25 from pathlib import Path from tempfile import gettempdir from tcga.utils import download download = download.to(abs_path=(Path(gettempdir()) / "tcga_download_cache")) print("Will download to:", download.local_folder) # Will download to: /tmp/tcga_download_cache # Lambda phage genome data = download("https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1").again( False).now print(data.meta) # same as tcga.refs.annotations[data] # {'source': 'https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1', 'datetime': '2020-06-25 07:18:52.065826+00:00'} print(data.text[0:42], "...", data.text[330:350], "...") # >ENA|J02459|J02459.1 Escherichia phage Lam ... CAGGGAATGCCCGTTCTGCG ... print(data.local_file) # /tmp/tcga_download_cache/Z9tBKiJCqrfWuYy5BlgrA3zZAWav2CUd4xrPsya93Os=.zip try: from Bio import SeqIO except ImportError: print("Need `biopython`") else: with data.open(mode='r') as fd: print(SeqIO.read(fd, format='fasta')) # ID: ENA|J02459|J02459.1
# RA, 2020-10-05 import io from contextlib import ExitStack from pathlib import Path from tcga.utils import download URLS = { 'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz", 'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz", } CACHE = Path(__file__).parent / "download_cache" download = download.to(abs_path=CACHE) HEAD = Path(__file__).parent / "head" HEAD.mkdir(parents=True, exist_ok=True) # Number of datalines for the `head` preview of VCF N = 1000 for url in URLS.values(): data = download(url).now for k in URLS: data = download(URLS[k]).now head = HEAD / Path(data.meta['source']).name with ExitStack() as stack: src = stack.enter_context(data.open(mode='rb'))
ROOT = Path(__file__).parent PARAM = { # Transcriptome of response to SARS-CoV-2 infection 'GSE CoV2': "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE147507&format=file&file=GSE147507%5FRawReadCounts%5FHuman%2Etsv%2Egz", # "111 expression datasets profiling the transcriptional response to a ligand" # https://zenodo.org/record/3260758 'txn_response_ref': "https://zenodo.org/record/3260758/files/expression_settings.rds?download=1", 'intercell': "https://omnipathdb.org/intercell", } # Setup default folder for downloads download = download.to(abs_path=(ROOT / "UV/download")) # Load the transcriptional response datasets # Collected by NicheNet authors for optimization/validation import tempfile with tempfile.NamedTemporaryFile() as tf: tf.write(download(PARAM['txn_response_ref']).now.bytes) import rpy2.robjects as robjects from rpy2.robjects import pandas2ri pandas2ri.activate() df = (robjects.r['readRDS'])(tf.name) assert (len(df) == 111) for (k, v) in df.items(): print(k) # print(v)