# RA, 2020-11-14

from idiva import log

import pandas
import io

from unittest import TestCase
from pathlib import Path
from tcga.utils import download
from idiva.io import open_maybe_gz

download_cache = (Path(__file__).parent.parent.parent / "input/download_cache")
assert download_cache.is_dir()
download = download.to(abs_path=download_cache)

URLS = {
    'ctrl':
    "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
    'case':
    "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",
}

# ref_len_v1 = {'ctrl': 2329288, 'case': 2360972}
ref_len_v2 = {'ctrl': 2227080, 'case': 2258797}


class TestDf(TestCase):
    def test_makes_df_case(self):
        # This takes about 20min on v1/v2
        from idiva.clf.df import v0_df
Beispiel #2
0
Usage:
    from z_sources import df_meta, df_expr, df_mrkr


"""

from tcga.utils import download
import tcga.utils
import pandas
import pathlib
import os
import json
import openpyxl

src_dir = tcga.utils.mkdir(pathlib.Path(__file__).with_suffix(''))
download = download.to(abs_path=(src_dir / "download_cache"))

URLS = {
    'GSE98816': "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE98nnn/GSE98816/suppl/GSE98816_Brain_samples_raw_read_counts_matrix.txt.gz",
    'GSE98816_miniml': "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE98nnn/GSE98816/miniml/GSE98816_family.xml.tgz",

    # https://figshare.com/collections/_/4077260
    # The file contains the description for 3436 single cells from mouse brain and 1504 single cells from mouse lung
    'sc_description': "https://ndownloader.figshare.com/files/11188505",
}

# https://www.nature.com/articles/nature25739/figures/1
# 1c
markers = {
    'PC': "Pdgfrb Cspg4 Anpep Rgs5 Cd248 Abcc9 Vtn S1pr3",
    'SMC': "Acta2 Tagln Myh11 Myl9 Mylk Sncg Cnn1 Pln",
Beispiel #3
0
In the first part of the assignment, you must identify which organism has the most number of genomes in the assembly database.
You should use on of the edirect scripts provided on the AWS image to complete that task as shown in the manual.

In the second part of the assignment, you must calculate the AVERAGE (mean) genome size of the genomes associated with “Prevotella buccalis”.

Hint: There is a command called countfasta.py that will likely help you with this step!
"""

import pandas as pd

from tcga.utils import download
from tcga.strings import lines
from urllib.parse import urlencode, quote

download = download.to(rel_path="cache/download")


class param:
    genera = list(
        lines(
            download(
                "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/"
                "Assignments/NCBIEDirectAssignment/genera.txt").now.text))

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"


for genus in param.genera:
    data = download(param.urls.descriptors + quote(genus) + "?" +
Beispiel #4
0
from itertools import chain
from collections import Counter

import pandas as pd

from tcga.utils import download

URLS = {
    'expr':
    "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/matrix.csv",
    'meta':
    "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/metadata.csv",
}

out_dir = Path(__file__).with_suffix('')
download = download.to(abs_path=out_dir)

for (k, url) in URLS.items():
    (download(url).now.meta)

# with download(URLS['expr']).now.open() as fd:
#     df_expr_index = pd.read_csv(fd, sep=',', usecols=[0], index_col=0).index
#     assert (76533 == len(df_expr_index))
#
# with download(URLS['meta']).now.open() as fd:
#     df_meta_index = pd.read_csv(fd, sep=',', index_col=0).index
#     assert (df_expr_index.equals(df_meta_index[0:len(df_expr_index)]))

with download(URLS['expr']).now.open() as fd:
    df_expr = pd.read_csv(fd, sep=',', nrows=10, index_col=0).astype(int)
    assert (df_expr.shape == (len(df_expr), 50281))
Beispiel #5
0
(Initial draft)
"""

import io
import gzip
import numpy as np
import pandas as pd

from collections import Counter

from Bio import SeqIO
from tcga.utils import download
from tcga.utils import First
from tcga.strings import triplets

download = download.to(rel_path="../20200608-Downloads/cache")


class PARAM:
    class DATA:
        I = download(
            "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.chromosome.I.fa.gz"
        ).now


with gzip.open(io.BytesIO(PARAM.DATA.I.bytes)) as fd:
    rec: SeqIO.SeqRecord
    rec = SeqIO.read(io.TextIOWrapper(fd), format='fasta')

from tcga.codons import standard_rna as rna_to_aa
from tcga.complements import dna_to_rna, dna_to_dna
Beispiel #6
0
# Note:
# The values from the GSE "Series" are ~ those of 'expresso'
# (Goes from raw probe intensities to expression values):
# https://www.rdocumentation.org/packages/affy/versions/1.50.0/topics/expresso
# https://gist.github.com/numpde/772cd596fb5fe6036f7e29736bd1cf15

# Note:
# Potentially useful slides
# https://bioinformatics.mdanderson.org/MicroarrayCourse/Lectures/

import re, gzip
import pandas as pd
from tcga.utils import download

# Default download directory
download = download.to(rel_path="UV/download")

url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60880/matrix/GSE60880_series_matrix.txt.gz"
with download(url).now.open(mode='rb') as gz:
    gz.seek(0)
    with gzip.open(gz, mode='r') as fd:
        sample_title = [
            re.findall(r'"([.\w]+)"', line)
            for line in fd.read().decode().splitlines()
            if line.lower().startswith("!sample_title")
        ].pop()

    gz.seek(0)  # !
    df_expr = pd.read_csv(gz, compression="gzip", comment='!', sep='\t', index_col='ID_REF').sort_index()

    assert (len(sample_title) == len(df_expr.columns))
Beispiel #7
0
# RA, 2021-03-16

"""
This file only prepares the downloader for the
“Mouse Whole Cortex and Hippocampus 10x” dataset
from Allen Brain, 2020.
https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x
https://celltypes.brain-map.org/rnaseq/mouse_ctx-hip_10x

The full download (done by a_download.py)
takes a few hours and requires over 4GB local storage.

10x protocol:
https://portal.brain-map.org/atlases-and-data/rnaseq/protocols-mouse-cortex-and-hippocampus#single_cell_sorting
https://www.protocols.io/view/10xv2-rnaseq-sample-processing-ynxfvfn/abstract
"""

from tcga.utils import download
from bugs import *

download = download.to(abs_path=(Path(__file__).with_suffix('') / "download_cache"))

URLS = {
    'expr': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/matrix.csv",
    'meta': "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/metadata.csv",
}
Beispiel #8
0
# RA, 2020-06-25

from pathlib import Path
from tempfile import gettempdir
from tcga.utils import download

download = download.to(abs_path=(Path(gettempdir()) / "tcga_download_cache"))
print("Will download to:", download.local_folder)
# Will download to: /tmp/tcga_download_cache

# Lambda phage genome
data = download("https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1").again(
    False).now

print(data.meta)  # same as tcga.refs.annotations[data]
# {'source': 'https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1', 'datetime': '2020-06-25 07:18:52.065826+00:00'}

print(data.text[0:42], "...", data.text[330:350], "...")
# >ENA|J02459|J02459.1 Escherichia phage Lam ... CAGGGAATGCCCGTTCTGCG ...

print(data.local_file)
# /tmp/tcga_download_cache/Z9tBKiJCqrfWuYy5BlgrA3zZAWav2CUd4xrPsya93Os=.zip

try:
    from Bio import SeqIO
except ImportError:
    print("Need `biopython`")
else:
    with data.open(mode='r') as fd:
        print(SeqIO.read(fd, format='fasta'))
# ID: ENA|J02459|J02459.1
# RA, 2020-10-05

import io
from contextlib import ExitStack
from pathlib import Path
from tcga.utils import download

URLS = {
    'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
    'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",
}

CACHE = Path(__file__).parent / "download_cache"
download = download.to(abs_path=CACHE)

HEAD = Path(__file__).parent / "head"
HEAD.mkdir(parents=True, exist_ok=True)

# Number of datalines for the `head` preview of VCF
N = 1000

for url in URLS.values():
    data = download(url).now

for k in URLS:
    data = download(URLS[k]).now
    head = HEAD / Path(data.meta['source']).name

    with ExitStack() as stack:
        src = stack.enter_context(data.open(mode='rb'))
Beispiel #10
0
ROOT = Path(__file__).parent

PARAM = {
    # Transcriptome of response to SARS-CoV-2 infection
    'GSE CoV2':
    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE147507&format=file&file=GSE147507%5FRawReadCounts%5FHuman%2Etsv%2Egz",

    # "111 expression datasets profiling the transcriptional response to a ligand"
    # https://zenodo.org/record/3260758
    'txn_response_ref':
    "https://zenodo.org/record/3260758/files/expression_settings.rds?download=1",
    'intercell': "https://omnipathdb.org/intercell",
}

# Setup default folder for downloads
download = download.to(abs_path=(ROOT / "UV/download"))

# Load the transcriptional response datasets
# Collected by NicheNet authors for optimization/validation
import tempfile
with tempfile.NamedTemporaryFile() as tf:
    tf.write(download(PARAM['txn_response_ref']).now.bytes)

    import rpy2.robjects as robjects
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    df = (robjects.r['readRDS'])(tf.name)
    assert (len(df) == 111)
    for (k, v) in df.items():
        print(k)
        # print(v)