def test_fetch_subdirs(): path = fetch_file(URL, decompress=True, subdir="datacache") assert path.endswith(FASTA_FILENAME) # if we change the subdir then data should end up in # something like /Users/me/Library/Caches/epitopes_test/ other_path = fetch_file(URL, decompress=True, subdir="datacache_test") assert other_path.endswith(FASTA_FILENAME) assert other_path != path, other_path
def test_fetch_decompress(): path1 = fetch_file( URL, decompress=True, subdir="datacache") assert path1.endswith(FASTA_FILENAME) with open(path1, 'r') as f1: s1 = f1.read() assert "TCAATTTCGTGCCAG" in s1
def test_fetch_decompress(): for use_wget_if_available in [True, False]: for timeout in [None, 10**6]: path1 = fetch_file(URL, decompress=True, subdir="datacache", use_wget_if_available=use_wget_if_available, timeout=timeout) assert path1.endswith(FASTA_FILENAME) with open(path1, 'r') as f1: s1 = f1.read() assert "TCAATTTCGTGCCAG" in s1
def test_fetch_decompress(): for use_wget_if_available in [True, False]: for timeout in [None, 10**6]: path1 = fetch_file( URL, decompress=True, subdir="datacache", use_wget_if_available=use_wget_if_available, timeout=timeout) assert path1.endswith(FASTA_FILENAME) with open(path1, 'r') as f1: s1 = f1.read() assert "TCAATTTCGTGCCAG" in s1
def transcript_id_to_transcript_name(transcript_id, _table_cache = [None]): if _table_cache[0] is None: print ("Fetching Ensembl ID mappings from BioMart %s" ) % _BIOMART_URL_TRANSCRIPT_ID_TO_TRANSCRIPT_NAME biomart_filename = \ datacache.fetch_file(_BIOMART_URL_TRANSCRIPT_ID_TO_TRANSCRIPT_NAME, "biomart_transcript_name.tsv") df = pd.read_csv(biomart_filename, sep='\t') transcript_ids = df['Ensembl Transcript ID'] transcript_names = df['Associated Transcript Name'] mapping = dict(zip(transcript_ids, transcript_names)) _table_cache[0] = mapping mapping = _table_cache[0] return mapping[transcript_id]
def transcript_id_to_gene_id(transcript_id, _table_cache = [None]): if _table_cache[0] is None: print ("Fetching Ensembl ID mappings from BioMart %s" ) % _BIOMART_URL_TRANSCRIPT_ID_TO_GENE_ID biomart_filename = \ datacache.fetch_file(_BIOMART_URL_TRANSCRIPT_ID_TO_GENE_ID, "biomart_transcript_gene.tsv") df = pd.read_csv(biomart_filename, sep='\t') gene_ids = df['Ensembl Gene ID'] transcript_ids = df['Ensembl Transcript ID'] mapping = dict(zip(transcript_ids, gene_ids)) _table_cache[0] = mapping mapping = _table_cache[0] return mapping[transcript_id]
def make_blastdb(url, name=None, filename=None, overwrite=False): """Download protein sequences and a make blast db. Uses datacache module.""" import datacache cachedir = datacache.get_data_dir() blastdb = os.path.join(cachedir, name) if os.path.exists(blastdb+'.phr') and overwrite==False: #print ('blast files found') return blastdb filename = datacache.fetch_file(url, filename=filename, decompress=True, subdir=None) #print filename cmd = 'makeblastdb -dbtype prot -in %s -out %s' %(filename,blastdb) #print cmd tmp=subprocess.check_output(cmd, shell=True) return blastdb
def fetch_fasta_dict(path_or_url): path = fetch_file(path_or_url) d = {} value_buffer = [] key = None if path.endswith(".gz") or path.endswith(".gzip"): f = gzip.open(path, "r") else: f = open(path, "r") for line in f.readlines(): if type(line) is bytes: line = line.decode("ascii") if line.startswith(">"): if key is not None: d[key] = "".join(value_buffer) value_buffer = [] key = line.split()[0][1:] else: value_buffer.append(line.strip()) if key and value_buffer: d[key] = "".join(value_buffer) f.close() return d
def fetch_file(url, decompress=True): return datacache.fetch_file(url, decompress=decompress, subdir="immuno")
def fetch_file(url, decompress = True): return datacache.fetch_file(url, decompress = decompress, subdir = "immuno")