def download_test(): import academictorrents as at at.get("test.torrent") R = os.path.isfile( "Introduction to the Special Topic on Grammar Induction, Representation of Language and Language Learning.pdf" ) return R
def test_redownload_only_one_file(self): path = at.get( 'b79869ca12787166de88311ca1f28e3ebec12dec') # test torrent files = os.listdir(path) self.assertTrue(len(files) == 174) datastore = os.getcwd() + "/datastore/" os.remove(datastore + "/BreastCancerCell_dataset/ytma55_030603_benign2.TIF") files = os.listdir(path) self.assertTrue(len(files) == 173) path = at.get( 'b79869ca12787166de88311ca1f28e3ebec12dec') # test torrent files = os.listdir(path) self.assertTrue(len(files) == 174)
def test_redownload_only_one_file(self): path = at.get('b79869ca12787166de88311ca1f28e3ebec12dec', use_timestamp=False) # test torrent files = os.listdir(path) self.assertTrue(len(files) == 174) os.remove(path + "/ytma55_030603_benign2.TIF") files = os.listdir(path) self.assertTrue(len(files) == 173) path = at.get('b79869ca12787166de88311ca1f28e3ebec12dec', use_timestamp=False) # test torrent files = os.listdir(path) self.assertTrue(len(files) == 174)
def test_different_datastore(self): filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb', datastore='~/.academictorrent-datastore/alt/', use_timestamp=False) assert filename == os.path.expanduser( '~/.academictorrent-datastore/alt/mnist.pkl.gz') self.assertTrue(os.path.isfile(filename))
def load_data(self): h5_file = h5py.File(at.get(self.at_hash_or_path)) self.node_names = np.array(h5_file['gene_names']) self.df = pd.DataFrame(np.array(np.array(h5_file['graph_data']).astype('float32'))) self.df.columns = self.node_names self.df.index = self.node_names self.nx_graph = nx.from_pandas_adjacency(self.df)
def load_data(self): self.nx_graph = nx.OrderedGraph( nx.readwrite.gpickle.read_gpickle( at.get(self.at_hash, datastore=self.datastore))) # Randomize if self.randomize: self.nx_graph = nx.relabel.relabel_nodes( self.nx_graph, randmap(self.nx_graph.nodes))
def load_data(self): # You could replace the value of self.hash with a path to a local copy of your graph and AT can handle that. h5_file = h5py.File(at.get(self.at_hash_or_path)) self.node_names = np.array(h5_file['gene_names']) self.df = pd.DataFrame(np.array(np.array(h5_file['graph_data']).astype('float32'))) self.df.columns = self.node_names self.df.index = self.node_names self.nx_graph = nx.from_pandas_adjacency(self.df)
def test_urls(self): path = at.get( "323a0048d87ca79b68f12a6350a57776b6a3b7fb", urls=["http://host1.academictorrents.com/share/mnist.pkl.gz"], use_timestamp=False) self.assertTrue(os.path.isfile(path)) mnist = gzip.open(path, 'rb') train_set, validation_set, test_set = pickle.load(mnist) mnist.close()
def _prepare(self): self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop", default=False) cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")) self.root = os.path.join(cachedir, "autoencoders/data", self.NAME) self.datadir = os.path.join(self.root, "data") self.txt_filelist = os.path.join(self.root, "filelist.txt") self.expected_length = 50000 if not edu.is_prepared(self.root): # prep self.logger.info("Preparing dataset {} in {}".format( self.NAME, self.root)) datadir = self.datadir if not os.path.exists(datadir): path = os.path.join(self.root, self.FILES[0]) if not os.path.exists(path) or not os.path.getsize( path) == self.SIZES[0]: import academictorrents as at atpath = at.get(self.AT_HASH, datastore=self.root) assert atpath == path self.logger.info("Extracting {} to {}".format(path, datadir)) os.makedirs(datadir, exist_ok=True) with tarfile.open(path, "r:") as tar: tar.extractall(path=datadir) vspath = os.path.join(self.root, self.FILES[1]) if not os.path.exists(vspath) or not os.path.getsize( vspath) == self.SIZES[1]: download(self.VS_URL, vspath) with open(vspath, "r") as f: synset_dict = f.read().splitlines() synset_dict = dict(line.split() for line in synset_dict) self.logger.info("Reorganizing into synset folders") synsets = np.unique(list(synset_dict.values())) for s in synsets: os.makedirs(os.path.join(datadir, s), exist_ok=True) for k, v in synset_dict.items(): src = os.path.join(datadir, k) dst = os.path.join(datadir, v) shutil.move(src, dst) filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG")) filelist = [os.path.relpath(p, start=datadir) for p in filelist] filelist = sorted(filelist) filelist = "\n".join(filelist) + "\n" with open(self.txt_filelist, "w") as f: f.write(filelist) edu.mark_prepared(self.root)
def load_data(self): savefile = os.path.join(self.datastore,"graphs", self.graph_name + ".adjlist.gz") if os.path.isfile(savefile): print(" loading from cache file" + savefile) self.nx_graph = nx.read_adjlist(savefile) else: self.nx_graph = nx.OrderedGraph( nx.readwrite.gpickle.read_gpickle(at.get(self.at_hash, datastore=self.datastore))) print(" writing graph") nx.write_adjlist(self.nx_graph, savefile)
def load_data(self): # You could replace the value of self.at_hash_or_path with a path to a local copy of your graph and AT can handle that. self.file_path = at.get(self.at_hash_or_path) self.file = h5py.File(self.file_path, 'r') self.data = np.array(self.file['expression_data'][:self.nb_examples]) self.nb_nodes = self.data.shape[1] self.labels = self.file['labels_data'] self.sample_names = self.file['sample_names'] self.node_names = np.array(self.file['gene_names']) self.df = pd.DataFrame(self.data) self.df.columns = self.node_names[:len(self.df.columns)] self.label_name = self.node_names[len(self.df.columns) + 1:] if self.labels.shape != self.labels[:].reshape(-1).shape: print("Converting one-hot labels to integers") self.labels = np.argmax(self.labels[:], axis=1)
def _prepare(self): self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop", default=True) cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")) self.root = os.path.join(cachedir, "autoencoders/data", self.NAME) self.datadir = os.path.join(self.root, "data") self.txt_filelist = os.path.join(self.root, "filelist.txt") self.expected_length = 1281167 if not edu.is_prepared(self.root): # prep self.logger.info("Preparing dataset {} in {}".format( self.NAME, self.root)) datadir = self.datadir if not os.path.exists(datadir): path = os.path.join(self.root, self.FILES[0]) if not os.path.exists(path) or not os.path.getsize( path) == self.SIZES[0]: import academictorrents as at atpath = at.get(self.AT_HASH, datastore=self.root) assert atpath == path self.logger.info("Extracting {} to {}".format(path, datadir)) os.makedirs(datadir, exist_ok=True) with tarfile.open(path, "r:") as tar: tar.extractall(path=datadir) self.logger.info("Extracting sub-tars.") subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar"))) for subpath in tqdm(subpaths): subdir = subpath[:-len(".tar")] os.makedirs(subdir, exist_ok=True) with tarfile.open(subpath, "r:") as tar: tar.extractall(path=subdir) filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG")) filelist = [os.path.relpath(p, start=datadir) for p in filelist] filelist = sorted(filelist) filelist = "\n".join(filelist) + "\n" with open(self.txt_filelist, "w") as f: f.write(filelist) edu.mark_prepared(self.root)
def load_data(self): csv_file = at.get(self.at_hash, datastore=self.datastore) hdf_file = csv_file.split(".gz")[0] + ".hdf5" if not os.path.isfile(hdf_file): print("We are converting a CSV dataset of TCGA to HDF5. Please wait a minute, this only happens the first " "time you use the TCGA dataset.") df = pd.read_csv(csv_file, compression="gzip", sep="\t") df = df.set_index('Sample') df = df.transpose() df.to_hdf(hdf_file, key="data", complevel=5) self.df = pd.read_hdf(hdf_file) self.df.rename(symbol_map(self.df.columns), axis="columns", inplace=True) self.df = self.df - self.df.mean(axis=0) #self.df = self.df / self.df.variance() self.sample_names = self.df.index.values.tolist() self.node_names = np.array(self.df.columns.values.tolist()).astype("str") self.nb_nodes = self.df.shape[1] self.labels = [0 for _ in range(self.df.shape[0])]
def get_nih_data_paths(): ''' returns the path to the nih images, patient csv file and bbox csv file ''' path = at.get(data_hash_dict.get('NIH')) to_path = os.path.dirname(path) path_to_tar = os.path.join(path, "images-224.tar") with tarfile.open(path_to_tar, 'r') as tar: tar.extractall(path=to_path) tar.close() x_ray_data_path = os.path.join(path, "Data_Entry_2017.csv") bbox_data_path = os.path.join(path, "BBox_List_2017.csv") image_path = os.path.join(to_path, "images-224") return image_path, x_ray_data_path, bbox_data_path
def download(self, chunksize=100): try: import gzip import shutil import pandas as pd from six.moves import urllib import academictorrents as at except ImportError as exception: raise ImportError('{0}. To use the TCGA dataset, you need to ' 'install the necessary dependencies with ' '`pip install torchmeta[tcga]`.'.format( exception.message)) clinical_matrices_folder = os.path.join(self.root, 'clinicalMatrices') if not os.path.exists(clinical_matrices_folder): os.makedirs(clinical_matrices_folder) for cancer in self.cancers: filename = self.clinical_matrix_filename.format(cancer) rawpath = os.path.join(clinical_matrices_folder, '{0}.gz'.format(filename)) filepath = os.path.join(clinical_matrices_folder, '{0}.tsv'.format(filename)) if os.path.isfile(filepath): continue if not os.path.exists(rawpath): print('Downloading `{0}.gz`...'.format(filename)) url = self.clinical_matrix_url.format(cancer) urllib.request.urlretrieve(url, rawpath) print('Extracting `{0}.gz`...'.format(filename)) with gzip.open(rawpath, 'rb') as gzf: with open(filepath, 'wb') as f: shutil.copyfileobj(gzf, f) gene_expression_file = os.path.join(self.root, self.gene_expression_filename) if not os.path.isfile(gene_expression_file): from tqdm import tqdm print('Downloading `{0}` using `academictorrents`...'.format( self.gene_expression_filename)) csv_file = at.get(self.gene_expression_torrent, datastore=self.root) print('Downloaded to: `{0}`'.format(csv_file)) print( 'Converting TCGA CSV dataset to HDF5. This may take a while, ' 'but only happens on the first run.') reader = pd.read_csv(csv_file, compression='gzip', sep='\t', header=0, index_col=0, chunksize=chunksize) shape = (10459, 20530) with tqdm(total=shape[1]) as pbar: with h5py.File(gene_expression_file, 'w') as f: dataset = f.create_dataset('expression_data', shape=shape, dtype='f4') gene_ids = [] for idx, chunk in enumerate(reader): slice_ = slice(idx * chunksize, (idx + 1) * chunksize) dataset[:, slice_] = chunk.T gene_ids.extend(chunk.index) pbar.update(chunk.shape[0]) all_sample_ids = chunk.columns.tolist() gene_ids_file = os.path.join(self.root, 'gene_ids.json') with open(gene_ids_file, 'w') as f: json.dump(gene_ids, f) all_sample_ids_file = os.path.join(self.root, 'all_sample_ids.json') with open(all_sample_ids_file, 'w') as f: json.dump(all_sample_ids, f) if os.path.isfile(csv_file): os.remove(csv_file) print('Done') self._process_clinical_matrices() # Create label files for split in ['train', 'val', 'test']: filename = os.path.join(self.root, self.filename_tasks.format(split)) data = get_asset(self.folder, '{0}.json'.format(split), dtype='json') with open(filename, 'w') as f: labels = sorted([key.split('|', 1) for key in data]) json.dump(labels, f) # Clean up for cancer in self.cancers: filename = self.clinical_matrix_filename.format(cancer) rawpath = os.path.join(clinical_matrices_folder, '{0}.gz'.format(filename)) if os.path.isfile(rawpath): os.remove(rawpath)
def test_get_single_file(self): filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb', use_timestamp=False) self.assertTrue(os.path.isfile(filename))
def test_get_file_http(self): filename = at.get('55a8925a8d546b9ca47d309ab438b91f7959e77f') self.assertTrue(os.path.isfile(filename))
def test_get_multiple_files(self): path = at.get('b79869ca12787166de88311ca1f28e3ebec12dec', use_timestamp=False) files = os.listdir(path) self.assertTrue(len(files) == 174)
def test_get_file_http(self): filename = at.get('55a8925a8d546b9ca47d309ab438b91f7959e77f', use_timestamp=False) self.assertTrue(os.path.isfile(filename)) time.sleep(3)
def _download(data_dir, cancers): import academictorrents as at from six.moves import urllib import gzip # download files try: os.makedirs(os.path.join(data_dir, 'clinicalMatrices')) except OSError as e: if e.errno == 17: pass else: raise for cancer in cancers: filename = '{}_clinicalMatrix'.format(cancer) file_path = os.path.join(data_dir, 'clinicalMatrices', filename) decompressed_file_path = file_path.replace('.gz', '') if os.path.isfile(file_path): continue file_path += '.gz' url = 'https://tcga.xenahubs.net/download/TCGA.{}.sampleMap/{}_clinicalMatrix.gz'.format( cancer, cancer) print('Downloading ' + url) data = urllib.request.urlopen(url) with open(file_path, 'wb') as f: f.write(data.read()) with open(decompressed_file_path, 'wb') as out_f, gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) if os.stat(decompressed_file_path).st_size == 0: os.remove(decompressed_file_path) error = IOError('Downloading {} from {} failed.'.format( filename, url)) error.strerror = 'Downloading {} from {} failed.'.format( filename, url) error.errno = 5 error.filename = decompressed_file_path raise error hdf_file = os.path.join(data_dir, "TCGA_HiSeqV2.hdf5") #csv_file = os.path.join(data_dir, 'HiSeqV2.gz') gene_ids_file = os.path.join(data_dir, 'gene_ids') all_sample_ids_file = os.path.join(data_dir, 'all_sample_ids') print('Downloading or checking for TCGA_HiSeqV2 using Academic Torrents') csv_file = at.get("e4081b995625f9fc599ad860138acf7b6eb1cf6f", datastore=data_dir) if not os.path.isfile(hdf_file) and os.path.isfile(csv_file): print("Downloaded to: " + csv_file) print( "Converting TCGA CSV dataset to HDF5. This only happens on first run." ) df = pd.read_csv(csv_file, compression="gzip", sep="\t") df = df.set_index('Sample') df = df.transpose() gene_ids = df.columns.values.tolist() all_sample_ids = df.index.values.tolist() with open(gene_ids_file, "w") as text_file: for gene_id in gene_ids: text_file.write('{}\n'.format(gene_id)) with open(all_sample_ids_file, "w") as text_file: for sample_id in all_sample_ids: text_file.write('{}\n'.format(sample_id)) f = h5py.File(hdf_file) f.create_dataset("dataset", data=df.values, compression="gzip") f.close()
print "About to import the library" import academictorrents as at print "About to start a download" filename = at.get("323a0048d87ca79b68f12a6350a57776b6a3b7fb") print "About to open the file" import cPickle, gzip import sys, os, time import numpy as np mnist = gzip.open(filename, 'rb') train_set, validation_set, test_set = cPickle.load(mnist) mnist.close()
import academictorrents as at at.get("85a5bd50e4c365f8df70240ffd4ecc7dec59912b")
def test_find_downloaded_torrent(self): filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb') self.assertTrue(os.path.isfile(filename))
def load_data(self): self.nx_graph = nx.OrderedGraph( nx.readwrite.gpickle.read_gpickle( at.get(self.at_hash, datastore=self.datastore)))
import academictorrents as at import sys import argparse parser = argparse.ArgumentParser(description='AT Simple command line tool') parser.add_argument('-hash', type=str, nargs='?', required=True, help='Hash of torrent to download') parser.add_argument('-name', type=str, nargs='?', default=None, help='Name of subfolder for file') parser.add_argument('-datastore', type=str, nargs='?', default=".", help='Location which to place the files') args = parser.parse_args() filename = at.get(args.hash, datastore=args.datastore, name=args.name) print("Done")
def test_different_datastore(self): filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb', datastore=os.getcwd() + '/datastore/alt/') assert filename == os.getcwd() + '/datastore/alt/mnist.pkl.gz' self.assertTrue(os.path.isfile(filename))