Beispiel #1
0
def download_test():
    import academictorrents as at
    at.get("test.torrent")
    R = os.path.isfile(
        "Introduction to the Special Topic on Grammar Induction, Representation of Language and Language Learning.pdf"
    )
    return R
 def test_redownload_only_one_file(self):
     path = at.get(
         'b79869ca12787166de88311ca1f28e3ebec12dec')  # test torrent
     files = os.listdir(path)
     self.assertTrue(len(files) == 174)
     datastore = os.getcwd() + "/datastore/"
     os.remove(datastore +
               "/BreastCancerCell_dataset/ytma55_030603_benign2.TIF")
     files = os.listdir(path)
     self.assertTrue(len(files) == 173)
     path = at.get(
         'b79869ca12787166de88311ca1f28e3ebec12dec')  # test torrent
     files = os.listdir(path)
     self.assertTrue(len(files) == 174)
Beispiel #3
0
    def test_redownload_only_one_file(self):
        path = at.get('b79869ca12787166de88311ca1f28e3ebec12dec',
                      use_timestamp=False)  # test torrent
        files = os.listdir(path)
        self.assertTrue(len(files) == 174)
        os.remove(path + "/ytma55_030603_benign2.TIF")

        files = os.listdir(path)
        self.assertTrue(len(files) == 173)

        path = at.get('b79869ca12787166de88311ca1f28e3ebec12dec',
                      use_timestamp=False)  # test torrent
        files = os.listdir(path)
        self.assertTrue(len(files) == 174)
Beispiel #4
0
 def test_different_datastore(self):
     filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb',
                       datastore='~/.academictorrent-datastore/alt/',
                       use_timestamp=False)
     assert filename == os.path.expanduser(
         '~/.academictorrent-datastore/alt/mnist.pkl.gz')
     self.assertTrue(os.path.isfile(filename))
Beispiel #5
0
 def load_data(self):
     h5_file = h5py.File(at.get(self.at_hash_or_path))
     self.node_names = np.array(h5_file['gene_names'])
     self.df = pd.DataFrame(np.array(np.array(h5_file['graph_data']).astype('float32')))
     self.df.columns = self.node_names
     self.df.index = self.node_names
     self.nx_graph = nx.from_pandas_adjacency(self.df)
Beispiel #6
0
 def load_data(self):
     self.nx_graph = nx.OrderedGraph(
         nx.readwrite.gpickle.read_gpickle(
             at.get(self.at_hash, datastore=self.datastore)))
     # Randomize
     if self.randomize:
         self.nx_graph = nx.relabel.relabel_nodes(
             self.nx_graph, randmap(self.nx_graph.nodes))
Beispiel #7
0
 def load_data(self):
     # You could replace the value of self.hash with a path to a local copy of your graph and AT can handle that.
     h5_file = h5py.File(at.get(self.at_hash_or_path))
     self.node_names = np.array(h5_file['gene_names'])
     self.df = pd.DataFrame(np.array(np.array(h5_file['graph_data']).astype('float32')))
     self.df.columns = self.node_names
     self.df.index = self.node_names
     self.nx_graph = nx.from_pandas_adjacency(self.df)
Beispiel #8
0
 def test_urls(self):
     path = at.get(
         "323a0048d87ca79b68f12a6350a57776b6a3b7fb",
         urls=["http://host1.academictorrents.com/share/mnist.pkl.gz"],
         use_timestamp=False)
     self.assertTrue(os.path.isfile(path))
     mnist = gzip.open(path, 'rb')
     train_set, validation_set, test_set = pickle.load(mnist)
     mnist.close()
Beispiel #9
0
    def _prepare(self):
        self.random_crop = retrieve(self.config,
                                    "ImageNetValidation/random_crop",
                                    default=False)
        cachedir = os.environ.get("XDG_CACHE_HOME",
                                  os.path.expanduser("~/.cache"))
        self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
        self.datadir = os.path.join(self.root, "data")
        self.txt_filelist = os.path.join(self.root, "filelist.txt")
        self.expected_length = 50000
        if not edu.is_prepared(self.root):
            # prep
            self.logger.info("Preparing dataset {} in {}".format(
                self.NAME, self.root))

            datadir = self.datadir
            if not os.path.exists(datadir):
                path = os.path.join(self.root, self.FILES[0])
                if not os.path.exists(path) or not os.path.getsize(
                        path) == self.SIZES[0]:
                    import academictorrents as at
                    atpath = at.get(self.AT_HASH, datastore=self.root)
                    assert atpath == path

                self.logger.info("Extracting {} to {}".format(path, datadir))
                os.makedirs(datadir, exist_ok=True)
                with tarfile.open(path, "r:") as tar:
                    tar.extractall(path=datadir)

                vspath = os.path.join(self.root, self.FILES[1])
                if not os.path.exists(vspath) or not os.path.getsize(
                        vspath) == self.SIZES[1]:
                    download(self.VS_URL, vspath)

                with open(vspath, "r") as f:
                    synset_dict = f.read().splitlines()
                    synset_dict = dict(line.split() for line in synset_dict)

                self.logger.info("Reorganizing into synset folders")
                synsets = np.unique(list(synset_dict.values()))
                for s in synsets:
                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
                for k, v in synset_dict.items():
                    src = os.path.join(datadir, k)
                    dst = os.path.join(datadir, v)
                    shutil.move(src, dst)

            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
            filelist = sorted(filelist)
            filelist = "\n".join(filelist) + "\n"
            with open(self.txt_filelist, "w") as f:
                f.write(filelist)

            edu.mark_prepared(self.root)
Beispiel #10
0
 def load_data(self):
     
     savefile = os.path.join(self.datastore,"graphs", self.graph_name + ".adjlist.gz")
     
     if os.path.isfile(savefile):
         print(" loading from cache file" + savefile)
         self.nx_graph = nx.read_adjlist(savefile)
     else:
     
         self.nx_graph = nx.OrderedGraph(
             nx.readwrite.gpickle.read_gpickle(at.get(self.at_hash, datastore=self.datastore)))
         
         print(" writing graph")
         nx.write_adjlist(self.nx_graph, savefile)
Beispiel #11
0
    def load_data(self):
        # You could replace the value of self.at_hash_or_path with a path to a local copy of your graph and AT can handle that.
        self.file_path = at.get(self.at_hash_or_path)
        self.file = h5py.File(self.file_path, 'r')
        self.data = np.array(self.file['expression_data'][:self.nb_examples])
        self.nb_nodes = self.data.shape[1]
        self.labels = self.file['labels_data']
        self.sample_names = self.file['sample_names']
        self.node_names = np.array(self.file['gene_names'])
        self.df = pd.DataFrame(self.data)
        self.df.columns = self.node_names[:len(self.df.columns)]
        self.label_name = self.node_names[len(self.df.columns) + 1:]

        if self.labels.shape != self.labels[:].reshape(-1).shape:
            print("Converting one-hot labels to integers")
            self.labels = np.argmax(self.labels[:], axis=1)
Beispiel #12
0
    def _prepare(self):
        self.random_crop = retrieve(self.config,
                                    "ImageNetTrain/random_crop",
                                    default=True)
        cachedir = os.environ.get("XDG_CACHE_HOME",
                                  os.path.expanduser("~/.cache"))
        self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
        self.datadir = os.path.join(self.root, "data")
        self.txt_filelist = os.path.join(self.root, "filelist.txt")
        self.expected_length = 1281167
        if not edu.is_prepared(self.root):
            # prep
            self.logger.info("Preparing dataset {} in {}".format(
                self.NAME, self.root))

            datadir = self.datadir
            if not os.path.exists(datadir):
                path = os.path.join(self.root, self.FILES[0])
                if not os.path.exists(path) or not os.path.getsize(
                        path) == self.SIZES[0]:
                    import academictorrents as at
                    atpath = at.get(self.AT_HASH, datastore=self.root)
                    assert atpath == path

                self.logger.info("Extracting {} to {}".format(path, datadir))
                os.makedirs(datadir, exist_ok=True)
                with tarfile.open(path, "r:") as tar:
                    tar.extractall(path=datadir)

                self.logger.info("Extracting sub-tars.")
                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
                for subpath in tqdm(subpaths):
                    subdir = subpath[:-len(".tar")]
                    os.makedirs(subdir, exist_ok=True)
                    with tarfile.open(subpath, "r:") as tar:
                        tar.extractall(path=subdir)

            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
            filelist = sorted(filelist)
            filelist = "\n".join(filelist) + "\n"
            with open(self.txt_filelist, "w") as f:
                f.write(filelist)

            edu.mark_prepared(self.root)
Beispiel #13
0
 def load_data(self):
     csv_file = at.get(self.at_hash, datastore=self.datastore)
     hdf_file = csv_file.split(".gz")[0] + ".hdf5"
     if not os.path.isfile(hdf_file):
         print("We are converting a CSV dataset of TCGA to HDF5. Please wait a minute, this only happens the first "
               "time you use the TCGA dataset.")
         df = pd.read_csv(csv_file, compression="gzip", sep="\t")
         df = df.set_index('Sample')
         df = df.transpose()
         df.to_hdf(hdf_file, key="data", complevel=5)
     self.df = pd.read_hdf(hdf_file)
     self.df.rename(symbol_map(self.df.columns), axis="columns", inplace=True)
     self.df = self.df - self.df.mean(axis=0)
     #self.df = self.df / self.df.variance()
     self.sample_names = self.df.index.values.tolist()
     self.node_names = np.array(self.df.columns.values.tolist()).astype("str")
     self.nb_nodes = self.df.shape[1]
     self.labels = [0 for _ in range(self.df.shape[0])]
def get_nih_data_paths():
    '''
    returns the path to the nih images,
    patient csv file and bbox csv file
    '''
    path = at.get(data_hash_dict.get('NIH'))
    to_path = os.path.dirname(path)
    path_to_tar = os.path.join(path, "images-224.tar")

    with tarfile.open(path_to_tar, 'r') as tar:
        tar.extractall(path=to_path)
        tar.close()

    x_ray_data_path = os.path.join(path, "Data_Entry_2017.csv")
    bbox_data_path = os.path.join(path, "BBox_List_2017.csv")
    image_path = os.path.join(to_path, "images-224")

    return image_path, x_ray_data_path, bbox_data_path
Beispiel #15
0
    def download(self, chunksize=100):
        try:
            import gzip
            import shutil
            import pandas as pd
            from six.moves import urllib
            import academictorrents as at
        except ImportError as exception:
            raise ImportError('{0}. To use the TCGA dataset, you need to '
                              'install the necessary dependencies with '
                              '`pip install torchmeta[tcga]`.'.format(
                                  exception.message))

        clinical_matrices_folder = os.path.join(self.root, 'clinicalMatrices')
        if not os.path.exists(clinical_matrices_folder):
            os.makedirs(clinical_matrices_folder)

        for cancer in self.cancers:
            filename = self.clinical_matrix_filename.format(cancer)
            rawpath = os.path.join(clinical_matrices_folder,
                                   '{0}.gz'.format(filename))
            filepath = os.path.join(clinical_matrices_folder,
                                    '{0}.tsv'.format(filename))

            if os.path.isfile(filepath):
                continue

            if not os.path.exists(rawpath):
                print('Downloading `{0}.gz`...'.format(filename))
                url = self.clinical_matrix_url.format(cancer)
                urllib.request.urlretrieve(url, rawpath)

            print('Extracting `{0}.gz`...'.format(filename))
            with gzip.open(rawpath, 'rb') as gzf:
                with open(filepath, 'wb') as f:
                    shutil.copyfileobj(gzf, f)

        gene_expression_file = os.path.join(self.root,
                                            self.gene_expression_filename)
        if not os.path.isfile(gene_expression_file):
            from tqdm import tqdm
            print('Downloading `{0}` using `academictorrents`...'.format(
                self.gene_expression_filename))
            csv_file = at.get(self.gene_expression_torrent,
                              datastore=self.root)
            print('Downloaded to: `{0}`'.format(csv_file))

            print(
                'Converting TCGA CSV dataset to HDF5. This may take a while, '
                'but only happens on the first run.')
            reader = pd.read_csv(csv_file,
                                 compression='gzip',
                                 sep='\t',
                                 header=0,
                                 index_col=0,
                                 chunksize=chunksize)
            shape = (10459, 20530)

            with tqdm(total=shape[1]) as pbar:
                with h5py.File(gene_expression_file, 'w') as f:
                    dataset = f.create_dataset('expression_data',
                                               shape=shape,
                                               dtype='f4')
                    gene_ids = []
                    for idx, chunk in enumerate(reader):
                        slice_ = slice(idx * chunksize, (idx + 1) * chunksize)
                        dataset[:, slice_] = chunk.T
                        gene_ids.extend(chunk.index)
                        pbar.update(chunk.shape[0])
                    all_sample_ids = chunk.columns.tolist()

            gene_ids_file = os.path.join(self.root, 'gene_ids.json')
            with open(gene_ids_file, 'w') as f:
                json.dump(gene_ids, f)

            all_sample_ids_file = os.path.join(self.root,
                                               'all_sample_ids.json')
            with open(all_sample_ids_file, 'w') as f:
                json.dump(all_sample_ids, f)

            if os.path.isfile(csv_file):
                os.remove(csv_file)

            print('Done')

        self._process_clinical_matrices()

        # Create label files
        for split in ['train', 'val', 'test']:
            filename = os.path.join(self.root,
                                    self.filename_tasks.format(split))
            data = get_asset(self.folder,
                             '{0}.json'.format(split),
                             dtype='json')

            with open(filename, 'w') as f:
                labels = sorted([key.split('|', 1) for key in data])
                json.dump(labels, f)

        # Clean up
        for cancer in self.cancers:
            filename = self.clinical_matrix_filename.format(cancer)
            rawpath = os.path.join(clinical_matrices_folder,
                                   '{0}.gz'.format(filename))
            if os.path.isfile(rawpath):
                os.remove(rawpath)
Beispiel #16
0
 def test_get_single_file(self):
     filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb',
                       use_timestamp=False)
     self.assertTrue(os.path.isfile(filename))
 def test_get_file_http(self):
     filename = at.get('55a8925a8d546b9ca47d309ab438b91f7959e77f')
     self.assertTrue(os.path.isfile(filename))
Beispiel #18
0
 def test_get_multiple_files(self):
     path = at.get('b79869ca12787166de88311ca1f28e3ebec12dec',
                   use_timestamp=False)
     files = os.listdir(path)
     self.assertTrue(len(files) == 174)
Beispiel #19
0
 def test_get_file_http(self):
     filename = at.get('55a8925a8d546b9ca47d309ab438b91f7959e77f',
                       use_timestamp=False)
     self.assertTrue(os.path.isfile(filename))
     time.sleep(3)
Beispiel #20
0
def _download(data_dir, cancers):
    import academictorrents as at
    from six.moves import urllib
    import gzip

    # download files
    try:
        os.makedirs(os.path.join(data_dir, 'clinicalMatrices'))
    except OSError as e:
        if e.errno == 17:
            pass
        else:
            raise

    for cancer in cancers:
        filename = '{}_clinicalMatrix'.format(cancer)
        file_path = os.path.join(data_dir, 'clinicalMatrices', filename)
        decompressed_file_path = file_path.replace('.gz', '')

        if os.path.isfile(file_path):
            continue

        file_path += '.gz'

        url = 'https://tcga.xenahubs.net/download/TCGA.{}.sampleMap/{}_clinicalMatrix.gz'.format(
            cancer, cancer)

        print('Downloading ' + url)
        data = urllib.request.urlopen(url)

        with open(file_path, 'wb') as f:
            f.write(data.read())
        with open(decompressed_file_path,
                  'wb') as out_f, gzip.GzipFile(file_path) as zip_f:
            out_f.write(zip_f.read())
        os.unlink(file_path)

        if os.stat(decompressed_file_path).st_size == 0:
            os.remove(decompressed_file_path)
            error = IOError('Downloading {} from {} failed.'.format(
                filename, url))
            error.strerror = 'Downloading {} from {} failed.'.format(
                filename, url)
            error.errno = 5
            error.filename = decompressed_file_path
            raise error

    hdf_file = os.path.join(data_dir, "TCGA_HiSeqV2.hdf5")
    #csv_file = os.path.join(data_dir, 'HiSeqV2.gz')
    gene_ids_file = os.path.join(data_dir, 'gene_ids')
    all_sample_ids_file = os.path.join(data_dir, 'all_sample_ids')

    print('Downloading or checking for TCGA_HiSeqV2 using Academic Torrents')
    csv_file = at.get("e4081b995625f9fc599ad860138acf7b6eb1cf6f",
                      datastore=data_dir)
    if not os.path.isfile(hdf_file) and os.path.isfile(csv_file):
        print("Downloaded to: " + csv_file)
        print(
            "Converting TCGA CSV dataset to HDF5. This only happens on first run."
        )
        df = pd.read_csv(csv_file, compression="gzip", sep="\t")
        df = df.set_index('Sample')
        df = df.transpose()
        gene_ids = df.columns.values.tolist()
        all_sample_ids = df.index.values.tolist()
        with open(gene_ids_file, "w") as text_file:
            for gene_id in gene_ids:
                text_file.write('{}\n'.format(gene_id))
        with open(all_sample_ids_file, "w") as text_file:
            for sample_id in all_sample_ids:
                text_file.write('{}\n'.format(sample_id))

        f = h5py.File(hdf_file)
        f.create_dataset("dataset", data=df.values, compression="gzip")
        f.close()
Beispiel #21
0
print "About to import the library"

import academictorrents as at

print "About to start a download"

filename = at.get("323a0048d87ca79b68f12a6350a57776b6a3b7fb")

print "About to open the file"

import cPickle, gzip
import sys, os, time
import numpy as np

mnist = gzip.open(filename, 'rb')
train_set, validation_set, test_set = cPickle.load(mnist)
mnist.close()
Beispiel #22
0
import academictorrents as at
at.get("85a5bd50e4c365f8df70240ffd4ecc7dec59912b")
 def test_find_downloaded_torrent(self):
     filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb')
     self.assertTrue(os.path.isfile(filename))
Beispiel #24
0
 def load_data(self):
     self.nx_graph = nx.OrderedGraph(
         nx.readwrite.gpickle.read_gpickle(
             at.get(self.at_hash, datastore=self.datastore)))
Beispiel #25
0
import academictorrents as at
import sys
import argparse

parser = argparse.ArgumentParser(description='AT Simple command line tool')
parser.add_argument('-hash',
                    type=str,
                    nargs='?',
                    required=True,
                    help='Hash of torrent to download')
parser.add_argument('-name',
                    type=str,
                    nargs='?',
                    default=None,
                    help='Name of subfolder for file')
parser.add_argument('-datastore',
                    type=str,
                    nargs='?',
                    default=".",
                    help='Location which to place the files')
args = parser.parse_args()

filename = at.get(args.hash, datastore=args.datastore, name=args.name)

print("Done")
 def test_different_datastore(self):
     filename = at.get('323a0048d87ca79b68f12a6350a57776b6a3b7fb',
                       datastore=os.getcwd() + '/datastore/alt/')
     assert filename == os.getcwd() + '/datastore/alt/mnist.pkl.gz'
     self.assertTrue(os.path.isfile(filename))