Esempio n. 1
0
def load_data2(params, seed, shuffle=True, n_cols=None):
    train_path = get_file(
        'P1B2.train.csv',
        origin=
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv'
    )
    test_path = get_file(
        'P1B2.test.csv',
        origin=
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv'
    )

    usecols = list(range(n_cols)) if n_cols else None

    df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
    df_test = pd.read_csv(test_path, engine='c', usecols=usecols)

    # df_train = candle.fetch_file(params['data_url'] + params['train_data'],subdir='Pilot1')
    # df_test  = candle.fetch_file(params['data_url'] + params['test_data'],subdir='Pilot1')

    if shuffle:
        df_train = df_train.sample(frac=1, random_state=seed)
        df_test = df_test.sample(frac=1, random_state=seed)

    X_train = df_train.iloc[:, 2:].values
    X_test = df_test.iloc[:, 2:].values

    y_train = pd.get_dummies(df_train[['cancer_type']]).values
    y_test = pd.get_dummies(df_test[['cancer_type']]).values

    return (X_train, y_train), (X_test, y_test)
Esempio n. 2
0
def fetch_file(link, subdir, untar=False, md5_hash=None):
    """ Convert URL to file path and download the file
        if it is not already present in spedified cache.

        Parameters
        ----------
        link : link path
            URL of the file to download
        subdir : directory path
            Local path to check for cached file.
        untar : boolean
            Flag to specify if the file to download should
            be decompressed too.
            (default: False, no decompression)
        md5_hash : MD5 hash
            Hash used as a checksum to verify data integrity.
            Verification is carried out if a hash is provided.
            (default: None, no verification)

        Return
        ----------
        local path to the downloaded, or cached, file.
    """

    fname = os.path.basename(link)
    return get_file(fname,
                    origin=link,
                    untar=untar,
                    md5_hash=md5_hash,
                    cache_subdir=subdir)
Esempio n. 3
0
def fetch_file(link, subdir, untar=False, md5_hash=None):
    fname = os.path.basename(link)
    return get_file(fname,
                    origin=link,
                    untar=untar,
                    md5_hash=md5_hash,
                    cache_subdir=subdir)
import os
from file_utils import get_file


modac_collection_path='https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_3/ml_ready_pathology_reports/'
metadata = 'ml_ready_raw_text_histo_metadata.csv'
reports = 'ml_ready_raw_text_pathology_reports.tar.gz'

metadata_url = os.path.join( modac_collection_path, metadata)
reports_url = os.path.join( modac_collection_path, reports)

print(metadata_url)
print(reports_url)

get_file('histo_metadata.csv', metadata_url, datadir = 'data')
get_file('features_full.tar.gz', reports_url, datadir = 'data', untar = True)


Esempio n. 5
0
 def set_es_filehandle(self):
     self.es_corpus_file = file_utils.get_file(self.es_corpus_filename, "r")
Esempio n. 6
0
            print len(self.en_lines), len(self.es_lines), len(self.es_uniq_words), \
                    len(self.english_words['resumption']), len(self.es_uniq), \
                    len(self.tfe), self.tfe["reanudación resumption"], "reanudación resumption"
        print 'Initialization done: ', time.time() - start, ' seconds'
        start = time.time()
        if self.model_no == 1:
            self.EM_algo()
        if self.model_no == 2:
            self.EM_algo2()
        print 'EM algorithm done: ', time.time() - start, ' seconds'
        start = time.time()
        if self.model_no == 1:
            file_utils.write_json_gzip(self.tfe, "ibm_model_1.gzip")
        if self.model_no == 2:
            file_utils.write_json_gzip(self.q, "ibm_model_2_q.gzip")
            file_utils.write_json_gzip(self.tfe, "ibm_model_2_tfe.gzip")
        print 'IBM Model', self.model_no, 'written to a file: ', time.time() - start, ' seconds'

if __name__ == "__main__":
    #model = IBM_model("corpus.en", "corpus.es", 2)
    #model.do_EM_algo()
    model = IBM_model("test.en", "test.es", 2)
    out_f = file_utils.get_file("alignment_test.p2.out", "w")
    start = time.time()
    file_utils.write_itr(model.use_model2(), out_f)
    print 'Alignments are done: ', time.time() - start, ' seconds'
    #out_f = file_utils.get_file("alignment_test.p1.out", "w")
    #start = time.time()
    #file_utils.write_itr(model.use_model1(), out_f)
    #print 'Alignments are done: ', time.time() - start, ' seconds'
Esempio n. 7
0
import os
from file_utils import get_file

modac_collection_path = 'https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_3/multitask_cnn/'
model = 'mt_cnn_model.h5'

model_url = os.path.join(modac_collection_path, model)

print(model_url)

get_file('mt_cnn_model.h5', model_url, datadir='.')
Esempio n. 8
0
def show_image(data_dir, class_id, image_id):
    image_name = get_files(data_dir, class_id)[image_id]
    image_file = get_file(data_dir, class_id, image_name)
    io.imshow(io.imread(image_file))
import os, sys

file_path = os.path.dirname(os.path.realpath(__file__))
lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
sys.path.append(lib_path2)

from file_utils import get_file

modac_collection_path = 'https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_1/combination_drug_response_predictor'

model = 'combo.model.h5'
weights = 'combo.weights.h5'

model_url = os.path.join(modac_collection_path, model)
weights_url = os.path.join(modac_collection_path, weights)

get_file('uq.model.h5', model_url, datadir='.')
get_file('uq.weights.h5', weights_url, datadir='.')
Esempio n. 10
0
def get_file(url):
    fname = os.path.basename(url)
    return file_utils.get_file(fname, origin=url, cache_subdir='Pilot1')
import os
from file_utils import get_file

modac_collection_path = 'https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_3/pathology-reports-hierarchical-self-attention-network-hisan'
model = 'HiSAN_model.tar.gz'

model_url = os.path.join(modac_collection_path, model)

print(model_url)

get_file('hisan-trained-model.tar.gz',
         model_url,
         datadir='hisan-trained-model',
         untar=True)