def load_data2(params, seed, shuffle=True, n_cols=None): train_path = get_file( 'P1B2.train.csv', origin= 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv' ) test_path = get_file( 'P1B2.test.csv', origin= 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv' ) usecols = list(range(n_cols)) if n_cols else None df_train = pd.read_csv(train_path, engine='c', usecols=usecols) df_test = pd.read_csv(test_path, engine='c', usecols=usecols) # df_train = candle.fetch_file(params['data_url'] + params['train_data'],subdir='Pilot1') # df_test = candle.fetch_file(params['data_url'] + params['test_data'],subdir='Pilot1') if shuffle: df_train = df_train.sample(frac=1, random_state=seed) df_test = df_test.sample(frac=1, random_state=seed) X_train = df_train.iloc[:, 2:].values X_test = df_test.iloc[:, 2:].values y_train = pd.get_dummies(df_train[['cancer_type']]).values y_test = pd.get_dummies(df_test[['cancer_type']]).values return (X_train, y_train), (X_test, y_test)
def fetch_file(link, subdir, untar=False, md5_hash=None): """ Convert URL to file path and download the file if it is not already present in spedified cache. Parameters ---------- link : link path URL of the file to download subdir : directory path Local path to check for cached file. untar : boolean Flag to specify if the file to download should be decompressed too. (default: False, no decompression) md5_hash : MD5 hash Hash used as a checksum to verify data integrity. Verification is carried out if a hash is provided. (default: None, no verification) Return ---------- local path to the downloaded, or cached, file. """ fname = os.path.basename(link) return get_file(fname, origin=link, untar=untar, md5_hash=md5_hash, cache_subdir=subdir)
def fetch_file(link, subdir, untar=False, md5_hash=None): fname = os.path.basename(link) return get_file(fname, origin=link, untar=untar, md5_hash=md5_hash, cache_subdir=subdir)
import os from file_utils import get_file modac_collection_path='https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_3/ml_ready_pathology_reports/' metadata = 'ml_ready_raw_text_histo_metadata.csv' reports = 'ml_ready_raw_text_pathology_reports.tar.gz' metadata_url = os.path.join( modac_collection_path, metadata) reports_url = os.path.join( modac_collection_path, reports) print(metadata_url) print(reports_url) get_file('histo_metadata.csv', metadata_url, datadir = 'data') get_file('features_full.tar.gz', reports_url, datadir = 'data', untar = True)
def set_es_filehandle(self): self.es_corpus_file = file_utils.get_file(self.es_corpus_filename, "r")
print len(self.en_lines), len(self.es_lines), len(self.es_uniq_words), \ len(self.english_words['resumption']), len(self.es_uniq), \ len(self.tfe), self.tfe["reanudación resumption"], "reanudación resumption" print 'Initialization done: ', time.time() - start, ' seconds' start = time.time() if self.model_no == 1: self.EM_algo() if self.model_no == 2: self.EM_algo2() print 'EM algorithm done: ', time.time() - start, ' seconds' start = time.time() if self.model_no == 1: file_utils.write_json_gzip(self.tfe, "ibm_model_1.gzip") if self.model_no == 2: file_utils.write_json_gzip(self.q, "ibm_model_2_q.gzip") file_utils.write_json_gzip(self.tfe, "ibm_model_2_tfe.gzip") print 'IBM Model', self.model_no, 'written to a file: ', time.time() - start, ' seconds' if __name__ == "__main__": #model = IBM_model("corpus.en", "corpus.es", 2) #model.do_EM_algo() model = IBM_model("test.en", "test.es", 2) out_f = file_utils.get_file("alignment_test.p2.out", "w") start = time.time() file_utils.write_itr(model.use_model2(), out_f) print 'Alignments are done: ', time.time() - start, ' seconds' #out_f = file_utils.get_file("alignment_test.p1.out", "w") #start = time.time() #file_utils.write_itr(model.use_model1(), out_f) #print 'Alignments are done: ', time.time() - start, ' seconds'
import os from file_utils import get_file modac_collection_path = 'https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_3/multitask_cnn/' model = 'mt_cnn_model.h5' model_url = os.path.join(modac_collection_path, model) print(model_url) get_file('mt_cnn_model.h5', model_url, datadir='.')
def show_image(data_dir, class_id, image_id): image_name = get_files(data_dir, class_id)[image_id] image_file = get_file(data_dir, class_id, image_name) io.imshow(io.imread(image_file))
import os, sys file_path = os.path.dirname(os.path.realpath(__file__)) lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) sys.path.append(lib_path2) from file_utils import get_file modac_collection_path = 'https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_1/combination_drug_response_predictor' model = 'combo.model.h5' weights = 'combo.weights.h5' model_url = os.path.join(modac_collection_path, model) weights_url = os.path.join(modac_collection_path, weights) get_file('uq.model.h5', model_url, datadir='.') get_file('uq.weights.h5', weights_url, datadir='.')
def get_file(url): fname = os.path.basename(url) return file_utils.get_file(fname, origin=url, cache_subdir='Pilot1')
import os from file_utils import get_file modac_collection_path = 'https://modac.cancer.gov/api/v2/dataObject/NCI_DOE_Archive/JDACS4C/JDACS4C_Pilot_3/pathology-reports-hierarchical-self-attention-network-hisan' model = 'HiSAN_model.tar.gz' model_url = os.path.join(modac_collection_path, model) print(model_url) get_file('hisan-trained-model.tar.gz', model_url, datadir='hisan-trained-model', untar=True)