def test_luna_patches_3d(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) image_dir = image_dir + '/test_luna/' utils.auto_make_dir(image_dir) id2zyxd = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH) luna_data_paths = utils_lung.get_patient_data_paths(pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] # pid = '1.3.6.1.4.1.14519.5.2.1.6279.6001.138080888843357047811238713686' # luna_data_paths = [pathfinder.LUNA_DATA_PATH + '/%s.mhd' % pid] for k, p in enumerate(luna_data_paths): img, origin, pixel_spacing = utils_lung.read_mhd(p) # img = data_transforms.hu2normHU(img) id = os.path.basename(p).replace('.mhd', '') print id annotations = id2zyxd[id] print annotations for zyxd in annotations: img_out, mask = config().data_prep_function_train(img, pixel_spacing=pixel_spacing, p_transform=config().p_transform, p_transform_augment=config().p_transform_augment, patch_center=zyxd, luna_annotations=annotations, luna_origin=origin) try: plot_slice_3d_2(img_out, mask, 0, id) plot_slice_3d_2(img_out, mask, 1, id) plot_slice_3d_2(img_out, mask, 2, id) except: pass print '------------------------------------------'
def sample_augmentation_parameters(transformation): # TODO: bad thing to mix fixed and random params!!! if set(transformation.keys()) == {'patch_size', 'mm_patch_size'} or \ set(transformation.keys()) == {'patch_size', 'mm_patch_size', 'mask_roi'}: return None shift_x = config().rng.uniform(*transformation.get('translation_range_x', [0., 0.])) shift_y = config().rng.uniform(*transformation.get('translation_range_y', [0., 0.])) translation = (shift_x, shift_y) rotation = config().rng.uniform(*transformation.get('rotation_range', [0., 0.])) shear = config().rng.uniform(*transformation.get('shear_range', [0., 0.])) roi_scale = config().rng.uniform(*transformation.get('roi_scale_range', [1., 1.])) z = config().rng.uniform(*transformation.get('zoom_range', [1., 1.])) zoom = (z, z) if 'do_flip' in transformation: if type(transformation['do_flip']) == tuple: flip_x = config().rng.randint(2) > 0 if transformation['do_flip'][0] else False flip_y = config().rng.randint(2) > 0 if transformation['do_flip'][1] else False else: flip_x = config().rng.randint(2) > 0 if transformation['do_flip'] else False flip_y = False else: flip_x, flip_y = False, False sequence_shift = config().rng.randint(30) if transformation.get('sequence_shift', False) else 0 return namedtuple('Params', ['translation', 'rotation', 'shear', 'zoom', 'roi_scale', 'flip_x', 'flip_y', 'sequence_shift'])(translation, rotation, shear, zoom, roi_scale, flip_x, flip_y, sequence_shift)
def import_cash_crops(ccrops_list): sql = "INSERT INTO cash_crops(cash_crop) VALUES(%s)" conn = None try: # read database configuration params = configuration.config() # connect to the PostgreSQL database conn = psycopg2.connect(**params) # create a new cursor cur = conn.cursor() # execute the INSERT statement cur.executemany(sql, ccrops_list) # commit the changes to the database conn.commit() # close communication with the database cur.close() except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def test_luna3d(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) image_dir = image_dir + '/test_luna/' utils.auto_make_dir(image_dir) id2zyxd = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH) luna_data_paths = utils_lung.get_patient_data_paths(pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] # luna_data_paths = [ # pathfinder.LUNA_DATA_PATH + '/1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886.mhd'] luna_data_paths = [ '/mnt/sda3/data/kaggle-lung/luna_test_patient/1.3.6.1.4.1.14519.5.2.1.6279.6001.943403138251347598519939390311.mhd'] for k, p in enumerate(luna_data_paths): img, origin, pixel_spacing = utils_lung.read_mhd(p) id = os.path.basename(p).replace('.mhd', '') print id annotations = id2zyxd[id] img_out, mask, annotations_out = config().data_prep_function(img, pixel_spacing=pixel_spacing, luna_annotations=annotations, luna_origin=origin) mask[mask == 0.] = 0.1 print annotations_out for zyxd in annotations_out: plot_slice_3d_2(img_out, mask, 0, id, idx=zyxd) plot_slice_3d_2(img_out, mask, 1, id, idx=zyxd) plot_slice_3d_2(img_out, mask, 2, id, idx=zyxd)
def __init__(self): self.w = wheels.wheels() self.s = Sensors() self.config = configuration.config() self.w.setConfig(self.config) self.s.setConfig(self.config) self.s.calibration = True
def start_interface(): interface = None for i in configuration.config('Interfaces').get_sections(): if configuration.config('Interfaces')(i).get_bool('enabled'): # Exactly one interface must be enabled if interface: raise exceptions.MultipleInterfacesError() else: interface = sys.modules['outspline.interfaces.' + i] # Exactly one interface must be enabled if interface: interface.loop() else: raise exceptions.InterfaceNotFoundError()
def test_luna3d(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) image_dir = image_dir + '/test_luna/' utils.auto_make_dir(image_dir) id2zyxd = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH) luna_data_paths = utils_lung.get_patient_data_paths(pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] # luna_data_paths = [ # pathfinder.LUNA_DATA_PATH + '/1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886.mhd'] luna_data_paths = [ '/mnt/sda3/data/kaggle-lung/luna_test_patient/1.3.6.1.4.1.14519.5.2.1.6279.6001.943403138251347598519939390311.mhd'] for k, p in enumerate(luna_data_paths): img, origin, pixel_spacing = utils_lung.read_mhd(p) id = os.path.basename(p).replace('.mhd', '') print(id) annotations = id2zyxd[id] img_out, mask, annotations_out = config().data_prep_function(img, pixel_spacing=pixel_spacing, luna_annotations=annotations, luna_origin=origin) mask[mask == 0.] = 0.1 print(annotations_out) for zyxd in annotations_out: plot_slice_3d_2(img_out, mask, 0, id, idx=zyxd) plot_slice_3d_2(img_out, mask, 1, id, idx=zyxd) plot_slice_3d_2(img_out, mask, 2, id, idx=zyxd)
def count_proportion(): id2zyxd = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH) luna_data_paths = utils_lung.get_patient_data_paths( pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] n_white = 0 n_black = 0 for k, p in enumerate(luna_data_paths): img, origin, pixel_spacing = utils_lung.read_mhd(p) img = data_transforms.hu2normHU(img) id = os.path.basename(p).replace('.mhd', '') print id annotations = id2zyxd[id] img_out, annotations_out = data_transforms.transform_scan3d( img, pixel_spacing=pixel_spacing, p_transform=config().p_transform, p_transform_augment=None, # config().p_transform_augment, luna_annotations=annotations, luna_origin=origin) mask = data_transforms.make_3d_mask_from_annotations(img_out.shape, annotations_out, shape='sphere') n_white += np.sum(mask) n_black += mask.shape[0] * mask.shape[1] * mask.shape[2] - np.sum(mask) print 'white', n_white print 'black', n_black
def connect(self): """ Connect to the PostgreSQL database server """ try: # read connection parameters params = config() # connect to the PostgreSQL server print('Connecting to the PostgreSQL database...') self.conn = psycopg2.connect(**params) # create a cursor cur = self.conn.cursor() # execute a statement print('PostgreSQL database version:') cur.execute('SELECT version()') # display the PostgreSQL database server version db_version = cur.fetchone() print(db_version) # close the communication with the PostgreSQL cur.close() except (Exception, psycopg2.DatabaseError) as error: print(error)
def get_s3_resource(): """ The calls to AWS STS AssumeRole must be signed with the access key ID and secret access key of an existing IAM user. The credentials can be in environment variables or in a configuration file and will be discovered automatically by the boto3.client() function. For more information, see the Python SDK documentation: http://boto3.readthedocs.io/en/latest/reference/services/sts.html#client Output: S3 resource object """ if not hasattr(get_s3_resource, 's3_resource'): get_s3_resource.s3_resource = boto3.resource('s3') configuration = config() if configuration['other'].getboolean('cross_account_access'): sts_client = boto3.client('sts') response = sts_client.assume_role( RoleArn=configuration['other']['cross_account_access_role'], RoleSessionName="AssumeRoleSession") get_s3_resource.s3_resource = boto3.resource( 's3', aws_access_key_id=response['Credentials']['AccessKeyId'], aws_secret_access_key=response['Credentials'] ['SecretAccessKey'], aws_session_token=response['Credentials']['SessionToken'], ) return get_s3_resource.s3_resource
def count_proportion(): id2zyxd = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH) luna_data_paths = utils_lung.get_patient_data_paths(pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] n_white = 0 n_black = 0 for k, p in enumerate(luna_data_paths): img, origin, pixel_spacing = utils_lung.read_mhd(p) img = data_transforms.hu2normHU(img) id = os.path.basename(p).replace('.mhd', '') print id annotations = id2zyxd[id] img_out, annotations_out = data_transforms.transform_scan3d(img, pixel_spacing=pixel_spacing, p_transform=config().p_transform, p_transform_augment=None, # config().p_transform_augment, luna_annotations=annotations, luna_origin=origin) mask = data_transforms.make_3d_mask_from_annotations(img_out.shape, annotations_out, shape='sphere') n_white += np.sum(mask) n_black += mask.shape[0] * mask.shape[1] * mask.shape[2] - np.sum(mask) print 'white', n_white print 'black', n_black
def connect_to_postgresql(): # read database configuration params = configuration.config() # connect to the PostgreSQL database conn = psycopg2.connect(**params) return conn
def preprocess_with_augmentation(patient_data, result, index, augment=True, metadata=None, testaug=False): """ Load the resulting data, augment it if needed, and put it in result at the correct index :param patient_data: :param result: :param index: :return: """ if augment: augmentation_parameters = sample_augmentation_parameters() else: augmentation_parameters = None for tag, data in patient_data.iteritems(): metadata_tag = metadata[tag] desired_shape = result[tag][index].shape # try to fit data into the desired shape if tag.startswith("sliced:data:singleslice"): cleaning_processes = getattr(config(), 'cleaning_processes', []) data = clean_images([patient_data[tag]], metadata=metadata_tag, cleaning_processes=cleaning_processes) patient_4d_tensor, zoom_ratios = resize_and_augment( data, output_shape=desired_shape[-2:], augment=augmentation_parameters)[0] if "area_per_pixel:sax" in result: result["area_per_pixel:sax"][index] = zoom_ratios[0] * np.prod( metadata_tag["PixelSpacing"]) put_in_the_middle(result[tag][index], patient_4d_tensor) elif tag.startswith("sliced:data"): # put time dimension first, then axis dimension data = clean_images(patient_data[tag], metadata=metadata_tag) patient_4d_tensor, zoom_ratios = resize_and_augment( data, output_shape=desired_shape[-2:], augment=augmentation_parameters) if "area_per_pixel:sax" in result: result["area_per_pixel:sax"][index] = zoom_ratios[0] * np.prod( metadata_tag[0]["PixelSpacing"]) if "noswitch" not in tag: patient_4d_tensor = np.swapaxes(patient_4d_tensor, 1, 0) put_in_the_middle(result[tag][index], patient_4d_tensor) if tag.startswith("sliced:data:shape"): result[tag][index] = patient_data[tag] if tag.startswith("sliced:meta:"): # TODO: this probably doesn't work very well yet result[tag][index] = patient_data[tag] return
def download_s3_folder(lob_data_bucket, day_folder, keys): configuration = config() raw_data_folder = configuration['folders']['raw_lob_data'] with futures.ThreadPoolExecutor(max_workers=100) as executor: future_to_key = { executor.submit(download_S3_object, lob_data_bucket, key, f'{raw_data_folder}/tmp'): key for key in keys } for future in futures.as_completed(future_to_key): future_to_key[future]
def sample_augmentation_parameters(transformation): # TODO: bad thing to mix fixed and random params!!! if set(transformation.keys()) == {'patch_size', 'mm_patch_size'} or \ set(transformation.keys()) == {'patch_size', 'mm_patch_size', 'mask_roi'}: return None shift_x = config().rng.uniform( *transformation.get('translation_range_x', [0., 0.])) shift_y = config().rng.uniform( *transformation.get('translation_range_y', [0., 0.])) translation = (shift_x, shift_y) rotation = config().rng.uniform( *transformation.get('rotation_range', [0., 0.])) shear = config().rng.uniform(*transformation.get('shear_range', [0., 0.])) roi_scale = config().rng.uniform( *transformation.get('roi_scale_range', [1., 1.])) z = config().rng.uniform(*transformation.get('zoom_range', [1., 1.])) zoom = (z, z) if 'do_flip' in transformation: if type(transformation['do_flip']) == tuple: flip_x = config().rng.randint( 2) > 0 if transformation['do_flip'][0] else False flip_y = config().rng.randint( 2) > 0 if transformation['do_flip'][1] else False else: flip_x = config().rng.randint( 2) > 0 if transformation['do_flip'] else False flip_y = False else: flip_x, flip_y = False, False sequence_shift = config().rng.randint(30) if transformation.get( 'sequence_shift', False) else 0 return namedtuple('Params', [ 'translation', 'rotation', 'shear', 'zoom', 'roi_scale', 'flip_x', 'flip_y', 'sequence_shift' ])(translation, rotation, shear, zoom, roi_scale, flip_x, flip_y, sequence_shift)
def evaluate(individual): Config = config() value_list = Config.decode_list(individual) y1 = value_list[0]**2 + value_list[1]**2 + value_list[2]**2 + 10 y2 = (value_list[3] - 1)**2 + (value_list[4] - 1)**2 + 15 y3 = (value_list[5] - 1)**2 + 20 t_y1 = () t_y2 = () t_y3 = () for i in range(20): t_y1 = t_y1 + (y1 + random.random() * 5, ) t_y2 = t_y2 + (y2 + random.random() * 5, ) t_y3 = t_y3 + (y3 + random.random() * 5, ) return t_y1, t_y2, t_y3 #, y3, y3, y3, y3, y3, y3, y3, y3, y3, y3, y3,y3,y3
def test_luna_patches_3d(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) image_dir = image_dir + '/test_luna/' utils.auto_make_dir(image_dir) id2zyxd = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH) luna_data_paths = utils_lung.get_patient_data_paths( pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] # pid = '1.3.6.1.4.1.14519.5.2.1.6279.6001.138080888843357047811238713686' # luna_data_paths = [pathfinder.LUNA_DATA_PATH + '/%s.mhd' % pid] for k, p in enumerate(luna_data_paths): img, origin, pixel_spacing = utils_lung.read_mhd(p) # img = data_transforms.hu2normHU(img) id = os.path.basename(p).replace('.mhd', '') print(id) annotations = id2zyxd[id] print(annotations) for zyxd in annotations: img_out, mask = config().data_prep_function_train( img, pixel_spacing=pixel_spacing, p_transform=config().p_transform, p_transform_augment=config().p_transform_augment, patch_center=zyxd, luna_annotations=annotations, luna_origin=origin) try: plot_slice_3d_2(img_out, mask, 0, id) plot_slice_3d_2(img_out, mask, 1, id) plot_slice_3d_2(img_out, mask, 2, id) except: pass print('------------------------------------------')
def sample_test_augmentation_parameters(): global quasi_random_generator augm = config().augmentation_params_test if hasattr( config(), 'augmentation_params_test') else config().augmentation_params if "translation" in augm: newdict = dict() if "translation" in augm: newdict["translate_x"] = augm["translation"] newdict["translate_y"] = augm["translation"] if "shear" in augm: newdict["shear"] = augm["shear"] if "flip_vert" in augm: newdict["flip_vert"] = augm["flip_vert"] if "roll_time" in augm: newdict["roll_time"] = augm["roll_time"] if "flip_time" in augm: newdict["flip_time"] = augm["flip_time"] augmentation_params = dict(DEFAULT_AUGMENTATION_PARAMETERS, **newdict) else: augmentation_params = dict(DEFAULT_AUGMENTATION_PARAMETERS, **augm) if quasi_random_generator is None: quasi_random_generator = quasi_random.scrambled_halton_sequence_generator( dimension=len(augmentation_params), permutation='Braaten-Weller') res = dict() try: sample = quasi_random_generator.next() except ValueError: quasi_random_generator = quasi_random.scrambled_halton_sequence_generator( dimension=len(augmentation_params), permutation='Braaten-Weller') sample = quasi_random_generator.next() for rand, (key, (a, b)) in izip(sample, augmentation_params.iteritems()): #res[key] = config().rng.uniform(a,b) res[key] = a + rand * (b - a) return res
def sample_test_augmentation_parameters(): global quasi_random_generator augm = config().augmentation_params_test if hasattr(config(), 'augmentation_params_test') else config().augmentation_params if "translation" in augm: newdict = dict() if "translation" in augm: newdict["translate_x"] = augm["translation"] newdict["translate_y"] = augm["translation"] if "shear" in augm: newdict["shear"] = augm["shear"] if "flip_vert" in augm: newdict["flip_vert"] = augm["flip_vert"] if "roll_time" in augm: newdict["roll_time"] = augm["roll_time"] if "flip_time" in augm: newdict["flip_time"] = augm["flip_time"] augmentation_params = dict(DEFAULT_AUGMENTATION_PARAMETERS, **newdict) else: augmentation_params = dict(DEFAULT_AUGMENTATION_PARAMETERS, **augm) if quasi_random_generator is None: quasi_random_generator = quasi_random.scrambled_halton_sequence_generator(dimension=len(augmentation_params), permutation='Braaten-Weller') res = dict() try: sample = quasi_random_generator.next() except ValueError: quasi_random_generator = quasi_random.scrambled_halton_sequence_generator(dimension=len(augmentation_params), permutation='Braaten-Weller') sample = quasi_random_generator.next() for rand, (key, (a, b)) in izip(sample, augmentation_params.iteritems()): #res[key] = config().rng.uniform(a,b) res[key] = a + rand*(b-a) return res
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] #logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: #logger.info('Article fetched!!') articles.append(article) _save_articles(news_site_uid, articles)
def preprocess_with_augmentation(patient_data, result, index, augment=True, metadata=None, testaug=False): """ Load the resulting data, augment it if needed, and put it in result at the correct index :param patient_data: :param result: :param index: :return: """ if augment: augmentation_parameters = sample_augmentation_parameters() else: augmentation_parameters = None for tag, data in patient_data.iteritems(): metadata_tag = metadata[tag] desired_shape = result[tag][index].shape # try to fit data into the desired shape if tag.startswith("sliced:data:singleslice"): cleaning_processes = getattr(config(), 'cleaning_processes', []) data = clean_images( [patient_data[tag]], metadata=metadata_tag, cleaning_processes=cleaning_processes) patient_4d_tensor, zoom_ratios = resize_and_augment(data, output_shape=desired_shape[-2:], augment=augmentation_parameters)[0] if "area_per_pixel:sax" in result: result["area_per_pixel:sax"][index] = zoom_ratios[0] * np.prod(metadata_tag["PixelSpacing"]) put_in_the_middle(result[tag][index], patient_4d_tensor) elif tag.startswith("sliced:data"): # put time dimension first, then axis dimension data = clean_images(patient_data[tag], metadata=metadata_tag) patient_4d_tensor, zoom_ratios = resize_and_augment(data, output_shape=desired_shape[-2:], augment=augmentation_parameters) if "area_per_pixel:sax" in result: result["area_per_pixel:sax"][index] = zoom_ratios[0] * np.prod(metadata_tag[0]["PixelSpacing"]) if "noswitch" not in tag: patient_4d_tensor = np.swapaxes(patient_4d_tensor,1,0) put_in_the_middle(result[tag][index], patient_4d_tensor) if tag.startswith("sliced:data:shape"): result[tag][index] = patient_data[tag] if tag.startswith("sliced:meta:"): # TODO: this probably doesn't work very well yet result[tag][index] = patient_data[tag] return
def _fetch_article(news_site_uid, host, link): #logger.info('Start fetching article at {}'.format(link)) article = None try: article = news.ArticlePage(news_site_uid, _build_link(host, link)) except (HTTPError, ConnectionError, MaxRetryError) as e: #logger.warning('Error while fechting the article', exc_info=False) if article and not article.body: #logger.warning('Invalid article. There is no body') return None return article def _build_link(host, link): if is_well_formed_link.match(link): return link elif is_root_path.match(link): return '{}{}'.format(host, link) else: return '{host}/{uri}'.format(host=host, uri=link) if __name__ == '__main__': parser = argparse.ArgumentParser() news_site_choices = list(config()['news_sites'].keys()) parser.add_argument('news_site', help='The news site that you want to scrape', type=str, choices=news_site_choices) args = parser.parse_args() _news_scraper(args.news_site)
def predict_model(expid, mfile=None): metadata_path = MODEL_PATH + "%s.pkl" % (expid if not mfile else mfile) prediction_path = INTERMEDIATE_PREDICTIONS_PATH + "%s.pkl" % expid submission_path = SUBMISSION_PATH + "%s.csv" % expid if theano.config.optimizer != "fast_run": print "WARNING: not running in fast mode!" print "Using" print " %s" % metadata_path print "To generate" print " %s" % prediction_path print " %s" % submission_path print "Build model" interface_layers = config().build_model() output_layers = interface_layers["outputs"] input_layers = interface_layers["inputs"] top_layer = lasagne.layers.MergeLayer( incomings=output_layers.values() ) all_layers = lasagne.layers.get_all_layers(top_layer) num_params = lasagne.layers.count_params(top_layer) print " number of parameters: %d" % num_params print string.ljust(" layer output shapes:",36), print string.ljust("#params:",10), print "output shape:" for layer in all_layers[:-1]: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print " %s %s %s" % (name, num_param, layer.output_shape) xs_shared = { key: lasagne.utils.shared_empty(dim=len(l_in.output_shape), dtype='float32') for (key, l_in) in input_layers.iteritems() } idx = T.lscalar('idx') givens = dict() for key in input_layers.keys(): if key=="sunny": givens[input_layers[key].input_var] = xs_shared[key][idx*config().sunny_batch_size:(idx+1)*config().sunny_batch_size] else: givens[input_layers[key].input_var] = xs_shared[key][idx*config().batch_size:(idx+1)*config().batch_size] network_outputs = [ lasagne.layers.helper.get_output(network_output_layer, deterministic=True) for network_output_layer in output_layers.values() ] iter_test = theano.function([idx], network_outputs + theano_printer.get_the_stuff_to_print(), givens=givens, on_unused_input="ignore", # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) print "Load model parameters for resuming" resume_metadata = np.load(metadata_path) lasagne.layers.set_all_param_values(top_layer, resume_metadata['param_values']) num_batches_chunk = config().batches_per_chunk num_batches = get_number_of_test_batches() num_chunks = int(np.ceil(num_batches / float(config().batches_per_chunk))) chunks_train_idcs = range(1, num_chunks+1) data_loader.filter_patient_folders() create_test_gen = partial(config().create_test_gen, required_input_keys = xs_shared.keys(), required_output_keys = ["patients", "classification_correction_function"], ) print "Generate predictions with this model" start_time = time.time() prev_time = start_time predictions = [{"patient": i+1, "systole": np.zeros((0,600)), "diastole": np.zeros((0,600)) } for i in xrange(NUM_PATIENTS)] for e, test_data in izip(itertools.count(start=1), buffering.buffered_gen_threaded(create_test_gen())): print " load testing data onto GPU" for key in xs_shared: xs_shared[key].set_value(test_data["input"][key]) patient_ids = test_data["output"]["patients"] classification_correction = test_data["output"]["classification_correction_function"] print " patients:", " ".join(map(str, patient_ids)) print " chunk %d/%d" % (e, num_chunks) for b in xrange(num_batches_chunk): iter_result = iter_test(b) network_outputs = tuple(iter_result[:len(output_layers)]) network_outputs_dict = {output_layers.keys()[i]: network_outputs[i] for i in xrange(len(output_layers))} kaggle_systoles, kaggle_diastoles = config().postprocess(network_outputs_dict) kaggle_systoles, kaggle_diastoles = kaggle_systoles.astype('float64'), kaggle_diastoles.astype('float64') for idx, patient_id in enumerate(patient_ids[b*config().batch_size:(b+1)*config().batch_size]): if patient_id != 0: index = patient_id-1 patient_data = predictions[index] assert patient_id==patient_data["patient"] kaggle_systole = kaggle_systoles[idx:idx+1,:] kaggle_diastole = kaggle_diastoles[idx:idx+1,:] assert np.isfinite(kaggle_systole).all() and np.isfinite(kaggle_systole).all() kaggle_systole = classification_correction[b*config().batch_size + idx](kaggle_systole) kaggle_diastole = classification_correction[b*config().batch_size + idx](kaggle_diastole) assert np.isfinite(kaggle_systole).all() and np.isfinite(kaggle_systole).all() patient_data["systole"] = np.concatenate((patient_data["systole"], kaggle_systole ),axis=0) patient_data["diastole"] = np.concatenate((patient_data["diastole"], kaggle_diastole ),axis=0) now = time.time() time_since_start = now - start_time time_since_prev = now - prev_time prev_time = now est_time_left = time_since_start * (float(num_chunks - (e + 1)) / float(e + 1 - chunks_train_idcs[0])) eta = datetime.now() + timedelta(seconds=est_time_left) eta_str = eta.strftime("%c") print " %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev) print " estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str) print already_printed = False for prediction in predictions: if prediction["systole"].size>0 and prediction["diastole"].size>0: average_method = getattr(config(), 'tta_average_method', partial(np.mean, axis=0)) prediction["systole_average"] = average_method(prediction["systole"]) prediction["diastole_average"] = average_method(prediction["diastole"]) try: test_if_valid_distribution(prediction["systole_average"]) test_if_valid_distribution(prediction["diastole_average"]) except: if not already_printed: print "WARNING: These distributions are not distributions" already_printed = True prediction["systole_average"] = make_monotone_distribution(prediction["systole_average"]) prediction["diastole_average"] = make_monotone_distribution(prediction["diastole_average"]) test_if_valid_distribution(prediction["systole_average"]) test_if_valid_distribution(prediction["diastole_average"]) print "Calculating training and validation set scores for reference" validation_dict = {} for patient_ids, set_name in [(validation_patients_indices, "validation"), (train_patients_indices, "train")]: errors = [] for patient in patient_ids: prediction = predictions[patient-1] if "systole_average" in prediction: assert patient == regular_labels[patient-1, 0] error = CRSP(prediction["systole_average"], regular_labels[patient-1, 1]) errors.append(error) error = CRSP(prediction["diastole_average"], regular_labels[patient-1, 2]) errors.append(error) if len(errors)>0: errors = np.array(errors) estimated_CRSP = np.mean(errors) print " %s kaggle loss: %f" % (string.rjust(set_name, 12), estimated_CRSP) validation_dict[set_name] = estimated_CRSP else: print " %s kaggle loss: not calculated" % (string.rjust(set_name, 12)) print "dumping prediction file to %s" % prediction_path with open(prediction_path, 'w') as f: pickle.dump({ 'metadata_path': metadata_path, 'prediction_path': prediction_path, 'submission_path': submission_path, 'configuration_file': config().__name__, 'git_revision_hash': utils.get_git_revision_hash(), 'experiment_id': expid, 'time_since_start': time_since_start, 'param_values': lasagne.layers.get_all_param_values(top_layer), 'predictions': predictions, 'validation_errors': validation_dict, }, f, pickle.HIGHEST_PROTOCOL) print "prediction file dumped" print "dumping submission file to %s" % submission_path with open(submission_path, 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(['Id'] + ['P%d'%i for i in xrange(600)]) for prediction in predictions: # the submission only has patients 501 to 700 if prediction["patient"] in data_loader.test_patients_indices: if "diastole_average" not in prediction or "systole_average" not in prediction: raise Exception("Not all test-set patients were predicted") csvwriter.writerow(["%d_Diastole" % prediction["patient"]] + ["%.18f" % p for p in prediction["diastole_average"].flatten()]) csvwriter.writerow(["%d_Systole" % prediction["patient"]] + ["%.18f" % p for p in prediction["systole_average"].flatten()]) print "submission file dumped" return
# predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/' + expid if valid_tta_feat or test_tta_feat or all_tta_feat or train_tta_feat: outputs_path += '/features' utils.auto_make_dir(outputs_path) if dump: prediction_dump = os.path.join(outputs_path, expid + "_" + args.eval + "_predictions.p") print('Build model') model = config().build_model() model.l_out.load_state_dict(metadata['param_values']) model.l_out.cuda() model.l_out.eval() criterion = config().build_objective() if test: data_iterator = config().test_data_iterator elif feat: data_iterator = config().feat_data_iterator def get_preds_targs(data_iterator): print('Data') print('n', sys.argv[2], ': %d' % data_iterator.nsamples)
def get_trade_data(pair, date_start, date_end, frequency=timedelta(seconds=10)): ''' Function that returns a dataframe of resampled trade data and ready to be concatenated to a quotes dataframe with depth (Level = -1) Arguments: pair -- string, curency pair to return (e.g.'USDT_BTC') date_start -- string, timeseries start date_end -- string, timeseries end frequency -- timedelta, the minimum time granularity (e.g. timedelta(seconds=10)) ''' print(f'Checking for cached trade data from {date_start} to {date_end}') configuration = config() raw_data_folder = configuration['folders']['raw_trade_data'] resampled_data_folder = configuration['folders']['resampled_data'] date_start = datetime.strptime(date_start, '%Y-%m-%d') date_end = datetime.strptime(date_end, '%Y-%m-%d') freq = f'{int(frequency.total_seconds())}s' os.makedirs(f'{resampled_data_folder}/{pair}/trades/{freq}', exist_ok=True) data = [] # Loop through day folders date_to_process = date_start while date_to_process <= date_end: resampled_file_path = f'{resampled_data_folder}/{pair}/trades/{freq}/{datetime.strftime(date_to_process, "%Y-%m-%d")}.csv.gz' if os.path.isfile(resampled_file_path): print(f'Found {resampled_file_path}') else: print(f'Generating {resampled_file_path}') raw_file_name = f'{pair}-{datetime.strftime(date_to_process, "%Y%m%d")}.csv.gz' raw_file_path = f'{raw_data_folder}/{pair}/{raw_file_name}' if not os.path.isfile(raw_file_path): s3_resource = get_s3_resource() trade_data_bucket = s3_resource.Bucket( configuration['buckets']['trade_data']) trade_data_bucket.download_file(f'{pair}/{raw_file_name}', f'{raw_file_path}') print(f'Downloaded {raw_file_name} from S3') day_data = pd.read_csv(raw_file_path, parse_dates=['date']) df_trades_grp = day_data.groupby( [pd.Grouper(key='date', freq=freq), 'type']).agg({ 'amount': 'sum', 'rate': 'mean' }).reset_index() df_trades_piv = df_trades_grp.pivot(values=['amount', 'rate'], columns='type', index='date').reset_index() df_trades_piv.columns = list(map( "_".join, df_trades_piv.columns)) # "flatten" column names df_trades_piv.rename(columns={ 'date_': 'Datetime', 'amount_buy': 'Ask_Size', 'amount_sell': 'Bid_Size', 'rate_buy': 'Ask_Price', 'rate_sell': 'Bid_Price' }, inplace=True) # fill gaps with no trades - MAYBE we need something similar for quotes as a data integrity check start_dt = datetime(date_to_process.year, date_to_process.month, date_to_process.day, 0, 0, 0) end_dt = datetime(date_to_process.year, date_to_process.month, date_to_process.day, 23, 59, 59) # to ensure each timestep is covered date_range_reindex = pd.DataFrame(pd.date_range(start_dt, end_dt, freq=freq), columns=['Datetime']) df_trades_piv = pd.merge(df_trades_piv, date_range_reindex, right_on='Datetime', left_on='Datetime', how='right').sort_values('Datetime') # impute NAs - zero for size and last px for price df_trades_piv.loc[:, [ 'Ask_Size', 'Bid_Size' ]] = df_trades_piv.loc[:, ['Ask_Size', 'Bid_Size']].fillna(0) df_trades_piv.loc[:, [ 'Ask_Price', 'Bid_Price' ]] = df_trades_piv.loc[:, ['Ask_Price', 'Bid_Price']].fillna( method='ffill') # impute NAs for the first rows of the dataframes try: # check if previous day exists and assign last value of previous day df prev_day = date_to_process + timedelta(days=-1) prev_day_data = pd.read_csv( f'{resampled_data_folder}/{pair}/trades/{freq}/{datetime.strftime(prev_day, "%Y-%m-%d")}.csv.gz' ) prev_file_ask_px = prev_day_data.iloc[-1]['Ask_Price'] prev_file_bid_px = prev_day_data.iloc[-1]['Bid_Price'] except Exception as e: # if previous day not in the database, use first avaialble future value - not ideal print(e) print( f'Non-continuous data being processed. imputing avg values for bid or ask prices at the beginning of {date_to_process}' ) # NOT ideal cause we are leaking information prev_file_ask_px = df_trades_piv['Ask_Price'].dropna().iloc[0] prev_file_bid_px = df_trades_piv['Bid_Price'].dropna().iloc[0] df_trades_piv.loc[:, 'Bid_Price'] = df_trades_piv.loc[:, 'Bid_Price'].fillna( prev_file_bid_px ) df_trades_piv.loc[:, 'Ask_Price'] = df_trades_piv.loc[:, 'Ask_Price'].fillna( prev_file_ask_px ) # level -1 to keep it separate from order book depth df_trades_piv['Level'] = -1 df_trades_piv.to_csv(resampled_file_path, compression='gzip') date_to_process += timedelta( days=1) # the most nested folder is a day of the month data.append(resampled_file_path) return dd.read_csv(data, compression='gzip')
) train_data["intermediates"] = iter_train(0) pickle.dump(train_data, open(metadata_path + "-dump", "wb")) return if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) required = parser.add_argument_group('required arguments') required.add_argument('-c', '--config', help='configuration to run', required=True) args = parser.parse_args() set_configuration(args.config) expid = utils.generate_expid(args.config) log_file = LOGS_PATH + "%s.log" % expid with print_to_file(log_file): print "Running configuration:", config().__name__ print "Current git version:", utils.get_git_revision_hash() train_model(expid) print "log saved to '%s'" % log_file predict_model(expid) print "log saved to '%s'" % log_file
assert config_name == metadata['configuration'] if 'subconfiguration' in metadata: set_subconfiguration(metadata['subconfiguration']) set_configuration(config_name) # predictions paths prediction_dir = utils.get_dir_path('predictions', pathfinder.METADATA_PATH) prediction_path = prediction_dir + "/%s-%s-%s-%s.pkl" % (metadata['experiment_id'], set, n_tta_iterations, mean) # submissions paths submission_dir = utils.get_dir_path('submissions', pathfinder.METADATA_PATH) submission_path = submission_dir + "/%s-%s-%s-%s.csv" % (metadata['experiment_id'], set, n_tta_iterations, mean) print "Build model" model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_top) all_params = nn.layers.get_all_params(model.l_top) num_params = nn.layers.count_params(model.l_top) print ' number of parameters: %d' % num_params nn.layers.set_all_param_values(model.l_top, metadata['param_values']) xs_shared = [nn.utils.shared_empty(dim=len(l.shape)) for l in model.l_ins] givens_in = {} for l_in, x in izip(model.l_ins, xs_shared): givens_in[l_in.input_var] = x iter_test_det = theano.function([], [nn.layers.get_output(l, deterministic=True) for l in model.l_outs], givens=givens_in, on_unused_input='warn') if set == 'train':
expid = utils.generate_expid(config_name) print print "Experiment ID: %s" % expid print # metadata metadata_dir = utils.get_dir_path('train', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print 'Build model' model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_top) all_params = nn.layers.get_all_params(model.l_top) num_params = nn.layers.count_params(model.l_top) print ' number of parameters: %d' % num_params print string.ljust(' layer output shapes:', 36), print string.ljust('#params:', 10), print 'output shape:' for layer in all_layers[:-1]: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print ' %s %s %s' % (name, num_param, layer.output_shape) train_loss = config().build_objective(model)
expid = utils.generate_expid(config_name) print print "Experiment ID: %s" % expid print # metadata metadata_dir = utils.get_dir_path('models', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print 'Build model' model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print ' number of parameters: %d' % num_params print string.ljust(' layer output shapes:', 36), print string.ljust('#params:', 10), print 'output shape:' for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print ' %s %s %s' % (name, num_param, layer.output_shape) train_loss = config().build_objective(model, deterministic=False) train_loss2 = config().build_objective2(model, deterministic=False)
config_name = sys.argv[1] set_configuration('configs_luna_size_scan', config_name) # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/%s' % config_name utils.auto_make_dir(outputs_path) # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % config_name) sys.stderr = sys.stdout # builds model and sets its parameters model = config().build_model() x_shared = nn.utils.shared_empty(dim=len(model.l_in.shape)) givens_valid = {} givens_valid[model.l_in.input_var] = x_shared get_predictions_patch = theano.function([], nn.layers.get_output(model.l_out, deterministic=True), givens=givens_valid, on_unused_input='ignore') data_iterator = config().data_iterator #existing_preds = [f.rsplit('.') for f in os.listdir(outputs_path)] #print existing_preds
def train_model(expid): metadata_path = MODEL_PATH + "%s.pkl" % expid if theano.config.optimizer != "fast_run": print "WARNING: not running in fast mode!" data_loader.filter_patient_folders() print "Build model" interface_layers = config().build_model() output_layers = interface_layers["outputs"] input_layers = interface_layers["inputs"] top_layer = lasagne.layers.MergeLayer( incomings=output_layers.values() ) all_layers = lasagne.layers.get_all_layers(top_layer) all_params = lasagne.layers.get_all_params(top_layer, trainable=True) if "cutoff_gradients" in interface_layers: submodel_params = [param for value in interface_layers["cutoff_gradients"] for param in lasagne.layers.get_all_params(value)] all_params = [p for p in all_params if p not in submodel_params] if "pretrained" in interface_layers: for config_name, layers_dict in interface_layers["pretrained"].iteritems(): pretrained_metadata_path = MODEL_PATH + "%s.pkl" % config_name.split('.')[1] pretrained_resume_metadata = np.load(pretrained_metadata_path) pretrained_top_layer = lasagne.layers.MergeLayer( incomings = layers_dict.values() ) lasagne.layers.set_all_param_values(pretrained_top_layer, pretrained_resume_metadata['param_values']) num_params = sum([np.prod(p.get_value().shape) for p in all_params]) print string.ljust(" layer output shapes:",36), print string.ljust("#params:",10), print string.ljust("#data:",10), print "output shape:" for layer in all_layers[:-1]: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(int(num_param).__str__(), 10) num_size = string.ljust(np.prod(layer.output_shape[1:]).__str__(), 10) print " %s %s %s %s" % (name, num_param, num_size, layer.output_shape) print " number of parameters: %d" % num_params obj = config().build_objective(interface_layers) train_loss_theano = obj.get_loss() kaggle_loss_theano = obj.get_kaggle_loss() segmentation_loss_theano = obj.get_segmentation_loss() validation_other_losses = collections.OrderedDict() validation_train_loss = obj.get_loss(average=False, deterministic=True, validation=True, other_losses=validation_other_losses) validation_kaggle_loss = obj.get_kaggle_loss(average=False, deterministic=True, validation=True) validation_segmentation_loss = obj.get_segmentation_loss(average=False, deterministic=True, validation=True) xs_shared = { key: lasagne.utils.shared_empty(dim=len(l_in.output_shape), dtype='float32') for (key, l_in) in input_layers.iteritems() } # contains target_vars of the objective! Not the output layers desired values! # There can be more output layers than are strictly required for the objective # e.g. for debugging ys_shared = { key: lasagne.utils.shared_empty(dim=target_var.ndim, dtype='float32') for (key, target_var) in obj.target_vars.iteritems() } learning_rate_schedule = config().learning_rate_schedule learning_rate = theano.shared(np.float32(learning_rate_schedule[0])) idx = T.lscalar('idx') givens = dict() for key in obj.target_vars.keys(): if key=="segmentation": givens[obj.target_vars[key]] = ys_shared[key][idx*config().sunny_batch_size : (idx+1)*config().sunny_batch_size] else: givens[obj.target_vars[key]] = ys_shared[key][idx*config().batch_size : (idx+1)*config().batch_size] for key in input_layers.keys(): if key=="sunny": givens[input_layers[key].input_var] = xs_shared[key][idx*config().sunny_batch_size:(idx+1)*config().sunny_batch_size] else: givens[input_layers[key].input_var] = xs_shared[key][idx*config().batch_size:(idx+1)*config().batch_size] updates = config().build_updates(train_loss_theano, all_params, learning_rate) #grad_norm = T.sqrt(T.sum([(g**2).sum() for g in theano.grad(train_loss_theano, all_params)])) #theano_printer.print_me_this("Grad norm", grad_norm) iter_train = theano.function([idx], [train_loss_theano, kaggle_loss_theano, segmentation_loss_theano] + theano_printer.get_the_stuff_to_print(), givens=givens, on_unused_input="ignore", updates=updates, # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) iter_validate = theano.function([idx], [validation_train_loss, validation_kaggle_loss, validation_segmentation_loss] + [v for _, v in validation_other_losses.items()] + theano_printer.get_the_stuff_to_print(), givens=givens, on_unused_input="ignore") num_chunks_train = int(config().num_epochs_train * NUM_TRAIN_PATIENTS / (config().batch_size * config().batches_per_chunk)) print "Will train for %d chunks" % num_chunks_train if config().restart_from_save and os.path.isfile(metadata_path): print "Load model parameters for resuming" resume_metadata = np.load(metadata_path) lasagne.layers.set_all_param_values(top_layer, resume_metadata['param_values']) start_chunk_idx = resume_metadata['chunks_since_start'] + 1 chunks_train_idcs = range(start_chunk_idx, num_chunks_train) # set lr to the correct value current_lr = np.float32(utils.current_learning_rate(learning_rate_schedule, start_chunk_idx)) print " setting learning rate to %.7f" % current_lr learning_rate.set_value(current_lr) losses_train = resume_metadata['losses_train'] losses_eval_valid = resume_metadata['losses_eval_valid'] losses_eval_train = resume_metadata['losses_eval_train'] losses_eval_valid_kaggle = [] #resume_metadata['losses_eval_valid_kaggle'] losses_eval_train_kaggle = [] #resume_metadata['losses_eval_train_kaggle'] else: chunks_train_idcs = range(num_chunks_train) losses_train = [] losses_eval_valid = [] losses_eval_train = [] losses_eval_valid_kaggle = [] losses_eval_train_kaggle = [] create_train_gen = partial(config().create_train_gen, required_input_keys = xs_shared.keys(), required_output_keys = ys_shared.keys()# + ["patients"], ) create_eval_valid_gen = partial(config().create_eval_valid_gen, required_input_keys = xs_shared.keys(), required_output_keys = ys_shared.keys()# + ["patients"] ) create_eval_train_gen = partial(config().create_eval_train_gen, required_input_keys = xs_shared.keys(), required_output_keys = ys_shared.keys() ) print "Train model" start_time = time.time() prev_time = start_time num_batches_chunk = config().batches_per_chunk for e, train_data in izip(chunks_train_idcs, buffering.buffered_gen_threaded(create_train_gen())): print "Chunk %d/%d" % (e + 1, num_chunks_train) epoch = (1.0 * config().batch_size * config().batches_per_chunk * (e+1) / NUM_TRAIN_PATIENTS) print " Epoch %.1f" % epoch for key, rate in learning_rate_schedule.iteritems(): if epoch >= key: lr = np.float32(rate) learning_rate.set_value(lr) print " learning rate %.7f" % lr if config().dump_network_loaded_data: pickle.dump(train_data, open("data_loader_dump_train_%d.pkl"%e, "wb")) for key in xs_shared: xs_shared[key].set_value(train_data["input"][key]) for key in ys_shared: ys_shared[key].set_value(train_data["output"][key]) #print "train:", sorted(train_data["output"]["patients"]) losses = [] kaggle_losses = [] segmentation_losses = [] for b in xrange(num_batches_chunk): iter_result = iter_train(b) loss, kaggle_loss, segmentation_loss = tuple(iter_result[:3]) utils.detect_nans(loss, xs_shared, ys_shared, all_params) losses.append(loss) kaggle_losses.append(kaggle_loss) segmentation_losses.append(segmentation_loss) mean_train_loss = np.mean(losses) print " mean training loss:\t\t%.6f" % mean_train_loss losses_train.append(mean_train_loss) print " mean kaggle loss:\t\t%.6f" % np.mean(kaggle_losses) print " mean segment loss:\t\t%.6f" % np.mean(segmentation_losses) if ((e + 1) % config().validate_every) == 0: print print "Validating" if config().validate_train_set: subsets = ["validation", "train"] gens = [create_eval_valid_gen, create_eval_train_gen] losses_eval = [losses_eval_valid, losses_eval_train] losses_kaggle = [losses_eval_valid_kaggle, losses_eval_train_kaggle] else: subsets = ["validation"] gens = [create_eval_valid_gen] losses_eval = [losses_eval_valid] losses_kaggle = [losses_eval_valid_kaggle] for subset, create_gen, losses_validation, losses_kgl in zip(subsets, gens, losses_eval, losses_kaggle): vld_losses = [] vld_kaggle_losses = [] vld_segmentation_losses = [] vld_other_losses = {k:[] for k,_ in validation_other_losses.items()} print " %s set (%d samples)" % (subset, get_number_of_validation_samples(set=subset)) for validation_data in buffering.buffered_gen_threaded(create_gen()): num_batches_chunk_eval = config().batches_per_chunk if config().dump_network_loaded_data: pickle.dump(validation_data, open("data_loader_dump_valid_%d.pkl"%e, "wb")) for key in xs_shared: xs_shared[key].set_value(validation_data["input"][key]) for key in ys_shared: ys_shared[key].set_value(validation_data["output"][key]) #print "validate:", validation_data["output"]["patients"] for b in xrange(num_batches_chunk_eval): losses = tuple(iter_validate(b)[:3+len(validation_other_losses)]) loss, kaggle_loss, segmentation_loss = losses[:3] other_losses = losses[3:] vld_losses.extend(loss) vld_kaggle_losses.extend(kaggle_loss) vld_segmentation_losses.extend(segmentation_loss) for k, other_loss in zip(validation_other_losses, other_losses): vld_other_losses[k].extend(other_loss) vld_losses = np.array(vld_losses) vld_kaggle_losses = np.array(vld_kaggle_losses) vld_segmentation_losses = np.array(vld_segmentation_losses) for k in validation_other_losses: vld_other_losses[k] = np.array(vld_other_losses[k]) # now select only the relevant section to average sunny_len = get_lenght_of_set(name="sunny", set=subset) regular_len = get_lenght_of_set(name="regular", set=subset) num_valid_samples = get_number_of_validation_samples(set=subset) #print losses[:num_valid_samples] #print kaggle_losses[:regular_len] #print segmentation_losses[:sunny_len] loss_to_save = obj.compute_average(vld_losses[:num_valid_samples]) print " mean training loss:\t\t%.6f" % loss_to_save print " mean kaggle loss:\t\t%.6f" % np.mean(vld_kaggle_losses[:regular_len]) print " mean segment loss:\t\t%.6f" % np.mean(vld_segmentation_losses[:sunny_len]) # print " acc:\t%.2f%%" % (acc * 100) for k, v in vld_other_losses.items(): print " mean %s loss:\t\t%.6f" % (k, obj.compute_average(v[:num_valid_samples], loss_name=k)) print losses_validation.append(loss_to_save) kaggle_to_save = np.mean(vld_kaggle_losses[:regular_len]) losses_kgl.append(kaggle_to_save) now = time.time() time_since_start = now - start_time time_since_prev = now - prev_time prev_time = now est_time_left = time_since_start * (float(num_chunks_train - (e + 1)) / float(e + 1 - chunks_train_idcs[0])) eta = datetime.now() + timedelta(seconds=est_time_left) eta_str = eta.strftime("%c") print " %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev) print " estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str) print if ((e + 1) % config().save_every) == 0: print print "Saving metadata, parameters" with open(metadata_path, 'w') as f: pickle.dump({ 'metadata_path': metadata_path, 'configuration_file': config().__name__, 'git_revision_hash': utils.get_git_revision_hash(), 'experiment_id': expid, 'chunks_since_start': e, 'losses_train': losses_train, 'losses_eval_train': losses_eval_train, 'losses_eval_train_kaggle': losses_eval_train_kaggle, 'losses_eval_valid': losses_eval_valid, 'losses_eval_valid_kaggle': losses_eval_valid_kaggle, 'time_since_start': time_since_start, 'param_values': lasagne.layers.get_all_param_values(top_layer) }, f, pickle.HIGHEST_PROTOCOL) print " saved to %s" % metadata_path print # store all known outputs from last batch: if config().take_a_dump: all_theano_variables = [train_loss_theano, kaggle_loss_theano, segmentation_loss_theano] + theano_printer.get_the_stuff_to_print() for layer in all_layers[:-1]: all_theano_variables.append(lasagne.layers.helper.get_output(layer)) iter_train = theano.function([idx], all_theano_variables, givens=givens, on_unused_input="ignore", updates=updates, # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) train_data["intermediates"] = iter_train(0) pickle.dump(train_data, open(metadata_path + "-dump", "wb")) return
expid = utils.generate_expid(config_name) print() print("Experiment ID: %s" % expid) print() # metadata metadata_dir = utils.get_dir_path('models', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print('Build model') model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print(' number of parameters: %d' % num_params) print(string.ljust(' layer output shapes:', 36),) print(string.ljust('#params:', 10),) print('output shape:') for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print(' %s %s %s' % (name, num_param, layer.output_shape)) train_loss = config().build_objective(model, deterministic=False) valid_loss = config().build_objective(model, deterministic=True)
def test_dsb(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) image_dir = image_dir + '/test_1/' utils.auto_make_dir(image_dir) patient_data_paths = utils_lung.get_patient_data_paths( pathfinder.DATA_PATH) print len(patient_data_paths) patient_data_paths = [ pathfinder.DATA_PATH + '/01de8323fa065a8963533c4a86f2f6c1' ] for k, p in enumerate(patient_data_paths): pid = utils_lung.extract_pid_dir(p) # sid2data, sid2metadata = utils_lung.get_patient_data(p) # sids_sorted = utils_lung.sort_sids_by_position(sid2metadata) # sids_sorted_jonas = utils_lung.sort_slices_jonas(sid2metadata) # sid2position = utils_lung.slice_location_finder(sid2metadata) # # jonas_slicethick = [] # for i in xrange(len(sids_sorted_jonas) - 1): # s = np.abs(sid2position[sids_sorted_jonas[i + 1]] - sid2position[sids_sorted_jonas[i]]) # jonas_slicethick.append(s) # # img = np.stack([data_transforms.ct2HU(sid2data[sid], sid2metadata[sid]) for sid in sids_sorted]) # xx = (jonas_slicethick[0], # sid2metadata[sids_sorted[0]]['PixelSpacing'][0], # sid2metadata[sids_sorted[0]]['PixelSpacing'][1]) # pixel_spacing = np.asarray(xx) img, pixel_spacing = utils_lung.read_dicom_scan(p) mask = lung_segmentation.segment_HU_scan_ira(img) print pid print pixel_spacing print '====================================' img_out, transform_matrix, mask_out = data_transforms.transform_scan3d( img, pixel_spacing=pixel_spacing, p_transform=config().p_transform, p_transform_augment=None, lung_mask=mask) for i in xrange(100, img_out.shape[0], 5): plot_slice_3d_2(img_out, mask_out, 0, str(pid) + str(i), idx=np.array([i, 200, 200])) plot_slice_3d_2(img_out, mask_out, 0, pid, idx=np.array(img_out.shape) / 2) plot_slice_3d_2(mask_out, img_out, 0, pid, idx=np.array(img_out.shape) / 4) plot_slice_3d_2(mask_out, img_out, 0, pid, idx=np.array(img_out.shape) / 8)
config_name = sys.argv[1] set_configuration('configs_seg_scan', config_name) # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/%s' % config_name utils.auto_make_dir(outputs_path) # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % config_name) sys.stderr = sys.stdout # builds model and sets its parameters model = config().build_model() x_shared = nn.utils.shared_empty(dim=len(model.l_in.shape)) idx_z = T.lscalar('idx_z') idx_y = T.lscalar('idx_y') idx_x = T.lscalar('idx_x') window_size = config().window_size stride = config().stride n_windows = config().n_windows givens = {} givens[model.l_in.input_var] = x_shared get_predictions_patch = theano.function([], nn.layers.get_output(model.l_out, deterministic=True),
def predict_slice_model(expid, outfile, mfile=None): metadata_path = MODEL_PATH + "%s.pkl" % (expid if not mfile else mfile) if theano.config.optimizer != "fast_run": print "WARNING: not running in fast mode!" print "Build model" interface_layers = config().build_model() output_layers = interface_layers["outputs"] input_layers = interface_layers["inputs"] top_layer = lasagne.layers.MergeLayer( incomings=output_layers.values() ) _check_slicemodel(input_layers) # Print the architecture _print_architecture(top_layer) xs_shared = { key: lasagne.utils.shared_empty(dim=len(l_in.output_shape), dtype='float32') for (key, l_in) in input_layers.iteritems() } idx = T.lscalar('idx') givens = dict() for key in input_layers.keys(): if key=="sunny": givens[input_layers[key].input_var] = xs_shared[key][idx*config().sunny_batch_size:(idx+1)*config().sunny_batch_size] else: givens[input_layers[key].input_var] = xs_shared[key][idx*config().batch_size:(idx+1)*config().batch_size] network_outputs = [ lasagne.layers.helper.get_output(network_output_layer, deterministic=True) for network_output_layer in output_layers.values() ] iter_test = theano.function([idx], network_outputs + theano_printer.get_the_stuff_to_print(), givens=givens, on_unused_input="ignore", # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) print "Load model parameters for resuming" resume_metadata = np.load(metadata_path) lasagne.layers.set_all_param_values(top_layer, resume_metadata['param_values']) num_batches_chunk = config().batches_per_chunk num_batches = get_number_of_test_batches() num_chunks = int(np.ceil(num_batches / float(config().batches_per_chunk))) chunks_train_idcs = range(1, num_chunks+1) create_test_gen = partial(config().create_test_gen, required_input_keys = xs_shared.keys(), required_output_keys = ["patients", "slices"], ) print "Generate predictions with this model" start_time = time.time() prev_time = start_time predictions = [{"patient": i+1, "slices": { slice_id: { "systole": np.zeros((0,600)), "diastole": np.zeros((0,600)) } for slice_id in data_loader.get_slice_ids_for_patient(i+1) } } for i in xrange(NUM_PATIENTS)] # Loop over data and generate predictions for e, test_data in izip(itertools.count(start=1), buffering.buffered_gen_threaded(create_test_gen())): print " load testing data onto GPU" for key in xs_shared: xs_shared[key].set_value(test_data["input"][key]) patient_ids = test_data["output"]["patients"] slice_ids = test_data["output"]["slices"] print " patients:", " ".join(map(str, patient_ids)) print " chunk %d/%d" % (e, num_chunks) for b in xrange(num_batches_chunk): iter_result = iter_test(b) network_outputs = tuple(iter_result[:len(output_layers)]) network_outputs_dict = {output_layers.keys()[i]: network_outputs[i] for i in xrange(len(output_layers))} kaggle_systoles, kaggle_diastoles = config().postprocess(network_outputs_dict) kaggle_systoles, kaggle_diastoles = kaggle_systoles.astype('float64'), kaggle_diastoles.astype('float64') for idx, (patient_id, slice_id) in enumerate( zip(patient_ids[b*config().batch_size:(b+1)*config().batch_size], slice_ids[b*config().batch_size:(b+1)*config().batch_size])): if patient_id != 0: index = patient_id-1 patient_data = predictions[index] assert patient_id==patient_data["patient"] patient_slice_data = patient_data["slices"][slice_id] patient_slice_data["systole"] = np.concatenate((patient_slice_data["systole"], kaggle_systoles[idx:idx+1,:]),axis=0) patient_slice_data["diastole"] = np.concatenate((patient_slice_data["diastole"], kaggle_diastoles[idx:idx+1,:]),axis=0) now = time.time() time_since_start = now - start_time time_since_prev = now - prev_time prev_time = now est_time_left = time_since_start * (float(num_chunks - (e + 1)) / float(e + 1 - chunks_train_idcs[0])) eta = datetime.now() + timedelta(seconds=est_time_left) eta_str = eta.strftime("%c") print " %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev) print " estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str) print # Average predictions already_printed = False for prediction in predictions: for prediction_slice_id in prediction["slices"]: prediction_slice = prediction["slices"][prediction_slice_id] if prediction_slice["systole"].size>0 and prediction_slice["diastole"].size>0: average_method = getattr(config(), 'tta_average_method', partial(np.mean, axis=0)) prediction_slice["systole_average"] = average_method(prediction_slice["systole"]) prediction_slice["diastole_average"] = average_method(prediction_slice["diastole"]) try: test_if_valid_distribution(prediction_slice["systole_average"]) test_if_valid_distribution(prediction_slice["diastole_average"]) except: if not already_printed: print "WARNING: These distributions are not distributions" already_printed = True prediction_slice["systole_average"] = make_monotone_distribution(prediction_slice["systole_average"]) prediction_slice["diastole_average"] = make_monotone_distribution(prediction_slice["diastole_average"]) print "Calculating training and validation set scores for reference" # Add CRPS scores to the predictions # Iterate over train and validation sets for patient_ids, set_name in [(validation_patients_indices, "validation"), (train_patients_indices, "train")]: # Iterate over patients in the set for patient in patient_ids: prediction = predictions[patient-1] # Iterate over the slices for slice_id in prediction["slices"]: prediction_slice = prediction["slices"][slice_id] if "systole_average" in prediction_slice: assert patient == regular_labels[patient-1, 0] error_sys = CRSP(prediction_slice["systole_average"], regular_labels[patient-1, 1]) prediction_slice["systole_CRPS"] = error_sys prediction_slice["target_systole"] = regular_labels[patient-1, 1] error_dia = CRSP(prediction_slice["diastole_average"], regular_labels[patient-1, 2]) prediction_slice["diastole_CRPS"] = error_dia prediction_slice["target_diastole"] = regular_labels[patient-1, 2] prediction_slice["CRPS"] = 0.5 * error_sys + 0.5 * error_dia print "dumping prediction file to %s" % outfile with open(outfile, 'w') as f: pickle.dump({ 'metadata_path': metadata_path, 'configuration_file': config().__name__, 'git_revision_hash': utils.get_git_revision_hash(), 'experiment_id': expid, 'time_since_start': time_since_start, 'param_values': lasagne.layers.get_all_param_values(top_layer), 'predictions_per_slice': predictions, }, f, pickle.HIGHEST_PROTOCOL) print "prediction file dumped" return
expid = utils.generate_expid(config_name) print print "Experiment ID: %s" % expid print # metadata metadata_dir = utils.get_dir_path('models', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print 'Build model' model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print ' number of parameters: %d' % num_params print string.ljust(' layer output shapes:', 36), print string.ljust('#params:', 10), print 'output shape:' for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print ' %s %s %s' % (name, num_param, layer.output_shape) train_loss = config().build_objective(model, deterministic=False) valid_loss = config().build_objective(model, deterministic=True)
# predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) output_pkl_file = predictions_dir + '/%s-%s.pkl' % (expid, set) submissions_dir = utils.get_dir_path('submissions', pathfinder.METADATA_PATH) output_csv_file = submissions_dir + '/%s-%s.csv' % (expid, set) # if os.path.isfile(output_pkl_file): # pid2prediction = utils.load_pkl(output_pkl_file) # utils_lung.write_submission(pid2prediction, output_csv_file) # print 'saved csv' # print output_csv_file # sys.exit(0) print 'Build model' model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print ' number of parameters: %d' % num_params print string.ljust(' layer output shapes:', 36), print string.ljust('#params:', 10), print 'output shape:' for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print ' %s %s %s' % (name, num_param, layer.output_shape) nn.layers.set_all_param_values(model.l_out, metadata['param_values'])
config_name = sys.argv[1] n_tta_iterations = int(sys.argv[2]) if len(sys.argv) >= 3 else 100 mean = sys.argv[3] if len(sys.argv) >= 4 else "geometric" print "Make %s tta predictions for %s set using %s mean" % (n_tta_iterations, "valid and test", mean) metadata_dir = utils.get_dir_path("train", METADATA_PATH) metadata_path = utils.find_model_metadata(metadata_dir, config_name) metadata = utils.load_pkl(metadata_path) assert config_name == metadata["configuration"] if "subconfiguration" in metadata: set_subconfiguration(metadata["subconfiguration"]) set_configuration(config_name) # predictions paths jonas_prediction_path = PREDICTIONS_PATH + "/ira_%s.pkl" % config().__name__ prediction_dir = utils.get_dir_path("predictions", METADATA_PATH) valid_prediction_path = prediction_dir + "/%s-%s-%s-%s.pkl" % ( metadata["experiment_id"], "valid", n_tta_iterations, mean, ) test_prediction_path = prediction_dir + "/%s-%s-%s-%s.pkl" % (metadata["experiment_id"], "test", n_tta_iterations, mean) # submissions paths submission_dir = utils.get_dir_path("submissions", METADATA_PATH) submission_path = submission_dir + "/%s-%s-%s-%s.csv" % (metadata["experiment_id"], "test", n_tta_iterations, mean) # logs logs_dir = utils.get_dir_path("logs", METADATA_PATH)
def load_addon(faddon, reqversion, tablenames): ''' Poss. cases | BASE | DEPENDENCY | OPTIONAL ------------+--------------------+--------------------+-------------------- NOT FOUND | impossible | critical exception | debug message ------------+--------------------+--------------------+-------------------- DISABLED | debug message | critical exception | debug message ------------+--------------------+--------------------+-------------------- VERSION | impossible | critical exception | critical exception ------------+--------------------+--------------------+-------------------- TABLES | critical exception | critical exception | critical exception ''' try: folder, addon = faddon.split('.') except ValueError: # Check core version # Get only the major version number instversion = int(info.core.version.split(".", 1)[0]) if reqversion is not False and instversion != reqversion: raise exceptions.AddonVersionError(instversion) else: section, logname = { 'extensions': ('Extensions', 'extension'), 'interfaces': ('Interfaces', 'interface'), 'plugins': ('Plugins', 'plugin'), }[folder] mfaddon = '.'.join(('outspline', faddon)) # An addon may list a dependency that is not installed # This check must be done before the other ones, in fact if the addon # is not installed it's impossible to read its info if addon not in configuration.config(section).get_sections(): raise exceptions.AddonNotFoundError() # This check must be done before the version or the provided tables # ones, in fact if an addon is disabled these problems shouldn't matter if not configuration.config(section)(addon).get_bool('enabled'): raise exceptions.AddonDisabledError() ainfo = importlib.import_module(".".join(("outspline", "info", folder, addon))) # Get only the major version number # This version check must be done before the 'mfaddon not in # sys.modules' one, otherwise it's not always performed; for example # two different addons may require the same addon with different # versions, and if the first one required the correct version, when # checking the second one no exception would be raised instversion = int(ainfo.version.split(".", 1)[0]) if reqversion is not False and instversion != reqversion: raise exceptions.AddonVersionError(instversion) # This check must be done after the version one, see the comment there # for the reason if mfaddon not in sys.modules: if section == 'Extensions': ptables = {table: faddon for table in ainfo.provides_tables if table} test = [table for table in set(tablenames) & set(ptables) if tablenames[table] != ptables[table]] if test: raise exceptions.ExtensionProvidedTablesError(test, [tablenames[table] for table in test]) tablenames.update(ptables) try: ainfo.dependencies except AttributeError: pass else: for dep, ver in ainfo.dependencies: try: load_addon(dep, int(ver), tablenames=tablenames) # If I wanted to silently disable an addon in case one of # its dependencies is not satisfied (not found, # disabled...) I should disable the addon in the # configuration to prevent the following bug: an enabled # addon is activated since all its dependencies are # enabled; that addon also has an optional dependency which # is also enabled and activated; this optional dependency, # though, has a dependency which is not enabled, so it is # not imported by this load_addon() function; however, # since in the configuration it is enabled, it's imported # by the main addon anyway with # coreaux_api.import_optional_extension_api(), thus # breaking the application, since the dependency for the # optional dependency is still missing # Note that this change won't be written in the # configuration file, since it's updated with # config.export_add() #except ...: # configuration.config(section)(addon)['enabled'] = 'off' except exceptions.AddonNotFoundError: log.error('{} depends on {} which however cannot be ' 'found'.format(faddon, dep)) # Raise a different exception, otherwise it may be # caught by start_addons() raise exceptions.AddonDependencyError() except exceptions.AddonDisabledError: log.error('{} depends on {} which however is ' 'disabled'.format(faddon, dep)) # Raise a different exception, otherwise it will be # caught by start_addons() raise exceptions.AddonDependencyError() except exceptions.AddonVersionError as err: log.error('{} depends on {} {} which however is ' 'installed with version {}'.format( faddon, dep, ver, err.version)) # Raise a different exception, otherwise it may be # caught by start_addons() raise exceptions.AddonDependencyError() except exceptions.ExtensionProvidedTablesError as err: log.error('{} depends on {} which provides tables {} ' 'that are already provided by {}'.format( faddon, dep, ', '.join(err.tables), ', '.join(err.extensions))) # Raise a different exception, otherwise it will be # caught by start_addons() raise exceptions.AddonDependencyError() try: ainfo.optional_dependencies except AttributeError: pass else: for opt, ver in ainfo.optional_dependencies: try: load_addon(opt, int(ver), tablenames=tablenames) except exceptions.AddonNotFoundError: log.debug('{} optionally depends on {} which however ' 'cannot be found'.format(faddon, opt)) except exceptions.AddonDisabledError: log.debug('{} optionally depends on {} which however ' 'is disabled'.format(faddon, opt)) except exceptions.AddonVersionError as err: log.error('{} optionally depends on {} {} which ' 'however is installed with version {}'.format( faddon, opt, ver, err.version)) # Just crash the application, in fact it's not easy to # handle this case, as the same addon may be required # by another addon with the correct version, but still # this addon should *not* use this dependency # Raise a different exception, otherwise it will be # caught by start_addons() raise exceptions.AddonDependencyError() except exceptions.ExtensionProvidedTablesError as err: log.error('{} optionally depends on {} which provides ' 'tables {} that are already provided by ' '{}'.format(faddon, opt, ', '.join(err.tables), ', '.join(err.extensions))) # Just crash the application, in fact it's not easy to # handle this case, as the same addon may be required # by another addon with the correct version, but still # this addon should *not* use this dependency # Raise a different exception, otherwise it will be # caught by start_addons() raise exceptions.AddonDependencyError() mod = importlib.import_module(mfaddon) # Interfaces must have a main() fnuction if hasattr(mod, 'main') or folder == 'interfaces': mod.main() enabled_addons[section].add(addon) log.info('Loaded {}: {}'.format(logname, addon))
def parse_cli_args(): # Options -h and --help are automatically created cliparser = argparse.ArgumentParser(description=_DESCRIPTION) cliparser.add_argument('-c', '--config', default=None, metavar='FILE', dest='configfile', help='set the configuration file name: a relative ' 'or full path can be specified (default: {})' ''.format(_USER_CONFIG_FILE)) cliparser.add_argument('-l', '--logfile', default=None, metavar='FILE', dest='logfile', help='set the log file name: a relative or full ' 'path can be specified (default: {}, see also ' '--loglevel option)' ''.format(os.path.expanduser(config('Log' )['log_file']))) cliparser.add_argument('-L', '--loglevel', default=None, metavar='NN', dest='loglevel', help='a 2-digit number (in base 4, from 00 to 33) ' 'whose digits define the verbosity of, ' 'respectively, stdout and file log messages; ' '0) disabled; 1) essential reports; 2) normal ' 'verbosity; 3) debug mode; digits different ' 'from 0,1,2,3 will default to the respective ' 'value set in the configuration file ' '(default: {}{}, see also --logfile option)' ''.format(config('Log')['log_level_stdout'], config('Log')['log_level_file'])) cliparser.add_argument('-u', '--config-update', action='store_true', dest='updonly', help='only create or update the configuration ' 'file, then exit') cliparser.add_argument('-v', '--version', action=ShowVersion, nargs=0, dest='version', help='show program\'s version number, copyright ' 'and license information, then exit') cliparser.add_argument('--about', action=ShowAbout, nargs=0, dest='about', help='show information on the installed components ' 'and addons, then exit') return cliparser.parse_args()
import numpy as np import scipy.misc import tensorflow as tf import tensorflow.contrib.gan as tfgan import tensorflow.contrib.slim as slim from tensorflow.contrib.gan.python import namedtuples import configuration import data_provider tf.reset_default_graph() conf = configuration.config() initializer = None batch_norm_params = { 'decay': conf.batch_norm_decay, 'epsilon': conf.epsilon, 'updates_collections': tf.GraphKeys.UPDATE_OPS, 'is_training': conf.is_training, 'zero_debias_moving_mean': True } # 训练参数 global_step = tf.train.get_or_create_global_step() generator_loss_fn = tfgan.losses.modified_generator_loss discriminator_loss_fn = tfgan.losses.modified_discriminator_loss weights_initializer = tf.initializers.random_normal(mean=0, stddev=0.02) gen_lr = tf.train.exponential_decay(conf.gen_lr, global_step, conf.decay_steps, 0.5, "generator_learning_rate") tf.summary.scalar("gen_learning_rate", gen_lr) generator_optimizer = tf.train.AdamOptimizer(learning_rate=gen_lr, beta1=0.5)
metadata_path = sys.argv[1] metadata_dir = utils.get_dir_path('train', METADATA_PATH) metadata = utils.load_pkl(metadata_dir + '/%s' % metadata_path) config_name = metadata['configuration'] if 'subconfiguration' in metadata: set_subconfiguration(metadata['subconfiguration']) set_configuration(config_name) # predictions paths prediction_dir = utils.get_dir_path('predictions', METADATA_PATH) prediction_path = prediction_dir + "/%s.pkl" % metadata['experiment_id'] prediction_mu_std_path = prediction_dir + "/%s_mu_sigma.pkl" % metadata['experiment_id'] print "Build model" model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_top) all_params = nn.layers.get_all_params(model.l_top) num_params = nn.layers.count_params(model.l_top) print ' number of parameters: %d' % num_params nn.layers.set_all_param_values(model.l_top, metadata['param_values']) xs_shared = [nn.utils.shared_empty(dim=len(l.shape)) for l in model.l_ins] givens_in = {} for l_in, x in izip(model.l_ins, xs_shared): givens_in[l_in.input_var] = x iter_test_det = theano.function([], [nn.layers.get_output(l, deterministic=True) for l in model.l_outs], givens=givens_in, on_unused_input='warn') iter_mu = theano.function([], [nn.layers.get_output(l, deterministic=True) for l in model.mu_layers], givens=givens_in,
def build_nesterov_updates(train_loss, all_params, learning_rate): updates = lasagne.updates.nesterov_momentum(train_loss, all_params, learning_rate, config().momentum) return updates
expid = utils.generate_expid(config_name) print() print("Experiment ID: %s" % expid) print() # metadata metadata_dir = utils.get_dir_path('models', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print('Build model') model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print(' number of parameters: %d' % num_params) print(string.ljust(' layer output shapes:', 36), ) print(string.ljust('#params:', 10), ) print('output shape:') for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print(' %s %s %s' % (name, num_param, layer.output_shape)) train_loss = config().build_objective(model, deterministic=False) valid_loss = config().build_objective(model, deterministic=True)
def get_lob_data(pair, date_start, date_end, frequency=timedelta(seconds=10), lob_depth=10): ''' Function to get limit orde book snapshots time series Arguments: pair -- string, curency pair to return (e.g.'USDT_BTC') date_start -- string, timeseries start date_end -- string, timeseries end frequency -- timedelta, the minimum time granularity (e.g. timedelta(seconds=10)) lob_depth -- number of ob levels analyzed Returns: Dask data frame ''' print(f'Checking for cached LOB data from {date_start} to {date_end}') #TODO assert if date_end is yesterday or earlier assert frequency >= timedelta( seconds=1), 'Frequency must be equal to or greater than 1 second' configuration = config() raw_data_folder = configuration['folders']['raw_lob_data'] resampled_data_folder = configuration['folders']['resampled_data'] date_start = datetime.strptime(date_start, '%Y-%m-%d') date_end = datetime.strptime(date_end, '%Y-%m-%d') freq = f'{int(frequency.total_seconds())}s' os.makedirs( f'{resampled_data_folder}/{pair}/{lob_depth}_levels/original_frequency', exist_ok=True) os.makedirs(f'{resampled_data_folder}/{pair}/{lob_depth}_levels/{freq}', exist_ok=True) data = [] # Loop through day folders date_to_process = date_start while date_to_process <= date_end: day_folder = datetime.strftime(date_to_process, '%Y/%m/%d') day_cache_file_name = f'{datetime.strftime(date_to_process, "%Y-%m-%d")}.csv.gz' resampled_file_path = f'{resampled_data_folder}/{pair}/{lob_depth}_levels/{freq}/{day_cache_file_name}' if os.path.isfile(resampled_file_path): print(f'Found {resampled_file_path}') else: print(f'Generating {resampled_file_path}') original_file_name = f'{resampled_data_folder}/{pair}/{lob_depth}_levels/original_frequency/{day_cache_file_name}' if os.path.isfile(original_file_name): day_data = pd.read_csv(original_file_name, parse_dates=['Datetime']) else: # empty json and nested list every new day processed raw_data = {} # empty dict to update with incoming json processed_data = [] if not os.path.isdir(f'{raw_data_folder}/{pair}/{day_folder}'): s3_resource = get_s3_resource() lob_data_bucket = s3_resource.Bucket( configuration['buckets']['lob_data']) os.makedirs(f'{raw_data_folder}/tmp/{pair}/{day_folder}', exist_ok=True) keys = [] for obj in lob_data_bucket.objects.filter( Prefix=f'{pair}/{day_folder}'): keys.append(obj.key) download_s3_folder(lob_data_bucket, day_folder, keys) shutil.move(f'{raw_data_folder}/tmp/{pair}/{day_folder}', f'{raw_data_folder}/{pair}/{day_folder}') # Load all files in to a dictionary for file_name in os.listdir( f'{raw_data_folder}/{pair}/{day_folder}'): try: with gzip.open( f'{raw_data_folder}/{pair}/{day_folder}/{file_name}', 'r') as f: json_string = f.read().decode('utf-8') frozen = json_string.count('"isFrozen": "1"') if frozen > 0: print(f'Frozen {frozen} snapshots') raw_data_temp = load_lob_json(json_string) except Exception as e: print(e.errno) print(e) raw_data.update(raw_data_temp) # number of seconds in a day / frequencey in seconds snapshot_count_day = int(24 * 60 * 60 / frequency.total_seconds()) if len(raw_data) != snapshot_count_day: diff = snapshot_count_day - len(raw_data) if diff > 0: print(f'{diff} gaps in {original_file_name}') else: print( f'{diff * -1} additional data points in {original_file_name}' ) #del(raw_data['BTC_XRP-20200404_000000']) #TODO fix sequence order raw_data_frame = pd.DataFrame.from_dict(raw_data, orient='index') raw_data_frame.reset_index(inplace=True) raw_data_frame['index'] = raw_data_frame['index'].str[-15:] raw_data_frame['index'] = pd.to_datetime( raw_data_frame['index'], format='%Y%m%d_%H%M%S') raw_data_frame.set_index('index', drop=True, inplace=True) raw_data_frame.sort_index(inplace=True) idx_start = date_to_process idx_end = date_to_process + timedelta(days=1) - timedelta( seconds=1) idx = pd.date_range(idx_start, idx_end, freq='1s') raw_data_frame = raw_data_frame.reindex(idx).ffill().fillna( method='bfill' ) # forward fill gaps and back fill first item if missing # Convert hierarchical json data in to tabular format levels = list(range(lob_depth)) for row in raw_data_frame.itertuples(): ask_price, ask_volume = zip(*row.asks[0:lob_depth]) bid_price, bid_volume = zip(*row.bids[0:lob_depth]) sequences = [row.seq] * lob_depth datetimes = [row.Index] * lob_depth processed_data.append( list( zip(ask_price, ask_volume, bid_price, bid_volume, levels, sequences, datetimes))) # unravel nested structure and force data types day_data = pd.DataFrame( [y for x in processed_data for y in x], #flatten the list of lists structure columns=[ 'Ask_Price', 'Ask_Size', 'Bid_Price', 'Bid_Size', 'Level', 'Sequence', 'Datetime' ]) day_data['Ask_Price'] = day_data['Ask_Price'].astype('float64') day_data['Bid_Price'] = day_data['Bid_Price'].astype('float64') day_data['Sequence'] = day_data['Sequence'].astype('int64') day_data.to_csv(original_file_name, compression='gzip') # resample dataframe to the wanted frequency resampled_day_data = day_data.groupby([ pd.Grouper(key='Datetime', freq=freq), pd.Grouper(key='Level') ]).last().reset_index() resampled_day_data.to_csv(resampled_file_path, compression='gzip') date_to_process += timedelta( days=1) # the most nested folder is a day of the month data.append(resampled_file_path) # computed = df.compute() # df = df.repartition(npartitions=1) # df.to_csv(f'{root_caching_folder}/{pair}/{output_file_name}', compression='gzip', single_file = True) # df.to_parquet(f'/tmp/10-seconds.parquet', compression='gzip', engine='pyarrow', write_index=False) return dd.read_csv(data, compression='gzip')
metadata = utils.load_pkl(metadata_path) expid = metadata['experiment_id'] # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s-test.log' % expid) sys.stderr = sys.stdout # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/' + expid utils.auto_make_dir(outputs_path) print('Build model') model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print(' number of parameters: %d' % num_params) print(string.ljust(' layer output shapes:', 36),) print(string.ljust('#params:', 10),) print('output shape:') for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print(' %s %s %s' % (name, num_param, layer.output_shape)) nn.layers.set_all_param_values(model.l_out, metadata['param_values'])
def import_px_data(frequency, pair, date_start, date_end, lob_depth, norm_type, roll): ''' Function that loads preprocessed data ready to be shaped/used for the model to train. Experiment folder is the path where data has been cached. The other parameters are part of the unique cached file nomenclature. If the file does not exist, it is generated frrom the input data in the "else" block Arguments: frequency -- timedelta, the minimum time granularity (e.g. timedelta(seconds=10)) pair -- string, curency pair to return (e.g.'USDT_BTC') date_start -- string, timeseries start date_end -- string, timeseries end lob_depth -- integer, how many levels of the order book to be considered norm_type -- string, can assume values of 'z' or 'dyn' for z-score or dynamic z-score roll -- integer, function of the granularity provided ''' configuration = config() resampled_data_folder = configuration['folders']['resampled_data'] frequency_seconds = int(frequency.total_seconds()) # Data import - needs to be adjusted importing from several files using Dask quotes_file_name = f'{pair}--{lob_depth}lev--{frequency_seconds}sec--{date_start}--{date_end}.csv.gz' standardized_train_file = f'{resampled_data_folder}/{pair}/TRAIN--{norm_type}-{roll}--{quotes_file_name}' standardized_test_file = f'{resampled_data_folder}/{pair}/TEST--{norm_type}-{roll}--{quotes_file_name}' top_ob_train_file = f'{resampled_data_folder}/{pair}/TRAIN_TOP--{quotes_file_name}' top_ob_test_file = f'{resampled_data_folder}/{pair}/TEST_TOP--{quotes_file_name}' # standardized test file contains both trades and quotes if os.path.isfile( standardized_test_file ): # testing for one of cache files, assuming all were saved # Import cached standardized data print(f'Reading cached {standardized_train_file}') train_dyn_df = pd.read_csv(standardized_train_file) #, index_col=1) train_dyn_df.drop('Unnamed: 0', axis=1, inplace=True) print(f'Reading cached {standardized_test_file}') test_dyn_df = pd.read_csv(standardized_test_file) #, index_col=1) test_dyn_df.drop('Unnamed: 0', axis=1, inplace=True) print(f'Reading cached {top_ob_train_file}') top_ob_train = pd.read_csv(top_ob_train_file) #, index_col=[0,1]) print(f'Reading cached {top_ob_test_file}') top_ob_test = pd.read_csv(top_ob_test_file) #, index_col=[0,1]) else: # check separately for quotes and trades input files quotes_data_input = get_lob_data(pair, date_start, date_end, frequency, lob_depth) quotes_data_input['Datetime'] = dd.to_datetime( quotes_data_input['Datetime']) trades_data_input = get_trade_data(pair, date_start, date_end, frequency) trades_data_input['Datetime'] = dd.to_datetime( trades_data_input['Datetime']) # once input files have been correctly read from the input folder, it's time to create a single standardized cache for trades and quotes # TODO - concatenate Dask dataframes quotes_data_input_pd = quotes_data_input.compute() trades_data_input_pd = trades_data_input.compute() data = pd.concat([trades_data_input_pd, quotes_data_input_pd ]).sort_values(by=['Datetime', 'Level']) roll = roll #+ 1 # +1 from extra level trades(level -1) stdz_depth = lob_depth + 1 train_dyn_df, test_dyn_df, top_ob_train, top_ob_test = standardized_data_cache( data, roll, stdz_depth, standardized_train_file, standardized_test_file, top_ob_train_file, top_ob_test_file) # reset indexes, cast datetime type and clean unwanted columns print(f'train_dyn_df {train_dyn_df.head(3)}') print(f'test_dyn_df {test_dyn_df.head(3)}') print(f'top_ob_train {top_ob_train.head(3)}') print(f'top_ob_test {top_ob_test.head(3)}') #train_dyn_df = train_dyn_df.reset_index() train_dyn_df['Datetime'] = pd.to_datetime(train_dyn_df['Datetime']) #test_dyn_df = test_dyn_df.reset_index() test_dyn_df['Datetime'] = pd.to_datetime(test_dyn_df['Datetime']) #test_dyn_df.set_index('index', inplace=True) #top_ob_train = top_ob_train.reset_index() top_ob_train['Datetime'] = pd.to_datetime(top_ob_train['Datetime']) top_ob_train.drop('Unnamed: 0', axis=1, inplace=True) #top_ob_test = top_ob_test.reset_index() top_ob_test['Datetime'] = pd.to_datetime(top_ob_test['Datetime']) top_ob_test.drop('Unnamed: 0', axis=1, inplace=True) return train_dyn_df, test_dyn_df, top_ob_train, top_ob_test
config_name = sys.argv[1] set_configuration('configs_luna_props_scan', config_name) # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/%s' % config_name utils.auto_make_dir(outputs_path) # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % config_name) sys.stderr = sys.stdout # builds model and sets its parameters model = config().build_model() x_shared = nn.utils.shared_empty(dim=len(model.l_in.shape)) givens_valid = {} givens_valid[model.l_in.input_var] = x_shared get_predictions_patch = theano.function([], nn.layers.get_output(model.l_out, deterministic=True), givens=givens_valid, on_unused_input='ignore') data_iterator = config().data_iterator #existing_preds = [f.rsplit('.') for f in os.listdir(outputs_path)] #print existing_preds
def preprocess_normscale(patient_data, result, index, augment=True, metadata=None, normscale_resize_and_augment_function=normscale_resize_and_augment, testaug=False): """Normalizes scale and augments the data. Args: patient_data: the data to be preprocessed. result: dict to store the result in. index: index indicating in which slot the result dict the data should go. augment: flag indicating wheter augmentation is needed. metadata: metadata belonging to the patient data. """ if augment: if testaug: augmentation_params = sample_test_augmentation_parameters() else: augmentation_params = sample_augmentation_parameters() else: augmentation_params = None zoom_factor = None # Iterate over different sorts of data for tag, data in patient_data.iteritems(): if tag in metadata: metadata_tag = metadata[tag] desired_shape = result[tag][index].shape cleaning_processes = getattr(config(), 'cleaning_processes', []) cleaning_processes_post = getattr(config(), 'cleaning_processes_post', []) if tag.startswith("sliced:data:singleslice"): # Cleaning data before extracting a patch data = clean_images( [patient_data[tag]], metadata=metadata_tag, cleaning_processes=cleaning_processes) # Augment and extract patch # Decide which roi to use. shift_center = (None, None) if getattr(config(), 'use_hough_roi', False): shift_center = metadata_tag["hough_roi"] patient_3d_tensor = normscale_resize_and_augment_function( data, output_shape=desired_shape[-2:], augment=augmentation_params, pixel_spacing=metadata_tag["PixelSpacing"], shift_center=shift_center[::-1])[0] if augmentation_params is not None: zoom_factor = augmentation_params["zoom_x"] * augmentation_params["zoom_y"] else: zoom_factor = 1.0 # Clean data further patient_3d_tensor = clean_images( patient_3d_tensor, metadata=metadata_tag, cleaning_processes=cleaning_processes_post) if "area_per_pixel:sax" in result: raise NotImplementedError() if augmentation_params and not augmentation_params.get("change_brightness", 0) == 0: patient_3d_tensor = augment_brightness(patient_3d_tensor, augmentation_params["change_brightness"]) put_in_the_middle(result[tag][index], patient_3d_tensor, True) elif tag.startswith("sliced:data:randomslices"): # Clean each slice separately data = [ clean_images([slicedata], metadata=metadata, cleaning_processes=cleaning_processes)[0] for slicedata, metadata in zip(data, metadata_tag)] # Augment and extract patches shift_centers = [(None, None)] * len(data) if getattr(config(), 'use_hough_roi', False): shift_centers = [m["hough_roi"] for m in metadata_tag] patient_3d_tensors = [ normscale_resize_and_augment_function( [slicedata], output_shape=desired_shape[-2:], augment=augmentation_params, pixel_spacing=metadata["PixelSpacing"], shift_center=shift_center[::-1])[0] for slicedata, metadata, shift_center in zip(data, metadata_tag, shift_centers)] if augmentation_params is not None: zoom_factor = augmentation_params["zoom_x"] * augmentation_params["zoom_y"] else: zoom_factor = 1.0 # Clean data further patient_3d_tensors = [ clean_images([patient_3d_tensor], metadata=metadata, cleaning_processes=cleaning_processes_post)[0] for patient_3d_tensor, metadata in zip(patient_3d_tensors, metadata_tag)] patient_4d_tensor = _make_4d_tensor(patient_3d_tensors) if augmentation_params and not augmentation_params.get("change_brightness", 0) == 0: patient_4d_tensor = augment_brightness(patient_4d_tensor, augmentation_params["change_brightness"]) if "area_per_pixel:sax" in result: raise NotImplementedError() put_in_the_middle(result[tag][index], patient_4d_tensor, True) elif tag.startswith("sliced:data:sax:locations"): pass # will be filled in by the next one elif tag.startswith("sliced:data:sax:is_not_padded"): pass # will be filled in by the next one elif tag.startswith("sliced:data:sax"): # step 1: sort (data, metadata_tag) with slice_location_finder slice_locations, sorted_indices, sorted_distances = slice_location_finder({i: metadata for i,metadata in enumerate(metadata_tag)}) data = [data[idx] for idx in sorted_indices] metadata_tag = [metadata_tag[idx] for idx in sorted_indices] slice_locations = np.array([slice_locations[idx]["relative_position"] for idx in sorted_indices]) slice_locations = slice_locations - (slice_locations[-1] + slice_locations[0])/2.0 data = [ clean_images([slicedata], metadata=metadata, cleaning_processes=cleaning_processes)[0] for slicedata, metadata in zip(data, metadata_tag)] # Augment and extract patches shift_centers = [(None, None)] * len(data) if getattr(config(), 'use_hough_roi', False): shift_centers = [m["hough_roi"] for m in metadata_tag] patient_3d_tensors = [ normscale_resize_and_augment_function( [slicedata], output_shape=desired_shape[-2:], augment=augmentation_params, pixel_spacing=metadata["PixelSpacing"], shift_center=shift_center[::-1])[0] for slicedata, metadata, shift_center in zip(data, metadata_tag, shift_centers)] if augmentation_params is not None: zoom_factor = augmentation_params["zoom_x"] * augmentation_params["zoom_y"] else: zoom_factor = 1.0 # Clean data further patient_3d_tensors = [ clean_images([patient_3d_tensor], metadata=metadata, cleaning_processes=cleaning_processes_post)[0] for patient_3d_tensor, metadata in zip(patient_3d_tensors, metadata_tag)] patient_4d_tensor = _make_4d_tensor(patient_3d_tensors) if augmentation_params and not augmentation_params.get("change_brightness", 0) == 0: patient_4d_tensor = augment_brightness(patient_4d_tensor, augmentation_params["change_brightness"]) # Augment sax order if augmentation_params and augmentation_params.get("flip_sax", 0) > 0.5: patient_4d_tensor = patient_4d_tensor[::-1] slice_locations = slice_locations[::-1] # Put data (images and metadata) in right location put_in_the_middle(result[tag][index], patient_4d_tensor, True) if "sliced:data:sax:locations" in result: eps_location = 1e-7 is_padded = np.array([False]*len(result["sliced:data:sax:locations"][index])) put_in_the_middle(result["sliced:data:sax:locations"][index], slice_locations + eps_location, True, is_padded) if "sliced:data:sax:distances" in result: eps_location = 1e-7 sorted_distances.append(0.0) # is easier for correct padding is_padded = np.array([False]*len(result["sliced:data:sax:distances"][index])) put_in_the_middle(result["sliced:data:sax:distances"][index], np.array(sorted_distances) + eps_location, True, is_padded) if "sliced:data:sax:is_not_padded" in result: result["sliced:data:sax:is_not_padded"][index] = np.logical_not(is_padded) elif tag.startswith("sliced:data:chanzoom:2ch"): # step 1: sort (data, metadata_tag) with slice_location_finder slice_locations, sorted_indices, sorted_distances = slice_location_finder({i: metadata for i,metadata in enumerate(metadata_tag[2])}) top_slice_metadata = metadata_tag[2][sorted_indices[0]] bottom_slice_metadata = metadata_tag[2][sorted_indices[-1]] ch2_metadata = metadata_tag[1] ch4_metadata = metadata_tag[0] trf_2ch, trf_4ch = get_chan_transformations( ch2_metadata=ch2_metadata, ch4_metadata=ch4_metadata, top_point_metadata = top_slice_metadata, bottom_point_metadata = bottom_slice_metadata, output_width=desired_shape[-1] ) ch4_3d_patient_tensor, ch2_3d_patient_tensor = [], [] ch4_data = data[0] ch2_data = data[1] if ch4_data is None and ch2_data is not None: ch4_data = ch2_data ch4_metadata = ch2_metadata if ch2_data is None and ch4_data is not None: ch2_data = ch4_data ch2_metadata = ch4_metadata for ch, ch_result, transform, metadata in [(ch4_data, ch4_3d_patient_tensor, trf_4ch, ch4_metadata), (ch2_data, ch2_3d_patient_tensor, trf_2ch, ch2_metadata)]: tform_shift_center, tform_shift_uncenter = build_center_uncenter_transforms(desired_shape[-2:]) zoom_factor = np.sqrt(np.abs(np.linalg.det(transform.params[:2,:2])) * np.prod(metadata["PixelSpacing"])) normalise_zoom_transform = build_augmentation_transform(zoom_x=zoom_factor, zoom_y=zoom_factor) if augmentation_params: augment_tform = build_augmentation_transform(**augmentation_params) total_tform = tform_shift_uncenter + augment_tform + normalise_zoom_transform + tform_shift_center + transform else: total_tform = tform_shift_uncenter + normalise_zoom_transform + tform_shift_center + transform ch_result[:] = [fast_warp(c, total_tform, output_shape=desired_shape[-2:]) for c in ch] # print "zoom factor:", zoom_factor if augmentation_params is not None: zoom_factor = augmentation_params["zoom_x"] * augmentation_params["zoom_y"] else: zoom_factor = 1.0 # Clean data further ch4_3d_patient_tensor = clean_images(np.array([ch4_3d_patient_tensor]), metadata=ch4_metadata, cleaning_processes=cleaning_processes_post)[0] ch2_3d_patient_tensor = clean_images(np.array([ch2_3d_patient_tensor]), metadata=ch2_metadata, cleaning_processes=cleaning_processes_post)[0] # Put data (images and metadata) in right location put_in_the_middle(result["sliced:data:chanzoom:2ch"][index], ch2_3d_patient_tensor, True) put_in_the_middle(result["sliced:data:chanzoom:4ch"][index], ch4_3d_patient_tensor, True) elif tag.startswith("sliced:data:shape"): raise NotImplementedError() elif tag.startswith("sliced:data"): # put time dimension first, then axis dimension data = clean_images(patient_data[tag], metadata=metadata_tag) patient_4d_tensor, zoom_ratios = resize_and_augment(data, output_shape=desired_shape[-2:], augment=augmentation_parameters) if "area_per_pixel:sax" in result: result["area_per_pixel:sax"][index] = zoom_ratios[0] * np.prod(metadata_tag[0]["PixelSpacing"]) if "noswitch" not in tag: patient_4d_tensor = np.swapaxes(patient_4d_tensor,1,0) put_in_the_middle(result[tag][index], patient_4d_tensor) elif tag.startswith("sliced:meta:all"): # TODO: this probably doesn't work very well yet result[tag][index] = patient_data[tag] elif tag.startswith("sliced:meta:PatientSex"): result[tag][index][0] = -1. if patient_data[tag]=='M' else 1. elif tag.startswith("sliced:meta:PatientAge"): number, letter = patient_data[tag][:3], patient_data[tag][-1] letter_rescale_factors = {'D': 365.25, 'W': 52.1429, 'M': 12., 'Y': 1.} result[tag][index][0] = float(patient_data[tag][:3]) / letter_rescale_factors[letter] if augmentation_params and zoom_factor: label_correction_function = lambda x: x * zoom_factor classification_correction_function = lambda x: utils.zoom_array(x, 1./zoom_factor) return label_correction_function, classification_correction_function else: return lambda x: x, lambda x: x
metadata = utils.load_pkl(metadata_path) expid = metadata['experiment_id'] # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s-test.log' % expid) sys.stderr = sys.stdout # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/' + expid utils.auto_make_dir(outputs_path) print 'Build model' model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print ' number of parameters: %d' % num_params print string.ljust(' layer output shapes:', 36), print string.ljust('#params:', 10), print 'output shape:' for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print ' %s %s %s' % (name, num_param, layer.output_shape) nn.layers.set_all_param_values(model.l_out, metadata['param_values'])
expid = utils.generate_expid(config_name) print() print("Experiment ID: %s" % expid) print() # metadata metadata_dir = utils.get_dir_path('models', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print('Build model') model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print(' number of parameters: %d' % num_params) print(string.ljust(' layer output shapes:', 36), ) print(string.ljust('#params:', 10), ) print('output shape:') for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print(' %s %s %s %s' % (name, num_param, layer.output_shape, layer.name)) train_loss = config().build_objective(model, deterministic=False)
from configuration import set_configuration, config import utils_plots import numpy as np set_configuration('configs_seg_scan', 'luna_s_local') data_iter = config().valid_data_iterator for (x, y, lung_mask, annotations, transform_matrices, pid) in data_iter.generate(): predictions_scan = lung_mask * x for nodule_n, zyxd in enumerate(annotations): utils_plots.plot_slice_3d_4(input=x[0, 0], lung_mask=lung_mask[0, 0], prediction=predictions_scan[0, 0], mask=y[0, 0], axis=0, pid='-'.join([str(nodule_n), str(pid)]), idx=zyxd)
expid = utils.generate_expid(config_name) print print "Experiment ID: %s" % expid print # metadata metadata_dir = utils.get_dir_path('models', pathfinder.METADATA_PATH) metadata_path = metadata_dir + '/%s.pkl' % expid # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % expid) sys.stderr = sys.stdout print 'Build model' model = config().build_model() all_layers = nn.layers.get_all_layers(model.l_out) all_params = nn.layers.get_all_params(model.l_out) num_params = nn.layers.count_params(model.l_out) print ' number of parameters: %d' % num_params print string.ljust(' layer output shapes:', 36), print string.ljust('#params:', 10), print 'output shape:' for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()]) num_param = string.ljust(num_param.__str__(), 10) print ' %s %s %s %s' % (name, num_param, layer.output_shape, layer.name) train_loss = config().build_objective(model, deterministic=False) valid_loss = config().build_objective(model, deterministic=True)
sys.exit("Usage: test_luna_scan.py <configuration_name>") config_name = sys.argv[1] set_configuration('configs_seg_scan', config_name) # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/%s' % config_name utils.auto_make_dir(outputs_path) # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % config_name) sys.stderr = sys.stdout data_iterator = config().train_data_iterator print print 'Data' print 'n samples: %d' % data_iterator.nsamples start_time = time.time() n_pos = 0 tp = 0 for n, (x, y, lung_mask, annotations, tf_matrix, pid) in enumerate(data_iterator.generate()): print '-------------------------------------' print n, pid n_pos += annotations.shape[0] n_pid_tp = 0 annotations = np.int32(annotations) for i in xrange(annotations.shape[0]):
blobs = np.asarray(blobs_original_voxel_coords) print blobs.shape utils.save_pkl(blobs, outputs_path + '/%s.pkl' % pid) jobs = [] theano.config.warn_float64 = 'raise' if len(sys.argv) < 3: sys.exit("Usage: test_seg_scan_dsb.py <configuration_name> <data_iterator_part>") config_name = sys.argv[1] set_configuration('configs_seg_scan', config_name) data_iterator_part = int(sys.argv[2]) # start from 0 assert data_iterator_part < len(config().data_iterators) # predictions path predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) outputs_path = predictions_dir + '/%s' % config_name utils.auto_make_dir(outputs_path) # logs logs_dir = utils.get_dir_path('logs', pathfinder.METADATA_PATH) sys.stdout = logger.Logger(logs_dir + '/%s.log' % config_name) sys.stderr = sys.stdout # builds model and sets its parameters model = config().build_model() x_shared = nn.utils.shared_empty(dim=len(model.l_in.shape))
def deploy(module_name=None, resume=False, lock_root=False): app = flask.discover(module_name) lpath = os.path.realpath(os.path.join(env.flask_dir, app.name)) # create new server server = Provider.load(env.provider) server.name = "-".join((app.name, uuid4().hex)) server.create().wait() # prepare system software api.run('apt-get install -y openssh-server') # set up firewall api.run('mkdir /etc/iptables') api.put(config('iptables'), '/etc/iptables/rules') api.put(config('iptables.sh'), '/etc/network/if-pre-up.d/iptables') api.run('chmod +x /etc/network/if-pre-up.d/iptables') # install software with open(config('packages.txt')) as requirements: for requirement in requirements: print("installing requirement `{0}`...".format(requirement)) api.run('apt-get install -y {0}'.format(requirement.strip())) # install python packages api.put(config('requirements.txt'), 'requirements.txt') api.run('pip3 install -r requirements.txt') api.run('rm requirements.txt') # deploy application rpath = os.path.join(SRV_ROOT, app.name) api.put(lpath, SRV_ROOT, use_sudo=True) api.run('chown -R www-data {0}'.format(rpath)) api.run('chmod -R 500 {0}'.format(rpath)) api.put(config('uwsgi.ini'), rpath, use_sudo=True) # extract socket_name with open(config('uwsgi.ini')) as ini: conf = ConfigParser.RawConfigParser() conf.readfp(ini) socket_name = conf.get('uwsgi', 'socket') # configure web server render('nginx.conf', '/etc/nginx/nginx.conf', socket_name = socket_name, static_directories = [ ( x.static_url_path, os.path.join( rpath, os.path.realpath(x.static_folder).split(lpath)[1][1:] )) for x in chain([app], app.blueprints.values()) if x.static_url_path ] ) # configure supervisord api.run('pip install supervisor') render('supervisord.conf', '/etc/supervisord.conf', flask_dir = module_name, location = SRV_ROOT, ini_file = os.path.join(rpath, 'uwsgi.ini'), ) # start web service api.run('/usr/local/bin/supervisord') api.run('service nginx start') if lock_root: # prepare admin user api.run('addgroup admin') api.run('adduser admin --quiet --ingroup admin --gecos ""') api.run('sudo -u admin mkdir /home/admin/.ssh') api.put('~/.ssh/id_rsa.pub', '/home/admin/.ssh/authorized_keys') api.run('chown admin:admin /home/admin/.ssh/authorized_keys') # lock down SSH api.put('build/sshd_config', '/etc/ssh/sshd_config') api.run('service ssh restart')