def do_makesamples(args): settings = Settings( num_samples=args.num_samples, block_variables=args.block, condition_variables=args.condition, stat='f') schema = load_schema(args.schema) input = Input.from_raw_file(args.infile, schema, limit=1) job = Job(input=input, settings=settings, schema=schema) res = an.new_sample_indexes(job) if args.output is None: output = sys.stdout else: output = args.output output.write("# Block layout: " + str(job.block_layout) + "\n") output.write("# Condition layout: " + str(job.condition_layout) + "\n") test = GroupSymbols(job.condition_layout) for row in res: for x in row: output.write(' {:3d}'.format(x)) output.write(" # " + test(np.array(row)) + "\n")
def copy_input(path, input_path, schema, settings, job_id): logging.info("Loading input for job from {0}".format(input_path)) input = Input.from_raw_file(input_path, schema) logging.info("Saving input, settings, and schema to " + str(path)) with h5py.File(path, 'w') as db: # Save the input object ids = input.feature_ids # Saving feature ids is tricky because they are strings dt = h5py.special_dtype(vlen=str) db.create_dataset("table", data=input.table) db.create_dataset("feature_ids", (len(ids), ), dt) for i, fid in enumerate(ids): input.feature_ids[i] = fid db['feature_ids'][i] = fid print('block vars are', settings.block_variables) # Save the settings object db.create_dataset("tuning_params", data=settings.tuning_params) db.attrs['job_id'] = job_id db.attrs['stat'] = settings.stat, db.attrs['glm_family'] = settings.glm_family, db.attrs['num_bins'] = settings.num_bins db.attrs['num_samples'] = settings.num_samples db.attrs['sample_from_residuals'] = settings.sample_from_residuals db.attrs['sample_with_replacement'] = settings.sample_with_replacement db.attrs['condition_variables'] = map(str, settings.condition_variables) if settings.block_variables != []: db.attrs['block_variables'] = settings.block_variables db.attrs['summary_min_conf'] = settings.summary_min_conf db.attrs['summary_step_size'] = settings.summary_step_size db.attrs['equalize_means'] = settings.equalize_means db.attrs['shrink'] = settings.shrink # Save the schema object schema_str = StringIO() schema.save(schema_str) db.attrs['schema'] = str(schema_str.getvalue()) if settings.equalize_means_ids is not None: db['equalize_means_ids'] = settings.equalize_means_ids
def copy_input(path, input_path, schema, settings, job_id): logging.info("Loading input for job from {0}".format(input_path)) input = Input.from_raw_file(input_path, schema) logging.info("Saving input, settings, and schema to " + str(path)) with h5py.File(path, 'w') as db: # Save the input object ids = input.feature_ids # Saving feature ids is tricky because they are strings dt = h5py.special_dtype(vlen=str) db.create_dataset("table", data=input.table) db.create_dataset("feature_ids", (len(ids),), dt) for i, fid in enumerate(ids): input.feature_ids[i] = fid db['feature_ids'][i] = fid print('block vars are', settings.block_variables) # Save the settings object db.create_dataset("tuning_params", data=settings.tuning_params) db.attrs['job_id'] = job_id db.attrs['stat'] = settings.stat, db.attrs['glm_family'] = settings.glm_family, db.attrs['num_bins'] = settings.num_bins db.attrs['num_samples'] = settings.num_samples db.attrs['sample_from_residuals'] = settings.sample_from_residuals db.attrs['sample_with_replacement'] = settings.sample_with_replacement db.attrs['condition_variables'] = map(str, settings.condition_variables) if settings.block_variables != []: db.attrs['block_variables'] = settings.block_variables db.attrs['summary_min_conf'] = settings.summary_min_conf db.attrs['summary_step_size'] = settings.summary_step_size db.attrs['equalize_means'] = settings.equalize_means db.attrs['shrink'] = settings.shrink # Save the schema object schema_str = StringIO() schema.save(schema_str) db.attrs['schema'] = str(schema_str.getvalue()) if settings.equalize_means_ids is not None: db['equalize_means_ids'] = settings.equalize_means_ids
def load_input(db): return Input(db['table'][...], db['feature_ids'][...])