def test_export_block_matrices(self): data = [np.random.rand(11 * 12), np.random.rand(5 * 17)] arrs = [data[0].reshape((11, 12)), data[1].reshape((5, 17))] bms = [ hl.linalg.BlockMatrix._create(11, 12, data[0].tolist(), block_size=4), hl.linalg.BlockMatrix._create(5, 17, data[1].tolist(), block_size=8) ] with hl.TemporaryDirectory() as prefix: hl.experimental.export_block_matrices(bms, f'{prefix}/files') for i in range(len(bms)): a = arrs[i] a2 = np.loadtxt( hl.current_backend().fs.open(f'{prefix}/files/{i}.tsv')) self.assertTrue(np.array_equal(a, a2)) with hl.TemporaryDirectory() as prefix2: custom_names = ["nameA", "inner/nameB.tsv"] hl.experimental.export_block_matrices( bms, f'{prefix2}/files', custom_filenames=custom_names) for i in range(len(bms)): a = arrs[i] a2 = np.loadtxt(hl.current_backend().fs.open( f'{prefix2}/files/{custom_names[i]}')) self.assertTrue(np.array_equal(a, a2))
def test_hadoop_mkdir_p(self): test_text = "HELLO WORLD" with hadoop_open(resource('./some/foo/bar.txt'), 'w') as out: out.write(test_text) self.assertTrue(hl.hadoop_exists(resource('./some/foo/bar.txt'))) with hadoop_open(resource('./some/foo/bar.txt')) as f: assert (f.read() == test_text) hl.current_backend().fs.rmtree(resource('./some'))
def test_to_from_numpy(self): n_rows = 10 n_cols = 11 data = np.random.rand(n_rows * n_cols) bm = BlockMatrix._create(n_rows, n_cols, data.tolist(), block_size=4) a = data.reshape((n_rows, n_cols)) with hl.TemporaryFilename() as bm_f, hl.TemporaryFilename() as a_f: bm.tofile(bm_f) a.tofile(a_f) a1 = bm.to_numpy() a2 = BlockMatrix.from_numpy(a, block_size=5).to_numpy() a3 = np.frombuffer( hl.current_backend().fs.open(bm_f, mode='rb').read() ).reshape((n_rows, n_cols)) a4 = BlockMatrix.fromfile(a_f, n_rows, n_cols, block_size=3).to_numpy() a5 = BlockMatrix.fromfile(bm_f, n_rows, n_cols).to_numpy() self._assert_eq(a1, a) self._assert_eq(a2, a) self._assert_eq(a3, a) self._assert_eq(a4, a) self._assert_eq(a5, a) bmt = bm.T at = a.T with hl.TemporaryFilename() as bmt_f, hl.TemporaryFilename() as at_f: bmt.tofile(bmt_f) at.tofile(at_f) at1 = bmt.to_numpy() at2 = BlockMatrix.from_numpy(at).to_numpy() at3 = np.frombuffer( hl.current_backend().fs.open(bmt_f, mode='rb').read() ).reshape((n_cols, n_rows)) at4 = BlockMatrix.fromfile(at_f, n_cols, n_rows).to_numpy() at5 = BlockMatrix.fromfile(bmt_f, n_cols, n_rows).to_numpy() self._assert_eq(at1, at) self._assert_eq(at2, at) self._assert_eq(at3, at) self._assert_eq(at4, at) self._assert_eq(at5, at) self._assert_eq(bm.to_numpy(_force_blocking=True), a)
def save(self): fs = hl.current_backend().fs try: backup_path = self.save_path + '.bak' if fs.exists(self.save_path): fs.copy(self.save_path, backup_path) with fs.open(self.save_path, 'w') as out: json.dump(self, out, indent=2, cls=Encoder) if fs.exists(backup_path): fs.remove(backup_path) except OSError as e: # these messages get printed, because there is absolutely no guarantee # that the hail context is in a sane state if any of the above operations # fail print( f'Failed saving {self.__class__.__name__} state at {self.save_path}' ) print( f'An attempt was made to copy {self.save_path} to {backup_path}' ) print('An old version of this state may be there.') print( 'Dumping current state as json to standard output, you may wish ' 'to save this output in order to resume the combiner.') json.dump(self, sys.stdout, indent=2, cls=Encoder) print() raise e
def set_query_name(request): backend = current_backend() if isinstance(backend, ServiceBackend): backend.batch_attributes = dict(name=request.node.name) yield backend.batch_attributes = dict() else: yield
def load(path) -> 'VariantDatasetCombiner': fs = hl.current_backend().fs with fs.open(path) as stream: combiner = json.load(stream, cls=Decoder) if combiner.save_path != path: warning( 'path/save_path mismatch in loaded VariantDatasetCombiner, using ' f'{path} as the new save_path for this combiner') combiner.save_path = path return combiner
def _assert_rectangles_eq(self, expected, rect_path, export_rects, binary=False): for (i, r) in enumerate(export_rects): piece_path = rect_path + '/rect-' + str(i) + '_' + '-'.join(map(str, r)) with hl.current_backend().fs.open(piece_path, mode='rb' if binary else 'r') as file: expected_rect = expected[r[0]:r[1], r[2]:r[3]] if binary: actual_rect = np.reshape( np.frombuffer(file.read()), (r[1] - r[0], r[3] - r[2])) else: actual_rect = np.loadtxt(file, ndmin=2) self._assert_eq(expected_rect, actual_rect)
def is_resource_available(self) -> bool: """ Check if this resource is available from the selected source. :return: True if the resource is available. """ path = self.path # Hail Tables, MatrixTables, and BlockMatrices are directories. # For those, check for the existence of the _SUCCESS object. path_to_test = (f"{path}/_SUCCESS" if any( path.endswith(ext) for ext in (".ht", ".mt", ".bm")) else path) return hl.current_backend().fs.exists(path_to_test)
def test_backward_compatability(self): import os def backward_compatible_same(current, old): if isinstance(current, hl.Table): current = current.select_globals(*old.globals) current = current.select(*old.row_value) else: current = current.select_globals(*old.globals) current = current.select_rows(*old.row_value) current = current.select_cols(*old.col_value) current = current.select_entries(*old.entry) return current._same(old) all_values_table, all_values_matrix_table = create_all_values_datasets( ) resource_dir = resource('backward_compatability') fs = hl.current_backend().fs versions = [os.path.basename(x['path']) for x in fs.ls(resource_dir)] n = 0 for v in versions: table_dir = os.path.join(resource_dir, v, 'table') i = 0 f = os.path.join(table_dir, '{}.ht'.format(i)) while fs.exists(f): ds = hl.read_table(f) assert backward_compatible_same(all_values_table, ds) i += 1 f = os.path.join(table_dir, '{}.ht'.format(i)) n += 1 matrix_table_dir = os.path.join(resource_dir, v, 'matrix_table') i = 0 f = os.path.join(matrix_table_dir, '{}.hmt'.format(i)) while fs.exists(f): ds = hl.read_matrix_table(f) assert backward_compatible_same(all_values_matrix_table, ds) i += 1 f = os.path.join(matrix_table_dir, '{}.hmt'.format(i)) n += 1 assert n == 72
def test_block_matrices_tofiles(self): data = [np.random.rand(11 * 12), np.random.rand(5 * 17)] arrs = [data[0].reshape((11, 12)), data[1].reshape((5, 17))] bms = [ hl.linalg.BlockMatrix._create(11, 12, data[0].tolist(), block_size=4), hl.linalg.BlockMatrix._create(5, 17, data[1].tolist(), block_size=8) ] with hl.TemporaryDirectory() as prefix: hl.experimental.block_matrices_tofiles(bms, f'{prefix}/files') for i in range(len(bms)): a = data[i] a2 = np.frombuffer(hl.current_backend().fs.open( f'{prefix}/files/{i}', mode='rb').read()) self.assertTrue(np.array_equal(a, a2))
def maybe_load_from_saved_path( save_path: str) -> Optional[VariantDatasetCombiner]: if force: return None fs = hl.current_backend().fs if fs.exists(save_path): try: combiner = load_combiner(save_path) warning( f'found existing combiner plan at {save_path}, using it') # we overwrite these values as they are serialized, but not part of the # hash for an autogenerated name and we want users to be able to overwrite # these when resuming a combine (a common reason to need to resume a combine # is a failure due to branch factor being too large) combiner.branch_factor = branch_factor combiner.target_records = target_records combiner.gvcf_batch_size = batch_size return combiner except (ValueError, TypeError, OSError, KeyError): warning( f'file exists at {save_path}, but it is not a valid combiner plan, overwriting' ) return None
def test_top_level_functions_are_do_not_error(self): hl.current_backend() hl.debug_info()
def setupAnnotationDBTests(cls): startTestHailContext() backend = hl.current_backend() if isinstance(backend, ServiceBackend): backend.batch_attributes = dict(name='setupAnnotationDBTests') t = hl.utils.range_table(10) t = t.key_by(locus=hl.locus('1', t.idx + 1)) t = t.annotate(annotation=hl.str(t.idx)) cls.tempdir_manager = hl.TemporaryDirectory() d = cls.tempdir_manager.__enter__() fname = d + '/f.mt' t.write(fname) if isinstance(backend, ServiceBackend): backend.batch_attributes = dict() cls.db_json = { 'unique_dataset': { 'description': 'now with unique rows!', 'url': 'https://example.com', 'annotation_db': { 'key_properties': ['unique'] }, 'versions': [{ 'url': { "aws": { "eu": fname, "us": fname }, "gcp": { "eu": fname, "us": fname } }, 'version': 'v1', 'reference_genome': 'GRCh37' }] }, 'nonunique_dataset': { 'description': 'non-unique rows :(', 'url': 'https://example.net', 'annotation_db': { 'key_properties': [] }, 'versions': [{ 'url': { "aws": { "eu": fname, "us": fname }, "gcp": { "eu": fname, "us": fname } }, 'version': 'v1', 'reference_genome': 'GRCh37' }] } }
def test_top_level_functions_are_do_not_error(self): hl.current_backend() hl.debug_info()
def test_count_range(): assert isinstance(hl.current_backend(), ServiceBackend) assert hl.utils.range_table(1000)._force_count() == 1000
def test_remove_and_rmtree(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir fs = hl.current_backend().fs dir = f'{prefix}foo/' subdir1 = f'{dir}foo/' subdir1subdir1 = f'{subdir1}foo/' subdir1subdir2 = f'{subdir1}bar/' subdir1subdir3 = f'{subdir1}baz/' def touch(filename): with fs.open(filename, 'w') as fobj: fobj.write('hello world') fs.mkdir(dir) touch(f'{dir}a') touch(f'{dir}b') fs.mkdir(subdir1) touch(f'{subdir1}a') fs.mkdir(subdir1subdir1) touch(f'{subdir1subdir1}a') fs.mkdir(subdir1subdir2) touch(f'{subdir1subdir2}a') fs.mkdir(subdir1subdir3) touch(f'{subdir1subdir3}a') try: fs.remove(subdir1subdir2) except (FileNotFoundError, IsADirectoryError): pass except FatalError as err: java_nio_error_message = 'DirectoryNotEmptyException: Cannot delete a non-empty directory' hadoop_error_message = f'Directory {subdir1subdir2.rstrip("/")} is not empty' assert java_nio_error_message in err.args[0] or hadoop_error_message in err.args[0] else: assert False fs.remove(f'{subdir1subdir2}a') assert fs.exists(dir) assert fs.exists(f'{dir}a') assert fs.exists(f'{dir}b') assert fs.exists(subdir1) assert fs.exists(f'{subdir1}a') assert fs.exists(subdir1subdir1) assert fs.exists(f'{subdir1subdir1}a') # subdir1subdir2: will exist in cloud, but not local, so do not test for it assert not fs.exists(f'{subdir1subdir2}a') assert fs.exists(subdir1subdir3) assert fs.exists(f'{subdir1subdir3}a') fs.rmtree(subdir1subdir1) assert fs.exists(dir) assert fs.exists(f'{dir}a') assert fs.exists(f'{dir}b') assert fs.exists(subdir1) assert fs.exists(f'{subdir1}a') assert not fs.exists(subdir1subdir1) assert not fs.exists(f'{subdir1subdir1}a') # subdir1subdir2: will exist in cloud, but not local, so do not test for it assert not fs.exists(f'{subdir1subdir2}a') assert fs.exists(subdir1subdir3) assert fs.exists(f'{subdir1subdir3}a') fs.rmtree(subdir1) assert fs.exists(dir) assert fs.exists(f'{dir}a') assert fs.exists(f'{dir}b') assert not fs.exists(subdir1) assert not fs.exists(f'{subdir1}a') assert not fs.exists(subdir1subdir1) assert not fs.exists(f'{subdir1subdir1}a') assert not fs.exists(subdir1subdir2) assert not fs.exists(f'{subdir1subdir2}a') assert not fs.exists(subdir1subdir3) assert not fs.exists(f'{subdir1subdir3}a')
def test_subdirs(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir fs = hl.current_backend().fs dir = f'{prefix}foo/' subdir1 = f'{dir}foo/' subdir1subdir1 = f'{subdir1}foo/' subdir1subdir2 = f'{subdir1}bar/' subdir1subdir3 = f'{subdir1}baz/' subdir1subdir4_empty = f'{subdir1}qux/' subdir2 = f'{dir}bar/' subdir3 = f'{dir}baz/' subdir4_empty = f'{dir}qux/' def touch(filename): with fs.open(filename, 'w') as fobj: fobj.write('hello world') fs.mkdir(dir) touch(f'{dir}a') touch(f'{dir}b') fs.mkdir(subdir1) fs.mkdir(subdir1subdir1) fs.mkdir(subdir1subdir2) fs.mkdir(subdir1subdir3) fs.mkdir(subdir1subdir4_empty) fs.mkdir(subdir2) fs.mkdir(subdir3) fs.mkdir(subdir4_empty) for subdir in [dir, subdir1, subdir2, subdir3, subdir1subdir1, subdir1subdir2, subdir1subdir3]: for i in range(30): touch(f'{subdir}a{i:02}') assert fs.is_dir(dir) assert fs.is_dir(subdir1) assert fs.is_dir(subdir1subdir1) assert fs.is_dir(subdir1subdir2) assert fs.is_dir(subdir1subdir3) # subdir1subdir4_empty: in cloud fses, empty dirs do not exist and thus are not dirs assert fs.is_dir(subdir2) assert fs.is_dir(subdir3) # subdir4_empty: in cloud fses, empty dirs do not exist and thus are not dirs fs.rmtree(subdir1subdir2) assert fs.is_dir(dir) assert fs.is_file(f'{dir}a') assert fs.is_file(f'{dir}b') assert fs.is_dir(subdir1) assert fs.is_file(f'{subdir1}a00') assert fs.is_dir(subdir1subdir1) assert fs.is_file(f'{subdir1subdir1}a00') assert not fs.is_dir(subdir1subdir2) assert not fs.is_file(f'{subdir1subdir2}a00') assert fs.is_dir(subdir1subdir3) assert fs.is_file(f'{subdir1subdir3}a00') assert fs.is_dir(subdir2) assert fs.is_file(f'{subdir2}a00') assert fs.is_dir(subdir3) assert fs.is_file(f'{subdir3}a00') fs.rmtree(dir) assert not fs.is_dir(dir)