def mmultfile_ata_piece(a_filename, offset, work_index=0, work_count=1, log_frequency=-1, force_python_only=False): t0_gtg = time.time() if log_frequency > 0: logging.info("ata_piece: Working on piece {0} of {1}.".format( work_index, work_count)) a = SnpMemMap(a_filename) def debatch_closure(work_index2): return a.sid_count * work_index2 // work_count start = debatch_closure(work_index) stop = debatch_closure(work_index + 1) ata_piece = np.zeros((a.sid_count - start, stop - start), order='C') if force_python_only: with open(a.filename, "rb") as fp: fp.seek(a.offset + start * a.iid_count * 8) slice = np.fromfile(fp, dtype=np.float64, count=a.iid_count * (stop - start)).reshape( a.iid_count, stop - start, order="F") for i in xrange(work_index, work_count): starti = debatch_closure(i) stopi = debatch_closure(i + 1) if i > work_index: slicei = np.fromfile(fp, dtype=np.float64, count=a.iid_count * (stopi - starti)).reshape(a.iid_count, stopi - starti, order="F") else: slicei = slice if i % log_frequency == 0 and log_frequency > 0: logging.info("{0}/{1}".format(i, work_count)) ata_piece[starti - start:stopi - start, :] = np.dot( slicei.T, slice) else: mmultfile_atax(a_filename, a.offset, a.iid_count, a.sid_count, work_index, work_count, ata_piece, num_threads=get_num_threads(), log_frequency=log_frequency) if log_frequency > 0: logging.info("ata_piece {0} of {1}: clocktime {2}".format( work_index, work_count, format_delta(time.time() - t0_gtg))) return ata_piece
idx = G0_memmap.pos[:,0]!=2 inonzero = np.array([True]*sum(idx)) inonzero[0] = False logging.info("Generating random {0}x{0}".format(sum(idx))) if sum(idx) < 10000: np.random.seed(0) SVinv3 = np.random.random((sum(idx),sum(idx))) else: SVinv3 = np.zeros((sum(idx),sum(idx))) np.fill_diagonal(SVinv3,val=1) logging.info("Done Generating random") local_fn_U = r"d:\deldir\local_fn_U.memmap" t0 = time.time() U_memmap = post_svd(local_fn_U, G0_memmap, idx, SVinv3, inonzero, memory_factor, runner, do_original=do_original,force_python_only=force_python_only,log_frequency=log_frequency) print(U_memmap.val) logging.info("clocktime {0}".format(format_delta(time.time()-t0))) print("done") elif test_ata: if do_fast: filename = r"D:\deldir\test\_Storage\very_small_0.8\G0_data.memmap" else: filename = r"D:\deldir\scratch\escience\carlk\cachebio\genetics\onemil\fc\bigsyn0\U1.memmap" work_count = 3 force_python_only = False a = SnpMemMap(filename) def memmap_lambda():
def _simple_open_read(self, simple_file_name, updater=None): logging.debug("open_read('{0}')".format(simple_file_name)) # #Returns name of local file and locks that local until released # #Try to Get locally, return it if worked #Is there enough room? If not delete some files. If still not, fail #Ask origin for randomized list of copies (the last one will be the origin, but not all have to be included) #Try getting each copy. If all fail, then fail #Register local copy with the origin dir_path = self.common_directory.join(simple_file_name) unique_name, root = self.id_and_path_function() local_path = root + "/" + simple_file_name pstutil.create_directory_if_necessary(local_path, isfile=True) copy_name = "copy_{0}.txt".format(unique_name) main_path = self._robust_load_main(dir_path) file_size = os.path.getsize(main_path) if os.path.exists(local_path): if main_path == local_path or dir_path.file_exists(copy_name): logging.info("\tfound local") assert file_size == os.path.getsize( local_path ), "Local file doesn't have the same size as the main file" yield local_path return else: logging.info("\tlost local found. Will remove") os.remove(local_path) pstutil.create_directory_if_necessary(local_path, isfile=True) dib_lib = _DibLib(unique_name, dir_path, dir_path, "dibs") try: dib_lib.wait_for_turn() #Now we can copy. Choose a source at random for storage_path, storage_count in self._far_file_sequence( dir_path ): # In a loop because in the future may want to handle copies being deleted. try: #If something goes wrong, try the next one self._net_use(storage_path) assert file_size == os.path.getsize( storage_path ), "File to copy from doesn't have the same size as the main file" if psutil.disk_usage( os.path.dirname(local_path) ).free - file_size < self.leave_space: #clean up the directories #Check every local file and if it is not in the directory (e.g. a temp file from other programs), remove it local_dir = os.path.split(local_path)[0] for other_file in os.listdir(local_dir): full_file = local_dir + "/" + other_file if os.path.isfile( full_file ) and not self._simple_file_exists(other_file): logging.info( "\tNeed space and file isn't in directory so removing it. '{0}'" .format(full_file)) os.remove(full_file) logging.info("\tshutil.copyfile('{0}','{1}')".format( storage_path, local_path)) then = datetime.datetime.now() shutil.copyfile(storage_path, local_path) shutil.copystat(storage_path, local_path) self._copy_time_stamp(storage_path, local_path) dir_path.save(copy_name, local_path) delta_sec = max( (datetime.datetime.now() - then).total_seconds(), 1.) try: #The 'try' stops this logging message from getting a div by zero error some times logging.info( "Copy time is {0}. Copy speed is {1} Mbps".format( _mbps(file_size, delta_sec))) except: logging.info( "Copy time is {0}. Copy speed can't be calculated Mbps" .format(format_delta(delta_sec))) break except Exception as e: if os.path.exists(local_path): logging.warning( "If a local file was created, but something went wrong (e.g. the source disappeared part way through the copying), so we remove it" ) os.remove(local_path) logging.warning("Ignore exception {0}".format(e)) assert file_size == os.path.getsize( local_path), "File did not copy or did not copy completely." finally: dib_lib.remove_dibs() yield local_path assert dir_path._simple_file_exists( "main.txt"), "File doesn't exist: '{0}'".format(path)
def mmultfile_ata_piece(a_filename, offset, work_index=0, work_count=1, log_frequency=-1, force_python_only=False): t0_gtg = time.time() if log_frequency > 0: logging.info("ata_piece: Working on piece {0} of {1}.".format( work_index, work_count)) a = SnpMemMap(a_filename) def debatch_closure(work_index2): return a.sid_count * work_index2 // work_count start = debatch_closure(work_index) stop = debatch_closure(work_index + 1) ata_piece = np.zeros((a.sid_count - start, stop - start), order='C') do_both = False if force_python_only or do_both: with open(a.filename, "rb") as fp: fp.seek(a.offset + start * a.iid_count * 8) slice = np.fromfile(fp, dtype=np.float64, count=a.iid_count * (stop - start)).reshape( a.iid_count, stop - start, order="F") for i in range(work_index, work_count): starti = debatch_closure(i) stopi = debatch_closure(i + 1) if i > work_index: slicei = np.fromfile(fp, dtype=np.float64, count=a.iid_count * (stopi - starti)).reshape(a.iid_count, stopi - starti, order="F") else: slicei = slice if log_frequency > 0 and i % log_frequency == 0: logging.info("{0}/{1}".format(i, work_count)) ata_piece[starti - start:stopi - start, :] = np.dot( slicei.T, slice) if do_both: ata_piece_python = ata_piece ata_piece = np.zeros((a.sid_count - start, stop - start), order='C') if not force_python_only or do_both: file_dot_piece(str(a_filename), a.offset, a.iid_count, start, ata_piece, num_threads=get_num_threads(None), log_frequency=log_frequency) if do_both: if not np.abs(ata_piece_python - ata_piece).max() < 1e-12: raise AssertionError( "Expect Python and Rust to get the same file_dot answer") if log_frequency > 0: logging.info("ata_piece {0} of {1}: clocktime {2}".format( work_index, work_count, format_delta(time.time() - t0_gtg))) return ata_piece