def create_from_ensemble_map(cls, ensemble_map, path): sharded = cls(path, ['ensemble']) num_shards = sharded.get_num_shards() # Check if already partly written. If so, resume from there. metadata_path = sharded._get_metadata() if os.path.exists(metadata_path): metadata = pd.read_hdf(metadata_path, f'atom3d/data/metadata') num_written = len(metadata['shard_num'].unique()) else: num_written = 0 shard_ranges = _get_shard_ranges(len(ensemble_map), num_shards) shard_size = shard_ranges[0, 1] - shard_ranges[0, 0] logging.info(f'Ensembles per shard: {shard_size:}') for shard_num in tqdm.trange(num_written, num_shards): start, stop = shard_ranges[shard_num] dfs = [] for name in sorted(ensemble_map.keys())[start:stop]: df = en.parse_ensemble(name, ensemble_map[name]) dfs.append(df) df = dt.merge_dfs(dfs) sharded._write_shard(shard_num, df)
def reshard(input_sharded, output_sharded, shuffle_buffer=0): """ Rebalance dataset, optionally shuffling. If shuffle_buffer is not 0, then we perform a streaming shuffle across shuffle_buffer number of output shards. """ dirname = os.path.dirname(output_sharded.path) if not os.path.exists(dirname) and dirname != '': os.makedirs(dirname, exist_ok=True) num_structures = input_sharded.get_num_keyed() output_num_shards = output_sharded.get_num_shards() input_num_shards = input_sharded.get_num_shards() shard_ranges = sh._get_shard_ranges(num_structures, output_num_shards) shard_sizes = shard_ranges[:, 1] - shard_ranges[:, 0] if shuffle_buffer != 0: buffer_size = shuffle_buffer * shard_sizes[0] else: buffer_size = 1 t = tqdm.trange(output_num_shards) next_output_shard_num, next_input_shard_num = 0, 0 to_write, to_consume = [], [] df = None while True: while len(to_consume) < buffer_size and \ (next_input_shard_num != input_num_shards): # Read next shard if need more examples. df = input_sharded.read_shard(next_input_shard_num) to_consume += [y for (_, y) in dt.split_df(df, input_sharded.get_keys())] if shuffle_buffer is not None: random.shuffle(to_consume) next_input_shard_num += 1 if len(to_consume) != 0: to_write.append(to_consume.pop(0)) if len(to_write) == shard_sizes[next_output_shard_num]: # Write output shard if have number needed. if len(to_write) == 0: # Insert empty dataframe if nothing to write. to_write = [df.iloc[0:0]] output_sharded._write_shard(next_output_shard_num, dt.merge_dfs(to_write)) to_write = [] next_output_shard_num += 1 t.update(1) if next_output_shard_num == output_num_shards: break
def read_scores(scores_dir, targets): """ Return a pandas DataFrame containing scores of all decoys for all targets in <targets>. Search in <scores_dir> for the label files. """ frames = [] for target in targets: df = pd.read_csv(os.path.join(scores_dir, '{:}.dat'.format(target)), delimiter='\s+', engine='python').dropna() frames.append(df) scores_df = dt.merge_dfs(frames) return scores_df