コード例 #1
0
    def create_from_ensemble_map(cls, ensemble_map, path):
        sharded = cls(path, ['ensemble'])

        num_shards = sharded.get_num_shards()

        # Check if already partly written.  If so, resume from there.
        metadata_path = sharded._get_metadata()
        if os.path.exists(metadata_path):
            metadata = pd.read_hdf(metadata_path, f'atom3d/data/metadata')
            num_written = len(metadata['shard_num'].unique())
        else:
            num_written = 0

        shard_ranges = _get_shard_ranges(len(ensemble_map), num_shards)
        shard_size = shard_ranges[0, 1] - shard_ranges[0, 0]

        logging.info(f'Ensembles per shard: {shard_size:}')
        for shard_num in tqdm.trange(num_written, num_shards):
            start, stop = shard_ranges[shard_num]

            dfs = []
            for name in sorted(ensemble_map.keys())[start:stop]:
                df = en.parse_ensemble(name, ensemble_map[name])
                dfs.append(df)
            df = dt.merge_dfs(dfs)

            sharded._write_shard(shard_num, df)
コード例 #2
0
def reshard(input_sharded, output_sharded, shuffle_buffer=0):
    """
    Rebalance dataset, optionally shuffling.

    If shuffle_buffer is not 0, then we perform a streaming shuffle across
    shuffle_buffer number of output shards.
    """
    dirname = os.path.dirname(output_sharded.path)
    if not os.path.exists(dirname) and dirname != '':
        os.makedirs(dirname, exist_ok=True)

    num_structures = input_sharded.get_num_keyed()
    output_num_shards = output_sharded.get_num_shards()
    input_num_shards = input_sharded.get_num_shards()

    shard_ranges = sh._get_shard_ranges(num_structures, output_num_shards)
    shard_sizes = shard_ranges[:, 1] - shard_ranges[:, 0]

    if shuffle_buffer != 0:
        buffer_size = shuffle_buffer * shard_sizes[0]
    else:
        buffer_size = 1

    t = tqdm.trange(output_num_shards)
    next_output_shard_num, next_input_shard_num = 0, 0
    to_write, to_consume = [], []
    df = None
    while True:
        while len(to_consume) < buffer_size and \
                (next_input_shard_num != input_num_shards):
            # Read next shard if need more examples.
            df = input_sharded.read_shard(next_input_shard_num)
            to_consume += [y for (_, y) in
                           dt.split_df(df, input_sharded.get_keys())]

            if shuffle_buffer is not None:
                random.shuffle(to_consume)
            next_input_shard_num += 1

        if len(to_consume) != 0:
            to_write.append(to_consume.pop(0))

        if len(to_write) == shard_sizes[next_output_shard_num]:
            # Write output shard if have number needed.

            if len(to_write) == 0:
                # Insert empty dataframe if nothing to write.
                to_write = [df.iloc[0:0]]

            output_sharded._write_shard(next_output_shard_num,
                                        dt.merge_dfs(to_write))
            to_write = []
            next_output_shard_num += 1
            t.update(1)

            if next_output_shard_num == output_num_shards:
                break
コード例 #3
0
def read_scores(scores_dir, targets):
    """
    Return a pandas DataFrame containing scores of all decoys for all targets
    in <targets>. Search in <scores_dir> for the label files.
    """
    frames = []
    for target in targets:
        df = pd.read_csv(os.path.join(scores_dir, '{:}.dat'.format(target)),
                         delimiter='\s+', engine='python').dropna()
        frames.append(df)
    scores_df = dt.merge_dfs(frames)
    return scores_df