Example #1
0
def load_data(npz_dir):
    """Load data from multiple npz files for a given folder."""
    files = glob.glob('%s/*.npz' % npz_dir)
    data_list = []
    for f in files:
        data_list += load_npz_to_data_list(f)
    return data_list
Example #2
0
 def _load_npz_data_path(self, data_path):
     data_list = []
     files = [f for f in os.listdir(data_path) if f.endswith('.npz')]
     files = sorted(files)
     for f in files:
         data_list += load_npz_to_data_list(join(data_path, f))
     return data_list
 def _load_npz_data(self, data_path):
     data_list = []
     files = [
         file for file in os.listdir(data_path) if file.endswith('.npz')
     ]
     for file in files:
         data_list += load_npz_to_data_list(join(data_path, file))
     return data_list
Example #4
0
 def test_data_list_to_npz(self):
     data_list = [{"a": np.array([1, 23, 4])}, {"a": np.array([2, 34, 5])}]
     npz_file = 'tmp.npz'
     save_data_list_to_npz(data_list, npz_file)
     reload_data_list = load_npz_to_data_list(npz_file)
     self.assertEqual(len(data_list), len(reload_data_list))
     for d1, d2 in zip(data_list, reload_data_list):
         self.assertEqual(len(d1), len(d2))
         for key in d1:
             self.assertTrue((d1[key] == d2[key]).all())
Example #5
0
    def __len__(self):
        if self.cached_len is not None:
            return self.cached_len
        else:
            n = 0
            for file in self.files:
                data_list = load_npz_to_data_list(file)
                n += len(data_list)

            self.cached_len = n
            return n
Example #6
0
 def __iter__(self):
     random.shuffle(self.files)
     for file in self.files:
         data_list = load_npz_to_data_list(file)
         if self.subset_selector is not None:
             data_list = self.subset_selector(data_list)
         for data in data_list:
             if self.max_protein_len > 0:
                 protein_token_ids = np.zeros(self.max_protein_len, dtype=np.int64) \
                         + ProteinTokenizer.padding_token_id
                 n = min(self.max_protein_len,
                         data['protein_token_ids'].size)
                 protein_token_ids[:n] = data['protein_token_ids'][:n]
                 data['protein_token_ids'] = protein_token_ids
             yield data
Example #7
0
 def _load_npz_data_files(self, data_files):
     data_list = []
     for f in data_files:
         data_list += load_npz_to_data_list(f)
     return data_list
 def _load_npz_data(self, data_path):
     files = [file for file in os.listdir(data_path) if file.endswith('.npz')]
     for file in files:
         data_list = load_npz_to_data_list(join(data_path, file))
         for data in data_list:
             yield data