def encode_record(data): features = { 'tensor': Feature(bytes_list=BytesList(value=[data.tobytes()])), } features = Features(feature=features) example = tf.train.Example(features=features) return example.SerializeToString()
def add_data_block(self, partition_id, x, y): dbm = self._dbms[partition_id] builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, partition_id, dbm.get_dumped_data_block_count(), dj_pb.WriterOptions(output_writer="TF_RECORD"), None) builder.set_data_block_manager(dbm) for i in range(x.shape[0]): feat = {} exam_id = '{}'.format(i).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) feat['event_time'] = Feature( int64_list = Int64List(value=[i]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[i]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[i]])) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), i, 0) return builder.finish_data_block()
def _create_local_data(self, xl, xf, y): N = 10 chunk_size = xl.shape[0] // N leader_worker_path = os.path.join(output_path, "data/leader") follower_worker_path = os.path.join(output_path, "data/follower") data_path = os.path.join(output_path, "data") if gfile.Exists(data_path): gfile.DeleteRecursively(data_path) os.makedirs(leader_worker_path) os.makedirs(follower_worker_path) for i in range(N): filename_l = os.path.join(leader_worker_path, '%02d.tfrecord' % i) filename_f = os.path.join(follower_worker_path, '%02d.tfrecord' % i) fl = tf.io.TFRecordWriter(filename_l) ff = tf.io.TFRecordWriter(filename_f) for j in range(chunk_size): idx = i * chunk_size + j features_l = {} features_l['example_id'] = Feature(bytes_list=BytesList( value=[str(idx).encode('utf-8')])) features_l['y'] = Feature(int64_list=Int64List(value=[y[idx]])) features_l['x'] = Feature(float_list=FloatList( value=list(xl[idx]))) fl.write( Example(features=Features( feature=features_l)).SerializeToString()) features_f = {} features_f['example_id'] = Feature(bytes_list=BytesList( value=[str(idx).encode('utf-8')])) features_f['x'] = Feature(float_list=FloatList( value=list(xf[idx]))) ff.write( Example(features=Features( feature=features_f)).SerializeToString()) fl.close() ff.close()
def _create_data_block(self, data_source, partition_id, x, y): data_block_metas = [] dbm = data_block_manager.DataBlockManager(data_source, partition_id) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) N = 200 chunk_size = x.shape[0] // N leader_index = 0 follower_index = N * chunk_size * 10 for i in range(N): builder = DataBlockBuilder( common.data_source_data_block_dir(data_source), data_source.data_source_meta.name, partition_id, i, dj_pb.WriterOptions(output_writer="TF_RECORD"), None ) builder.set_data_block_manager(dbm) for j in range(chunk_size): feat = {} idx = i * chunk_size + j exam_id = '{}'.format(idx).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) evt_time = random.randint(1, 1000) feat['event_time'] = Feature( int64_list = Int64List(value=[evt_time]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[idx]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[idx]])) feat['leader_index'] = Feature( int64_list = Int64List(value=[leader_index]) ) feat['follower_index'] = Feature( int64_list = Int64List(value=[follower_index]) ) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 1 follower_index += 1 data_block_metas.append(builder.finish_data_block()) self.max_index = follower_index return data_block_metas
def write_records(data, output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) with TFRecordWriter(output_file) as writer: for datum in data: # data_feat, data_size = feature_array(datum.astype(np.int64)) data_size = feature_int64([datum.shape[0]]) data_feat = feature_int64_list(datum) assert datum.shape[0] > 0 assert datum.ndim == 1 sequence_features = {'data_feat': data_feat} context_features = {'data_size': data_size} # example = Example(features=Features(feature=feature)) example = tf.train.SequenceExample( context=Features(feature=context_features), feature_lists=tf.train.FeatureLists( feature_list=sequence_features), ) writer.write(example.SerializeToString())
def _example_from_image_annotation(image_path: Path, annotation: ROCOAnnotation) -> Example: image_name = image_path.name encoded_jpg = image_path.read_bytes() key = hashlib.sha256(encoded_jpg).hexdigest() width, height = annotation.w, annotation.h xmin, ymin, xmax, ymax, classes, classes_text = [], [], [], [], [], [] for obj in annotation.objects: x1 = max(0.0, obj.box.x1 / width) y1 = max(0.0, obj.box.y1 / height) x2 = min(1.0, obj.box.x2 / width) y2 = min(1.0, obj.box.y2 / height) if x1 >= x2 or y1 >= y2: continue xmin.append(x1) ymin.append(y1) xmax.append(x2) ymax.append(y2) classes_text.append(obj.type.name.lower().encode("utf8")) classes.append(label_map.id_of(obj.type.name.lower())) return Example(features=Features( feature={ "image/filename": bytes_feature(image_name.encode("utf8")), "image/source_id": bytes_feature(image_name.encode("utf8")), "image/height": int64_feature(height), "image/width": int64_feature(width), "image/key/sha256": bytes_feature(key.encode("utf8")), "image/encoded": bytes_feature(encoded_jpg), "image/format": bytes_feature("jpeg".encode("utf8")), "image/object/bbox/xmin": float_list_feature(xmin), "image/object/bbox/xmax": float_list_feature(xmax), "image/object/bbox/ymin": float_list_feature(ymin), "image/object/bbox/ymax": float_list_feature(ymax), "image/object/class/text": bytes_list_feature(classes_text), "image/object/class/label": int64_list_feature(classes), }))
def write_records_raw(sentences, output_file, charmap, chunksize=1000, total=None): with ShardRecordWriter(path_fmt=output_file, chunksize=chunksize) as writer: for sentence in tqdm(sentences, total=total, desc='Writing Records'): datum = encode_words(sentence, charmap) data_size = feature_int64([datum.shape[0]]) data_feat = feature_int64_list(datum) assert datum.shape[0] > 0 assert datum.ndim == 1 sequence_features = {'text': data_feat} context_features = {SENTENCE_LENGTH: data_size} example = tf.train.SequenceExample( context=Features(feature=context_features), feature_lists=tf.train.FeatureLists( feature_list=sequence_features), ) writer.write(example.SerializeToString())
def write_records_parsed_v2(sentences: Iterable[List[Word]], output_file: str, vocabmaps: Dict[str, Dict[str, int]], int_fields=INT_FIELDS, text_fields=TEXT_FIELDS, chunksize=1000, max_length=None, total=None): count = 0 with ShardRecordWriter(path_fmt=output_file, chunksize=chunksize) as writer: for sentence in tqdm(sentences, desc="Writing Records", total=total): if max_length is None or len(sentence) <= max_length: count += 1 int_field_data = { field: feature_int64_list( [int(getattr(word, field)) for word in sentence]) for field in int_fields } text_field_data = { field: feature_int64_list( encode_words( words=[getattr(word, field) for word in sentence], wordmap=vocabmaps[field])) for field in text_fields } sentence_length = feature_int64([len(sentence)]) sequence_features = dict() sequence_features.update(int_field_data) sequence_features.update(text_field_data) context_features = {SENTENCE_LENGTH: sentence_length} example = tf.train.SequenceExample( context=Features(feature=context_features), feature_lists=tf.train.FeatureLists( feature_list=sequence_features), ) writer.write(example.SerializeToString()) print("Wrote [{}] records out of [{}]".format(count, total))
def write_records_parsed(sentences: Iterable[List[Word]], output_file, wordmap, tagmap, total=None): os.makedirs(os.path.dirname(output_file), exist_ok=True) with TFRecordWriter(output_file) as writer: for sentence in tqdm(sentences, desc="Writing Records", total=total): indices = [word.index for word in sentence] text = [ wordmap[word.text] if word.text in wordmap else wordmap[UNK] for word in sentence ] tags = [ tagmap[word.tag] if word.tag in tagmap else tagmap[UNK] for word in sentence ] heads = [word.head for word in sentence] indices_feat = feature_int64_list(indices) text_feat = feature_int64_list(text) tags_feat = feature_int64_list(tags) heads_feat = feature_int64_list(heads) data_size = feature_int64([len(sentence)]) sequence_features = { 'indices': indices_feat, 'text': text_feat, 'tags': tags_feat, 'heads': heads_feat } context_features = {'data_size': data_size} example = tf.train.SequenceExample( context=Features(feature=context_features), feature_lists=tf.train.FeatureLists( feature_list=sequence_features), ) writer.write(example.SerializeToString())
def generate(shape, window_size=10, start=0, offset=5, noise_samples=0, classes=None, damaged_subjects=None): if classes is None: classes = ["left-fist", "right-fist"] if damaged_subjects is None: damaged_subjects = ["S088", "S089", "S092", "S100", "S104"] if type(shape) not in (tuple, list): shape = [shape] dataset_dir = os.path.join(PHYSIONET_DIR, "normalized-by-sample", _get_window_folder_name(window_size, start, offset, noise_samples), f"{window_size}x{'x'.join(str(dim) for dim in shape)}", "-".join(classes)) executions = [] if any([c in classes for c in ["eyes-closed"]]): executions.append("R02") if any([c in classes for c in ["left-fist", "right-fist"]]): for execution in ["R04", "R08", "R12"]: executions.append(execution) if any([c in classes for c in ["both-fists", "both-feet"]]): for execution in ["R06", "R10", "R14"]: executions.append(execution) label_value = 0 labels = {} if "eyes-closed" in classes: labels["eyes-closed"] = label_value label_value += 1 if "left-fist" in classes: labels["left-fist"] = label_value label_value += 1 if "right-fist" in classes: labels["right-fist"] = label_value label_value += 1 if "both-fists" in classes: labels["both-fists"] = label_value label_value += 1 if "both-feet" in classes: labels["both-feet"] = label_value label_value += 1 fsh.recreate_dir(dataset_dir) regex_executions = f"({'|'.join(executions)})" info = { "n_samples_by_subject": 0 } subjects = filter(lambda s: s not in damaged_subjects, sorted(os.listdir(RAW_EDF_FILES_DIR))) for subject in filter(lambda f: re.match("S(\\d+)", f), subjects): print(f"Generating TFRecord file from the subject {subject} ...") X_segments = np.empty((0, 64)) y = np.empty(0, dtype=np.int64) edf_subject_path_dir = os.path.join(RAW_EDF_FILES_DIR, subject) edf_file_names = sorted(os.listdir(edf_subject_path_dir)) for edf_file_name in filter(lambda f: re.match(f"^{subject}{regex_executions}\\.edf$", f), edf_file_names): edf_file = EdfFile(edf_subject_path_dir, edf_file_name) events_windows = [next(group) for key, group in groupby(enumerate(edf_file.labels), key=itemgetter(1))] n_events = len(events_windows) for index, (event_start_index, event) in enumerate(events_windows): if event == "rest": continue event_start_index += start X = edf_file.data[event_start_index:] if index + 1 == n_events \ else edf_file.data[event_start_index:events_windows[index + 1][0]] n_segments = 0 for (start_segment, end_segment) in _windows(X, window_size, offset): x_segment = X[start_segment:end_segment] X_segments = np.vstack((X_segments, x_segment)) y = np.append(y, labels[event]) for _ in range(noise_samples): noise = np.random.normal(0, 1, x_segment.shape) X_segments = np.vstack((X_segments, x_segment + noise)) y = np.append(y, labels[event]) n_segments += 1 print(f"X{X.shape} splitted into {n_segments} segments of " f"{window_size} samples with offset of {offset} plus {noise_samples} noise samples") edf_file.close() if len(y) > info["n_samples_by_subject"]: info["n_samples_by_subject"] = len(y) print("Labels: ", len(y), len(y[y == 0]), len(y[y == 1])) X_segments = X_segments.reshape((-1, window_size, 64)) print("Has nan: ", np.isnan(X_segments).any()) tfrecord_subject_filepath = os.path.join(dataset_dir, f"{subject}.tfrecord") options = tf.io.TFRecordOptions(compression_type="GZIP") with tf.io.TFRecordWriter(tfrecord_subject_filepath, options) as writer: for n_segment in range(len(y)): X_segment = np.array(list(map(lambda x: _process_record(x, shape), X_segments[n_segment]))) eeg_example = Example( features=Features( feature={ "X": Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(X_segment).numpy()])), "y": Feature(int64_list=Int64List(value=[y[n_segment]])) } ) ) writer.write(eeg_example.SerializeToString()) print("n_samples_by_subject: ", info["n_samples_by_subject"]) info_filepath = os.path.join(dataset_dir, "info.pkl") with open(info_filepath, "wb") as fp: pickle.dump(info, fp, protocol=4)
for i in range(N): filename_l = os.path.join(current_dir, 'data/leader/%02d.tfrecord' % i) filename_f = os.path.join(current_dir, 'data/follower/%02d.tfrecord' % i) fl = tf.io.TFRecordWriter(filename_l) ff = tf.io.TFRecordWriter(filename_f) for j in range(chunk_size): idx = i * chunk_size + j features_l = {} features_l['example_id'] = \ Feature(bytes_list=BytesList(value=[str(idx).encode()])) features_l['y'] = \ Feature(int64_list=Int64List(value=[random.randint(0, 1)])) for k in range(512): features_l['x_{0}'.format(k)] = \ Feature(int64_list=Int64List(value=[random.randint(0, 100)])) fl.write( Example(features=Features(feature=features_l)).SerializeToString()) features_f = {} features_f['example_id'] = \ Feature(bytes_list=BytesList(value=[str(idx).encode()])) for k in range(512): features_f['x_{0}'.format(k)] = \ Feature(int64_list=Int64List(value=[random.randint(0, 100)])) ff.write( Example(features=Features(feature=features_f)).SerializeToString()) fl.close() ff.close()
def generate(dataset_root_dir, events_enum, labels_enum, labels, classes, window_size): dataset_dir = os.path.join(dataset_root_dir, "normalized-by-sample", f"window-{window_size}", "-".join(classes).lower()) fsh.recreate_dir(dataset_dir) info = {"n_samples_by_file": 0} subjects_labels_counts = LabelsCounts() subjects_session_labels_counts = LabelsCounts() gdf_files_dir = os.path.join(dataset_root_dir, "gdf-files") gdf_file_names = filter(lambda f: re.match(".*.gdf", f), sorted(os.listdir(gdf_files_dir))) for gdf_file_name in gdf_file_names: gdf_file = mne.io.read_raw_gdf(os.path.join(gdf_files_dir, gdf_file_name), preload=True) groups_gdf_file = re.match("(.)(\\d{2})(\\d{0,2})([ET])\\.gdf", gdf_file_name).groups() database_prefix = groups_gdf_file[0] subject = groups_gdf_file[1] session = groups_gdf_file[2] session_type = groups_gdf_file[3] labels_filepath = os.path.join( gdf_files_dir, "labels", f"{database_prefix}{subject}{session}{session_type}.mat") labels_file = io.loadmat(labels_filepath)["classlabel"] annotations = gdf_file.annotations start_trials_indexes = [ event_index for event_index in range(len(annotations.description)) if events_enum.get(annotations.description[event_index]) == "NEW_TRIAL" ] indexes_channels_eeg = [ index for index, _ in enumerate( filter(lambda ch: "EEG" in ch, gdf_file.ch_names)) ] n_channels = len(indexes_channels_eeg) frequency = int(gdf_file.info["sfreq"]) # should consider the complete motor imagery period (4s), # not only the cue exhibition period (1.25s) duration_event = window_size // frequency n_samples = frequency * duration_event rejected_trials = 0 ignored_trials = 0 X = np.empty((0, n_channels)) y = np.empty(0, dtype=np.int64) for n_trial, event_start_trial_index in enumerate( start_trials_indexes): cue_event_index = event_start_trial_index + 1 onset_event = annotations.onset[cue_event_index] onset_index = int(np.ceil(onset_event * frequency)) end_index = int(np.ceil( (onset_event + duration_event) * frequency)) event_samples = end_index - onset_index if event_samples != n_samples: end_index += n_samples - event_samples # The event correspondent to the trial is the following the start trial event if events_enum[annotations. description[cue_event_index]] == "REJECTED_TRIAL": rejected_trials += 1 continue label = labels_enum[labels_file[n_trial][0]] if label not in classes: ignored_trials += 1 continue # The index 0 returns the data array of the gdf_file # The index 1 returns the times array of the gdf_file x = gdf_file[indexes_channels_eeg, onset_index:end_index][0].T x = (x - np.mean(x)) / np.std(x) X = np.vstack((X, x)) y = np.append(y, labels[label]) gdf_file.close() X = X.reshape((-1, n_samples, n_channels)) tfrecord_filepath = os.path.join( dataset_dir, f"{database_prefix}{subject}{session}{session_type}.tfrecord") options = tf.io.TFRecordOptions(compression_type="GZIP") with tf.io.TFRecordWriter(tfrecord_filepath, options) as writer: for n_segment in range(len(y)): eeg_example = Example(features=Features( feature={ "X": Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(X[n_segment]).numpy() ])), "y": Feature(int64_list=Int64List(value=[y[n_segment]])) })) writer.write(eeg_example.SerializeToString()) valid_trials = len(y) if valid_trials > info["n_samples_by_file"]: info["n_samples_by_file"] = valid_trials labels_counts = np.unique(y, return_counts=True)[1] subjects_labels_counts.put(subject, labels_counts) subjects_session_labels_counts.put(subject + session_type, labels_counts) print("Info from file " + gdf_file_name) print(f"Labels counts: {labels_counts}") print("Valid Trials: " + str(valid_trials)) print("Ignored Trials: " + str(ignored_trials)) print("Rejected Trials: " + str(rejected_trials)) print("Total Trials: " + str(valid_trials + ignored_trials + rejected_trials)) print("Subjects Labels Counts:") print(subjects_labels_counts) print("Subjects Session Counts:") print(subjects_session_labels_counts) print("Generation dataset ended, saving info data ...") print("info[n_samples_by_file]=", info["n_samples_by_file"]) info_filepath = os.path.join(dataset_dir, "info.pkl") with open(info_filepath, "wb") as fp: pickle.dump(info, fp, protocol=4)
def _create_example_from_features(features): return Example(features=Features(feature=features))