def __init__(self): super().__init__() fext = FeatureExtractor(remove_entities=self.remove_entities, remove_nonalpha=self.remove_nonalpha, remove_stopwords=self.remove_stopwords) self._X = fext.transform(self._X)
def TestExtractDayFeature(): measurements = DataGetter.GetAllMeasurements(g_file_name, i_patient_id_column_num=1, i_datetime_column_num=2, i_glucose_level_column_num=3) time_separator = datetime.datetime.strptime("05:00:00", "%H:%M:%S").time() measurements_per_day = FeatureExtractor.SeparateMeasurementsForDays( measurements, time_separator) day_measurement = measurements_per_day[1] day_feature = FeatureExtractor.ExtractDayFeature(day_measurement) assert day_feature
def numpy_to_tfrecord(path_to_files, output_folder='', feature_names='cepstrum', label_names='transcript'): """ :param path_to_files: (str) path to folder with subfolders of numpy files :param output_folder: (str)path to folder with output tfrecord files :param feature_names: sequence of symbols that can be used as common identifier for feature files :param label_names: sequence of symbols that can be used as common identifier for label files :return: None """ path_to_files = os.path.normpath(path_to_files) folder_structure_gen = os.walk(path_to_files) # ('path_to_current_folder', [subfolders], ['files', ...]) for folder in folder_structure_gen: path, subfolders, files = folder if not files: continue feat_file_names = [f for f in files if feature_names in f] label_file_names = [f for f in files if label_names in f] print(path.split("\\")) if output_folder and isinstance(output_folder, str): output_path = os.path.join(os.path.normpath(output_folder), *path.split("\\")[-3:]) os.makedirs(os.path.split(output_path)[0], exist_ok=True) else: output_path = os.path.splitext(path)[0] print(output_path) num_feats = len(feat_file_names) num_labels = len(label_file_names) assert num_feats == num_labels, 'There is {} feature files and {} label files (must be same).'.format(num_feats, num_labels) tfrecord_path = output_path + '.tfrecord' writer = tf.io.TFRecordWriter(tfrecord_path) for i in range(num_feats): feat_load_path = os.path.join(path, feat_file_names[i]) label_load_path = os.path.join(path, label_file_names[i]) feat, _ = FeatureExtractor.load_cepstra(feat_load_path) label, _ = DataLoader.load_labels(label_load_path) # print(feat[0][0].shape, label[0][0].shape) serialized = serialize_array(feat[0][0], label[0][0]) writer.write(serialized) writer.close() print("Data written to {}".format(tfrecord_path))
def _read_next_file(self): assert self.m_next_file_index < len(self.m_files) src_file = self.m_files[self.m_next_file_index] con_ms = DataGetter.GetAllMeasurements(src_file, i_patient_id_column_num=1, i_datetime_column_num=2, i_glucose_level_column_num=3) self.m_cur_day_measurements = FeatureExtractor.SeparateMeasurementsForDays( con_ms, DayMeasurementsIterator.g_ts) self.m_next_file_index += 1 self.m_next_day_measurement_index = 0
def processor(self): pre_processor = PreProcessor() feature_extractor = FeatureExtractor() feature_selector = FeatureSelector() accuracy_checker = AccuracyChecker() y_train, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_test_sj, x_test_iq = self.read_data( ) x_train_sj = pre_processor.impute_redundant_features( x_train_sj, self.impute_columns) x_train_iq = pre_processor.impute_redundant_features( x_train_iq, self.impute_columns) x_test_sj = pre_processor.impute_redundant_features( x_test_sj, self.impute_columns) x_test_iq = pre_processor.impute_redundant_features( x_test_iq, self.impute_columns) imputer_sj = Imputer(strategy='mean') x_train_sj = pre_processor.impute_missing_values( x_train_sj, self.features, imputer_sj) x_test_sj = pre_processor.impute_missing_values( x_test_sj, self.features, imputer_sj) imputer_iq = Imputer(strategy='mean') x_train_iq = pre_processor.impute_missing_values( x_train_iq, self.features, imputer_iq) x_test_iq = pre_processor.impute_missing_values( x_test_iq, self.features, imputer_iq) x_train_sj = feature_extractor.add_time_series_features(x_train_sj, window=100) x_train_iq = feature_extractor.add_time_series_features(x_train_iq, window=30) x_test_sj = feature_extractor.add_time_series_features(x_test_sj, window=100) x_test_iq = feature_extractor.add_time_series_features(x_test_iq, window=30) x_train_sj = feature_selector.drop_unnecessary_features( x_train_sj, self.drop_features, self.time_series_features) x_train_iq = feature_selector.drop_unnecessary_features( x_train_iq, self.drop_features, self.time_series_features) x_test_sj = feature_selector.drop_unnecessary_features( x_test_sj, self.drop_features, self.time_series_features) x_test_iq = feature_selector.drop_unnecessary_features( x_test_iq, self.drop_features, self.time_series_features) features_to_normalize = self.features + self.new_features x_train_sj[features_to_normalize] = x_train_sj[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_train_iq[features_to_normalize] = x_train_iq[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_test_sj[features_to_normalize] = x_test_sj[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_test_iq[features_to_normalize] = x_test_iq[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_train = pd.concat([x_train_sj, x_train_iq], axis=0) x_train.set_index('index', inplace=True) x_sj, y_sj = x_train.loc[x_train.city == 'sj', :], y_train.loc[ x_train.city == 'sj', :] x_iq, y_iq = x_train.loc[x_train.city == 'iq', :], y_train.loc[ x_train.city == 'iq', :] x_train_sj, x_cross_sj, y_train_sj, y_cross_sj = train_test_split( x_sj, y_sj, test_size=0.2, stratify=x_sj.weekofyear) x_train_iq, x_cross_iq, y_train_iq, y_cross_iq = train_test_split( x_iq, y_iq, test_size=0.2, stratify=x_iq.weekofyear) x_train_sj = feature_selector.select_features(x_train_sj, self.features, self.new_features) x_train_iq = feature_selector.select_features(x_train_iq, self.features, self.new_features) x_cross_sj = feature_selector.select_features(x_cross_sj, self.features, self.new_features) x_cross_iq = feature_selector.select_features(x_cross_iq, self.features, self.new_features) reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=500, random_state=67) reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=300, random_state=67) reg_sj_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) reg_iq_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor( reg_sj_gb, reg_iq_gb, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_cross_sj, x_cross_iq, "gb") y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor( reg_sj_rf, reg_iq_rf, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_cross_sj, x_cross_iq, "rf") y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2, y_iq_pred_m1, y_iq_pred_m2, 5, 3) print("San Juan:") accuracy_checker.cross_validate_out_of_sample(y_sj_pred, y_cross_sj.total_cases) print("Iquitos:") accuracy_checker.cross_validate_out_of_sample(y_iq_pred, y_cross_iq.total_cases) predict_sj = x_test_sj[self.keys].copy() predict_iq = x_test_iq[self.keys].copy() x_sj = feature_selector.select_features(x_sj, self.features, self.new_features) x_iq = feature_selector.select_features(x_iq, self.features, self.new_features) x_test_sj = feature_selector.select_features(x_test_sj, self.features, self.new_features) x_test_iq = feature_selector.select_features(x_test_iq, self.features, self.new_features) reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=500, random_state=67) reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=300, random_state=67) reg_sj_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) reg_iq_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor( reg_sj_gb, reg_iq_gb, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq, "gb") y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor( reg_sj_rf, reg_iq_rf, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq, "rf") y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2, y_iq_pred_m1, y_iq_pred_m2, 5, 3) predict_sj['total_cases'] = y_sj_pred.round().astype(int) predict_iq['total_cases'] = y_iq_pred.round().astype(int) predict_df = pd.concat([predict_sj, predict_iq], axis=0) predict_df.loc[predict_df.total_cases < 0, 'total_cases'] = 0 self.write_results(predict_df)
def feature_length_range(load_dir, save_dir, min_frame_length=100, max_frame_length=3000, mode='copy', feature_names='cepstrum', label_names='transcript'): """ Check individual files (features and their labels) in load_dir and copy/move those which satisfy the condition: min_frame_length <= feature_frame_len <= max_frame_length :param load_dir: folder from which to load features and their labels :param save_dir: folder to which copy/move the files which satisfy the condition above :param min_frame_length: lower bound of the feature frame length condition :param max_frame_length: upper bound of the feature frame length condition :param mode: 'copy'/'move' - condition satisfying files are copied/moved from load_dir to save_dir :param feature_names: sequence of symbols that can be used as common identifier for feature files :param label_names: sequence of symbols that can be used as common identifier for label files :return: None """ # normalize the save directory path save_path = os.path.normpath(save_dir) folder_structure_gen = os.walk( load_dir) # ('path_to_current_folder', [subfolders], ['files', ...]) for folder in folder_structure_gen: path, subfolders, files = folder feat_file_names = [f for f in files if feature_names in f] label_file_names = [f for f in files if label_names in f] num_feats = len(feat_file_names) num_labels = len(label_file_names) assert num_feats == num_labels, 'There is {} feature files and {} label files (must be same).'.format( num_feats, num_labels) rel_path = os.path.relpath( path, load_dir ) # relative position of current subdirectory in regards to load_dir save_full_path = os.path.join( save_path, rel_path) # folder/subfolder to which save files in save_dir # make subdirectories in save_dir os.makedirs(save_full_path, exist_ok=True) for i in range(num_feats): feat_load_path = os.path.join(path, feat_file_names[i]) label_load_path = os.path.join(path, label_file_names[i]) feat_save_path = os.path.join(save_full_path, feat_file_names[i]) label_save_path = os.path.join(save_full_path, label_file_names[i]) feat, _ = FeatureExtractor.load_cepstra(feat_load_path) feat_frame_len = feat[0][0].shape[0] if min_frame_length <= feat_frame_len <= max_frame_length: if mode == 'copy': shutil.copy2(feat_load_path, feat_save_path) print("Copied {} to {}".format(feat_load_path, feat_save_path)) shutil.copy2(label_load_path, label_save_path) print("Copied {} to {}".format(label_load_path, label_save_path)) elif mode == 'move': os.rename(feat_load_path, feat_save_path) print("Moved {} to {}".format(feat_load_path, feat_save_path)) os.rename(label_load_path, label_save_path) print("Moved {} to {}".format(label_load_path, label_save_path)) else: raise ValueError( "argument mode must be eiher 'copy' or 'move'") print("Finished.")
words = sentence.split(" ") return sentence if __name__ == '__main__': # set logging to only show errors os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # print("INITIALIZING LANGUAGE MODEL PIPELINE".center(50, "_") # lm_mask_pipeline = masked_pipeline_from_trained_model(PREDICTION_FLAGS.models['lm_path']) print("INITIALIZING FEATURE EXTRACTOR".center(50, "_")) extractor = FeatureExtractor( PREDICTION_FLAGS.recording['rate'], feature_type=PREDICTION_FLAGS.features['type'], energy=PREDICTION_FLAGS.features['energy'], deltas=PREDICTION_FLAGS.features['deltas']) print("RECORDING AUDIO".center(50, "_")) timespan, frames, stream = record_audio(5) print("CONVERTING TO FEATURE REPRESENTATION".center(50, "_")) features = extractor.transform_data([np.array(frames)])[0] print("PREDICTING FROM SAVED MODEL".center(50, "_")) predictions = predict_from_saved_model(PREDICTION_FLAGS.models['am_path'], features) print("TRANSCRIBING TO STRINGS".center(50, "_")) string_predictions = convert_to_strings(predictions,
from FeatureExtraction import SeriesModifier from FeatureExtraction import FeatureExtractor src_path = u'D:/GDrive/Диплом 2/DataPreparation/output/Patient#7/Period_from_2000-07-16__9-52.xlsx' measurements = DataGetter.GetAllMeasurements(src_path, i_patient_id_column_num=1, i_datetime_column_num=2, i_glucose_level_column_num=3) smooth_measurements = SeriesModifier.Smooth(measurements) extremal_indexes, bla = ExtremaFilter.FindExtremalMeasurements( smooth_measurements) time_separator = datetime.datetime.strptime("05:00:00", "%H:%M:%S").time() measurements_per_day = FeatureExtractor.SeparateMeasurementsForDays( measurements, time_separator) rise_features = [] for day_measurement in measurements_per_day: day_feature = FeatureExtractor.ExtractDayFeature(day_measurement) if day_feature: rise_features.extend(day_feature.GetRiseFeatures()) gl_b_level = np.array( map(lambda x: x.GetBeforeMax().GetGlucoseLevel(), rise_features)) dt_b = np.array(map(lambda x: x.GetBeforeMax().GetDateTime(), rise_features)) gl_a_level = np.array( map(lambda x: x.GetAfterMax().GetGlucoseLevel(), rise_features)) dt_a = np.array(map(lambda x: x.GetAfterMax().GetDateTime(), rise_features)) gl_m_level = np.array(
SentenceBlockMedianPooler, SentenceLevelAveragePooler, SentenceLevelMedianPooler, SentenceLevelMaximumPooler, StringConcatenationEncoder) # Hypothesis 1: input clean-up improves performance datasets = [TitleTextDataset] hypothesis = 'fext' pipelines = [('nb_baseline', Pipeline([('vec', None), ('cls', MultinomialNB())])), ('svm_baseline', Pipeline([('vec', None), ('cls', LinearSVC())])), ('log_baseline', Pipeline([('vec', None), ('cls', LogisticRegression())])), ('nb_fext', Pipeline([('fext', FeatureExtractor()), ('vec', None), ('cls', MultinomialNB())])), ('svm_fext', Pipeline([('fext', FeatureExtractor()), ('vec', None), ('cls', LinearSVC())])), ('log_fext', Pipeline([('fext', FeatureExtractor()), ('vec', None), ('cls', LogisticRegression())]))] HYPOTHESIS_1 = (hypothesis, datasets, pipelines) # Hypothesis 2: title information improves performance datasets = [TextDataset, TitleDataset, TitleTextDataset] hypothesis = 'title' pipelines = [('nb_baseline', Pipeline([('vec', None),
def prepare_data(files, save_folder, dataset="pdtsc", label_max_duration=10.0, speeds=(0.9, 1.0, 1.1), feature_type="MFSC", bigrams=False, repeated=False, energy=True, deltas=(0, 0), nbanks=40, filter_nan=True, sort=True): cepstra_length_list = [] file_names = get_file_names(files) for speed in speeds: LOGGER.info(f"Create audio_transormer for speed {speed}") audio_transformer = (AudioEffectsChain().speed(speed)) save_path = os.path.join(save_folder, f"{speed}/") LOGGER.debug(f"Current save_path: {save_path}") for i, file in enumerate(files): if dataset == "pdtsc": pdtsc = PDTSCLoader([file[0]], [file[1]], bigrams, repeated) labels = pdtsc.transcripts_to_labels( ) # list of lists of 1D numpy arrays labels = labels[0] # flatten label list audio_list, fs = pdtsc.load_audio() audio = audio_list[0] fs = fs[0] LOGGER.debug( f"Loaded PDTSC with fs {fs} from:\n \t audio_path: {file[0]}\n \t transcript_path: {file[1]}" ) elif dataset == "oral": oral = OralLoader([file[0]], [file[1]], bigrams, repeated) label_dict = oral.transcripts_to_labels( label_max_duration ) # Dict['file_name':Tuple[sents_list, starts_list, ends_list]] audio_dict, fs_dict = oral.load_audio() # Dicts['file_name'] labels = label_dict[file_names[i]] audio = audio_dict[file_names[i]] fs = fs_dict[file_names[i]] LOGGER.debug( f"Loaded ORAL with fs {fs} from:\n \t audio_path: {file[0]}\n \t transcript_path: {file[1]}" ) else: raise ValueError( "'dataset' argument must be either 'pdtsc' or 'oral'") full_save_path = os.path.join(save_path, file_names[i]) LOGGER.info( f"\tApplying SoX transormation on audio from {full_save_path}") for ii in range(len(audio)): LOGGER.debug(f"\t\t input.shape: {audio[ii].shape}") audio[ii] = audio_transformer(audio[ii]) LOGGER.debug(f"\t\t output.shape: {audio[ii].shape}") LOGGER.info(f"\tApplying FeatureExtractor on audio") feat_ext = FeatureExtractor(fs, feature_type=feature_type, energy=energy, deltas=deltas, nbanks=nbanks) cepstra = feat_ext.transform_data(audio) # list of 2D arrays # filter out cepstra which are containing nan values if filter_nan: LOGGER.info(f"\tFiltering out NaN values") # boolean list where False marks cepstra in which there is at least one nan value present mask_nan = [ not np.isnan(cepstrum).any() for cepstrum in cepstra ] # mask out cepstra and their corresponding labels with nan values cepstra = list(compress(cepstra, mask_nan)) labels = list(compress(labels, mask_nan)) # SAVE Cepstra to files (features) LOGGER.info(f"\tSaving cepstra to files") FeatureExtractor.save_cepstra(cepstra, full_save_path, exist_ok=True) LOGGER.debug(f"\t\tfull_save_path: {full_save_path}") # SAVE Transcripts to files (labels) LOGGER.info(f"\tSaving transcripts to files") if dataset == 'pdtsc': pdtsc.save_labels([labels], save_path, exist_ok=True) elif dataset == 'oral': label_dict[file_names[i]] = labels oral.save_labels(label_dict, save_path, exist_ok=True) else: raise ValueError( "'dataset' argument must be either 'pdtsc' or 'oral'") LOGGER.info(f"\tChecking SAVE/LOAD consistency") loaded_cepstra, loaded_cepstra_paths = FeatureExtractor.load_cepstra( full_save_path) loaded_labels, loaded_label_paths = DataLoader.load_labels( full_save_path) # flatten the lists loaded_cepstra, loaded_cepstra_paths, loaded_labels, loaded_label_paths = ( loaded_cepstra[0], loaded_cepstra_paths[0], loaded_labels[0], loaded_label_paths[0]) for j in range(len(cepstra)): if np.any(np.not_equal(cepstra[j], loaded_cepstra[j])): raise UserWarning( "Saved and loaded cepstra are not value consistent.") if dataset == 'pdtsc': if np.any(np.not_equal(labels[j], loaded_labels[j])): raise UserWarning( "Saved and loaded labels are not value consistent." ) elif dataset == 'oral': if np.any(np.not_equal(labels[j][0], loaded_labels[j])): raise UserWarning( "Saved and loaded labels are not value consistent." ) # add (cepstrum_path, label_path, cepstrum_length) tuple into collective list for sorting cepstra_length_list.append( (loaded_cepstra_paths[j], loaded_label_paths[j], loaded_cepstra[j].shape[0])) LOGGER.debug( f'files from {file_names[i]} transformed and saved into {os.path.abspath(save_path)}.' ) # sort cepstra and labels by time length (number of frames) if sort: LOGGER.info( f"Sorting cepstra and labels by time length (number of frames)" ) sort_indices = np.argsort([ c[2] for c in cepstra_length_list ]) # indices which sort the lists by cepstra length cepstra_length_list = [ cepstra_length_list[i] for i in sort_indices ] # sort the cepstra list num_digits = len(str(len(cepstra_length_list))) for idx, file in enumerate(cepstra_length_list): cepstrum_path, label_path, _ = file os.rename( cepstrum_path, "{0}/cepstrum-{1:0{2}d}.npy".format( save_path, idx, num_digits)) os.rename( label_path, "{0}/transcript-{1:0{2}d}.npy".format( save_path, idx, num_digits)) subfolders = next(os.walk(save_path))[1] for folder in subfolders: try: os.rmdir(os.path.join(save_path, folder)) except OSError: LOGGER.warning( "Folder {} is not empty! Can't delete.".format( os.path.join(save_path, folder)))