def __init__(self):

        super().__init__()
        fext = FeatureExtractor(remove_entities=self.remove_entities,
                                remove_nonalpha=self.remove_nonalpha,
                                remove_stopwords=self.remove_stopwords)
        self._X = fext.transform(self._X)
def TestExtractDayFeature():
    measurements = DataGetter.GetAllMeasurements(g_file_name,
                                                 i_patient_id_column_num=1,
                                                 i_datetime_column_num=2,
                                                 i_glucose_level_column_num=3)
    time_separator = datetime.datetime.strptime("05:00:00", "%H:%M:%S").time()
    measurements_per_day = FeatureExtractor.SeparateMeasurementsForDays(
        measurements, time_separator)
    day_measurement = measurements_per_day[1]
    day_feature = FeatureExtractor.ExtractDayFeature(day_measurement)
    assert day_feature
def numpy_to_tfrecord(path_to_files, output_folder='',
                      feature_names='cepstrum', label_names='transcript'):
    """

    :param path_to_files: (str) path to folder with subfolders of numpy files
    :param output_folder: (str)path to folder with output tfrecord files
    :param feature_names: sequence of symbols that can be used as common identifier for feature files
    :param label_names: sequence of symbols that can be used as common identifier for label files
    :return: None
    """

    path_to_files = os.path.normpath(path_to_files)
    folder_structure_gen = os.walk(path_to_files)  # ('path_to_current_folder', [subfolders], ['files', ...])

    for folder in folder_structure_gen:
        path, subfolders, files = folder
        if not files:
            continue
        feat_file_names = [f for f in files if feature_names in f]
        label_file_names = [f for f in files if label_names in f]

        print(path.split("\\"))

        if output_folder and isinstance(output_folder, str):
            output_path = os.path.join(os.path.normpath(output_folder), *path.split("\\")[-3:])
            os.makedirs(os.path.split(output_path)[0], exist_ok=True)
        else:
            output_path = os.path.splitext(path)[0]
        print(output_path)

        num_feats = len(feat_file_names)
        num_labels = len(label_file_names)

        assert num_feats == num_labels, 'There is {} feature files and {} label files (must be same).'.format(num_feats,
                                                                                                              num_labels)

        tfrecord_path = output_path + '.tfrecord'
        writer = tf.io.TFRecordWriter(tfrecord_path)

        for i in range(num_feats):
            feat_load_path = os.path.join(path, feat_file_names[i])
            label_load_path = os.path.join(path, label_file_names[i])

            feat, _ = FeatureExtractor.load_cepstra(feat_load_path)
            label, _ = DataLoader.load_labels(label_load_path)

            #            print(feat[0][0].shape, label[0][0].shape)

            serialized = serialize_array(feat[0][0], label[0][0])
            writer.write(serialized)

        writer.close()

        print("Data written to {}".format(tfrecord_path))
 def _read_next_file(self):
     assert self.m_next_file_index < len(self.m_files)
     src_file = self.m_files[self.m_next_file_index]
     con_ms = DataGetter.GetAllMeasurements(src_file,
                                            i_patient_id_column_num=1,
                                            i_datetime_column_num=2,
                                            i_glucose_level_column_num=3)
     self.m_cur_day_measurements = FeatureExtractor.SeparateMeasurementsForDays(
         con_ms, DayMeasurementsIterator.g_ts)
     self.m_next_file_index += 1
     self.m_next_day_measurement_index = 0
Exemple #5
0
    def processor(self):
        pre_processor = PreProcessor()
        feature_extractor = FeatureExtractor()
        feature_selector = FeatureSelector()
        accuracy_checker = AccuracyChecker()
        y_train, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_test_sj, x_test_iq = self.read_data(
        )

        x_train_sj = pre_processor.impute_redundant_features(
            x_train_sj, self.impute_columns)
        x_train_iq = pre_processor.impute_redundant_features(
            x_train_iq, self.impute_columns)

        x_test_sj = pre_processor.impute_redundant_features(
            x_test_sj, self.impute_columns)
        x_test_iq = pre_processor.impute_redundant_features(
            x_test_iq, self.impute_columns)

        imputer_sj = Imputer(strategy='mean')
        x_train_sj = pre_processor.impute_missing_values(
            x_train_sj, self.features, imputer_sj)
        x_test_sj = pre_processor.impute_missing_values(
            x_test_sj, self.features, imputer_sj)

        imputer_iq = Imputer(strategy='mean')
        x_train_iq = pre_processor.impute_missing_values(
            x_train_iq, self.features, imputer_iq)
        x_test_iq = pre_processor.impute_missing_values(
            x_test_iq, self.features, imputer_iq)

        x_train_sj = feature_extractor.add_time_series_features(x_train_sj,
                                                                window=100)
        x_train_iq = feature_extractor.add_time_series_features(x_train_iq,
                                                                window=30)
        x_test_sj = feature_extractor.add_time_series_features(x_test_sj,
                                                               window=100)
        x_test_iq = feature_extractor.add_time_series_features(x_test_iq,
                                                               window=30)

        x_train_sj = feature_selector.drop_unnecessary_features(
            x_train_sj, self.drop_features, self.time_series_features)
        x_train_iq = feature_selector.drop_unnecessary_features(
            x_train_iq, self.drop_features, self.time_series_features)
        x_test_sj = feature_selector.drop_unnecessary_features(
            x_test_sj, self.drop_features, self.time_series_features)
        x_test_iq = feature_selector.drop_unnecessary_features(
            x_test_iq, self.drop_features, self.time_series_features)

        features_to_normalize = self.features + self.new_features

        x_train_sj[features_to_normalize] = x_train_sj[
            features_to_normalize].apply(pre_processor.normalize, axis=0)
        x_train_iq[features_to_normalize] = x_train_iq[
            features_to_normalize].apply(pre_processor.normalize, axis=0)
        x_test_sj[features_to_normalize] = x_test_sj[
            features_to_normalize].apply(pre_processor.normalize, axis=0)
        x_test_iq[features_to_normalize] = x_test_iq[
            features_to_normalize].apply(pre_processor.normalize, axis=0)

        x_train = pd.concat([x_train_sj, x_train_iq], axis=0)
        x_train.set_index('index', inplace=True)

        x_sj, y_sj = x_train.loc[x_train.city == 'sj', :], y_train.loc[
            x_train.city == 'sj', :]
        x_iq, y_iq = x_train.loc[x_train.city == 'iq', :], y_train.loc[
            x_train.city == 'iq', :]

        x_train_sj, x_cross_sj, y_train_sj, y_cross_sj = train_test_split(
            x_sj, y_sj, test_size=0.2, stratify=x_sj.weekofyear)

        x_train_iq, x_cross_iq, y_train_iq, y_cross_iq = train_test_split(
            x_iq, y_iq, test_size=0.2, stratify=x_iq.weekofyear)

        x_train_sj = feature_selector.select_features(x_train_sj,
                                                      self.features,
                                                      self.new_features)
        x_train_iq = feature_selector.select_features(x_train_iq,
                                                      self.features,
                                                      self.new_features)
        x_cross_sj = feature_selector.select_features(x_cross_sj,
                                                      self.features,
                                                      self.new_features)
        x_cross_iq = feature_selector.select_features(x_cross_iq,
                                                      self.features,
                                                      self.new_features)

        reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=5,
                                              n_estimators=500,
                                              random_state=67)
        reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=3,
                                              n_estimators=300,
                                              random_state=67)

        reg_sj_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)
        reg_iq_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)

        y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor(
            reg_sj_gb, reg_iq_gb, x_train_sj, y_train_sj, x_train_iq,
            y_train_iq, x_cross_sj, x_cross_iq, "gb")
        y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor(
            reg_sj_rf, reg_iq_rf, x_train_sj, y_train_sj, x_train_iq,
            y_train_iq, x_cross_sj, x_cross_iq, "rf")

        y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2,
                                                   y_iq_pred_m1, y_iq_pred_m2,
                                                   5, 3)
        print("San Juan:")
        accuracy_checker.cross_validate_out_of_sample(y_sj_pred,
                                                      y_cross_sj.total_cases)
        print("Iquitos:")
        accuracy_checker.cross_validate_out_of_sample(y_iq_pred,
                                                      y_cross_iq.total_cases)

        predict_sj = x_test_sj[self.keys].copy()
        predict_iq = x_test_iq[self.keys].copy()

        x_sj = feature_selector.select_features(x_sj, self.features,
                                                self.new_features)
        x_iq = feature_selector.select_features(x_iq, self.features,
                                                self.new_features)
        x_test_sj = feature_selector.select_features(x_test_sj, self.features,
                                                     self.new_features)
        x_test_iq = feature_selector.select_features(x_test_iq, self.features,
                                                     self.new_features)

        reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=5,
                                              n_estimators=500,
                                              random_state=67)
        reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=3,
                                              n_estimators=300,
                                              random_state=67)

        reg_sj_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)
        reg_iq_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)

        y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor(
            reg_sj_gb, reg_iq_gb, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq,
            "gb")
        y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor(
            reg_sj_rf, reg_iq_rf, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq,
            "rf")
        y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2,
                                                   y_iq_pred_m1, y_iq_pred_m2,
                                                   5, 3)
        predict_sj['total_cases'] = y_sj_pred.round().astype(int)
        predict_iq['total_cases'] = y_iq_pred.round().astype(int)

        predict_df = pd.concat([predict_sj, predict_iq], axis=0)
        predict_df.loc[predict_df.total_cases < 0, 'total_cases'] = 0

        self.write_results(predict_df)
def feature_length_range(load_dir,
                         save_dir,
                         min_frame_length=100,
                         max_frame_length=3000,
                         mode='copy',
                         feature_names='cepstrum',
                         label_names='transcript'):
    """ Check individual files (features and their labels) in load_dir and copy/move those which satisfy the condition:
    min_frame_length <= feature_frame_len <= max_frame_length

    :param load_dir: folder from which to load features and their labels
    :param save_dir: folder to which copy/move the files which satisfy the condition above
    :param min_frame_length: lower bound of the feature frame length condition
    :param max_frame_length: upper bound of the feature frame length condition
    :param mode: 'copy'/'move' - condition satisfying files are copied/moved from load_dir to save_dir
    :param feature_names: sequence of symbols that can be used as common identifier for feature files
    :param label_names: sequence of symbols that can be used as common identifier for label files
    :return: None
    """

    # normalize the save directory path
    save_path = os.path.normpath(save_dir)

    folder_structure_gen = os.walk(
        load_dir)  # ('path_to_current_folder', [subfolders], ['files', ...])

    for folder in folder_structure_gen:
        path, subfolders, files = folder
        feat_file_names = [f for f in files if feature_names in f]
        label_file_names = [f for f in files if label_names in f]

        num_feats = len(feat_file_names)
        num_labels = len(label_file_names)

        assert num_feats == num_labels, 'There is {} feature files and {} label files (must be same).'.format(
            num_feats, num_labels)

        rel_path = os.path.relpath(
            path, load_dir
        )  # relative position of current subdirectory in regards to load_dir
        save_full_path = os.path.join(
            save_path,
            rel_path)  # folder/subfolder to which save files in save_dir

        # make subdirectories in save_dir
        os.makedirs(save_full_path, exist_ok=True)

        for i in range(num_feats):
            feat_load_path = os.path.join(path, feat_file_names[i])
            label_load_path = os.path.join(path, label_file_names[i])
            feat_save_path = os.path.join(save_full_path, feat_file_names[i])
            label_save_path = os.path.join(save_full_path, label_file_names[i])

            feat, _ = FeatureExtractor.load_cepstra(feat_load_path)
            feat_frame_len = feat[0][0].shape[0]

            if min_frame_length <= feat_frame_len <= max_frame_length:
                if mode == 'copy':
                    shutil.copy2(feat_load_path, feat_save_path)
                    print("Copied {} to {}".format(feat_load_path,
                                                   feat_save_path))
                    shutil.copy2(label_load_path, label_save_path)
                    print("Copied {} to {}".format(label_load_path,
                                                   label_save_path))
                elif mode == 'move':
                    os.rename(feat_load_path, feat_save_path)
                    print("Moved {} to {}".format(feat_load_path,
                                                  feat_save_path))
                    os.rename(label_load_path, label_save_path)
                    print("Moved {} to {}".format(label_load_path,
                                                  label_save_path))
                else:
                    raise ValueError(
                        "argument mode must be eiher 'copy' or 'move'")

        print("Finished.")
Exemple #7
0
        words = sentence.split(" ")

    return sentence


if __name__ == '__main__':
    # set logging to only show errors
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    #    print("INITIALIZING LANGUAGE MODEL PIPELINE".center(50, "_")
    #    lm_mask_pipeline = masked_pipeline_from_trained_model(PREDICTION_FLAGS.models['lm_path'])

    print("INITIALIZING FEATURE EXTRACTOR".center(50, "_"))
    extractor = FeatureExtractor(
        PREDICTION_FLAGS.recording['rate'],
        feature_type=PREDICTION_FLAGS.features['type'],
        energy=PREDICTION_FLAGS.features['energy'],
        deltas=PREDICTION_FLAGS.features['deltas'])

    print("RECORDING AUDIO".center(50, "_"))
    timespan, frames, stream = record_audio(5)

    print("CONVERTING TO FEATURE REPRESENTATION".center(50, "_"))
    features = extractor.transform_data([np.array(frames)])[0]

    print("PREDICTING FROM SAVED MODEL".center(50, "_"))
    predictions = predict_from_saved_model(PREDICTION_FLAGS.models['am_path'],
                                           features)

    print("TRANSCRIBING TO STRINGS".center(50, "_"))
    string_predictions = convert_to_strings(predictions,
Exemple #8
0
from FeatureExtraction import SeriesModifier
from FeatureExtraction import FeatureExtractor

src_path = u'D:/GDrive/Диплом 2/DataPreparation/output/Patient#7/Period_from_2000-07-16__9-52.xlsx'

measurements = DataGetter.GetAllMeasurements(src_path,
                                             i_patient_id_column_num=1,
                                             i_datetime_column_num=2,
                                             i_glucose_level_column_num=3)

smooth_measurements = SeriesModifier.Smooth(measurements)
extremal_indexes, bla = ExtremaFilter.FindExtremalMeasurements(
    smooth_measurements)

time_separator = datetime.datetime.strptime("05:00:00", "%H:%M:%S").time()
measurements_per_day = FeatureExtractor.SeparateMeasurementsForDays(
    measurements, time_separator)
rise_features = []
for day_measurement in measurements_per_day:
    day_feature = FeatureExtractor.ExtractDayFeature(day_measurement)
    if day_feature:
        rise_features.extend(day_feature.GetRiseFeatures())

gl_b_level = np.array(
    map(lambda x: x.GetBeforeMax().GetGlucoseLevel(), rise_features))
dt_b = np.array(map(lambda x: x.GetBeforeMax().GetDateTime(), rise_features))

gl_a_level = np.array(
    map(lambda x: x.GetAfterMax().GetGlucoseLevel(), rise_features))
dt_a = np.array(map(lambda x: x.GetAfterMax().GetDateTime(), rise_features))

gl_m_level = np.array(
Exemple #9
0
    SentenceBlockMedianPooler, SentenceLevelAveragePooler,
    SentenceLevelMedianPooler, SentenceLevelMaximumPooler,
    StringConcatenationEncoder)

# Hypothesis 1: input clean-up improves performance

datasets = [TitleTextDataset]
hypothesis = 'fext'

pipelines = [('nb_baseline', Pipeline([('vec', None),
                                       ('cls', MultinomialNB())])),
             ('svm_baseline', Pipeline([('vec', None), ('cls', LinearSVC())])),
             ('log_baseline',
              Pipeline([('vec', None), ('cls', LogisticRegression())])),
             ('nb_fext',
              Pipeline([('fext', FeatureExtractor()), ('vec', None),
                        ('cls', MultinomialNB())])),
             ('svm_fext',
              Pipeline([('fext', FeatureExtractor()), ('vec', None),
                        ('cls', LinearSVC())])),
             ('log_fext',
              Pipeline([('fext', FeatureExtractor()), ('vec', None),
                        ('cls', LogisticRegression())]))]

HYPOTHESIS_1 = (hypothesis, datasets, pipelines)

# Hypothesis 2: title information improves performance

datasets = [TextDataset, TitleDataset, TitleTextDataset]
hypothesis = 'title'
pipelines = [('nb_baseline', Pipeline([('vec', None),
Exemple #10
0
def prepare_data(files,
                 save_folder,
                 dataset="pdtsc",
                 label_max_duration=10.0,
                 speeds=(0.9, 1.0, 1.1),
                 feature_type="MFSC",
                 bigrams=False,
                 repeated=False,
                 energy=True,
                 deltas=(0, 0),
                 nbanks=40,
                 filter_nan=True,
                 sort=True):
    cepstra_length_list = []

    file_names = get_file_names(files)

    for speed in speeds:
        LOGGER.info(f"Create audio_transormer for speed {speed}")
        audio_transformer = (AudioEffectsChain().speed(speed))
        save_path = os.path.join(save_folder, f"{speed}/")
        LOGGER.debug(f"Current save_path: {save_path}")
        for i, file in enumerate(files):
            if dataset == "pdtsc":
                pdtsc = PDTSCLoader([file[0]], [file[1]], bigrams, repeated)
                labels = pdtsc.transcripts_to_labels(
                )  # list of lists of 1D numpy arrays
                labels = labels[0]  # flatten label list
                audio_list, fs = pdtsc.load_audio()
                audio = audio_list[0]
                fs = fs[0]
                LOGGER.debug(
                    f"Loaded PDTSC with fs {fs} from:\n \t audio_path: {file[0]}\n \t transcript_path: {file[1]}"
                )
            elif dataset == "oral":
                oral = OralLoader([file[0]], [file[1]], bigrams, repeated)
                label_dict = oral.transcripts_to_labels(
                    label_max_duration
                )  # Dict['file_name':Tuple[sents_list, starts_list, ends_list]]
                audio_dict, fs_dict = oral.load_audio()  # Dicts['file_name']

                labels = label_dict[file_names[i]]
                audio = audio_dict[file_names[i]]
                fs = fs_dict[file_names[i]]
                LOGGER.debug(
                    f"Loaded ORAL with fs {fs} from:\n \t audio_path: {file[0]}\n \t transcript_path: {file[1]}"
                )
            else:
                raise ValueError(
                    "'dataset' argument must be either 'pdtsc' or 'oral'")

            full_save_path = os.path.join(save_path, file_names[i])

            LOGGER.info(
                f"\tApplying SoX transormation on audio from {full_save_path}")
            for ii in range(len(audio)):
                LOGGER.debug(f"\t\t input.shape: {audio[ii].shape}")
                audio[ii] = audio_transformer(audio[ii])
                LOGGER.debug(f"\t\t output.shape: {audio[ii].shape}")

            LOGGER.info(f"\tApplying FeatureExtractor on audio")
            feat_ext = FeatureExtractor(fs,
                                        feature_type=feature_type,
                                        energy=energy,
                                        deltas=deltas,
                                        nbanks=nbanks)
            cepstra = feat_ext.transform_data(audio)  # list of 2D arrays

            # filter out cepstra which are containing nan values
            if filter_nan:
                LOGGER.info(f"\tFiltering out NaN values")
                # boolean list where False marks cepstra in which there is at least one nan value present
                mask_nan = [
                    not np.isnan(cepstrum).any() for cepstrum in cepstra
                ]

                # mask out cepstra and their corresponding labels with nan values
                cepstra = list(compress(cepstra, mask_nan))
                labels = list(compress(labels, mask_nan))

            # SAVE Cepstra to files (features)
            LOGGER.info(f"\tSaving cepstra to files")
            FeatureExtractor.save_cepstra(cepstra,
                                          full_save_path,
                                          exist_ok=True)
            LOGGER.debug(f"\t\tfull_save_path: {full_save_path}")

            # SAVE Transcripts to files (labels)
            LOGGER.info(f"\tSaving transcripts to files")
            if dataset == 'pdtsc':
                pdtsc.save_labels([labels], save_path, exist_ok=True)
            elif dataset == 'oral':
                label_dict[file_names[i]] = labels
                oral.save_labels(label_dict, save_path, exist_ok=True)
            else:
                raise ValueError(
                    "'dataset' argument must be either 'pdtsc' or 'oral'")

            LOGGER.info(f"\tChecking SAVE/LOAD consistency")
            loaded_cepstra, loaded_cepstra_paths = FeatureExtractor.load_cepstra(
                full_save_path)
            loaded_labels, loaded_label_paths = DataLoader.load_labels(
                full_save_path)

            # flatten the lists
            loaded_cepstra, loaded_cepstra_paths, loaded_labels, loaded_label_paths = (
                loaded_cepstra[0], loaded_cepstra_paths[0], loaded_labels[0],
                loaded_label_paths[0])

            for j in range(len(cepstra)):
                if np.any(np.not_equal(cepstra[j], loaded_cepstra[j])):
                    raise UserWarning(
                        "Saved and loaded cepstra are not value consistent.")
                if dataset == 'pdtsc':
                    if np.any(np.not_equal(labels[j], loaded_labels[j])):
                        raise UserWarning(
                            "Saved and loaded labels are not value consistent."
                        )
                elif dataset == 'oral':
                    if np.any(np.not_equal(labels[j][0], loaded_labels[j])):
                        raise UserWarning(
                            "Saved and loaded labels are not value consistent."
                        )

                # add (cepstrum_path, label_path, cepstrum_length) tuple into collective list for sorting
                cepstra_length_list.append(
                    (loaded_cepstra_paths[j], loaded_label_paths[j],
                     loaded_cepstra[j].shape[0]))

            LOGGER.debug(
                f'files from {file_names[i]} transformed and saved into {os.path.abspath(save_path)}.'
            )

        # sort cepstra and labels by time length (number of frames)
        if sort:
            LOGGER.info(
                f"Sorting cepstra and labels by time length (number of frames)"
            )
            sort_indices = np.argsort([
                c[2] for c in cepstra_length_list
            ])  # indices which sort the lists by cepstra length
            cepstra_length_list = [
                cepstra_length_list[i] for i in sort_indices
            ]  # sort the cepstra list

            num_digits = len(str(len(cepstra_length_list)))

            for idx, file in enumerate(cepstra_length_list):
                cepstrum_path, label_path, _ = file
                os.rename(
                    cepstrum_path, "{0}/cepstrum-{1:0{2}d}.npy".format(
                        save_path, idx, num_digits))
                os.rename(
                    label_path, "{0}/transcript-{1:0{2}d}.npy".format(
                        save_path, idx, num_digits))
            subfolders = next(os.walk(save_path))[1]
            for folder in subfolders:
                try:
                    os.rmdir(os.path.join(save_path, folder))
                except OSError:
                    LOGGER.warning(
                        "Folder {} is not empty! Can't delete.".format(
                            os.path.join(save_path, folder)))