Beispiel #1
0
    def infer_embedding(self, partition):
        """infer embeddings given documents using trained model
        """
        infer_docs = []

        labels = dict()
        _, _, level_dev, level_train = load_label()
        labels['train'] = level_train
        labels['dev'] = level_dev

        with smart_open(self.data_config['transcript_preproc'][partition],
                        'rb',
                        encoding='utf-8') as all_data:
            for line_no, line in enumerate(all_data):
                tokens = gensim.utils.to_unicode(line).split()
                words = tokens
                tags = [line_no]
                sentiment = [labels[partition][line_no]]
                infer_docs.append(
                    self.interview_transcript(words, tags, sentiment))

        infer_vecs = [
            self.model.infer_vector(doc.words, alpha=.1) for doc in infer_docs
        ]
        infer_labels = [doc.sentiment for doc in infer_docs]

        # save inferred vectors and labels
        print("\nsaving inferred vectors and labels to file")
        if os.path.isdir(self.save_dir):
            np.save(os.path.join(self.save_dir, 'vectors_%s' % partition),
                    infer_vecs)
            np.save(os.path.join(self.save_dir, 'labels_%s' % partition),
                    infer_labels)
Beispiel #2
0
    def prepare_data(self, corpus):
        """prepared training data
        """
        labels = dict()
        _, _, level_dev, level_train = load_label()
        labels['train'] = level_train
        labels['dev'] = level_dev

        # evaluate without test partition as there is no label
        for partition in ['train', 'dev']:
            with smart_open(self.data_config['transcript_preproc'][partition],
                            'rb',
                            encoding='utf-8') as all_data:
                for line_no, line in enumerate(all_data):
                    tokens = gensim.utils.to_unicode(line).split()
                    words = tokens
                    tags = [line_no]
                    sentiment = [labels[partition][line_no]
                                 ] if partition != 'test' else [None]
                    self.all_docs.append(
                        self.interview_transcript(words, tags, sentiment))
        # use addition Turkish corpus for performance boost
        if corpus:
            with smart_open(self.data_config['turkish_corpus_proc'],
                            'rb',
                            encoding='utf-8') as all_data:
                for line_no, line in enumerate(all_data):
                    tokens = gensim.utils.to_unicode(line).split()
                    words = tokens
                    tags = [line_no]
                    sentiment = [None]
                    self.all_docs.append(
                        self.interview_transcript(words, tags, sentiment))
Beispiel #3
0
    def RF(self):
        print(
            "\nrunning RF on features selected with RF with doc2vec embeddings"
        )

        feature_path = smart_open('./pre-trained/fusion/feature_list.txt',
                                  'rb',
                                  encoding='utf-8')
        feature_list = []
        for _, line in enumerate(feature_path):
            feature_list.append(str(line).replace('\n', ''))

        for _ in range(3):
            for feature in feature_list:
                _, _, y_dev, y_train = load_label()
                y_train = y_train.astype('int')
                y_dev = y_dev.astype('int')

                X_train = np.load(
                    os.path.join('pre-trained', 'fusion', feature,
                                 'X_train.npy'))
                X_dev = np.load(
                    os.path.join('pre-trained', 'fusion', feature,
                                 'X_dev.npy'))

                random_forest = RandomForest(feature,
                                             X_train,
                                             y_train,
                                             X_dev,
                                             y_dev,
                                             test=False)
                random_forest.run()
                y_pred_train, y_pred_dev = random_forest.evaluate()
                get_UAR(y_pred_train,
                        y_train,
                        np.array([]),
                        'RF',
                        feature,
                        'multiple',
                        train_set=True,
                        test=False)
                get_UAR(y_pred_dev,
                        y_dev,
                        np.array([]),
                        'RF',
                        feature,
                        'multiple',
                        test=False)
    def test_multi_task_dnn(self):
        X_train, y_train, inst_train, X_dev, y_dev, inst_dev = load_proc_baseline_feature('BoAW', verbose=True)
        ymrs_dev, ymrs_train, _, _ = load_label()

        self.assertEqual(X_train.shape[1], X_dev.shape[1])

        num_classes = max(max(y_train), max(y_dev))
        test_dnn = MultiTaskDNN('BoAW', X_train.shape[1], num_classes)

        y_dev_r = test_dnn.prepare_regression_label(ymrs_dev.values[:, 1], inst_dev)
        y_train_r = test_dnn.prepare_regression_label(ymrs_train.values[:, 1], inst_train)

        self.assertEqual(len(y_dev_r), len(y_dev))
        self.assertEqual(len(y_train_r), len(y_train))

        test_dnn.build_model()
        test_dnn.train_model(X_train, y_train, y_train_r, X_dev, y_dev, y_dev_r)
        test_dnn.evaluate_model(X_train, y_train, y_train_r, X_dev, y_dev, y_dev_r)
Beispiel #5
0
def get_late_fusion_UAR(model_name,
                        feature_name_1,
                        feature_name_2,
                        baseline=False):
    """
    apply late fusion strategy on posterior probabilities of two modalities
    ---
    # para model_name: str
        given model name
    # para feature_name_1: str
        given 1st feature name
    # para feature_name_2: str
        given 2nd feature name
    # para baseline: bool
        whether to get baseline performance or not
    """
    prob_dev_1 = load_post_probability(model_name, feature_name_1)
    prob_dev_2 = load_post_probability(model_name, feature_name_2)

    assert prob_dev_1.shape == prob_dev_2.shape
    # PROB_DEV_1 = (3, 60)
    # PROB_DEV_2 = (3, 60)

    _, _, level_dev, _ = load_label()
    y_dev = level_dev.values[:, 1]
    # get the shape
    (_, num_inst) = prob_dev_1.shape
    y_pred = np.array([0] * num_inst)

    for i in range(num_inst):
        prob = prob_dev_1[:, i] + prob_dev_2[:, i]
        # fusion based on majority voting and averaging two modalities
        y_pred[i] = np.argmax(prob) + 1

    get_UAR(y_pred,
            y_dev,
            np.array([]),
            model_name,
            feature_name_1 + feature_name_2,
            'multiple',
            baseline=baseline,
            fusion=True)
Beispiel #6
0
    def DNN(self):
        print(
            "\nrunning Multi-Task DNN on features selected with RF with doc2vec embeddings"
        )

        feature_path = smart_open('./pre-trained/fusion/feature_list.txt',
                                  'rb',
                                  encoding='utf-8')
        feature_list = []
        for _, line in enumerate(feature_path):
            feature_list.append(str(line).replace('\n', ''))

        feature = feature_list[0]

        X_train = np.load(
            os.path.join('pre-trained', 'fusion', feature, 'X_train_tree.npy'))
        X_dev = np.load(
            os.path.join('pre-trained', 'fusion', feature, 'X_dev_tree.npy'))

        y_dev_r, y_train_r, y_dev, y_train = load_label()
        y_train = y_train.astype('int')
        y_dev = y_dev.astype('int')
        num_classes = 3

        if False:
            multi_dnn = MultiTaskDNN(feature, X_train.shape[1], num_classes)
            multi_dnn.build_model()
            multi_dnn.train_model(X_train, y_train, y_train_r, X_dev, y_dev,
                                  y_dev_r)
            multi_dnn.evaluate_model(X_train, y_train, y_train_r, X_dev, y_dev,
                                     y_dev_r)
        else:
            single_dnn = SingleTaskDNN(feature, X_train.shape[1], num_classes)
            single_dnn.build_model()
            single_dnn.train_model(X_train, y_train, X_dev, y_dev)
            single_dnn.evaluate_model(X_dev, y_dev)
Beispiel #7
0
def get_UAR(y_pred,
            y_dev,
            inst,
            model_name,
            feature_name,
            modality,
            frame=True,
            session=True,
            baseline=False,
            train_set=False,
            fusion=False,
            test=False):
    """
    get UAR metric for both frame-level and session-level
    ---
    # para y_pred: np.array()
        predicted mania level for each frame
    # para y_dev: np.array()
        actual mania level for each frame
    # para inst: np.array()
        session mappings of frames
    # para model_name: str
        given model name
    # para feature_name: str
        given feature name
    # para modality: str
        either single or multiple
    # para frame: bool
        whether to get frame-level UAR or not
    # para session: bool
        whether to get session-level UAR or not
    # para baseline: bool
        whether to get baseline performance or not
    # para train_set: bool
        whether to get UAR on training set or not
    # para fusion: bool
        whether to fuse UAR or not
    # para test: bool
        whether to save UAR results
    """
    frame_res, session_res, precision, fscore = 0.0, 0.0, 0.0, 0.0
    modality = 'baseline' if baseline else modality

    # UAR for session-level only (AU features)
    if not inst.any():
        # get recalls for three classes
        recall = [0] * 3
        for i in range(3):
            index, = np.where(y_dev == (i + 1))
            index_pred, = np.where(y_pred[index] == (i + 1))
            recall[i] = len(index_pred) / len(index)  # TP / (TP + FN)
        session_res = np.mean(recall)
        np.save(
            os.path.join('pre-trained', 'baseline',
                         '%s_%s_results.npy' % (model_name, feature_name)),
            y_pred)
        if not fusion:
            if train_set:
                print(
                    "\nUAR (mean of recalls) using %s feature based on session-level (training set) is %.3f and %.3f (sklearn)"
                    % (feature_name, session_res,
                       recall_score(y_dev, y_pred, average='macro')))
            else:
                print(
                    "\nUAR (mean of recalls) using %s feature based on session-level (development set) is %.3f and %.3f (sklearn)"
                    % (feature_name, session_res,
                       recall_score(y_dev, y_pred, average='macro')))
                if not test:
                    session_res = recall_score(y_dev, y_pred, average='macro')
                    precision, _, fscore, _ = precision_recall_fscore_support(
                        y_dev, y_pred, average='macro')
                    save_UAR_results(frame_res, session_res, precision, fscore,
                                     model_name, feature_name, modality)
            print(
                classification_report(
                    y_dev,
                    y_pred,
                    target_names=['depression', 'hypo-mania', 'mania']))

        else:
            print(
                "\nUAR (mean of recalls) using fusion based on session-level is %.3f and %.3f"
                % (session_res, recall_score(y_dev, y_pred, average='macro')))
            if not test:
                session_res = recall_score(y_dev, y_pred, average='macro')
                precision, _, fscore, _ = precision_recall_fscore_support(
                    y_dev, y_pred, average='macro')
                save_UAR_results(frame_res, session_res, precision, fscore,
                                 model_name, 'fusion', modality)

    else:
        # UAR for frame-level
        if frame:
            # get recalls for three classes
            recall = [0] * 3
            for i in range(3):
                index, = np.where(y_dev == (i + 1))
                index_pred, = np.where(y_pred[index] == (i + 1))
                recall[i] = len(index_pred) / len(index)  # TP / (TP + FN)
            frame_res = np.mean(recall)
            if train_set:
                print(
                    "\nUAR (mean of recalls) using %s feature based on frame-level (training set) is %.3f"
                    % (feature_name, frame_res))
            else:
                print(
                    "\nUAR (mean of recalls) using %s feature based on frame-level (development set) is %.3f"
                    % (feature_name, frame_res))
            print(
                classification_report(
                    y_dev,
                    y_pred,
                    target_names=['depression', 'hypo-mania', 'mania']))

        # UAR for session-level
        if session:
            # get majority-voting for each session
            decision = np.array(([0] * inst.max()))
            for j in range(len(decision)):
                index, = np.where(inst == (j + 1))
                count = [0] * 3
                for k in range(3):
                    index_pred, = np.where(y_pred[index] == (k + 1))
                    count[k] = len(index_pred)
                decision[j] = np.argmax(count) + 1

            np.save(
                os.path.join('pre-trained', 'baseline',
                             '%s_%s_results.npy' % (model_name, feature_name)),
                decision)

            # get recalls for three classes
            recall = [0] * 3
            _, _, level_dev, _ = load_label()
            labels = level_dev
            labels = np.array(labels, dtype=np.int8)
            for i in range(3):
                index, = np.where(labels == (i + 1))
                index_pred, = np.where(decision[index] == (i + 1))
                recall[i] = len(index_pred) / len(index)  # TP / (TP + FN)
            session_res = np.mean(recall)
            if train_set:
                print(
                    "\nUAR (mean of recalls) using %s feature based on session-level (training set) is %.3f"
                    % (feature_name, session_res))

            else:
                print(
                    "\nUAR (mean of recalls) using %s feature based on session-level (development set) is %.3f"
                    % (feature_name, session_res))

        if not train_set and not test:
            precision, _, fscore, _ = precision_recall_fscore_support(
                y_dev, y_pred, average='macro')
            save_UAR_results(frame_res, session_res, precision, fscore,
                             model_name, feature_name, modality)

    return frame_res, session_res
Beispiel #8
0
 def test_load_label(self):
     _, _, y_dev, y_train = load_label(verbose=True)
     y_dev = y_dev.values[:, 1]
     y_train = y_train.values[:, 1]
Beispiel #9
0
    def RF_CV(self):
        print(
            "\nrunning RF on features selected with RF with doc2vec embeddings"
        )

        feature_path = smart_open('./pre-trained/fusion/feature_list.txt',
                                  'rb',
                                  encoding='utf-8')
        feature_list = []
        for _, line in enumerate(feature_path):
            feature_list.append(str(line).replace('\n', ''))

        from sklearn.metrics import precision_recall_fscore_support

        cv_results_UAR = dict()
        cv_results_UAP = dict()

        for feature in feature_list:
            cv_results_UAR[feature] = []
            cv_results_UAP[feature] = []

            _, _, y_dev, y_train = load_label()
            y_train = y_train.astype('int')
            y_dev = y_dev.astype('int')

            X_train = np.load(
                os.path.join('pre-trained', 'fusion', feature, 'X_train.npy'))
            X_dev = np.load(
                os.path.join('pre-trained', 'fusion', feature, 'X_dev.npy'))

            X = np.vstack((X_train, X_dev))
            y = np.hstack((y_train, y_dev))

            cv_ids = k_fold_cv(len(X))

            for cv_id in cv_ids:
                X_train = X[cv_id[0]]
                y_train = y[cv_id[0]]
                X_dev = X[cv_id[1]]
                y_dev = y[cv_id[1]]

                print('train on %d test on %d' % (len(y_train), len(y_dev)))

                random_forest = RandomForest(feature,
                                             X_train,
                                             y_train,
                                             X_dev,
                                             y_dev,
                                             test=False)
                random_forest.run()
                _, y_pred = random_forest.evaluate()
                precision, recall, _, _ = precision_recall_fscore_support(
                    y_dev, y_pred, average='macro')
                cv_results_UAR[feature].append(recall)
                cv_results_UAP[feature].append(precision)

            assert len(cv_results_UAR[feature]) == len(
                cv_results_UAP[feature]) == 10

        with open(os.path.join('results', 'cross-validation.json'),
                  'a+',
                  encoding='utf-8') as outfile:
            json.dump(cv_results_UAR, outfile)
            json.dump(cv_results_UAP, outfile)
Beispiel #10
0
    def FUSION(self):
        print(
            "\nrunning early fusion strategy on audio-visual-textual modalities"
        )
        model_path_AV = smart_open('./pre-trained/DDAE/model_list.txt',
                                   'rb',
                                   encoding='utf-8')
        model_path_T = smart_open('./pre-trained/doc2vec/model_list.txt',
                                  'rb',
                                  encoding='utf-8')
        model_list_AV = []
        model_list_T = []

        for _, line_AV in enumerate(model_path_AV):
            line_AV = str(line_AV).replace('\n', '')
            model_list_AV.append(line_AV)
        for _, line_T in enumerate(model_path_T):
            line_T = str(line_T).replace('\n', '')
            model_list_T.append(line_T)

        _, _, y_dev, y_train = load_label()
        y_train = y_train.astype('int')
        y_dev = y_dev.astype('int')

        for AV in model_list_AV:
            for T in model_list_T:
                feature_name = AV[19:-2] + T[22:]
                if os.path.isfile(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_train_tree.npy')) and os.path.isfile(
                                         os.path.join('pre-trained', 'fusion',
                                                      feature_name,
                                                      'X_train_tree.npy')):
                    X_train_tree = np.load(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_train_tree.npy'))
                    X_dev_tree = np.load(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_train_tree.npy'))

                else:
                    X_train_AV = np.load(
                        os.path.join(AV[:-2],
                                     'X_train_tree_%d.npy' % int(AV[-2:])))
                    X_dev_AV = np.load(
                        os.path.join(AV[:-2],
                                     'X_dev_tree_%d.npy' % int(AV[-2:])))
                    X_train_txt = np.load(os.path.join(T, 'vectors_train.npy'))
                    X_dev_txt = np.load(os.path.join(T, 'vectors_dev.npy'))

                    assert X_train_AV.shape[0] == X_train_txt.shape[0] == len(
                        y_train)
                    assert X_dev_AV.shape[0] == X_dev_txt.shape[0] == len(
                        y_dev)

                    X_train = np.hstack((X_train_AV, X_train_txt))
                    X_dev = np.hstack((X_dev_AV, X_dev_txt))

                    os.mkdir(
                        os.path.join('pre-trained', 'fusion', feature_name))
                    np.save(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_train'), X_train)
                    np.save(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_dev'), X_dev)

                    from sklearn.ensemble import RandomForestClassifier

                    model = RandomForestClassifier(n_estimators=800,
                                                   criterion='entropy')

                    df = pd.DataFrame(np.vstack((X_train, X_dev)))
                    feature_names = [
                        'feature_%d' % i for i in range(len(X_train[0]))
                    ]
                    df.columns = feature_names
                    y = np.hstack((y_train, y_dev))

                    model.fit(df, y)
                    importances = model.feature_importances_
                    print("\nfeature importance ranking")
                    indices = np.argsort(importances)[::-1]
                    for f in range(100):
                        print("%d. feature %d %s (%f)" %
                              (f + 1, indices[f], feature_names[indices[f]],
                               importances[indices[f]]))

                    indices = indices[:100]
                    np.save(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'feature_list'), indices)

                    X_train_df = pd.DataFrame(X_train)
                    X_train_df.columns = [
                        'feature_%d' % i for i in range(len(X_train[0]))
                    ]
                    X_train_tree = X_train_df.iloc[:, indices]

                    X_dev_df = pd.DataFrame(X_dev)
                    X_dev_df.columns = [
                        'feature_%d' % i for i in range(len(X_dev[0]))
                    ]
                    X_dev_tree = X_dev_df.iloc[:, indices]

                    np.save(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_train_tree'), X_train_tree)
                    np.save(
                        os.path.join('pre-trained', 'fusion', feature_name,
                                     'X_dev_tree'), X_dev_tree)
Beispiel #11
0
def preprocess_BOXW(verbose=False):
    """preprocess Bags of X Words representations
    """
    # load directory from configuration file
    A_input_dir = data_config['data_path_local']['baseline']['BoAW']
    V_input_dir = data_config['data_path_local']['baseline']['BoVW']
    A_output_dir = data_config['baseline_preproc']['BoAW']
    V_output_dir = data_config['baseline_preproc']['BoVW']
    # load length from configuration file
    length = dict()
    length['train'] = data_config['length']['train']
    length['dev'] = data_config['length']['dev']
    length['test'] = data_config['length']['test']
    # load labels from configuration file
    _, _, level_dev, level_train = load_label()
    label_train, label_dev = level_train.values, level_dev.values
    labels = dict()
    labels['train'] = label_train[:, 1]
    labels['dev'] = label_dev[:, 1]

    for partition in ['train', 'dev']:
        # write handle
        A_label_f = smart_open(A_output_dir['%s_label' % partition],
                               'a+',
                               encoding='utf-8')
        V_label_f = smart_open(V_output_dir['%s_label' % partition],
                               'a+',
                               encoding='utf-8')
        A_inst_f = smart_open(A_output_dir['%s_inst' % partition],
                              'a+',
                              encoding='utf-8')
        V_inst_f = smart_open(V_output_dir['%s_inst' % partition],
                              'a+',
                              encoding='utf-8')
        A_data, V_data = None, None
        label = labels[partition]

        for i in range(length[partition]):

            A_feature = load_baseline_feature('BoAW', partition, (i + 1))
            V_feature = load_baseline_feature('BoVW', partition, (i + 1))
            A_t, _ = A_feature.shape
            V_t, _ = V_feature.shape
            # ensure timesteps match between Audio and Video
            timestep = A_t if A_t < V_t else V_t
            A_feature = A_feature.iloc[:timestep, 2:]
            V_feature = V_feature.iloc[:timestep, 2:]
            # concatenate features
            A_data = A_feature.copy() if not i else pd.concat(
                [A_data, A_feature])
            V_data = V_feature.copy() if not i else pd.concat(
                [V_data, V_feature])
            # write labels and instances
            A_label_f.write(('%d,' % label[i]) * timestep)
            V_label_f.write(('%d,' % label[i]) * timestep)
            A_inst_f.write(('%d,' % (i + 1)) * timestep)
            V_inst_f.write(('%d,' % (i + 1)) * timestep)

            if verbose:
                print(A_feature.shape, V_feature.shape)
                print(A_data.shape, V_data.shape)

        # save to external files
        A_data.to_csv(A_output_dir['%s_data' % partition],
                      header=None,
                      index=None)
        V_data.to_csv(V_output_dir['%s_data' % partition],
                      header=None,
                      index=None)

        A_label_f.close()
        V_label_f.close()
        A_inst_f.close()
        V_inst_f.close()
Beispiel #12
0
def preproc_baseline_feature(feature_name, verbose=False):
    """pre-process the baseline features (LLDs)
    """
    # para feature_name: which feature to pre-process
    # para verbose: whether or not to output more results
    no_train = data_config['train_len']
    no_dev = data_config['dev_len']
    # keep one instance in every # instances
    keep = data_config['keepinstance']

    def remove_if_exist(filename):
        if os.path.isfile(filename):
            os.remove(filename)

    # load output filenames
    train_data = data_config['baseline_preproc'][feature_name]['train_data']
    train_label = data_config['baseline_preproc'][feature_name]['train_label']
    train_inst = data_config['baseline_preproc'][feature_name]['train_inst']
    dev_data = data_config['baseline_preproc'][feature_name]['dev_data']
    dev_label = data_config['baseline_preproc'][feature_name]['dev_label']
    dev_inst = data_config['baseline_preproc'][feature_name]['dev_inst']

    # remove file if exists
    remove_if_exist(train_data)
    remove_if_exist(train_label)
    remove_if_exist(train_inst)
    remove_if_exist(dev_data)
    remove_if_exist(dev_label)
    remove_if_exist(dev_inst)

    # load the labels
    ymrs_train, ymrs_dev, level_dev, level_train = load_label()

    for partition in ['train', 'dev']:
        index_range = no_train if partition == 'train' else no_dev
        if verbose:
            print("\n----preprocessing on %s, dataset %s----" %
                  (feature_name, partition))

        if partition == 'train':
            data_loc, label_loc, inst_loc = train_data, train_label, train_inst
        else:
            data_loc, label_loc, inst_loc = dev_data, dev_label, dev_inst

        dataf = smart_open(data_loc, 'a+', encoding='utf-8')
        labelf = smart_open(label_loc, 'a+', encoding='utf-8')
        instf = smart_open(inst_loc, 'a+', encoding='utf-8')

        for id in range(1, index_range + 1):
            sample = get_sample(partition, id)

            if partition == 'train':
                ymrs_sample = ymrs_train[ymrs_train.Instance_name ==
                                         sample].iat[0, 1]
                level_sample = level_train[level_train.Instance_name ==
                                           sample].iat[0, 1]
            else:
                ymrs_sample = ymrs_dev[ymrs_dev.Instance_name == sample].iat[0,
                                                                             1]
                level_sample = level_dev[level_dev.Instance_name ==
                                         sample].iat[0, 1]

            if verbose:
                print("YMRS score for %s is %d" % (sample, ymrs_sample))
                print("Mania level for %s is %d" % (sample, level_sample))

            feat = load_baseline_feature(feature_name, partition, id)
            no_frame, _ = feat.shape
            count_nan = 0

            for i in range(0, no_frame, keep):
                if verbose:
                    print("\n----processing no. %d frame----" % i)
                data = feat.iloc[i, :]
                data = data[1:]  # remove name
                if data.isnull().values.any():
                    print("----NAN, DROP FEATURE----")
                    count_nan += 1
                    continue

                data_str = data.to_string(header=False, index=False)
                data_str = data_str.replace("\n", ",").replace(" ", "")

                # write baseline features to external file
                dataf.write(data_str)
                dataf.write("\n")
                # write baseline labels and instance to external file
                if id == 1 and i == 0:
                    labelf.write("%d" % level_sample)
                    instf.write("%d" % id)
                else:
                    labelf.write(",%d" % level_sample)
                    instf.write(",%d" % id)

            if verbose:
                print("\n----next feature----")
        if verbose:
            print("\n----%s partition done----" % partition)
            print("\n----ALL NAN DROPPED %d----" % count_nan)

        # close file handles
        dataf.close()
        labelf.close()
        instf.close()