def get_data_by_id(feature_table, gender=False): sql_handler = SqlHandler() feature = sql_handler.get_df(feature_table) feature['ID'] = feature['ID'].apply(pd.to_numeric) #转为数值 train = sql_handler.get_df(config.tbl_train_ros_set) dev = sql_handler.get_df(config.tbl_develop_set) test = sql_handler.get_df(config.tbl_test_set) if not gender: train_set = merge_df_by_id( train, feature) #合并特征csv和PHQcsv 调用了之前的函数 merge_df_by_id() dev_set = merge_df_by_id(dev, feature) #合并特征csv和PHQcsv test_set = merge_df_by_id(test, feature) return train_set, dev_set, test_set else: #考虑性别因素 train_male = train[train['Gender'] == 1] train_female = train[train['Gender'] == 0] dev_male = dev[dev['Gender'] == 1] dev_female = dev[dev['Gender'] == 0] test_male = test[test['Gender'] == 1] test_female = test[test['Gender'] == 0] train_male = merge_df_by_id(train_male, feature) train_female = merge_df_by_id(train_female, feature) dev_male = merge_df_by_id(dev_male, feature) dev_female = merge_df_by_id(dev_female, feature) test_male = merge_df_by_id(test_male, feature) test_female = merge_df_by_id(test_female, feature) return train_male, dev_male, test_male, train_female, dev_female, test_female
def get_data_multi_modality(tables, gender=False): """gather data from different tables in every modality and generate train set and dev dev set of them. """ sql_handler = SqlHandler() audio_df, video_df, text_df = [], [], [] for tb in tables: if tb in AUDIO_TABLE: audio_df.append(sql_handler.get_df(tb)) #从数据库提取特征存入list elif tb in VIDEO_TABLE: video_df.append(sql_handler.get_df(tb)) elif tb in TEXT_TABLE: text_df.append(sql_handler.get_df(tb)) else: pass audio_merge_df = merge_dfs_by_id(audio_df) video_merge_df = merge_dfs_by_id(video_df) text_merge_df = merge_dfs_by_id(text_df) train = sql_handler.get_df(config.tbl_train_ros_set) dev = sql_handler.get_df(config.tbl_develop_set) if not gender: #不考虑性别 data_dct = { 'audio_train': merge_df_by_id(train, audio_merge_df), 'audio_dev': merge_df_by_id(dev, audio_merge_df), 'vedio_train': merge_df_by_id(train, video_merge_df), 'vedio_dev': merge_df_by_id(dev, video_merge_df), 'text_train': merge_df_by_id(train, text_merge_df), 'text_dev': merge_df_by_id(dev, text_merge_df) } else: #考虑性别 train_male = train[train['Gender'] == 1] train_female = train[train['Gender'] == 0] dev_male = dev[dev['Gender'] == 1] dev_female = dev[dev['Gender'] == 0] data_dct = { 'male': { 'audio_train': merge_df_by_id(train_male, audio_merge_df), 'audio_dev': merge_df_by_id(dev_male, audio_merge_df), 'vedio_train': merge_df_by_id(train_male, video_merge_df), 'vedio_dev': merge_df_by_id(dev_male, video_merge_df), 'text_train': merge_df_by_id(train_male, text_merge_df), 'text_dev': merge_df_by_id(dev_male, text_merge_df) }, 'female': { 'audio_train': merge_df_by_id(train_female, audio_merge_df), 'audio_dev': merge_df_by_id(dev_female, audio_merge_df), 'vedio_train': merge_df_by_id(train_female, video_merge_df), 'vedio_dev': merge_df_by_id(dev_female, video_merge_df), 'text_train': merge_df_by_id(train_female, text_merge_df), 'text_dev': merge_df_by_id(dev_female, text_merge_df) } } return data_dct
def get_data_by_id(feature_table, gender=False): sql_handler = SqlHandler() feature = sql_handler.get_df(feature_table) feature['ID'] = feature['ID'].apply(pd.to_numeric) train = sql_handler.get_df(config.tbl_training_set) dev = sql_handler.get_df(config.tbl_develop_set) if not gender: train_set = merge_df_by_id(train, feature) dev_set = merge_df_by_id(dev, feature) return train_set, dev_set else: train_male = train[train['Gender'] == 1] train_female = train[train['Gender'] == 0] dev_male = train[train['Gender'] == 1] dev_female = train[train['Gender'] == 0] train_male = merge_df_by_id(train_male, feature) train_female = merge_df_by_id(train_female, feature) dev_male = merge_df_by_id(dev_male, feature) dev_female = merge_df_by_id(dev_female, feature) return train_male, dev_male, train_female, dev_female
def createCodebook(feature_name): """ COVAREP, FORMANT, FAUs ,***p.CSV->stack dataframe -> create codebook ->extract all files' features java -jar openXBOW.jar -i examples/example2/llds.csPv -o bow.csv -a 1 -log -c kmeans++ -size 100 -B codebook -writeName -writeTimeStamp """ sqlhandler = SqlHandler() train = sqlhandler.get_df(config.tbl_training_set) dev = sqlhandler.get_df(config.tbl_develop_set) trainID = train['Participant_ID'].values devID = dev['Participant_ID'].values trainDev = np.hstack([trainID, devID]) folds = np.random.choice(trainDev, 20, replace=False) # for video 50, for audio 20 window_size = 4 hop_size = 1 openxbow = 'java -jar E:/openXBOW/openXBOW.jar ' openxbow_options = '-writeName -writeTimeStamp -t ' + str( window_size) + ' ' + str(hop_size) codebook_out = 'E:/openXBOW/codebooks/' openxbow_options_codebook = f'-size 100 -a 1 -log -B {codebook_out}{feature_name}_codebook ' if feature_name == 'faus': for fold in folds: path = config.data_dir + str(fold) + '_P/' + str( fold) + '_' + SUFFIX['au'] feature = np.loadtxt(path, delimiter=',', skiprows=1) success = feature[:, 3] == 1 feature = feature[success, 1:18] feature = np.delete(feature, [1, 2], axis=1) save_features(codebook_out + 'fausTrainDevRandom.csv', feature, append=True, instname=str(fold)) os.system(openxbow+ f'-standardizeInput -i {codebook_out}fausTrainDevRandom.csv '+openxbow_options_codebook+\ openxbow_options+ ' -c kmeans++'+ f' -o {codebook_out}temp.csv') elif feature_name == 'gaze_pose': for fold in folds: path1 = config.data_dir + str(fold) + '_P/' + str( fold) + '_' + SUFFIX['gaze'] path2 = config.data_dir + str(fold) + '_P/' + str( fold) + '_' + SUFFIX['pose'] gaze_data = pd.read_csv(path1) pose_data = pd.read_csv(path2) if fold in [367, 396, 432]: temp = np.all(pose_data.values != ' -1.#IND', axis=1) #['367_', '396_', '432_'] 缺失 存在异常值 data = pd.merge( gaze_data, pose_data) #key = frame timestamps confidence success data = data[temp] data.iloc[:, -6:] = data.iloc[:, -6:].applymap( lambda x: float(x[1:])) else: data = pd.merge(gaze_data, pose_data) success = data[' success'] == 1 data = data.values[:, 1:] data = np.delete(data, [1, 2], axis=1) data = data[success] save_features(codebook_out + 'gazePoseTrainDevRandom.csv', data, append=True, instname=str(fold)) os.system(openxbow+ f'-standardizeInput -i {codebook_out}gazeposeTrainDevRandom.csv '+openxbow_options_codebook+\ openxbow_options+ ' -c kmeans++'+ f' -o {codebook_out}temp.csv') elif feature_name == 'covarep': for fold in folds: path = config.data_dir + str(fold) + '_P/' + str( fold) + '_' + SUFFIX['covarep'] data = np.loadtxt(path, delimiter=',') timestamp = np.arange(0, data.shape[0]).reshape(data.shape[0], 1) timestamp = timestamp / 100 data = np.hstack([timestamp, data]) data = data[data[:, 2] == 1] data = np.delete(data, 2, axis=1) data[np.isnan(data)] = 0 data[np.isinf(data)] = 0 save_features(codebook_out + 'covarepTrainDevRandom.csv', data, append=True, instname=str(fold)) os.system(openxbow+ f'-standardizeInput -i {codebook_out}covarepTrainDevRandom.csv '+openxbow_options_codebook+\ openxbow_options+ ' -c kmeans++'+ f' -o {codebook_out}temp.csv') else: for fold in folds: path = config.data_dir + str(fold) + '_P/' + str( fold) + '_' + SUFFIX['formant'] data = np.loadtxt(path, delimiter=',') timestamp = np.arange(0, data.shape[0]).reshape(data.shape[0], 1) timestamp = timestamp / 100 data = np.hstack([timestamp, data]) data[np.isnan(data)] = 0 data[np.isinf(data)] = 0 save_features(codebook_out + 'formantTrainDevRandom.csv', data, append=True, instname=str(fold)) os.system(openxbow+ f'-standardizeInput -i {codebook_out}formantTrainDevRandom.csv '+openxbow_options_codebook+\ openxbow_options+ ' -c kmeans++'+ f' -o {codebook_out}temp.csv')