def read_lab_fea(cfg_file, fea_only, shared_list, output_folder): # Reading chunk-specific cfg file (first argument-mandatory file) if not (os.path.exists(cfg_file)): sys.stderr.write('ERROR: The config file %s does not exist!\n' % cfg_file) sys.exit(0) else: config = configparser.ConfigParser() config.read(cfg_file) # Reading some cfg parameters to_do = config['exp']['to_do'] if to_do == 'train': max_seq_length = int( config['batches']['max_seq_length_train'] ) # *(int(info_file[-13:-10])+1) # increasing over the epochs if to_do == 'valid': max_seq_length = int(config['batches']['max_seq_length_valid']) if to_do == 'forward': max_seq_length = -1 # do to break forward sentences [fea_dict, lab_dict, arch_dict] = dict_fea_lab_arch(config, fea_only) [cw_left_max, cw_right_max] = compute_cw_max(fea_dict) fea_index = 0 cnt_fea = 0 for fea in fea_dict.keys(): # reading the features fea_scp = fea_dict[fea][1] fea_opts = fea_dict[fea][2] cw_left = int(fea_dict[fea][3]) cw_right = int(fea_dict[fea][4]) cnt_lab = 0 # Production case, we don't have labels (lab_name = none) if fea_only: lab_dict.update({'lab_name': 'none'}) for lab in lab_dict.keys(): # Production case, we don't have labels (lab_name = none) if fea_only: lab_folder = None lab_opts = None else: lab_folder = lab_dict[lab][1] lab_opts = lab_dict[lab][2] [data_name_fea, data_set_fea, data_end_index_fea ] = load_chunk(fea_scp, fea_opts, lab_folder, lab_opts, cw_left, cw_right, max_seq_length, output_folder, fea_only) # making the same dimenion for all the features (compensating for different context windows) labs_fea = data_set_fea[cw_left_max - cw_left:data_set_fea.shape[0] - (cw_right_max - cw_right), -1] data_set_fea = data_set_fea[cw_left_max - cw_left:data_set_fea.shape[0] - (cw_right_max - cw_right), 0:-1] data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left) data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max - cw_right) if cnt_fea == 0 and cnt_lab == 0: data_set = data_set_fea labs = labs_fea data_end_index = data_end_index_fea data_end_index = data_end_index_fea data_name = data_name_fea fea_dict[fea].append(fea_index) fea_index = fea_index + data_set_fea.shape[1] fea_dict[fea].append(fea_index) fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5]) else: if cnt_fea == 0: labs = np.column_stack((labs, labs_fea)) if cnt_lab == 0: data_set = np.column_stack((data_set, data_set_fea)) fea_dict[fea].append(fea_index) fea_index = fea_index + data_set_fea.shape[1] fea_dict[fea].append(fea_index) fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5]) # Checks if lab_names are the same for all the features if not (data_name == data_name_fea): sys.stderr.write( 'ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n' ) sys.exit(0) # Checks if end indexes are the same for all the features if not (data_end_index == data_end_index_fea).all(): sys.stderr.write( 'ERROR end_index must be the same for all the sentences"\n' ) sys.exit(0) cnt_lab = cnt_lab + 1 cnt_fea = cnt_fea + 1 cnt_lab = 0 if not fea_only: for lab in lab_dict.keys(): lab_dict[lab].append(data_set.shape[1] + cnt_lab) cnt_lab = cnt_lab + 1 data_set = np.column_stack((data_set, labs)) # check automatically if the model is sequential seq_model = is_sequential_dict(config, arch_dict) # Randomize if the model is not sequential if not (seq_model) and to_do != 'forward': np.random.shuffle(data_set) # Split dataset in many part. If the dataset is too big, we can have issues to copy it into the shared memory # (due to pickle limits) # N_split=10 # data_set=np.array_split(data_set, N_split) # Adding all the elements in the shared list shared_list.append(data_name) shared_list.append(data_end_index) shared_list.append(fea_dict) shared_list.append(lab_dict) shared_list.append(arch_dict) shared_list.append(data_set)
config['batches']['max_seq_length_train'] ) #*(int(info_file[-13:-10])+1) # increasing over the epochs batch_size = int(config['batches']['batch_size_train']) if to_do == 'valid': max_seq_length = int(config['batches']['max_seq_length_valid']) batch_size = int(config['batches']['batch_size_valid']) if to_do == 'forward': max_seq_length = -1 # do to break forward sentences batch_size = 1 start_time = time.time() # Compute the maximum context window in the feature dict [cw_left_max, cw_right_max] = compute_cw_max(fea_dict) # Reading all the features and labels [data_name, data_set, data_end_index] = read_lab_fea(fea_dict, lab_dict, cw_left_max, cw_right_max, max_seq_length, is_production) # Randomize if the model is not sequential if not (seq_model) and to_do != 'forward': np.random.shuffle(data_set) elapsed_time_reading = time.time() - start_time # converting numpy tensors into pytorch tensors and put them on GPUs if specified start_time = time.time() if not (save_gpumem) and use_cuda:
def _read_features_and_labels(fea_dict, lab_dict, max_seq_length, fea_only, output_folder): def _get_fea_config_from_dict(fea_dict_entr): fea_scp = fea_dict_entr[1] fea_opts = fea_dict_entr[2] cw_left = int(fea_dict_entr[3]) cw_right = int(fea_dict_entr[4]) return fea_scp, fea_opts, cw_left, cw_right def _get_lab_config_from_dict(lab_dict_entr, fea_only): if fea_only: lab_folder = None lab_opts = None else: lab_folder = lab_dict_entr[1] lab_opts = lab_dict_entr[2] return lab_folder, lab_opts def _compensate_for_different_context_windows(data_set_fea, data_set_lab, cw_left_max, cw_left, cw_right_max, cw_right, data_end_index_fea, data_end_index_lab): data_set_lab = np.take(data_set_lab, range(cw_left_max-cw_left,data_set_lab.shape[0]-(cw_right_max-cw_right)), axis=0, mode='clip') data_set_fea = np.take(data_set_fea, range(cw_left_max-cw_left,data_set_fea.shape[0]-(cw_right_max-cw_right)), axis=0, mode='clip') data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left) data_end_index_lab = data_end_index_lab - (cw_left_max - cw_left) data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max - cw_right) data_end_index_lab[-1] = data_end_index_lab[-1] - (cw_right_max - cw_right) return data_set_lab, data_set_fea, data_end_index_fea, data_end_index_lab def _update_data(data_set, labs, fea_dict, fea, fea_index, data_set_fea, labs_fea, cnt_fea, cnt_lab): if cnt_fea==0 and cnt_lab==0: data_set=data_set_fea labs=labs_fea fea_dict[fea].append(fea_index) fea_index=fea_index+data_set_fea.shape[1] fea_dict[fea].append(fea_index) fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5]) elif cnt_fea==0 and (not cnt_fea==0): labs=np.column_stack((labs,labs_fea)) elif (not cnt_fea==0) and cnt_fea==0: data_set=np.column_stack((data_set,data_set_fea)) fea_dict[fea].append(fea_index) fea_index=fea_index+data_set_fea.shape[1] fea_dict[fea].append(fea_index) fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5]) return data_set, labs, fea_dict, fea_index def _check_consistency(data_name, data_name_fea, data_end_index_fea_ini, data_end_index_fea, data_end_index_lab_ini, data_end_index_lab): if not (data_name == data_name_fea): sys.stderr.write('ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n') sys.exit(0) if not (data_end_index_fea_ini == data_end_index_fea).all(): sys.stderr.write('ERROR end_index must be the same for all the sentences"\n') sys.exit(0) if not (data_end_index_lab_ini == data_end_index_lab).all(): sys.stderr.write('ERROR end_index must be the same for all the sentences"\n') sys.exit(0) def _update_lab_dict(lab_dict, data_set): cnt_lab=0 for lab in lab_dict.keys(): lab_dict[lab].append(data_set.shape[1]+cnt_lab) cnt_lab=cnt_lab+1 return lab_dict def _load_chunk_refac01(fea_scp,fea_opts,lab_folder,lab_opts,left,right,max_sequence_length, output_folder,fea_only=False): [data_name,data_set,data_lab,end_index_fea,end_index_lab]=load_dataset(fea_scp,fea_opts,lab_folder,lab_opts,left,right, max_sequence_length, output_folder, fea_only) # TODO: this function will currently only work well if no context window is given or fea and lab have the same time dimensionality # Context window if left!=0 or right!=0: data_set=context_window(data_set,left,right) end_index_fea = end_index_fea - left end_index_lab = end_index_lab - left end_index_fea[-1] = end_index_fea[-1] - right end_index_lab[-1] = end_index_lab[-1] - right # mean and variance normalization data_set=(data_set-np.mean(data_set,axis=0))/np.std(data_set,axis=0) # Label processing data_lab=data_lab-data_lab.min() if right>0: data_lab=data_lab[left:-right] else: data_lab=data_lab[left:] if len(data_set.shape) == 1: data_set = np.expand_dims(data_set, -1) return [data_name, data_set, data_lab, end_index_fea, end_index_lab] cw_left_max, cw_right_max = compute_cw_max(fea_dict) fea_index=0 cnt_fea=0 data_name = None data_end_index_fea_ini = None data_end_index_lab_ini = None data_set = None labs = None for fea in fea_dict.keys(): fea_scp, fea_opts, cw_left, cw_right = _get_fea_config_from_dict(fea_dict[fea]) cnt_lab=0 if fea_only: lab_dict.update({'lab_name':'none'}) for lab in lab_dict.keys(): lab_folder, lab_opts = _get_lab_config_from_dict(lab_dict[lab], fea_only) data_name_fea, data_set_fea, data_set_lab, data_end_index_fea, data_end_index_lab = _load_chunk_refac01(fea_scp, fea_opts, lab_folder, lab_opts, cw_left, cw_right, max_seq_length, output_folder, fea_only) labs_fea, data_set_fea, data_end_index_fea, data_end_index_lab = _compensate_for_different_context_windows(data_set_fea, data_set_lab, cw_left_max, cw_left, cw_right_max, cw_right, data_end_index_fea, data_end_index_lab) if cnt_fea == 0 and cnt_lab == 0: data_end_index_fea_ini = data_end_index_fea data_end_index_lab_ini = data_end_index_lab data_name = data_name_fea data_set, labs, fea_dict, fea_index = _update_data(data_set, labs, fea_dict, fea, fea_index, data_set_fea, labs_fea, cnt_fea, cnt_lab) _check_consistency(data_name, data_name_fea, data_end_index_fea_ini, data_end_index_fea, data_end_index_lab_ini, data_end_index_lab) cnt_lab=cnt_lab+1 cnt_fea=cnt_fea+1 if not fea_only: lab_dict = _update_lab_dict(lab_dict, data_set) return data_name, data_end_index_fea_ini, data_end_index_lab_ini, fea_dict, lab_dict, data_set, labs