print('using cpu') # flickr doesnt need to be split at the root node def iterate_data(h5_file): for x in h5_file.root: yield x f_nodes_mfcc = [node for node in iterate_data(data_file)] f_nodes_flickr = [node for node in iterate_data(flickr_file)] # split the database into train test and validation sets. default settings uses the json file # with the karpathy split train, val, test = split_data_flickr(f_nodes_flickr, args.split_loc) textcp = pd.read_csv( "/Users/sebastiaanscholten/Documents/speech2image-master/vgsexperiments/experiments/Results_isolated_word_recognition/documents/textcp.csv" ) testlist = [ "dog", "man", "boy", "girl", "woman", "people", "dogs", "shirt", "child", "ball", "person", "children", "men", "girls", "bike", "rock", "camera", "boys", "hat", "player", "jacket", "basketball", "swing", "car", "wall", "hair", "football", "sunglasses", "head", "shorts", "dress", "table", "water", "grass", "bench", "snow", "air", "field", "street", "mouth", "dirt", "mountain", "pool", "ocean", "sand", "building", "soccer", "park", "face" ]
# check if cuda is availlable and user wants to run on gpu cuda = args.cuda and torch.cuda.is_available() if cuda: print('using gpu') else: print('using cpu') # flickr doesnt need to be split at the root node def iterate_data(h5_file): for x in h5_file.root: yield x f_nodes = [node for node in iterate_data(data_file)] # split the database into train test and validation sets. default settings uses the json file # with the karpathy split train, test, val = split_data_flickr(f_nodes, args.split_loc) ############################### Neural network setup ################################################# # network modules img_net = img_encoder(image_config) cap_net = audio_rnn_encoder(audio_config) # Adam optimiser. I found SGD to work terribly and could not find appropriate parameter settings for it. optimizer = torch.optim.Adam(list(img_net.parameters())+list(cap_net.parameters()), 1) #plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.9, patience = 100, # threshold = 0.0001, min_lr = 1e-8, cooldown = 100) #step_scheduler = lr_scheduler.StepLR(optimizer, 1000, gamma=0.1, last_epoch=-1)
# images should be shape (batch_size, 1024). images_shape[1] is collapsed as the original features are of shape (1,1024) images = np.float64( np.reshape(images, (images_shape[0], images_shape[2]))) yield images, speech, caption, lengths # load the word frequency dictionary f_dict = load_obj(dict_path) # select words which occur between 50 and a 1000 times and are over 3 characters long words = select(f_dict, 50, 1000, 3) vocab_size = len(words) # open and load the data data_file = tables.open_file(data_loc, mode='r+') f_nodes = [node for node in iterate_flickr(data_file)] # split the data train, val, test = split_data_flickr(f_nodes, split_loc) ################################network config################################## # rnn encoder for audio (mfcc, mbn etc.) class audio_rnn_encoder(nn.Module): def __init__(self, config): super(audio_rnn_encoder, self).__init__() conv = config['conv'] rnn = config['rnn'] att = config['att'] self.Conv = nn.Conv1d(in_channels=conv['in_channels'], out_channels=conv['out_channels'], kernel_size=conv['kernel_size'], stride=conv['stride'], padding=conv['padding'])