def perform_visualization(ids_dict, metadata_file, params_path_data, params_path_clusters, params_list_data, params_list_clusters, audio_path): curr_params_data = unpack_params(params_path_data, params_list_data) curr_params_clusters = unpack_params(params_path_clusters, params_list_clusters) dim = curr_params_data['components'] save_to = '2D_visualization_labels' if dim == 2 else ( '3D_visualization_labels' if dim == 3 else 'invalid') save_to += '_(' + str(params_list_clusters[0]) for i in params_list_clusters[1:-1]: save_to += '_' + str(i) save_to += ')_' + str(params_list_clusters[-1]) save_to += '_data_(' + str(params_list_data[0]) for i in params_list_data[1:]: save_to += '_' + str(i) save_to += ').html' save_to = os.path.join('visualizations', save_to) load_from_data = 'small_dataset' for i in params_list_data: load_from_data += '_' + str(i) load_from_data += '.csv' load_from_data = os.path.join(params_path_data[-1], load_from_data) load_from_clusters = 'cluster_labels' load_from_clusters += '_(' + str(params_list_clusters[0]) for i in params_list_clusters[1:-1]: load_from_clusters += '_' + str(i) load_from_clusters += ')_' + str(params_list_clusters[-1]) load_from_clusters += '.csv' load_from_clusters = os.path.join(params_path_clusters[-1], load_from_clusters) print("Creating visualization...") print("Using dataset parameters: " + str(curr_params_data)) print("Using clustering parameters: " + str(curr_params_clusters)) print("Saving at: " + save_to) print("Loading dataset from: " + load_from_data) print("Loading clusters from: " + load_from_clusters) if os.path.exists(save_to): user_confirmation( "Visualization for these parameters already done, overwrite? enter 'Y' to confirm," ) # Create visualization using the preferred parameters if dim == 2: bokeh_2d(ids_dict, save_to, load_from_data, load_from_clusters, metadata_file, audio_path, **curr_params_data) elif dim == 3: plotly_3d(ids_dict, save_to, load_from_data, load_from_clusters, metadata_file, audio_path, **curr_params_data)
def reduce_to_small_dimension(params_path, params_list): save_to = 'small_dataset' for i in params_list: save_to += '_' + str(i) save_to += '.csv' save_to = os.path.join(params_path[-1], save_to) load_from = 'mid_dataset' for i in params_list[:-1]: load_from += '_' + str(i) load_from += '.csv' load_from = os.path.join(params_path[-2], load_from) curr_params = unpack_params(params_path, params_list) print("Performing small dimensionality reduction...") print("Using parameters: " + str(curr_params)) print("Saving at: " + save_to) print("Loading from: " + load_from) if os.path.exists(save_to): print( "Small dimensionality reduction for these parameters already done!" ) return # Perform dimensionality reduction using the preferred parameters if curr_params['small_algorithm'] == "tsne": tsne_small(save_to, load_from, **curr_params) elif curr_params['small_algorithm'] == "pca": pca_small(save_to, load_from, **curr_params)
def perform_clustering(params_path, params_list): params_len = len(params_list)-1 save_to = 'cluster_labels' save_to += '_(' + str(params_list[0]) for i in params_list[1:-1]: save_to += '_' + str(i) save_to += ')_' + str(params_list[-1]) save_to += '.csv' save_to = os.path.join(params_path[-1], save_to) load_from = 'full_dataset' if params_len == 2 else ('mid_dataset' if params_len == 3 else 'small_dataset' ) for i in params_list[:-1]: load_from += '_' + str(i) load_from += '.csv' load_from = os.path.join(params_path[-2], load_from) curr_params = unpack_params(params_path, params_list) print("Performing clustering...") print("Using parameters: " + str(curr_params)) print("Saving at: " + save_to) print("Loading from: " + load_from) if os.path.exists(save_to): print ("Clustering for these parameters already done!") return # Perform clustering using the preferred parameters if curr_params['clustering_algorithm'] == "kmeans": inp = user_confirmation("Enter 'select' to plot kmeans inertia for various k, \n'c' to continue," ) if inp == 'select': select_kmeans(load_from, **curr_params) cluster_kmeans(save_to, load_from, **curr_params)
def detect_poss_duplicates(ids_dict, params_path_data, params_list_data, n_neighbors=5): curr_params_data = unpack_params(params_path_data, params_list_data) load_from_data = 'small_dataset' for i in params_list_data: load_from_data += '_' + str(i) load_from_data += '.csv' load_from_data = os.path.join(params_path_data[-1], load_from_data) save_to = 'possible_duplicates' save_to += '_data_(' + str(params_list_data[0]) for i in params_list_data[1:]: save_to += '_' + str(i) save_to += ').csv' print("Creating possible duplicates file...") print("Using dataset parameters: " + str(curr_params_data)) print("Saving at: " + save_to) print("Loading dataset from: " + load_from_data) df_dups = pd.read_csv(load_from_data, names=['UniqueID', 'f1', 'f2']) matrix_2d = df_dups.values[:, 1:] neigh = NearestNeighbors(n_neighbors=n_neighbors) neigh.fit(matrix_2d) dist_mat, idx_mat = neigh.kneighbors(matrix_2d) df_dups['name'] = [ os.path.split(get_key(df_dups.iloc[j, 0], ids_dict))[1] for j in range(idx_mat.shape[0]) ] for i in range(n_neighbors): df_dups['uid_' + str(i)] = [df_dups.iloc[x, 0] for x in idx_mat[:, i]] df_dups['name_' + str(i)] = [ os.path.split(get_key(df_dups.iloc[idx_mat[j, i], 0], ids_dict))[1] for j in range(idx_mat.shape[0]) ] df_dups['sound_dist_' + str(i)] = dist_mat[:, i] df_dups['name_dist_' + str(i)] = [ -jellyfish.jaro_winkler(df_dups['name'][j], df_dups['name_' + str(i)][j]) + 1 for j in range(idx_mat.shape[0]) ] df_dups['is_psfix_' + str(i)] = [ is_psfix(df_dups['name'][j], df_dups['name_' + str(i)][j]) for j in range(idx_mat.shape[0]) ] to_drop = ['f1', 'f2'] df_dups = df_dups.drop(to_drop, axis=1) df_dups.to_csv(save_to, index=False, header=True, encoding='utf-8')
def perform_feature_extraction(ids_dict, params_path, params_list, audio_path): save_to = 'full_dataset' for i in params_list: save_to += '_' + str(i) save_to += '.csv' save_to = os.path.join(params_path[-1], save_to) curr_params = unpack_params(params_path, params_list) print("Performing feature extraction...") print("Using parameters: " + str(curr_params)) print("Saving at: " + save_to) if os.path.exists(save_to): print("Feature extraction for these parameters already done!") return # Extract features for a clip using the preferred parameters if curr_params['method'] == "mfcc": extract_mfcc(ids_dict, save_to, audio_path, **curr_params) elif curr_params['method'] == "spectrogram": extract_spectrogram(save_to, audio_path, **curr_params)
num_heads, n_classes, n_layers, dropouti=config['dropouti'], dropouth=config['dropouth'], dropouta=config.get('dropouta', 0.1), dropoutc=config['dropoutc'], rel_pos=config['rel_pos']) # load embedding model.embed.lut.weight = nn.Parameter(TEXT.vocab.vectors) device = th.device('cuda:0') model = model.to(device) embed_params, other_params, wd_params = unpack_params(model.named_parameters()) optimizer = get_wrapper(config['opt_wrapper'])(optim.Adam([{ 'params': embed_params, 'lr': 0 }, { 'params': other_params, 'lr': config.get('lr', 1e-3) }, { 'params': wd_params, 'lr': config.get('lr', 1e-3), 'weight_decay':
def run(proc_id, n_gpus, devices, config): th.manual_seed(config['seed']) np.random.seed(config['seed']) th.cuda.manual_seed_all(config['seed']) dev_id = devices[proc_id] if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=dev_id) TEXT = data.Field(lower=True, batch_first=True) train, dev, test = get_lm_dataset(config['dataset']).splits(TEXT, root='./data') TEXT.build_vocab(train) train = LMDataset(train, max_length=config['length'], part=(proc_id, n_gpus)) dev = LMDataset(dev, max_length=config['length']) test = LMDataset(test, max_length=config['length']) batcher = LMBatcher(TEXT, fully=config['fully']) train_loader = DataLoader(dataset=train, batch_size=config['batch_size'] // n_gpus, collate_fn=batcher, shuffle=True, num_workers=6) dev_loader = DataLoader(dataset=dev, batch_size=config['dev_batch_size'], collate_fn=batcher, shuffle=False, num_workers=0) test_loader = DataLoader(dataset=test, batch_size=config['batch_size'], collate_fn=batcher, shuffle=False, num_workers=0) dim_embed = config['dim_embed'] dim_model = config['dim_model'] dim_ff = config['dim_ff'] num_heads = config['num_heads'] n_layers = config['n_layers'] vocab_size = len(TEXT.vocab) model = make_model(vocab_size, dim_embed, dim_model, dim_ff, num_heads, vocab_size, n_layers, dropouti=config['dropouti'], dropouth=config['dropouth'], dropouta=config.get('dropouta', 0.1), dropoutc=config['dropoutc'], rel_pos=config['rel_pos'], ffn=config['ffn']) # tie weights if dim_embed == dim_model: model.generator.proj.weight = model.embed.lut.weight device = th.device(dev_id) model = model.to(device) embed_params, other_params, wd_params = unpack_params( model.named_parameters()) optimizer = get_wrapper(config['opt_wrapper'])(optim.Adam([{ 'params': embed_params + other_params, 'lr': config.get('lr', 1e-3) }, { 'params': wd_params, 'lr': config.get('lr', 1e-3), 'weight_decay': 5e-5 }])) best_val = 1e9 best_test = 0 for _ in range(config['n_epochs']): if proc_id == 0: print('training...') model.train() n_tokens = 0 sum_loss = 0 hit = 0 tic = time.time() for i, batch in enumerate(train_loader): batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device) batch.g.ndata['pos'] = batch.g.ndata['pos'].to(device) out = model(batch) loss = F.nll_loss(out, batch.y) optimizer.zero_grad() loss.backward() if n_gpus > 1: for param in model.parameters(): if param.requires_grad and param.grad is not None: th.distributed.all_reduce( param.grad.data, op=th.distributed.ReduceOp.SUM) param.grad.data /= n_gpus nn.utils.clip_grad_norm_(model.parameters(), 2) optimizer.step() n = len(batch.y) n_tokens += n sum_loss += loss.item() * n hit += (out.max(dim=-1)[1] == batch.y).sum().item() if (i + 1) % config['log_interval'] == 0 and proc_id == 0: mem = th.cuda.max_memory_cached() print( 'ppl: ', np.exp(sum_loss / n_tokens), ' acc: ', hit * 1.0 / n_tokens, ' #tokens/s: ', config['batch_size'] * config['log_interval'] * config['length'] / (time.time() - tic), ' #mem: ', mem / 1024 / 1024 / 1024) tic = time.time() n_tokens, sum_loss, hit = 0, 0, 0 if n_gpus > 1: th.distributed.barrier() if proc_id == 0: print('evaluating...') model.eval() n_tokens = 0 sum_loss = 0 hit = 0 for batch in dev_loader: batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device) batch.g.ndata['pos'] = batch.g.ndata['pos'].to(device) with th.no_grad(): out = model(batch) loss = F.nll_loss(out, batch.y, reduction='sum') n = len(batch.y) n_tokens += n sum_loss += loss.item() hit += (out.max(dim=-1)[1] == batch.y).sum().item() if proc_id == 0: if config['dataset'] == 'enwik8' or config['dataset'] == 'text8': print('bpc: ', (sum_loss / n_tokens) / np.log(2), ' acc: ', hit * 1.0 / n_tokens) else: print('ppl: ', np.exp(sum_loss / n_tokens), ' acc: ', hit * 1.0 / n_tokens) optimizer.adjust_lr(np.exp(sum_loss / n_tokens)) val_ppl = np.exp(sum_loss / n_tokens) if proc_id == 0: print('testing...') model.eval() n_tokens = 0 sum_loss = 0 hit = 0 for batch in test_loader: batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device) batch.g.ndata['pos'] = batch.g.ndata['pos'].to(device) with th.no_grad(): out = model(batch) loss = F.nll_loss(out, batch.y, reduction='sum') n = len(batch.y) n_tokens += n sum_loss += loss.item() hit += (out.max(dim=-1)[1] == batch.y).sum().item() if proc_id == 0: if config['dataset'] == 'enwik8' or config['dataset'] == 'text8': print('bpc: ', (sum_loss / n_tokens) / np.log(2), ' acc: ', hit * 1.0 / n_tokens) else: print('ppl: ', np.exp(sum_loss / n_tokens), ' acc: ', hit * 1.0 / n_tokens) if val_ppl < best_val: best_val = val_ppl best_test = np.exp(sum_loss / n_tokens) if proc_id == 0: if config['dataset'] == 'enwik8' or config['dataset'] == 'text8': print('best val: %.2f ' % np.log2(best_val), 'best test: %.2f ' % np.log2(best_test)) else: print('best val: %.2f ' % best_val, 'best test: %.2f ' % best_test)
def run(proc_id, n_gpus, devices, config, checkpoint, eval_mode): th.manual_seed(config['seed']) np.random.seed(config['seed']) th.cuda.manual_seed_all(config['seed']) dev_id = devices[proc_id] if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=dev_id) TEXT = data.Field(batch_first=True) train, dev, test = get_lm_dataset(config['dataset']).splits(TEXT, root='./data') TEXT.build_vocab(train) train = LMDataset(train, max_length=config['length'], part=(proc_id, n_gpus)) eval_length = config['eval_length'] dev = LMDataset(dev, max_length=eval_length, test=True) test = LMDataset(test, max_length=eval_length, test=True) batcher = LMBatcher(TEXT, graph_type=config['graph_type'], **config.get('graph_attrs', {})) if not eval_mode: train_loader = DataLoader(dataset=train, batch_size=config['batch_size'] // n_gpus, collate_fn=batcher, shuffle=True, num_workers=6) dev_loader = DataLoader(dataset=dev, batch_size=config['dev_batch_size'] // n_gpus, collate_fn=batcher, shuffle=False, num_workers=6) test_loader = DataLoader(dataset=test, batch_size=config['batch_size'] // n_gpus, collate_fn=batcher, shuffle=False, num_workers=6) dim_embed = config['dim_embed'] dim_model = config['dim_model'] dim_ff = config['dim_ff'] num_heads = config['num_heads'] n_layers = config['n_layers'] vocab_size = len(TEXT.vocab) dim_pos = config.get('dim_pos', 1) model = make_model(vocab_size, dim_embed, dim_model, dim_ff, num_heads, vocab_size, n_layers, dropouti=config['dropouti'], dropouth=config['dropouth'], dropouta=config.get('dropouta', 0.1), dropoutc=config['dropoutc'], rel_pos=config['rel_pos'], dim_pos=dim_pos) if checkpoint != -1: with open('checkpoints/{}.pkl'.format(checkpoint), 'rb') as f: state_dict = th.load(f, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict) # tie weights if dim_embed == dim_model: model.embed.lut.weight = model.generator.proj.weight device = th.device(dev_id) th.cuda.set_device(device) model = model.to(device) embed_params, other_params, wd_params = unpack_params( model.named_parameters()) optimizer = get_wrapper(config['opt_wrapper'])(optim.Adam([{ 'params': embed_params + other_params, 'lr': config.get('lr', 1e-3), 'betas': (0.9, 0.98) }, { 'params': wd_params, 'lr': config.get('lr', 1e-3), 'betas': (0.9, 0.98) }]), **config.get('opt_attrs', {})) if not eval_mode: for _ in range(checkpoint + 1): for _ in range(len(train_loader)): optimizer.step() best_val = 1e9 best_test = 0 last_epoch = checkpoint + 2 if eval_mode else config['n_epochs'] eval_interval = config.get('eval_interval', 1) for epoch in range(checkpoint + 1, last_epoch): if not eval_mode: if proc_id == 0: print('epoch {} starts'.format(epoch)) print('training...') model.train() n_tokens = 0 sum_loss = 0 hit = 0 tic = time.time() for i, batch in enumerate(train_loader): batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device) if dim_pos == 1: batch.g.ndata['pos'] = batch.g.ndata['pos'].to(device) else: for k in range(dim_pos): batch.g.ndata['pos_{}'.format(k)] = batch.g.ndata[ 'pos_{}'.format(k)].to(device) aux = (epoch * 1.0 / config['n_epochs'] ) if config['dataset'] in char_lm else None out = model(batch, aux=aux) if aux is None: loss = F.nll_loss(out, batch.y) else: loss = 0 for out_l in out: loss = loss + F.nll_loss(out_l, batch.y) loss /= len(out) optimizer.zero_grad() loss.backward() if n_gpus > 1: for param in model.parameters(): if param.requires_grad and param.grad is not None: th.distributed.all_reduce( param.grad.data, op=th.distributed.ReduceOp.SUM) param.grad.data /= n_gpus nn.utils.clip_grad_norm_(model.parameters(), 0.25) optimizer.step() n = len(batch.y) n_tokens += n sum_loss += loss.item() * n if aux is None: hit += (out.max(dim=-1)[1] == batch.y).sum().item() else: hit += (out[-1].max(dim=-1)[1] == batch.y).sum().item() if (i + 1) % config['log_interval'] == 0 and proc_id == 0: mem = th.cuda.max_memory_cached() print( 'ppl: ', np.exp(sum_loss / n_tokens), ' acc: ', hit * 1.0 / n_tokens, ' #tokens/s: ', config['batch_size'] * config['log_interval'] * config['length'] / (time.time() - tic), ' #mem: ', mem / 1024 / 1024 / 1024) tic = time.time() n_tokens, sum_loss, hit = 0, 0, 0 if n_gpus > 1: th.distributed.barrier() if proc_id == 0: print('evaluating...') if not os.path.exists('checkpoints'): os.mkdir('checkpoints') with open('checkpoints/{}.pkl'.format(epoch), 'wb') as f: th.save(model.state_dict(), f) if (epoch + 1) % eval_interval > 0 and not eval_mode: continue model.eval() n_tokens = 0 sum_loss = 0 hit = 0 for batch in dev_loader: batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device) if dim_pos == 1: batch.g.ndata['pos'] = batch.g.ndata['pos'].to(device) else: for k in range(dim_pos): batch.g.ndata['pos_{}'.format(k)] = batch.g.ndata[ 'pos_{}'.format(k)].to(device) with th.no_grad(): out = model(batch) loss = F.nll_loss(out, batch.y, reduction='sum') n = len(batch.y) n_tokens += n sum_loss += loss.item() hit += (out.max(dim=-1)[1] == batch.y).sum().item() if proc_id == 0: if config['dataset'] in char_lm: print('bpc: ', (sum_loss / n_tokens) / np.log(2), ' acc: ', hit * 1.0 / n_tokens) else: print('ppl: ', np.exp(sum_loss / n_tokens), ' acc: ', hit * 1.0 / n_tokens) optimizer.adjust_lr(np.exp(sum_loss / n_tokens)) val_ppl = np.exp(sum_loss / n_tokens) if proc_id == 0: print('testing...') model.eval() n_tokens = 0 sum_loss = 0 hit = 0 for batch in test_loader: batch.y = batch.y.to(device) batch.g.edata['etype'] = batch.g.edata['etype'].to(device) batch.g.ndata['x'] = batch.g.ndata['x'].to(device) if dim_pos == 1: batch.g.ndata['pos'] = batch.g.ndata['pos'].to(device) else: for k in range(dim_pos): batch.g.ndata['pos_{}'.format(k)] = batch.g.ndata[ 'pos_{}'.format(k)].to(device) with th.no_grad(): out = model(batch) loss = F.nll_loss(out, batch.y, reduction='sum') n = len(batch.y) n_tokens += n sum_loss += loss.item() hit += (out.max(dim=-1)[1] == batch.y).sum().item() if proc_id == 0: if config['dataset'] in char_lm: print('bpc: ', (sum_loss / n_tokens) / np.log(2), ' acc: ', hit * 1.0 / n_tokens) else: print('ppl: ', np.exp(sum_loss / n_tokens), ' acc: ', hit * 1.0 / n_tokens) if val_ppl < best_val: best_val = val_ppl best_test = np.exp(sum_loss / n_tokens) if proc_id == 0: if config['dataset'] in char_lm: print('best val: %.2f ' % np.log2(best_val), 'best test: %.2f ' % np.log2(best_test)) else: print('best val: %.2f ' % best_val, 'best test: %.2f ' % best_test)