def run(n_init, max_features, umap_n_components, dataset, val_dataset, labels, result_dir, random_state ): # Set random states np.random.seed(random_state) target_names = fetch_20newsgroups().target_names idx = [target_names.index(l) for l in labels] # load data train_df = pd.read_csv(dataset) train_df = train_df.query("labels in @idx") print(train_df.shape) train_texts = train_df['texts'].to_numpy() train_labels = train_df['labels'].to_numpy() print(train_texts.shape) val_df = pd.read_csv(val_dataset) val_df = val_df.query("labels in @idx") print(val_df.shape) val_texts = val_df['texts'].to_numpy() val_labels = val_df['labels'].to_numpy() print(val_texts.shape) tfidf = TfidfVectorizer(max_features=max_features, stop_words='english') X_train = tfidf.fit_transform(train_texts) X_test = tfidf.transform(val_texts) #umap = UMAP(n_components=umap_n_components) #X_train = umap.fit_transform(X_train.toarray()) #X_test = umap.transform(X_test.toarray()) print(len(np.unique(train_labels))) kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels))) kmeans.fit(X_train) predicted_labels = kmeans.predict(X_test) best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels) ari = adjusted_rand_score(val_labels, predicted_labels) nmi = normalized_mutual_info_score(val_labels, predicted_labels) purity = purity_score(y_true=val_labels, y_pred=predicted_labels) run_results = {} run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results['purity'] = purity # use purity to compare with microsoft paper os.makedirs(result_dir, exist_ok=True) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, f'20newsgroups_samples-kmeans.csv'), index=False)
def run(n_init, max_features, umap_n_components, dataset, result_dir, random_state, do_umap): # Set random states np.random.seed(random_state) # load data train_df = pd.read_csv(dataset) texts = train_df['texts'].to_numpy() labels = train_df['labels'].to_numpy() le = LabelEncoder() labels = le.fit_transform(labels) tfidf = TfidfVectorizer(max_features=max_features, stop_words='english') X_train = tfidf.fit_transform(texts) if do_umap: umap = UMAP(n_components=umap_n_components) X_train = umap.fit_transform(X_train.toarray()) kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(labels))) predicted_labels = kmeans.fit_predict(X_train) best_matching, accuracy = cluster_accuracy(labels, predicted_labels) ari = adjusted_rand_score(labels, predicted_labels) nmi = normalized_mutual_info_score(labels, predicted_labels) purity = purity_score(y_true=labels, y_pred=predicted_labels) run_results = {} run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results[ 'purity'] = purity # use purity to compare with microsoft paper os.makedirs(result_dir, exist_ok=True) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, f'trec6-kmeans.csv'), index=False)
def run(n_init, models, embedding_extractor, batch_size, dataset, train_idx_file, val_idx_file, result_dir, random_state, device): # Set random states np.random.seed(random_state) torch.manual_seed(random_state) torch.cuda.manual_seed_all(random_state) os.makedirs(result_dir, exist_ok=True) # load data df = pd.read_csv(dataset) with open(train_idx_file, 'r') as f: train_idx = np.array(list(map(int, f.readlines()))) with open(val_idx_file, 'r') as f: val_idx = np.array(list(map(int, f.readlines()))) all_idx = np.concatenate((train_idx, val_idx)) df_train = df.iloc[all_idx].copy() train_texts = df_train['texts'].to_numpy() train_labels = df_train['labels'].to_numpy() train_data = TextDataset(train_texts, train_labels) train_data_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=False) df_val = df.iloc[val_idx].copy() val_texts = df_val['texts'].to_numpy() val_labels = df_val['labels'].to_numpy() val_data = TextDataset(val_texts, val_labels) val_data_loader = DataLoader(dataset=val_data, batch_size=batch_size, shuffle=False) results = [] for model in models: # init lm model & tokenizer lm_model = AutoModel.from_pretrained(model, return_dict=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(model, return_dict=True, output_hidden_states=True) lm_model.to(device) train_embeddings = [] train_labels = [] for batch_texts, batch_labels in tqdm( train_data_loader, desc="Extracting train embeddings"): inputs = tokenizer(list(batch_texts), return_tensors='pt', padding=True, truncation=True) inputs = inputs.to(device) with torch.no_grad(): outputs = lm_model(**inputs) extracted_embeddings = embedding_extractor( outputs).cpu().detach().numpy() train_embeddings.append(extracted_embeddings) train_labels.extend(batch_labels.numpy().astype('int')) X_train = np.vstack(train_embeddings) train_labels = np.array(train_labels) test_embeddings = [] val_labels = [] for batch_texts, batch_labels in tqdm( val_data_loader, desc="Extracting val embeddings"): inputs = tokenizer(list(batch_texts), return_tensors='pt', padding=True, truncation=True) inputs = inputs.to(device) with torch.no_grad(): outputs = lm_model(**inputs) extracted_embeddings = embedding_extractor( outputs).cpu().detach().numpy() test_embeddings.append(extracted_embeddings) val_labels.extend(batch_labels.numpy().astype('int')) X_test = np.vstack(test_embeddings) val_labels = np.array(val_labels) kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels))) kmeans.fit(X_train) predicted_labels = kmeans.predict(X_test) best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels) ari = adjusted_rand_score(val_labels, predicted_labels) nmi = normalized_mutual_info_score(val_labels, predicted_labels) purity = purity_score(y_true=val_labels, y_pred=predicted_labels) run_results = {} run_results['model'] = model run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results[ 'purity'] = purity # use purity to compare with microsoft paper results.append(run_results) with open( os.path.join(result_dir, f'{model.replace("/", "_")}_embeddings.h'), 'wb') as f: pickle.dump([X_train, train_labels, X_test, val_labels], f) result_df = pd.DataFrame.from_records(results) result_df.to_csv(os.path.join( result_dir, f'ag_news_subset5-sbert-embeddings-kmeans.csv'), index=False)
def run(n_epochs, lr, train_batch_size, val_batch_size, base_model, clustering_loss_weight, embedding_extractor, annealing_alphas, dataset, val_dataset, result_dir, early_stopping, early_stopping_tol, device, random_state): # Set random states np.random.seed(random_state) torch.manual_seed(random_state) torch.cuda.manual_seed_all(random_state) # load data train_df = pd.read_csv(dataset) train_texts = train_df['texts'].to_numpy() train_labels = train_df['labels'].to_numpy() train_data = TextDataset(train_texts, train_labels) train_data_loader = DataLoader(dataset=train_data, batch_size=train_batch_size, shuffle=False) val_df = pd.read_csv(val_dataset) val_texts = val_df['texts'].to_numpy() val_labels = val_df['labels'].to_numpy() val_data = TextDataset(val_texts, val_labels) val_data_loader = DataLoader(dataset=val_data, batch_size=val_batch_size, shuffle=False) # init lm model & tokenizer lm_model = AutoModelForMaskedLM.from_pretrained(base_model, return_dict=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(base_model, return_dict=True, output_hidden_states=True) lm_model.to(device) # init clustering model model, initial_centroids, initial_embeddings = init_model( lm_model=lm_model, tokenizer=tokenizer, data_loader=train_data_loader, embedding_extractor=embedding_extractor, n_clusters=np.unique(train_labels).shape[0], device=device) # init optimizer & scheduler opt = torch.optim.RMSprop( params=model.parameters(), lr=lr, # 2e-5, 5e-7, eps=1e-8) total_steps = len(train_data_loader) * n_epochs scheduler = get_linear_schedule_with_warmup( optimizer=opt, num_warmup_steps=int(len(train_data_loader) * 0.5), num_training_steps=total_steps) # train the model hist = train(n_epochs=n_epochs, model=model, optimizer=opt, scheduler=scheduler, annealing_alphas=annealing_alphas, train_data_loader=train_data_loader, eval_data_loader=val_data_loader, clustering_loss_weight=clustering_loss_weight, early_stopping=early_stopping, early_stopping_tol=early_stopping_tol, verbose=True) # do eval run_results = {} predicted_labels, true_labels = evaluate(model=model, eval_data_loader=val_data_loader, verbose=True) best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels) ari = adjusted_rand_score(true_labels, predicted_labels) nmi = normalized_mutual_info_score(true_labels, predicted_labels) purity = purity_score(y_true=true_labels, y_pred=predicted_labels) run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results[ 'purity'] = purity # use purity to compare with microsoft paper # save train hist os.makedirs(result_dir, exist_ok=True) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, '20_newsgroups-distilbert.csv'), index=False) # save results & model os.makedirs(result_dir, exist_ok=True) with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f: pickle.dump(hist, file=f) torch.save(model, os.path.join(result_dir, 'model.bin'))
def run(n_init, max_features, umap_n_components, dataset, train_idx_file, val_idx_file, result_dir, random_state, do_umap=True): # Set random states np.random.seed(random_state) # load data df = pd.read_csv(dataset) with open(train_idx_file, 'r') as f: train_idx = np.array(list(map(int, f.readlines()))) with open(val_idx_file, 'r') as f: val_idx = np.array(list(map(int, f.readlines()))) all_idx = np.concatenate((train_idx, val_idx)) df_train = df.iloc[all_idx].copy() train_texts = df_train['texts'].to_numpy() train_labels = df_train['labels'].to_numpy() df_val = df.iloc[val_idx].copy() val_texts = df_val['texts'].to_numpy() val_labels = df_val['labels'].to_numpy() tfidf = TfidfVectorizer(max_features=max_features, stop_words='english') X_train = tfidf.fit_transform(train_texts) X_test = tfidf.transform(val_texts) if do_umap: umap = UMAP(n_components=umap_n_components) X_train = umap.fit_transform(X_train.toarray()) X_test = umap.transform(X_test.toarray()) kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels))) kmeans.fit(X_train) predicted_labels = kmeans.predict(X_test) best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels) ari = adjusted_rand_score(val_labels, predicted_labels) nmi = normalized_mutual_info_score(val_labels, predicted_labels) purity = purity_score(y_true=val_labels, y_pred=predicted_labels) run_results = {} run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results[ 'purity'] = purity # use purity to compare with microsoft paper os.makedirs(result_dir, exist_ok=True) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, f'ag_news_subset5-kmeans.csv'), index=False)
def run(n_epochs, hyperparam_grid, batch_size, val_batch_size, base_model, dataset, train_idx_file, val_idx_file, result_dir, early_stopping, early_stopping_tol, device, random_state ): # Set random states np.random.seed(random_state) torch.manual_seed(random_state) torch.cuda.manual_seed_all(random_state) # load data df = pd.read_csv(dataset) with open(train_idx_file, 'r') as f: train_idx = np.array(list(map(int, f.readlines()))) with open(val_idx_file, 'r') as f: val_idx = np.array(list(map(int, f.readlines()))) all_idx = np.append(train_idx, val_idx) df_train = df.iloc[all_idx] train_texts = df_train['texts'].to_numpy() train_labels = df_train['labels'].to_numpy() train_data = TextDataset(train_texts, train_labels) train_data_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=False) df_val = df.iloc[val_idx] val_texts = df_val['texts'].to_numpy() val_labels = df_val['labels'].to_numpy() val_data = TextDataset(val_texts, val_labels) val_data_loader = DataLoader(dataset=val_data, batch_size=val_batch_size, shuffle=False) # insert code here! results = [] param_grid = ParameterGrid(hyperparam_grid) for run_idx, params in enumerate(param_grid): print(f'Run: {run_idx+1}/{len(list(param_grid))}') print("Running with params:") pprint(params) # init lm model & tokenizer lm_model = AutoModelForMaskedLM.from_pretrained(base_model, return_dict=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(base_model, return_dict=True, output_hidden_states=True) lm_model.to(device) # init clustering model model, initial_centroids, initial_embeddings = init_model( lm_model=lm_model, tokenizer=tokenizer, data_loader=train_data_loader, embedding_extractor=params['embedding_extractor'], n_clusters=np.unique(train_labels).shape[0], device=device ) # init optimizer & scheduler opt = torch.optim.RMSprop( params=model.parameters(), lr=params['lr'], # hier weitermachen eps=1e-8 ) total_steps = len(train_data_loader) * n_epochs scheduler = get_linear_schedule_with_warmup( optimizer=opt, num_warmup_steps=int(len(train_data_loader) * 0.5), num_training_steps=total_steps ) # train the model hist = train( n_epochs=n_epochs, model=model, optimizer=opt, scheduler=scheduler, annealing_alphas=params['annealing_alphas'], train_data_loader=train_data_loader, eval_data_loader=val_data_loader, clustering_loss_weight=params['clustering_loss_weight'], early_stopping=early_stopping, early_stopping_tol=early_stopping_tol, verbose=True ) # do eval run_results = {**{f'param_{key}': value for key, value in params.items()}} predicted_labels, true_labels = evaluate( model=model, eval_data_loader=val_data_loader, verbose=True ) best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels) ari = adjusted_rand_score(true_labels, predicted_labels) nmi = normalized_mutual_info_score(true_labels, predicted_labels) purity = purity_score(y_true=true_labels, y_pred=predicted_labels) run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi # save train hist os.makedirs(result_dir, exist_ok=True) results.append(run_results) result_df = pd.DataFrame.from_records(results) result_df.to_csv(os.path.join(result_dir, 'opt_results_ag_news_subset10.csv'), index=False) with open(os.path.join(result_dir, f'train_hist_run{run_idx}.h'), 'wb') as f: pickle.dump(hist, file=f)