def find_pseudo_labels( self, labeled_file_path: str, unlabeled_file_path: str, temperature: int = 10, batch_size: int = None, **kwargs ): labeled_data = load_data_jsonl( labeled_file_path, ) unlabeled_data = load_data_jsonl( unlabeled_file_path, ) if not batch_size: batch_size = len(unlabeled_data) unlabeled_data_chunks = chunks(unlabeled_data, batch_size) n_batches = len(range(0, len(unlabeled_data), batch_size)) all_recovered = list() labeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in labeled_data])) for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks): logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}') unlabeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in unlabeled_data_chunk])) embeddings = np.concatenate((labeled_embeddings, unlabeled_embeddings), axis=0) w = (1 - pairwise_distances(embeddings, embeddings, metric='cosine')).astype(np.float32) all_recovered += get_nKNN_pseudo_labels(w, labeled_data, unlabeled_data_chunk, temperature=temperature) return all_recovered
def find_pseudo_labels(self, labeled_file_path: str, unlabeled_file_path: str, temperature: int = 10, batch_size: int = None, **kwargs): labeled_data = load_data_jsonl(labeled_file_path, ) unlabeled_data = load_data_jsonl(unlabeled_file_path, ) if not batch_size: batch_size = len(unlabeled_data) unlabeled_data_chunks = chunks(unlabeled_data, batch_size) n_batches = len(range(0, len(unlabeled_data), batch_size)) all_recovered = list() labeled_embeddings = np.array( self.embedder.embed_sentences([d['input'] for d in labeled_data])) for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks): logger.info( f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}') unlabeled_embeddings = np.array( self.embedder.embed_sentences( [d['input'] for d in unlabeled_data_chunk])) embeddings = np.concatenate( (labeled_embeddings, unlabeled_embeddings), axis=0) nn = NearestNeighbors(n_neighbors=10, metric='cosine') nn.fit(embeddings) graph = nn.kneighbors_graph().toarray() w = (graph.T + graph > 0).astype(int) # D d = np.diag(w.sum(0)) d_half = fractional_matrix_power(d, -0.5) # Normalized laplacian l_sym = np.eye(len(w)) - d_half @ w @ d_half # Eigen decomposition eigs = eigh(l_sym, eigvals=(1, min(31, len(l_sym) - 1))) normed_eigs = eigs[1] / np.sqrt(eigs[0]) # W_prime w_prime = (normed_eigs @ normed_eigs.T).astype(np.float32) all_recovered += get_nKNN_pseudo_labels(w_prime, labeled_data, unlabeled_data_chunk, temperature=temperature) return all_recovered
def find_pseudo_labels( self, labeled_file_path: str, unlabeled_file_path: str, batch_size: int = None, **kwargs ): self.fit(labeled_file_path) unlabeled_data = load_data_jsonl(unlabeled_file_path) if not batch_size: batch_size = len(unlabeled_data) unlabeled_data_chunks = chunks(unlabeled_data, batch_size) n_batches = len(range(0, len(unlabeled_data), batch_size)) recovered = list() for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks): logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}') X = self.embedder.embed_sentences([str(d['sentence']) for d in unlabeled_data_chunk]) predictions = self.logreg.predict_proba(X) pseudo_labels = predictions.argmax(1) pseudo_labels_scores = predictions.max(1) for original_data, pseudo_label, pseudo_label_score in zip( unlabeled_data_chunk, pseudo_labels, pseudo_labels_scores): recovered.append(dict( data=original_data.copy(), pseudo_label=self.labels_vocab(pseudo_label, rev=True), pseudo_label_score=float(pseudo_label_score) )) return recovered
def fit(self, path): train_data = load_data_jsonl(path) X = self.embedder.embed_sentences([str(d['sentence']) for d in train_data]) self.logreg = LogisticRegression(C=100.0) self.labels_vocab = Vocab([d['label'] for d in train_data]) y = [self.labels_vocab(d['label']) for d in train_data] self.logreg.fit(X, y)
def fit(self, path): train_data = load_data_jsonl(path) self.tfidf = TfidfVectorizer() X = self.tfidf.fit_transform([str(d['sentence']) for d in train_data]) self.logreg = LogisticRegression(C=100.0) self.labels_vocab = Vocab([d['label'] for d in train_data]) y = [self.labels_vocab(d['label']) for d in train_data] self.logreg.fit(X, y)
def find_pseudo_labels( self, labeled_file_path: str, unlabeled_file_path: str, temperature: int = 10, batch_size: int = None, **kwargs ): labeled_data = load_data_jsonl( labeled_file_path, ) unlabeled_data = load_data_jsonl( unlabeled_file_path, ) if not batch_size: batch_size = len(unlabeled_data) unlabeled_data_chunks = chunks(unlabeled_data, batch_size) n_batches = len(range(0, len(unlabeled_data), batch_size)) all_recovered = list() labeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in labeled_data], detached=True)) for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks): logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}') unlabeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in unlabeled_data_chunk], detached=True)) labels = [d['label'] for d in labeled_data] + ['' for _ in unlabeled_data_chunk] labels_vocab = Vocab([d['label'] for d in labeled_data]) embeddings = np.concatenate((labeled_embeddings, unlabeled_embeddings), axis=0) # Build similarity matrix w = (1 - pairwise_distances(embeddings, embeddings, metric='cosine')).astype(np.float32) # Extracts splits of W for each label. Will be used to compute score w_label = dict() for label in labels_vocab.labels: labelled_global_indices = [ix for ix, d in enumerate(labeled_data) if d['label'] == label] w_label[label] = w[:, labelled_global_indices] # Build hierarchical tree, bottom to top Z = linkage(embeddings, 'ward') root_tree = to_tree(Z) # Split tree, top to bottom trees = get_unique_label_trees(root_tree=root_tree, labels=labels) # Recover data recovered = list() for tree, path in trees: output = list() # Get all indices in the tree order = tree.pre_order() tree_labels = [labels[ix] for ix in order] # Case when all elements of tree are unlabelled if set(tree_labels) == {''}: recovered += output continue # Case when samples are mixed (labeled & unlabeled), but with a unique label # Get the label pseudo_label = [l for o, l in zip(order, tree_labels) if len(labels[o])][0] # Iterate over items for ix in order: # Case if item is unlabeled if labels[ix] == '': # Compute score global_ix = ix z_i = np.array([ w_label[label][global_ix].mean() for label in labels_vocab.labels ]) # temperature z_i *= temperature z_i_bar = np.exp(z_i) z_i_bar /= z_i_bar.sum() pseudo_label_score = float(z_i_bar[labels_vocab(pseudo_label)]) # Output dat = unlabeled_data_chunk[ix - len(labeled_data)].copy() output.append(dict( data=dat, pseudo_label=pseudo_label, pseudo_label_score=pseudo_label_score )) recovered += output all_recovered += recovered return all_recovered
def run_proto(train_path, model_name_or_path, test_input_path=None, test_output_path=None, refined=False): import numpy as np from util.data import load_data_jsonl import random import collections import os import pickle if test_output_path: os.makedirs(os.path.dirname(test_output_path), exist_ok=True) if test_input_path: test_sentences = [ line.strip() for line in open(test_input_path, 'r').readlines() if len(line.strip()) ] else: test_sentences = list() # train_path = f'data/datasets/Liu/few-shot_final/01/train.jsonl' # Load model bert = BERTEncoder(model_name_or_path) net = Protonet(encoder=bert) optimizer = torch.optim.Adam(net.parameters(), lr=2e-5) # Load data data = load_data_jsonl(train_path) print("Data loaded") data_dict = collections.defaultdict(list) for d in data: data_dict[d['label']].append(d['sentence']) data_dict = dict(data_dict) for k, d in data_dict.items(): random.shuffle(d) labels = sorted(data_dict.keys()) random.shuffle(labels) labels_train = labels[:int(len(labels) / 2)] labels_valid = labels[int(len(labels) / 2):] print(f"Train Labels ({len(labels_train)}) {labels_train}") print(f"Valid Labels ({len(labels_valid)}) {labels_valid}") # train_data_dict = { # k: d[:int(0.7 * len(d))] for k, d in data_dict.items() # } # valid_data_dict = { # k: d[int(0.7 * len(d)):] for k, d in data_dict.items() # } train_data_dict = {label: data_dict[label] for label in labels_train} valid_data_dict = {label: data_dict[label] for label in labels_valid} print("Data split. starting training") accs = list() n_eval_since_last_best = 0 best_valid_acc = 0.0 for _ in range(10000): loss, loss_dict = train_step(net, optimizer, train_data_dict, refined=refined) accs.append(loss_dict['acc']) if (_ + 1) % 100 == 0: train_acc = np.mean(accs) valid_acc = test_step(net, valid_data_dict, refined=refined) if valid_acc > best_valid_acc: print( f"Train acc={train_acc:.4f} | Valid acc={valid_acc:.4f} (better)" ) n_eval_since_last_best = 0 best_valid_acc = valid_acc if test_input_path: embeddings = list() for i in tqdm.tqdm(range(0, len(test_sentences), 16)): net.eval() with torch.no_grad(): embeddings.append( net.encoder.forward( test_sentences[i:i + 16]).cpu().detach().numpy()) with open(test_output_path, "wb") as file: pickle.dump(embeddings, file) else: n_eval_since_last_best += 1 print( f"Train acc={train_acc:.4f} | Valid acc={valid_acc:.4f} (worse, {n_eval_since_last_best})" ) if n_eval_since_last_best >= 5: print(f"Early-stopping.") break