def repr_for_text_file(text_file, model, tokenizer, layer, mean_pool): with torch.no_grad(): vectors = [ get_repr_from_layer( model, sentence_tensor, layer, mean_pool=mean_pool) for sentence_tensor in batch_generator( text_data_generator(text_file, tokenizer), 64)] return torch.cat(vectors, dim=0).numpy()
def load_and_batch_data(txt, lng, tokenizer, lng2idx, batch_size=32, epochs=1): text_batches = batch_generator(text_data_generator(txt, tokenizer, epochs=epochs, max_len=110), size=batch_size, padding=True) lng_batches = batch_generator(lng_data_generator(lng, lng2idx, epochs=epochs), size=batch_size, padding=False) return zip(text_batches, lng_batches)
def repr_for_txt_file(filename, tokenizer, model, device, layer, center_lng=True, mean_pool=True): print(f"Processing {filename} ... ", file=sys.stderr, end="", flush=True) with torch.no_grad(): vectors = [ get_repr_from_layer(model, sentence_tensor.to(device), layer, mean_pool=mean_pool).cpu() for sentence_tensor in batch_generator( text_data_generator(filename, tokenizer), 32) ] lng_repr = torch.cat(vectors, dim=0) if center_lng: lng_repr = lng_repr - lng_repr.mean(0, keepdim=True) print("Done.", file=sys.stderr) return lng_repr
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("bert_model", type=str, help="Variant of pre-trained model.") parser.add_argument( "layer", type=int, help="Layer from of layer from which the representation is taken.") parser.add_argument("data", type=str, nargs="+", help="Sentences with language for training.") parser.add_argument("--distance", choices=["cosine", "euklid"], default="cosine") parser.add_argument("--skip-tokenization", default=False, action="store_true", help="Only split on spaces, skip wordpieces.") parser.add_argument( "--mean-pool", default=False, action="store_true", help="If true, use mean-pooling instead of [CLS] vector.") parser.add_argument( "--center-lng", default=False, action="store_true", help="Center languages to be around coordinate origin.") parser.add_argument( "--projections", default=None, nargs="+", help="List of sklearn projections for particular languages.") parser.add_argument("--em-iterations", default=None, type=int, help="Iterations of projection self-learning.") parser.add_argument("--num-threads", type=int, default=4) args = parser.parse_args() if args.center_lng and args.projections is not None: print("You cannot do projections and centering at once.", file=sys.stderr) exit(1) if (args.projections is not None and len(args.projections) != len(args.data)): print("You must have a projection for each data file.", file=sys.stderr) exit(1) if (args.projections is not None and args.em_iterations is not None): print("You either have pre-trained projections or self-train them.", file=sys.stderr) exit(1) projections = None if args.projections is not None: projections = [] for proj_str in args.projections: if proj_str == "None": projections.append(None) else: projections.append(joblib.load(proj_str)) distance_fn = None if args.distance == "cosine": distance_fn = cosine_distances elif args.distance == "euklid": distance_fn = euklid_distances else: raise ValueError("Unknown distance function.") torch.set_num_threads(args.num_threads) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer, model = load_bert(args.bert_model, device)[:2] representations = [] with torch.no_grad(): for i, text_file in enumerate(args.data): print(f"Processing {text_file}") vectors = [ get_repr_from_layer(model, sentence_tensor, args.layer, tokenizer.pad_token_id, mean_pool=args.mean_pool) for sentence_tensor in batch_generator( text_data_generator(text_file, tokenizer), 64, tokenizer) ] lng_repr = torch.cat(vectors, dim=0) if args.center_lng: lng_repr = lng_repr - lng_repr.mean(0, keepdim=True) if projections is not None and projections[i] is not None: proj = projections[i] lng_repr = torch.from_numpy(proj.predict(lng_repr.numpy())) representations.append(lng_repr) mutual_projections = None if args.em_iterations is not None: print(f"EM training ...") new_mutual_projections = {} for i in range(args.em_iterations): print(f" ... iteration {i + 1}") for lng1, repr1 in zip(args.data, representations): for lng2, repr2 in zip(args.data, representations): if mutual_projections is not None: proj = mutual_projections[(lng1, lng2)] repr1 = torch.from_numpy( proj.predict(repr1.numpy())) distances = distance_fn(repr1, repr2) retrieved = repr2[distances.min(dim=1)[1]] proj = LinearRegression() proj.fit(repr1.numpy(), retrieved.numpy()) new_mutual_projections[(lng1, lng2)] = proj mutual_projections = new_mutual_projections data_len = representations[0].shape[0] assert all(r.shape[0] == data_len for r in representations) print() for k in [1, 5, 10, 20, 50, 100]: print(f"Recall at {k}, random baseline {k / data_len:.5f}") print("--", end="\t") for lng in args.data: print(lng[-6:-4], end="\t") print() recalls_to_avg = [] for lng1, repr1 in zip(args.data, representations): print(lng1[-6:-4], end="\t") for lng2, repr2 in zip(args.data, representations): if mutual_projections is not None: proj = mutual_projections[(lng1, lng2)] repr1 = torch.from_numpy(proj.predict(repr1.numpy())) distances = distance_fn(repr1, repr2) recall = recall_at_k_from_distances(distances, k) print(f"{recall.numpy():.5f}", end="\t") if lng1 != lng2: recalls_to_avg.append(recall.numpy()) print() print(f"On average: {np.mean(recalls_to_avg):.5f}") print()
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("bert_model", type=str, help="Variant of pre-trained model.") parser.add_argument( "layer", type=int, help="Layer from of layer from which the representation is taken.") parser.add_argument("language_list", type=str, help="TSV file with available languages.") parser.add_argument("data", type=str, help="Directory with txt files.") parser.add_argument("target", type=str, help="npz file with saved centroids.") parser.add_argument("--num-threads", type=int, default=4) parser.add_argument( "--mean-pool", default=False, action="store_true", help="If true, use mean-pooling instead of [CLS] vecotr.") parser.add_argument("--batch-size", type=int, default=32) parser.add_argument("--batch-count", type=int, default=200) args = parser.parse_args() torch.set_num_threads(args.num_threads) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer, model = load_bert(args.bert_model, device)[:2] language_names = [] centroids = [] with open(args.language_list) as lng_f: for line in lng_f: name, code = line.strip().split("\t") data_file = os.path.join(args.data, f"{code}.txt") data = text_data_generator(data_file, tokenizer) batches = batch_generator(data, args.batch_size, tokenizer) print(f"Data iterator initialized: {data_file}") with torch.no_grad(): representations = [] for _, txt in zip(range(args.batch_count), batches): batch_repr = get_repr_from_layer( model, txt.to(device), args.layer, tokenizer.pad_token_id, mean_pool=args.mean_pool).cpu().numpy() if not np.any(np.isnan(batch_repr)): representations.append(batch_repr) if representations: language_names.append(name) centroid = np.concatenate(representations, axis=0).mean(0) centroids.append(centroid) print("Centroids computed.") np.savez(args.target, languages=language_names, centroids=centroids)