コード例 #1
0
def repr_for_text_file(text_file, model, tokenizer, layer, mean_pool):
    with torch.no_grad():
        vectors = [
            get_repr_from_layer(
                model, sentence_tensor, layer,
                mean_pool=mean_pool)
            for sentence_tensor in batch_generator(
                text_data_generator(text_file, tokenizer), 64)]
        return torch.cat(vectors, dim=0).numpy()
コード例 #2
0
def load_and_batch_data(txt, lng, tokenizer, lng2idx, batch_size=32, epochs=1):
    text_batches = batch_generator(text_data_generator(txt,
                                                       tokenizer,
                                                       epochs=epochs,
                                                       max_len=110),
                                   size=batch_size,
                                   padding=True)
    lng_batches = batch_generator(lng_data_generator(lng,
                                                     lng2idx,
                                                     epochs=epochs),
                                  size=batch_size,
                                  padding=False)
    return zip(text_batches, lng_batches)
コード例 #3
0
def repr_for_txt_file(filename,
                      tokenizer,
                      model,
                      device,
                      layer,
                      center_lng=True,
                      mean_pool=True):
    print(f"Processing {filename} ... ", file=sys.stderr, end="", flush=True)
    with torch.no_grad():
        vectors = [
            get_repr_from_layer(model,
                                sentence_tensor.to(device),
                                layer,
                                mean_pool=mean_pool).cpu()
            for sentence_tensor in batch_generator(
                text_data_generator(filename, tokenizer), 32)
        ]

        lng_repr = torch.cat(vectors, dim=0)
        if center_lng:
            lng_repr = lng_repr - lng_repr.mean(0, keepdim=True)
    print("Done.", file=sys.stderr)
    return lng_repr
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("bert_model",
                        type=str,
                        help="Variant of pre-trained model.")
    parser.add_argument(
        "layer",
        type=int,
        help="Layer from of layer from which the representation is taken.")
    parser.add_argument("data",
                        type=str,
                        nargs="+",
                        help="Sentences with language for training.")
    parser.add_argument("--distance",
                        choices=["cosine", "euklid"],
                        default="cosine")
    parser.add_argument("--skip-tokenization",
                        default=False,
                        action="store_true",
                        help="Only split on spaces, skip wordpieces.")
    parser.add_argument(
        "--mean-pool",
        default=False,
        action="store_true",
        help="If true, use mean-pooling instead of [CLS] vector.")
    parser.add_argument(
        "--center-lng",
        default=False,
        action="store_true",
        help="Center languages to be around coordinate origin.")
    parser.add_argument(
        "--projections",
        default=None,
        nargs="+",
        help="List of sklearn projections for particular languages.")
    parser.add_argument("--em-iterations",
                        default=None,
                        type=int,
                        help="Iterations of projection self-learning.")
    parser.add_argument("--num-threads", type=int, default=4)
    args = parser.parse_args()

    if args.center_lng and args.projections is not None:
        print("You cannot do projections and centering at once.",
              file=sys.stderr)
        exit(1)
    if (args.projections is not None
            and len(args.projections) != len(args.data)):
        print("You must have a projection for each data file.",
              file=sys.stderr)
        exit(1)
    if (args.projections is not None and args.em_iterations is not None):
        print("You either have pre-trained projections or self-train them.",
              file=sys.stderr)
        exit(1)

    projections = None
    if args.projections is not None:
        projections = []
        for proj_str in args.projections:
            if proj_str == "None":
                projections.append(None)
            else:
                projections.append(joblib.load(proj_str))

    distance_fn = None
    if args.distance == "cosine":
        distance_fn = cosine_distances
    elif args.distance == "euklid":
        distance_fn = euklid_distances
    else:
        raise ValueError("Unknown distance function.")

    torch.set_num_threads(args.num_threads)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer, model = load_bert(args.bert_model, device)[:2]

    representations = []

    with torch.no_grad():
        for i, text_file in enumerate(args.data):
            print(f"Processing {text_file}")
            vectors = [
                get_repr_from_layer(model,
                                    sentence_tensor,
                                    args.layer,
                                    tokenizer.pad_token_id,
                                    mean_pool=args.mean_pool)
                for sentence_tensor in batch_generator(
                    text_data_generator(text_file, tokenizer), 64, tokenizer)
            ]

            lng_repr = torch.cat(vectors, dim=0)
            if args.center_lng:
                lng_repr = lng_repr - lng_repr.mean(0, keepdim=True)

            if projections is not None and projections[i] is not None:
                proj = projections[i]
                lng_repr = torch.from_numpy(proj.predict(lng_repr.numpy()))

            representations.append(lng_repr)

        mutual_projections = None
        if args.em_iterations is not None:
            print(f"EM training ...")
            new_mutual_projections = {}
            for i in range(args.em_iterations):
                print(f" ... iteration {i + 1}")
                for lng1, repr1 in zip(args.data, representations):
                    for lng2, repr2 in zip(args.data, representations):
                        if mutual_projections is not None:
                            proj = mutual_projections[(lng1, lng2)]
                            repr1 = torch.from_numpy(
                                proj.predict(repr1.numpy()))

                        distances = distance_fn(repr1, repr2)
                        retrieved = repr2[distances.min(dim=1)[1]]
                        proj = LinearRegression()
                        proj.fit(repr1.numpy(), retrieved.numpy())
                        new_mutual_projections[(lng1, lng2)] = proj
                mutual_projections = new_mutual_projections

        data_len = representations[0].shape[0]
        assert all(r.shape[0] == data_len for r in representations)
        print()
        for k in [1, 5, 10, 20, 50, 100]:
            print(f"Recall at {k}, random baseline {k / data_len:.5f}")
            print("--", end="\t")
            for lng in args.data:
                print(lng[-6:-4], end="\t")
            print()

            recalls_to_avg = []

            for lng1, repr1 in zip(args.data, representations):
                print(lng1[-6:-4], end="\t")
                for lng2, repr2 in zip(args.data, representations):

                    if mutual_projections is not None:
                        proj = mutual_projections[(lng1, lng2)]
                        repr1 = torch.from_numpy(proj.predict(repr1.numpy()))

                    distances = distance_fn(repr1, repr2)

                    recall = recall_at_k_from_distances(distances, k)
                    print(f"{recall.numpy():.5f}", end="\t")

                    if lng1 != lng2:
                        recalls_to_avg.append(recall.numpy())
                print()
            print(f"On average: {np.mean(recalls_to_avg):.5f}")
            print()
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("bert_model",
                        type=str,
                        help="Variant of pre-trained model.")
    parser.add_argument(
        "layer",
        type=int,
        help="Layer from of layer from which the representation is taken.")
    parser.add_argument("language_list",
                        type=str,
                        help="TSV file with available languages.")
    parser.add_argument("data", type=str, help="Directory with txt files.")
    parser.add_argument("target",
                        type=str,
                        help="npz file with saved centroids.")
    parser.add_argument("--num-threads", type=int, default=4)
    parser.add_argument(
        "--mean-pool",
        default=False,
        action="store_true",
        help="If true, use mean-pooling instead of [CLS] vecotr.")
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--batch-count", type=int, default=200)
    args = parser.parse_args()

    torch.set_num_threads(args.num_threads)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer, model = load_bert(args.bert_model, device)[:2]

    language_names = []
    centroids = []

    with open(args.language_list) as lng_f:
        for line in lng_f:
            name, code = line.strip().split("\t")
            data_file = os.path.join(args.data, f"{code}.txt")

            data = text_data_generator(data_file, tokenizer)
            batches = batch_generator(data, args.batch_size, tokenizer)
            print(f"Data iterator initialized: {data_file}")

            with torch.no_grad():
                representations = []
                for _, txt in zip(range(args.batch_count), batches):
                    batch_repr = get_repr_from_layer(
                        model,
                        txt.to(device),
                        args.layer,
                        tokenizer.pad_token_id,
                        mean_pool=args.mean_pool).cpu().numpy()
                    if not np.any(np.isnan(batch_repr)):
                        representations.append(batch_repr)

                if representations:
                    language_names.append(name)
                    centroid = np.concatenate(representations, axis=0).mean(0)
                    centroids.append(centroid)

    print("Centroids computed.")

    np.savez(args.target, languages=language_names, centroids=centroids)