Exemple #1
0
    def __init__(
        self,
        n_tokens,
        d_model=512,
        n_encoder_layers=2,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
    ):
        super(Seq2SeqLSTM, self).__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}
        assert self.config["pad_id"] is not None

        d_rep = d_model  # so same embedding can be used by encoder and decoder

        # Encoder
        self.encoder = CodeEncoderLSTM(
            n_tokens=n_tokens,
            d_model=d_model,
            d_rep=d_rep,
            n_encoder_layers=n_encoder_layers,
            dropout=dropout,
            pad_id=pad_id,
            project="hidden"
        )

        # Decoder
        self.decoder = nn.LSTM(input_size=d_model, hidden_size=d_rep, num_layers=1,
                               bidirectional=False, dropout=dropout)
        self.decoder_c_0 = nn.Parameter(torch.zeros(1, 1, d_rep))
    def __init__(
        self,
        n_tokens,
        n_output_tokens,
        d_model=512,
        d_out_projection=512,
        n_hidden_output=1,
        d_rep=128,
        n_head=8,
        n_encoder_layers=6,
        d_ff=2048,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
        encoder_type="transformer",
    ):
        super(TypeTransformer, self).__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}

        # Encoder and output for type prediction
        assert encoder_type in ["transformer", "lstm"]
        if encoder_type == "transformer":
            self.encoder = CodeEncoder(n_tokens,
                                       d_model,
                                       d_rep,
                                       n_head,
                                       n_encoder_layers,
                                       d_ff,
                                       dropout,
                                       activation,
                                       norm,
                                       pad_id,
                                       project=False)
            # TODO: Try LeakyReLU
            self.output = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(),
                                        nn.Linear(d_model, n_output_tokens))
        elif encoder_type == "lstm":
            self.encoder = CodeEncoderLSTM(
                n_tokens=n_tokens,
                d_model=d_model,
                d_rep=d_rep,
                n_encoder_layers=n_encoder_layers,
                dropout=dropout,
                pad_id=pad_id,
                project=False,
            )
            layers = []
            layers.append(nn.Linear(d_model * 2, d_out_projection))
            if n_hidden_output > 1:
                layers.append(nn.Dropout(dropout))
            layers.append(nn.ReLU())
            for hidden_idx in range(n_hidden_output - 1):
                layers.append(nn.Linear(d_out_projection, d_out_projection))
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(d_out_projection, n_output_tokens))
            self.output = nn.Sequential(*layers)
Exemple #3
0
 def make_encoder(self,
                  n_tokens,
                  d_model,
                  d_rep,
                  pad_id=None,
                  encoder_type="transformer",
                  lstm_project_mode="hidden",
                  n_encoder_layers=6,
                  dropout=0.1,
                  **kwargs):
     if encoder_type == "transformer":
         return CodeEncoder(n_tokens,
                            project=True,
                            pad_id=pad_id,
                            d_model=d_model,
                            d_rep=d_rep,
                            n_encoder_layers=n_encoder_layers,
                            **kwargs)
     elif encoder_type == "lstm":
         return CodeEncoderLSTM(
             n_tokens=n_tokens,
             d_model=d_model,
             d_rep=d_rep,
             n_encoder_layers=n_encoder_layers,
             dropout=dropout,
             pad_id=pad_id,
             project=lstm_project_mode,
         )
     else:
         raise ValueError
Exemple #4
0
    def __init__(self,
                 n_tokens,
                 d_model=512,
                 pad_id=None,
                 encoder_type="transformer",
                 **encoder_args):
        super().__init__()
        self.n_tokens = n_tokens
        self.d_model = d_model
        if encoder_type == "transformer":
            self.encoder = CodeEncoder(n_tokens,
                                       project=False,
                                       pad_id=pad_id,
                                       d_model=d_model,
                                       **encoder_args)
            self.head_in = d_model
        elif encoder_type == "lstm":
            self.encoder = CodeEncoderLSTM(n_tokens=n_tokens,
                                           d_model=d_model,
                                           pad_id=pad_id,
                                           project=False,
                                           **encoder_args)
            self.head_in = 2 * d_model
        else:
            raise ValueError

        self.head = nn.Sequential(nn.Linear(self.head_in, d_model), nn.ReLU(),
                                  nn.LayerNorm(d_model))
Exemple #5
0
class Seq2SeqLSTM(nn.Module):
    def __init__(
        self,
        n_tokens,
        d_model=512,
        n_encoder_layers=2,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
    ):
        super(Seq2SeqLSTM, self).__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}
        assert self.config["pad_id"] is not None

        d_rep = d_model  # so same embedding can be used by encoder and decoder

        # Encoder
        self.encoder = CodeEncoderLSTM(
            n_tokens=n_tokens,
            d_model=d_model,
            d_rep=d_rep,
            n_encoder_layers=n_encoder_layers,
            dropout=dropout,
            pad_id=pad_id,
            project="hidden",
        )

        # Decoder
        self.decoder = nn.LSTM(input_size=d_model,
                               hidden_size=d_rep,
                               num_layers=1,
                               bidirectional=False,
                               dropout=dropout)
        self.decoder_c_0 = nn.Parameter(torch.zeros(1, 1, d_rep))
        # self.decoder_proj = nn.Sequential(nn.Linear(d_rep, d_model), nn.ReLU())

    def forward(self, src_tok_ids, tgt_tok_ids, src_lengths, tgt_lengths):
        r"""
        Arguments:
            src_tok_ids: [B, L] long tensor
            tgt_tok_ids: [B, T] long tensor
        """
        if src_tok_ids.size(0) != tgt_tok_ids.size(0):
            raise RuntimeError(
                "the batch number of src_tok_ids and tgt_tok_ids must be equal"
            )

        # Encode
        oh_0 = self.encoder(src_tok_ids, src_lengths)  # B x d_rep
        oh_0 = oh_0.unsqueeze(0)  # 1 x B x d_rep

        # Decode, using the same embedding as the encoder
        # TODO: Try a different subword vocab, or a non-subword vocab
        tgt_emb = self.encoder.embedding(tgt_tok_ids).transpose(
            0, 1) * math.sqrt(self.config["d_model"])
        tgt_emb_packed = torch.nn.utils.rnn.pack_padded_sequence(
            tgt_emb, tgt_lengths - 1, enforce_sorted=False
        )  # subtract 1 from lengths since targets are expected to be shifted
        output, _ = self.decoder(
            tgt_emb_packed,
            (oh_0, self.decoder_c_0.expand_as(oh_0)))  # [T, B, d_rep] (packed)
        # output = self.decoder_proj(output)  # [T, B, d_model] (packed)
        # print("Prior to pading output, shapes:")
        # print("oh_0.shape", oh_0.shape)
        # print("src_tok_ids.shape", src_tok_ids.shape)
        # print("tgt_tok_ids.shape", tgt_tok_ids.shape)
        # print("src_lengths.shape", src_lengths.shape)
        # print("src_length min", src_lengths.min())
        # print("src_length max", src_lengths.max())
        # print("tgt_lengths.shape", tgt_lengths.shape)
        # print("tgt_length min", tgt_lengths.min())
        # print("tgt_length max", tgt_lengths.max())
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(
            output, batch_first=True,
            total_length=tgt_tok_ids.size(1))  # [B, T, d_model]
        # print("After packing", output.shape)
        logits = torch.matmul(output,
                              self.encoder.embedding.weight.transpose(
                                  0, 1))  # [B, T, ntok]
        return logits
Exemple #6
0
    def __init__(
        self,
        n_tokens,
        d_model=512,
        n_head=8,
        n_encoder_layers=6,
        d_ff=2048,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
        encoder_type="transformer",
        critic_type="bilinear_identity",
        bilinear_rank=None,
    ):
        super().__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}

        # Encoder and output for type prediction
        assert encoder_type in ["transformer", "lstm"]
        if encoder_type == "transformer":
            d_critic_rep = d_model  # Per token dimension, then take mean
            self.encoder = CodeEncoder(
                n_tokens=n_tokens,
                d_model=d_model,
                n_head=n_head,
                n_encoder_layers=n_encoder_layers,
                d_ff=d_ff,
                dropout=dropout,
                activation=activation,
                norm=norm,
                pad_id=pad_id,
                project=False,
            )
        elif encoder_type == "lstm":
            d_critic_rep = 4 * d_model  # 4 * d_model for 2 layer bidirectional LSTM
            self.encoder = CodeEncoderLSTM(
                n_tokens=n_tokens,
                d_model=d_model,
                n_encoder_layers=n_encoder_layers,
                dropout=dropout,
                pad_id=pad_id,
                project=False,
            )

        if critic_type == "bilinear_diagonal":
            self.output_weight = nn.Parameter(torch.randn(d_critic_rep),
                                              requires_grad=True)
        elif critic_type == "bilinear_symmetric":
            self.output_weight = nn.Parameter(torch.randn(
                d_critic_rep, d_critic_rep),
                                              requires_grad=True)
        elif critic_type == "bilinear_symmetric_plus_identity":
            W = torch.randn(d_critic_rep,
                            d_critic_rep) + torch.eye(d_critic_rep)
            self.output_weight = nn.Parameter(W, requires_grad=True)
        elif critic_type == "bilinear_identity":
            self.output_weight = None
        elif critic_type == "bilinear_lowrank":
            assert bilinear_rank
            W = torch.randn(bilinear_rank, d_critic_rep)
            self.output_weight = nn.Parameter(W, requires_grad=True)
        else:
            raise ValueError
Exemple #7
0
def embed_augmented(
    # Data
    data_filepath: str,
    output_dir: str,
    spm_filepath: str,
    num_workers=1,
    max_seq_len=-1,
    min_alternatives=2,
    # Model
    encoder_type: str = "lstm",
    pretrain_resume_path: str = "",
    pretrain_resume_encoder_name: str = "encoder_q",  # encoder_q, encoder_k, encoder
    pretrain_resume_project: bool = False,
    # no_output_attention: bool = False,
    n_encoder_layers: int = 2,
    d_model: int = 512,
    # Loss
    subword_regularization_alpha: float = 0,
    # Computational
    use_cuda: bool = True,
    seed: int = 0,
):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    config = locals()
    logger.info(f"Config: {config}")

    if use_cuda:
        assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False"

    sp = spm.SentencePieceProcessor()
    sp.Load(spm_filepath)
    pad_id = sp.PieceToId("[PAD]")
    mask_id = sp.PieceToId("[MASK]")

    # Create model
    if encoder_type == "lstm":
        encoder = CodeEncoderLSTM(
            n_tokens=sp.GetPieceSize(),
            d_model=d_model,
            d_rep=256,
            n_encoder_layers=n_encoder_layers,
            dropout=0.1,
            pad_id=pad_id,
            project=False,
        )
        encoder.config["project"] = "hidden"
        logger.info(f"Created CodeEncoderLSTM with {count_parameters(encoder)} params")
    elif encoder_type == "transformer":
        encoder = CodeEncoder(sp.GetPieceSize(), d_model, 256, 8, n_encoder_layers, 2048, 0.1, "relu", True, pad_id, project=False)
        logger.info(f"Created CodeEncoder with {count_parameters(encoder)} params")

    # Load pretrained checkpoint
    if pretrain_resume_path:
        logger.info(
            f"Resuming training from pretraining checkpoint {pretrain_resume_path}, pretrain_resume_encoder_name={pretrain_resume_encoder_name}"
        )
        checkpoint = torch.load(pretrain_resume_path)
        pretrained_state_dict = checkpoint["model_state_dict"]

        for key in pretrained_state_dict.keys():
            print("Pretrained state dict:", key)
        for key in encoder.state_dict().keys():
            print("Encoder state dict:", key)

        encoder_state_dict = {}
        assert pretrain_resume_encoder_name in ["encoder_k", "encoder_q", "encoder"]

        for key, value in pretrained_state_dict.items():
            if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer" not in key:
                remapped_key = key[len(pretrain_resume_encoder_name + ".") :]
                logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}")
                encoder_state_dict[remapped_key] = value
        encoder.load_state_dict(encoder_state_dict)
        logger.info(f"Loaded state dict from {pretrain_resume_path}")

    # Parallelize across GPUs
    encoder = nn.DataParallel(encoder)
    encoder = encoder.cuda() if use_cuda else encoder

    # Load batches consisting of augmented variants of the same program
    sp = spm.SentencePieceProcessor()
    sp.Load(config["spm_filepath"])
    pad_id = sp.PieceToId("[PAD]")

    def pad_collate(batch):
        assert len(batch) == 1
        X = batch[0]
        B = len(X)

        # Create tensor of sequence lengths, [B] or [2B]
        lengths = torch.tensor([len(x) for x in X], dtype=torch.long)

        # Create padded tensor for batch, [B, T]
        X = pad_sequence(X, batch_first=True, padding_value=pad_id)

        return X, lengths

    dataset = PrecomputedDataset(
        data_filepath,
        min_alternatives=min_alternatives,
        program_mode="all_alternatives",
        limit_size=-1,
        sp=sp,
        subword_regularization_alpha=subword_regularization_alpha,
        max_length=max_seq_len,
    )

    loader = torch.utils.data.DataLoader(
        dataset, batch_size=1, shuffle=True, collate_fn=pad_collate, num_workers=num_workers, drop_last=False, pin_memory=False,
    )

    representations = []
    encoder.eval()
    os.makedirs(output_dir, exist_ok=True)
    with torch.no_grad():
        # Evaluate metrics
        logger.info(f"Evaluating encoder...")
        pbar = tqdm.tqdm(loader, desc="evalaute")
        for X, lengths in pbar:
            rep = encoder(X.cuda(), lengths.cuda(), None)  # [B, n_layers*n_directions*d_model]
            if encoder_type == "transformer":
                assert len(rep.shape) == 3
                rep = rep.mean(dim=0)  # rep is [T, B, dimension], so take mean across sequence
            rep = rep.cpu().numpy()
            X = X.cpu().numpy()
            print("rep", type(rep), "X", type(X))
            print("rep", rep.shape, "X", X.shape)
            representations.append((X, rep))

            if len(representations) and len(representations) % 100 == 0:
                path = os.path.join(output_dir, f"tokens_and_embeddings_{len(representations):06d}.pth")
                logger.info(f"Saving representations to {path}")
                # with open(path, "wb") as f:
                #     pickle.dump(representations, f)
                # torch.save(path, representations)
                torch.save(representations, path)