コード例 #1
0
ファイル: transformer.py プロジェクト: stevew00ds/contracode
    def __init__(
        self,
        n_tokens,
        d_model=512,
        d_rep=128,
        n_head=8,
        n_encoder_layers=6,
        d_ff=2048,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
        n_decoder_layers=6,
    ):
        super(TransformerModel, self).__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}

        # Encoder
        self.encoder = CodeEncoder(
            n_tokens, d_model, d_rep, n_head, n_encoder_layers, d_ff, dropout, activation, norm, pad_id, project=False
        )

        # Decoder
        decoder_layer = nn.TransformerDecoderLayer(d_model, n_head, d_ff, dropout, activation)
        decoder_norm = nn.LayerNorm(d_model) if norm else None
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_decoder_layers, norm=decoder_norm)
コード例 #2
0
    def __init__(
        self,
        n_tokens,
        n_output_tokens,
        d_model=512,
        d_out_projection=512,
        n_hidden_output=1,
        d_rep=128,
        n_head=8,
        n_encoder_layers=6,
        d_ff=2048,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
        encoder_type="transformer",
    ):
        super(TypeTransformer, self).__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}

        # Encoder and output for type prediction
        assert encoder_type in ["transformer", "lstm"]
        if encoder_type == "transformer":
            self.encoder = CodeEncoder(n_tokens,
                                       d_model,
                                       d_rep,
                                       n_head,
                                       n_encoder_layers,
                                       d_ff,
                                       dropout,
                                       activation,
                                       norm,
                                       pad_id,
                                       project=False)
            # TODO: Try LeakyReLU
            self.output = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(),
                                        nn.Linear(d_model, n_output_tokens))
        elif encoder_type == "lstm":
            self.encoder = CodeEncoderLSTM(
                n_tokens=n_tokens,
                d_model=d_model,
                d_rep=d_rep,
                n_encoder_layers=n_encoder_layers,
                dropout=dropout,
                pad_id=pad_id,
                project=False,
            )
            layers = []
            layers.append(nn.Linear(d_model * 2, d_out_projection))
            if n_hidden_output > 1:
                layers.append(nn.Dropout(dropout))
            layers.append(nn.ReLU())
            for hidden_idx in range(n_hidden_output - 1):
                layers.append(nn.Linear(d_out_projection, d_out_projection))
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(d_out_projection, n_output_tokens))
            self.output = nn.Sequential(*layers)
コード例 #3
0
ファイル: code_moco.py プロジェクト: ncoop57/contracode
 def make_encoder(self,
                  n_tokens,
                  d_model,
                  d_rep,
                  pad_id=None,
                  encoder_type="transformer",
                  lstm_project_mode="hidden",
                  n_encoder_layers=6,
                  dropout=0.1,
                  **kwargs):
     if encoder_type == "transformer":
         return CodeEncoder(n_tokens,
                            project=True,
                            pad_id=pad_id,
                            d_model=d_model,
                            d_rep=d_rep,
                            n_encoder_layers=n_encoder_layers,
                            **kwargs)
     elif encoder_type == "lstm":
         return CodeEncoderLSTM(
             n_tokens=n_tokens,
             d_model=d_model,
             d_rep=d_rep,
             n_encoder_layers=n_encoder_layers,
             dropout=dropout,
             pad_id=pad_id,
             project=lstm_project_mode,
         )
     else:
         raise ValueError
コード例 #4
0
ファイル: code_mlm.py プロジェクト: patilanup246/contracode
    def __init__(self,
                 n_tokens,
                 d_model=512,
                 pad_id=None,
                 encoder_type="transformer",
                 **encoder_args):
        super().__init__()
        self.n_tokens = n_tokens
        self.d_model = d_model
        if encoder_type == "transformer":
            self.encoder = CodeEncoder(n_tokens,
                                       project=False,
                                       pad_id=pad_id,
                                       d_model=d_model,
                                       **encoder_args)
            self.head_in = d_model
        elif encoder_type == "lstm":
            self.encoder = CodeEncoderLSTM(n_tokens=n_tokens,
                                           d_model=d_model,
                                           pad_id=pad_id,
                                           project=False,
                                           **encoder_args)
            self.head_in = 2 * d_model
        else:
            raise ValueError

        self.head = nn.Sequential(nn.Linear(self.head_in, d_model), nn.ReLU(),
                                  nn.LayerNorm(d_model))
コード例 #5
0
ファイル: transformer.py プロジェクト: ncoop57/contracode
class TransformerModel(nn.Module):
    def __init__(
        self,
        n_tokens,
        d_model=512,
        d_rep=128,
        n_head=8,
        n_encoder_layers=6,
        d_ff=2048,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
        n_decoder_layers=6,
    ):
        super(TransformerModel, self).__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}

        # Encoder
        self.encoder = CodeEncoder(n_tokens,
                                   d_model,
                                   d_rep,
                                   n_head,
                                   n_encoder_layers,
                                   d_ff,
                                   dropout,
                                   activation,
                                   norm,
                                   pad_id,
                                   project=False)

        # Decoder
        decoder_layer = nn.TransformerDecoderLayer(d_model, n_head, d_ff,
                                                   dropout, activation)
        decoder_norm = nn.LayerNorm(d_model) if norm else None
        self.decoder = nn.TransformerDecoder(decoder_layer,
                                             num_layers=n_decoder_layers,
                                             norm=decoder_norm)

    def forward(self,
                src_tok_ids,
                tgt_tok_ids,
                src_lengths=None,
                tgt_lengths=None):
        r"""
        Arguments:
            src_tok_ids: [B, L] long tensor
            tgt_tok_ids: [B, T] long tensor
        """
        if src_tok_ids.size(0) != tgt_tok_ids.size(0):
            raise RuntimeError(
                "the batch number of src_tok_ids and tgt_tok_ids must be equal"
            )

        # Encode
        memory = self.encoder(src_tok_ids)

        # Decode, using the same embedding and positional encoding as the encoder
        tgt_emb = self.encoder.embedding(tgt_tok_ids).transpose(
            0, 1) * math.sqrt(self.config["d_model"])
        tgt_emb = self.encoder.pos_encoder(tgt_emb)
        tgt_mask = self.generate_square_subsequent_mask(
            tgt_tok_ids.size(1)).to(tgt_tok_ids.device)
        if self.config["pad_id"] is None:
            assert False
            tgt_key_padding_mask = None
        else:
            tgt_key_padding_mask = tgt_tok_ids == self.config["pad_id"]
        output = self.decoder(tgt_emb,
                              memory,
                              tgt_mask=tgt_mask,
                              memory_mask=None,
                              tgt_key_padding_mask=tgt_key_padding_mask)

        logits = torch.matmul(output,
                              self.encoder.embedding.weight.transpose(
                                  0, 1))  # [T, B, ntok]
        return torch.transpose(logits, 0, 1)  # [B, T, ntok]

    def generate_square_subsequent_mask(self, sz):
        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
        Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(
            mask == 1, float(0.0))
        return mask
コード例 #6
0
ファイル: clone.py プロジェクト: dabaier/contracode
    def __init__(
        self,
        n_tokens,
        d_model=512,
        n_head=8,
        n_encoder_layers=6,
        d_ff=2048,
        dropout=0.1,
        activation="relu",
        norm=True,
        pad_id=None,
        encoder_type="transformer",
        critic_type="bilinear_identity",
        bilinear_rank=None,
    ):
        super().__init__()
        assert norm
        assert pad_id is not None
        self.config = {k: v for k, v in locals().items() if k != "self"}

        # Encoder and output for type prediction
        assert encoder_type in ["transformer", "lstm"]
        if encoder_type == "transformer":
            d_critic_rep = d_model  # Per token dimension, then take mean
            self.encoder = CodeEncoder(
                n_tokens=n_tokens,
                d_model=d_model,
                n_head=n_head,
                n_encoder_layers=n_encoder_layers,
                d_ff=d_ff,
                dropout=dropout,
                activation=activation,
                norm=norm,
                pad_id=pad_id,
                project=False,
            )
        elif encoder_type == "lstm":
            d_critic_rep = 4 * d_model  # 4 * d_model for 2 layer bidirectional LSTM
            self.encoder = CodeEncoderLSTM(
                n_tokens=n_tokens,
                d_model=d_model,
                n_encoder_layers=n_encoder_layers,
                dropout=dropout,
                pad_id=pad_id,
                project=False,
            )

        if critic_type == "bilinear_diagonal":
            self.output_weight = nn.Parameter(torch.randn(d_critic_rep),
                                              requires_grad=True)
        elif critic_type == "bilinear_symmetric":
            self.output_weight = nn.Parameter(torch.randn(
                d_critic_rep, d_critic_rep),
                                              requires_grad=True)
        elif critic_type == "bilinear_symmetric_plus_identity":
            W = torch.randn(d_critic_rep,
                            d_critic_rep) + torch.eye(d_critic_rep)
            self.output_weight = nn.Parameter(W, requires_grad=True)
        elif critic_type == "bilinear_identity":
            self.output_weight = None
        elif critic_type == "bilinear_lowrank":
            assert bilinear_rank
            W = torch.randn(bilinear_rank, d_critic_rep)
            self.output_weight = nn.Parameter(W, requires_grad=True)
        else:
            raise ValueError
コード例 #7
0
ファイル: embed_for_tsne.py プロジェクト: dabaier/contracode
def embed_augmented(
    # Data
    data_filepath: str,
    output_dir: str,
    spm_filepath: str,
    num_workers=1,
    max_seq_len=-1,
    min_alternatives=2,
    # Model
    encoder_type: str = "lstm",
    pretrain_resume_path: str = "",
    pretrain_resume_encoder_name: str = "encoder_q",  # encoder_q, encoder_k, encoder
    pretrain_resume_project: bool = False,
    # no_output_attention: bool = False,
    n_encoder_layers: int = 2,
    d_model: int = 512,
    # Loss
    subword_regularization_alpha: float = 0,
    # Computational
    use_cuda: bool = True,
    seed: int = 0,
):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    config = locals()
    logger.info(f"Config: {config}")

    if use_cuda:
        assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False"

    sp = spm.SentencePieceProcessor()
    sp.Load(spm_filepath)
    pad_id = sp.PieceToId("[PAD]")
    mask_id = sp.PieceToId("[MASK]")

    # Create model
    if encoder_type == "lstm":
        encoder = CodeEncoderLSTM(
            n_tokens=sp.GetPieceSize(),
            d_model=d_model,
            d_rep=256,
            n_encoder_layers=n_encoder_layers,
            dropout=0.1,
            pad_id=pad_id,
            project=False,
        )
        encoder.config["project"] = "hidden"
        logger.info(f"Created CodeEncoderLSTM with {count_parameters(encoder)} params")
    elif encoder_type == "transformer":
        encoder = CodeEncoder(sp.GetPieceSize(), d_model, 256, 8, n_encoder_layers, 2048, 0.1, "relu", True, pad_id, project=False)
        logger.info(f"Created CodeEncoder with {count_parameters(encoder)} params")

    # Load pretrained checkpoint
    if pretrain_resume_path:
        logger.info(
            f"Resuming training from pretraining checkpoint {pretrain_resume_path}, pretrain_resume_encoder_name={pretrain_resume_encoder_name}"
        )
        checkpoint = torch.load(pretrain_resume_path)
        pretrained_state_dict = checkpoint["model_state_dict"]

        for key in pretrained_state_dict.keys():
            print("Pretrained state dict:", key)
        for key in encoder.state_dict().keys():
            print("Encoder state dict:", key)

        encoder_state_dict = {}
        assert pretrain_resume_encoder_name in ["encoder_k", "encoder_q", "encoder"]

        for key, value in pretrained_state_dict.items():
            if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer" not in key:
                remapped_key = key[len(pretrain_resume_encoder_name + ".") :]
                logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}")
                encoder_state_dict[remapped_key] = value
        encoder.load_state_dict(encoder_state_dict)
        logger.info(f"Loaded state dict from {pretrain_resume_path}")

    # Parallelize across GPUs
    encoder = nn.DataParallel(encoder)
    encoder = encoder.cuda() if use_cuda else encoder

    # Load batches consisting of augmented variants of the same program
    sp = spm.SentencePieceProcessor()
    sp.Load(config["spm_filepath"])
    pad_id = sp.PieceToId("[PAD]")

    def pad_collate(batch):
        assert len(batch) == 1
        X = batch[0]
        B = len(X)

        # Create tensor of sequence lengths, [B] or [2B]
        lengths = torch.tensor([len(x) for x in X], dtype=torch.long)

        # Create padded tensor for batch, [B, T]
        X = pad_sequence(X, batch_first=True, padding_value=pad_id)

        return X, lengths

    dataset = PrecomputedDataset(
        data_filepath,
        min_alternatives=min_alternatives,
        program_mode="all_alternatives",
        limit_size=-1,
        sp=sp,
        subword_regularization_alpha=subword_regularization_alpha,
        max_length=max_seq_len,
    )

    loader = torch.utils.data.DataLoader(
        dataset, batch_size=1, shuffle=True, collate_fn=pad_collate, num_workers=num_workers, drop_last=False, pin_memory=False,
    )

    representations = []
    encoder.eval()
    os.makedirs(output_dir, exist_ok=True)
    with torch.no_grad():
        # Evaluate metrics
        logger.info(f"Evaluating encoder...")
        pbar = tqdm.tqdm(loader, desc="evalaute")
        for X, lengths in pbar:
            rep = encoder(X.cuda(), lengths.cuda(), None)  # [B, n_layers*n_directions*d_model]
            if encoder_type == "transformer":
                assert len(rep.shape) == 3
                rep = rep.mean(dim=0)  # rep is [T, B, dimension], so take mean across sequence
            rep = rep.cpu().numpy()
            X = X.cpu().numpy()
            print("rep", type(rep), "X", type(X))
            print("rep", rep.shape, "X", X.shape)
            representations.append((X, rep))

            if len(representations) and len(representations) % 100 == 0:
                path = os.path.join(output_dir, f"tokens_and_embeddings_{len(representations):06d}.pth")
                logger.info(f"Saving representations to {path}")
                # with open(path, "wb") as f:
                #     pickle.dump(representations, f)
                # torch.save(path, representations)
                torch.save(representations, path)