Ejemplo n.º 1
0
def test_bert_encoder_backward(gpu, default_implementation, sdfg_name):
    batch_size = 2
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])
    ptmodel = BertLayer(BertConfig(hidden_act="relu")).eval()

    dace_model = DaceModule(ptmodel,
                            cuda=gpu,
                            train=False,
                            backward=True,
                            sdfg_name=sdfg_name,
                            apply_strict=True)

    ptinput = torch.clone(input)
    ptinput.requires_grad = True
    ptmodel(ptinput)[0].sum().backward()

    dace_input = torch.clone(input)
    dace_input.requires_grad = True
    dace_model(dace_input).sum().backward()

    diff = np.abs(dace_input.grad.detach().numpy() -
                  ptinput.grad.detach().numpy())

    assert np.max(diff) < 1e-4
Ejemplo n.º 2
0
def test_bert_cf(sdfg_name):
    batch_size = 8
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])

    ptmodel = BertLayer(BertConfig()).eval()
    pt_outputs = ptmodel(input.clone())

    dace_model = DaceModule(ptmodel,
                            train=False,
                            sdfg_name=sdfg_name,
                            dummy_inputs=(input.clone(), ),
                            auto_optimize=False)

    # run again with constant folding
    dace_model.reset_sdfg()
    dace_model.prepend_post_onnx_hook(
        "cf", lambda onnx_model: onnx_model.sdfg.
        apply_transformations_repeated([ConstantFolding, RedundantSecondArray],
                                       validate_all=True,
                                       strict=True))
    dace_outputs1 = dace_model(input.clone())

    diff = np.abs(dace_outputs1.detach().numpy() -
                  pt_outputs[0].detach().numpy())

    assert np.max(diff) < 1e-5
Ejemplo n.º 3
0
    def __init__(self, device):
        super(Model, self).__init__()
        self.device = device
        self.num_labels = 2
        self.config = BertConfig.from_pretrained('./roberta_pretrain/bert_config.json')
        self.embeddings = BertEmbeddings(self.config)

        num_layers = 3
        self.layer = nn.ModuleList([BertLayer(self.config) for _ in range(num_layers)])
        self.output = nn.Linear(self.config.hidden_size, self.num_labels)   # 分类
Ejemplo n.º 4
0
 def __init__(self, count, config, num_labels):
     super(HSUM, self).__init__()
     self.count = count
     self.num_labels = num_labels
     self.pre_layers = torch.nn.ModuleList()
     self.crf_layers = torch.nn.ModuleList()
     self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
     for i in range(count):
         self.pre_layers.append(BertLayer(config))
         self.crf_layers.append(CRF(num_labels))
Ejemplo n.º 5
0
 def __init__(self, count, config, num_labels):
     super(HSUM, self).__init__()
     self.count = count
     self.num_labels = num_labels
     self.pre_layers = torch.nn.ModuleList()
     self.loss_fct = torch.nn.ModuleList()
     self.pooler = BertPooler(config)
     self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
     for i in range(count):
         self.pre_layers.append(BertLayer(config))
         self.loss_fct.append(torch.nn.CrossEntropyLoss(ignore_index=-1))
Ejemplo n.º 6
0
    def __init__(self, bert, config, args):
        super(BertAttentiveKeywordsClassification, self).__init__()
        self.bert = bert
        self.hidden_size = config.hidden_size
        self.transformer = BertLayer(config)

        self.dropout = nn.Dropout(args.dropout)
        self.seq = nn.Sequential(
            nn.Dropout(args.dropout),
            nn.Linear(6 * config.hidden_size, config.hidden_size),
            nn.Dropout(args.dropout),
            nn.Linear(config.hidden_size, 2)
        )
    def __init__(self, config):
        super(Net, self).__init__()
        self.config = config
        self.bert_dim = 768
        self.rel_num = self.config.rel_num
        self.max_len = self.config.max_len
        self.device = self.config.device
        self.lr = self.config.learning_rate
        self.id2rel = json.load(open(self.config.rel2id, encoding="utf8"))[0]

        self.bert_encoder = BertModel.from_pretrained("bert-base-chinese")
        self.conv = nn.Conv1d(in_channels=self.bert_dim,
                              out_channels=self.rel_num,
                              kernel_size=self.config.conv_kernel)
        self.pool = nn.MaxPool1d(self.config.pool_kernel)
        self.lstm = nn.LSTM(input_size=self.config.lstm_in,
                            hidden_size=self.config.lstm_out,
                            batch_first=True,
                            bidirectional=self.config.if_bidirectional)
        self.lstms = LSTMS(self.config)
        # self.w = nn.Linear(in_features=self.max_len -
        #                    self.config.conv_kernel + 1, out_features=128)
        self.w = nn.Linear(in_features=self.max_len - self.config.conv_kernel +
                           1,
                           out_features=384)
        self.linears = nn.Linear(in_features=self.config.lstm_out,
                                 out_features=self.config.tag_num)
        self.rel2tag = nn.Linear(in_features=self.max_len -
                                 self.config.conv_kernel + 1,
                                 out_features=1)
        self.softmax = nn.Softmax(-1)
        self.layernorm = nn.LayerNorm(
            [self.config.max_len, self.config.lstm_in])

        self.matrix = Variable(torch.randn(128, self.bert_dim),
                               requires_grad=True).to(self.device)
        self.bertlayer1 = BertLayer(BertConfig(vocab_size=21128))
        self.bertlayer2 = BertLayer(BertConfig(vocab_size=21128))
Ejemplo n.º 8
0
def test_bert_encoder(gpu, default_implementation, sdfg_name):
    if not gpu and default_implementation == 'onnxruntime':
        pytest.skip("combination is tested below")

    batch_size = 8
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])

    ptmodel = BertLayer(BertConfig()).eval()
    pt_outputs = ptmodel(input.clone())

    dace_model = DaceModule(ptmodel,
                            cuda=gpu,
                            train=False,
                            sdfg_name=sdfg_name,
                            apply_strict=True,
                            dummy_inputs=(input.clone(), ))

    if gpu:
        for name, _ in dace_model.model.named_parameters():
            parameter_to_transient(dace_model, name)

    dace_outputs0 = dace_model(input.clone())

    diff = np.abs(dace_outputs0.detach().numpy() -
                  pt_outputs[0].detach().numpy())

    assert np.max(diff) < 1e-5

    if default_implementation == "pure":
        ort_nodes = [
            n for n, _ in dace_model.sdfg.all_nodes_recursive()
            if hasattr(n, "environments") and any("onnx" in e.lower()
                                                  for e in n.environments)
        ]
        if len(ort_nodes) > 0:
            assert False, f"expected pure graph, found ORT nodes: {ort_nodes} "

        # check that cuBLAS is being used
        if gpu:
            assert any(
                (hasattr(n, "environments") and "cuBLAS" in n.environments or
                 hasattr(n, "implementation") and n.implementation == "cuBLAS")
                for n, _ in dace_model.sdfg.all_nodes_recursive())
Ejemplo n.º 9
0
    def __init__(self,bertconfig,config):
        super(BERT_Seq2SeqModel, self).__init__()
        self.encoder = BertModel.from_pretrained( config.model_path, config=bertconfig)
        self.num_labels = bertconfig.num_labels
        self.l2_reg_lambda = bertconfig.l2_reg_lambda
        self.dropout = nn.Dropout(bertconfig.hidden_dropout_prob)
        vocab_size=config.vocab_size
        self.ner_classifier=nn.Linear(config.enc_hidden_size, vocab_size)

        self.span_layer = BertLayer(config=bertconfig)
        self.w = nn.Parameter(torch.Tensor([0.5, 0.5]))
        self.gamma = nn.Parameter(torch.ones(1))

        dec_att_type=int(config.dec_att_type)
        self.rel_size=config.rel_size
        self.decoder = Decoder(config.dec_inp_size, config.dec_hidden_size, 1, config.drop_rate, config.max_trg_len,dec_att_type,self.rel_size)
        self.relation_embeddings = nn.Embedding(config.rel_size, config.dec_inp_size)
        self.dropout_di = nn.Dropout(config.drop_rate)  
Ejemplo n.º 10
0
    def __init__(self, config, **kwargs):
        super().__init__(config)
        self.config = config
        self.num_labels = config.num_labels
        self.cus_config = kwargs['cus_config']
        self.type = self.cus_config.type  # a,b,c,d

        self.usr_embed = nn.Embedding(self.cus_config.num_usrs,
                                      self.cus_config.attr_dim)
        self.usr_embed.weight.requires_grad = True
        init.uniform_(self.usr_embed.weight, a=-0.25, b=0.25)

        self.prd_embed = nn.Embedding(self.cus_config.num_prds,
                                      self.cus_config.attr_dim)
        self.prd_embed.weight.requires_grad = True
        init.uniform_(self.usr_embed.weight, a=-0.25, b=0.25)

        if self.type not in ['b', 'a']:
            self.text = nn.Parameter(torch.Tensor(1, self.cus_config.attr_dim))
            # init.normal_(self.text)
            init.uniform_(self.text, a=-0.25, b=0.25)
            self.ATrans_decoder = nn.ModuleList([
                MAALayer(config, self.cus_config)
                for _ in range(self.cus_config.n_mmalayer)
            ])
            self.classifier = BERTClassificationHead(config)
        elif self.type == 'a':
            self.fusion = Fusion(self.config.hidden_size,
                                 self.cus_config.attr_dim)
            self.layer = nn.ModuleList(
                [BertLayer(config) for _ in range(self.cus_config.n_mmalayer)])
            self.classifier = BERTClassificationHead(config)
        else:
            self.classifier = BERTClassificationHeadWithAttribute(
                self.cus_config)

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.init_weights()
Ejemplo n.º 11
0
def test_bert_encoder(gpu, apply_strict):
    batch_size = 8
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])

    ptmodel = BertLayer(BertConfig()).eval()
    pt_outputs = ptmodel(input.clone())

    dace_model = DaceModule(ptmodel, cuda=gpu, train=False)
    dace_outputs0 = dace_model(input.clone())

    dace_model.dace_model.sdfg.apply_transformations_repeated(
        [ConstantFolding, RedundantSecondArray], validate_all=True)

    dace_outputs1 = dace_model(input.clone())

    diff = np.abs(dace_outputs0 - pt_outputs[0].detach().numpy())

    assert np.max(diff) < 1e-5
    assert np.allclose(dace_outputs1, dace_outputs0)
Ejemplo n.º 12
0
    def __init__(self):
        super(Model, self).__init__()
        self.config = BertConfig.from_pretrained(
            './roberta_pretrain/bert_config.json')
        self.ques_encoder = BertModel.from_pretrained(
            './roberta_pretrain/pytorch_model.bin', config=self.config)
        self.context_encoder = BertModel.from_pretrained(
            './roberta_pretrain/pytorch_model.bin', config=self.config)

        self.basicblocks = nn.ModuleList()
        self.n_layers = 3
        trans_heads = 8
        trans_drop = 0.1
        bert_config = BertConfig(hidden_size=self.config.hidden_size,
                                 num_attention_heads=trans_heads,
                                 attention_probs_dropout_prob=trans_drop)

        for layer in range(self.n_layers):
            self.basicblocks.append(BertLayer(bert_config))

        self.num_labels = 2
        self.output = nn.Linear(self.config.hidden_size, self.num_labels)
Ejemplo n.º 13
0
 def __init__(self, config):
     super().__init__()
     self.output_attentions = config.output_attentions
     self.output_hidden_states = config.output_hidden_states
     self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
def _compute_pytorch(
    model_names,
    batch_sizes,
    slice_sizes,
    dictionary,
    average_over,
    device,
    torchscript,
    fp16,
    no_speed,
    no_memory,
    verbose,
    num_hashes
):

    hidden_size = 64
    num_attention_heads = 2
    intermediate_size = 128

    chunk_length = 64
    num_hashes = num_hashes

    hidden_states = floats_tensor((1, 2 ** 16, hidden_size))

    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")

        dictionary[model_name] = {
            "bs": batch_sizes,
            "ss": slice_sizes,
            "results": {},
            "memory": {},
        }
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:

            for slice_size in slice_sizes:

                num_buckets = int(2 * slice_size / chunk_length)
                if num_buckets > chunk_length:
                    factorized_num_buckets = num_buckets // 32
                    num_buckets = [32, factorized_num_buckets]

                bert_config = BertConfig(
                    hidden_size=hidden_size,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    hidden_dropout_prob=0.0,
                    attention_probs_dropout_prob=0.0,
                )

                reformer_config = ReformerConfig(
                    hidden_size=hidden_size,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    chunk_length=chunk_length,
                    num_hashes=num_hashes,
                    num_buckets=num_buckets
                )

                layers = {
                    'ReformerLayer': ReformerLayer(reformer_config), 
                    'BertLayer': BertLayer(bert_config)
                }
                model = layers[model_name]

                if fp16:
                    model.half()
                model.to(device)
                model.eval()

                if False:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = (
                        hidden_states[0, :slice_size, :]
                        .to(device=device)
                        .repeat(batch_size, 1, 1)
                    )
                    try:
                        if torchscript:
                            print("Tracing model with sequence size", sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            if model_name == "ReformerLayer":
                                inference(sequence, sequence)
                            else:
                                inference(sequence)

                        if not no_memory:
                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)

                            trace = start_memory_tracing("transformers")
                            if model_name == "ReformerLayer":
                                inference(sequence, sequence)
                            else:
                                inference(sequence)
                            summary = stop_memory_tracing(trace)

                            if verbose:
                                print_summary_statistics(summary)

                            dictionary[model_name]["memory"][batch_size][
                                slice_size
                            ] = str(summary.total)
                        else:
                            dictionary[model_name]["memory"][batch_size][
                                slice_size
                            ] = "N/A"

                        if not no_speed:
                            print(
                                "Going through model with sequence of shape",
                                sequence.shape,
                            )
                            if model_name == "ReformerLayer":
                                runtimes = timeit.repeat(
                                    lambda: inference(sequence, sequence),
                                    repeat=average_over,
                                    number=3,
                                )
                            else:
                                runtimes = timeit.repeat(
                                    lambda: inference(sequence),
                                    repeat=average_over,
                                    number=3,
                                )
                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                            dictionary[model_name]["results"][batch_size][
                                slice_size
                            ] = average_time
                        else:
                            dictionary[model_name]["results"][batch_size][
                                slice_size
                            ] = "N/A"

                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][
                            slice_size
                        ] = "N/A"
                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
    return dictionary
Ejemplo n.º 15
0
 def __init__(self):
     super(BertTokenSoftmaxClf, self).__init__()
     self.bert = BertLayer(BertConfig(hidden_act="relu")).eval()
     self.sm = nn.LogSoftmax(dim=-1)