def load_model(self, path):
        device = torch.device('cpu')
        V = len(self.vocab.char2id)
        d_model = 64
        d_ff = 256
        h = 4
        n_encoders = 4

        self_attn = MultiHeadedAttention(h=h,
                                         d_model=d_model,
                                         d_k=d_model // h,
                                         d_v=d_model // h,
                                         dropout=0.1)
        feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=d_ff)
        position = PositionalEncoding(d_model, dropout=0.1)
        embedding = nn.Sequential(Embeddings(d_model=d_model, vocab=V),
                                  position)

        encoder = Encoder(self_attn=self_attn,
                          feed_forward=feed_forward,
                          size=d_model,
                          dropout=0.1)
        generator = Generator(d_model=d_model, vocab_size=V)
        model = Bert(encoder=encoder,
                     embedding=embedding,
                     generator=generator,
                     n_layers=n_encoders)
        model = model.to(device)
        model_save_path = path
        checkpoint = torch.load(model_save_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])

        return model
def main():
    BERT_MODEL_PATH = '../../models/bert_jp/'

    # start bert server
    commands = [
        'bert-serving-start', '-model_dir', BERT_MODEL_PATH, '-num_worker=1',
        '-cpu'
    ]
    p = subprocess.Popen(commands,
                         shell=False,
                         stdout=subprocess.DEVNULL,
                         stderr=subprocess.DEVNULL)

    # start bert client
    bert = Bert(bert_model_path=BERT_MODEL_PATH, client_ip='0.0.0.0')

    # build train features
    train_dataset = pd.read_csv('../../data/processed/train_dataset.csv')
    train_vectors, train_targets = build_features(train_dataset, bert)
    np.save('../../data/features/train_vectors', train_vectors)
    np.save('../../data/features/train_targets', train_targets)

    # build test features
    test_dataset = pd.read_csv('../../data/processed/test_dataset.csv')
    test_vectors, test_targets = build_features(test_dataset, bert)
    np.save('../../data/features/test_vectors', test_vectors)
    np.save('../../data/features/test_targets', test_targets)

    p.terminate()
def train():
    config = BertConfig()

    logger = get_logger(config.log_path)

    model = Bert(config)

    device = config.device

    train_dataset = BertDataSet(config.base_config.train_data_path)
    dev_dataset = BertDataSet(config.base_config.dev_data_path)

    train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss()

    model.to(device)
    model.train()

    best_acc = 0.

    for epoch in range(config.epochs):
        for i, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            input_ids, token_type_ids, attention_mask, labels = batch[0].to(device), batch[1].to(device), batch[
                2].to(
                device), batch[3].to(device)
            logits = model(input_ids, token_type_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                preds = torch.argmax(logits, dim=1)
                acc = torch.sum(preds == labels)*1. / len(labels)
                logger.info("TRAIN: epoch: {} step: {} acc: {}, loss: {}".format(epoch, i, acc, loss.item()))

        acc, cls_report = dev(model, dev_dataloader, config)
        logger.info("DEV: epoch: {} acc: {}".format(epoch, acc))
        logger.info("DEV classification report:\n{}".format(cls_report))

        if acc > best_acc:
            torch.save(model.state_dict(), config.model_path)
            best_acc = acc

    test_dataset = BertDataSet(config.base_config.test_data_path)
    test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
    best_model = Bert(config)
    best_model.load_state_dict(torch.load(config.model_path))
    acc, cls_report = dev(best_model, test_dataloader, config)
    logger.info("TEST: ACC:{}".format(acc))
    logger.info("TEST classification report:\n{}".format(cls_report))
 def test_bert(self,user_account,threshold, test_df, bert):
     if (not bert):
         bert=Bert(model_path,class_names)
         bert.load_model()
     prediction = bert.predict(user_account,threshold, test_df)
     return prediction
Esempio n. 5
0
                                 or metrics['f1'] < f1):
                break
            if metrics['f1'] > f1:
                f1 = metrics['f1']

        torch.save(model.state_dict(), config.model_save_path)


def inference(config, model):
    res, inputs_tags = evaluate(config, model)
    print(res)
    for i in inputs_tags:
        print(i)


if __name__ == '__main__':
    os.environ['CUDA_VISIBLE_DEVICES'] = '6'
    torch.set_num_threads(10)
    set_seed()
    config = Config()
    model = Bert(config)

    if config.inference or config.resume_train:
        checkpoint = torch.load(config.model_save_base)
        model.load_state_dict(checkpoint)

    if config.inference:
        inference(config, model)
    else:
        train(config, model)
 def run(self):
     global bert
     bert = Bert(self.model_path, self.class_names)
     bert.load_model()
     print("BERT LOADED")
     self.signals.result.emit(True)
Esempio n. 7
0
def train():
    config = KDConfig()

    logger = get_logger(config.log_path, "train_KD")

    device = config.device

    # 加载bert模型,作为teacher
    logger.info("load bert .....")
    bert = Bert(config.bert_config)
    bert.load_state_dict(torch.load(config.bert_config.model_path))
    bert.to(device)
    bert.eval()

    # 冻结bert参数
    for name, p in bert.named_parameters():
        p.requires_grad = False

    # 加载textcnn模型,作为student
    textcnn = TextCNN(config.textcnn_config)
    textcnn.to(device)
    textcnn.train()

    # 加载数据集
    logger.info("load train/dev data .....")
    train_loader = DataLoader(KDdataset(config.base_config.train_data_path),
                              batch_size=config.batch_size,
                              shuffle=True)
    dev_loader = DataLoader(KDdataset(config.base_config.dev_data_path),
                            batch_size=config.batch_size,
                            shuffle=False)

    optimizer = Adam(textcnn.parameters(), lr=config.lr)

    # 开始训练
    logger.info("start training .....")
    best_acc = 0.
    for epoch in range(config.epochs):
        for i, batch in enumerate(train_loader):
            cnn_ids, labels, input_ids, token_type_ids, attention_mask = batch[0].to(device), batch[1].to(device), \
                                                                         batch[2].to(device), batch[3].to(device), \
                                                                         batch[4].to(device)
            optimizer.zero_grad()
            students_output = textcnn(cnn_ids)
            teacher_output = bert(input_ids, token_type_ids, attention_mask)
            loss = loss_fn_kd(students_output, labels, teacher_output,
                              config.T, config.alpha)
            loss.backward()
            optimizer.step()

            # 打印信息
            if i % 100 == 0:
                labels = labels.data.cpu().numpy()
                preds = torch.argmax(students_output, dim=1)
                preds = preds.data.cpu().numpy()
                acc = np.sum(preds == labels) * 1. / len(preds)
                logger.info(
                    "TRAIN: epoch: {} step: {} acc: {} loss: {} ".format(
                        epoch + 1, i, acc, loss.item()))

        acc, table = dev(textcnn, dev_loader, config)

        logger.info("DEV: acc: {} ".format(acc))
        logger.info("DEV classification report: \n{}".format(table))

        if acc > best_acc:
            torch.save(textcnn.state_dict(), config.model_path)
            best_acc = acc

    logger.info("start testing ......")
    test_loader = DataLoader(KDdataset(config.base_config.test_data_path),
                             batch_size=config.batch_size,
                             shuffle=False)
    best_model = TextCNN(config.textcnn_config)
    best_model.load_state_dict(torch.load(config.model_path))
    acc, table = dev(best_model, test_loader, config)

    logger.info("TEST acc: {}".format(acc))
    logger.info("TEST classification report:\n{}".format(table))
Esempio n. 8
0
from bert import Bert

bert = Bert()
verb, probability = bert.getBestPredicateAndProbability("She", "the ball")
print("'" + verb + "'" + " " + str(probability))

item, probability = bert.combineTo("iron", "hammer")
print("'" + item + "'" + " " + str(probability))
Esempio n. 9
0
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        self.device = 'cuda' if options.enable_gpu else ''

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        # Load Albert if the option is set
        if options.albert:
            from albert import Albert
            self.albert = Albert(
                pretrained_model=options.albert_pretrained_model)
        else:
            self.albert = None

        # Load Bert if the option is set
        if options.bert:
            from bert import Bert
            self.bert = Bert(options.bert_pretrained_model,
                             options.bert_pretrained_config,
                             options.bert_tokenizer)
        else:
            self.bert = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.keys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.keys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        # Sartiano
        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        albert_emb_size = self.albert.emb_dim if self.albert else 0
        bert_emb_size = self.bert.emb_dim if self.bert else 0

        self.pretrained_embeddings_size = (elmo_emb_size + albert_emb_size +
                                           bert_emb_size)

        self.lstm_input_size = (
            options.word_emb_size + self.pretrained_embeddings_size +
            options.pos_emb_size + options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print("Word-level LSTM input size: " + str(self.lstm_input_size),
              file=sys.stderr)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))
Esempio n. 10
0
class FeatureExtractor(object):
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        self.device = 'cuda' if options.enable_gpu else ''

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        # Load Albert if the option is set
        if options.albert:
            from albert import Albert
            self.albert = Albert(
                pretrained_model=options.albert_pretrained_model)
        else:
            self.albert = None

        # Load Bert if the option is set
        if options.bert:
            from bert import Bert
            self.bert = Bert(options.bert_pretrained_model,
                             options.bert_pretrained_config,
                             options.bert_tokenizer)
        else:
            self.bert = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.keys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.keys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        # Sartiano
        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        albert_emb_size = self.albert.emb_dim if self.albert else 0
        bert_emb_size = self.bert.emb_dim if self.bert else 0

        self.pretrained_embeddings_size = (elmo_emb_size + albert_emb_size +
                                           bert_emb_size)

        self.lstm_input_size = (
            options.word_emb_size + self.pretrained_embeddings_size +
            options.pos_emb_size + options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print("Word-level LSTM input size: " + str(self.lstm_input_size),
              file=sys.stderr)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

    def Init(self, options):
        paddingWordVec = self.word_lookup[
            1] if options.word_emb_size > 0 else None
        paddingElmoVec = dy.inputTensor(np.zeros(
            (self.elmo.emb_dim, 1)), self.device) if self.elmo else None
        paddingAlbertVec = dy.inputTensor(np.zeros(
            (self.albert.emb_dim, 1)), self.device) if self.albert else None
        paddingBertVec = dy.inputTensor(np.zeros(
            (self.bert.emb_dim, 1)), self.device) if self.bert else None
        paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None
        paddingCharVec = self.charPadding.expr(
        ) if options.char_emb_size > 0 else None
        paddingTbankVec = self.treebank_lookup[
            0] if options.tbank_emb_size > 0 else None

        paddings = dy.concatenate(
            list(
                filter(None, [
                    paddingWordVec, paddingElmoVec, paddingAlbertVec,
                    paddingBertVec, paddingPosVec, paddingCharVec,
                    paddingTbankVec
                ])))
        self.paddingVec = dy.tanh(self.word2lstm.expr() *\
            paddings + self.word2lstmbias.expr())

        self.empty = self.paddingVec if self.nnvecs == 1 else\
            dy.concatenate([self.paddingVec for _ in range(self.nnvecs)])

    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):
        """
        Fills root.vec of tokens in :param sentence: with corresponding embedding.

        :param train: boolean whether training or predicting.
        :return: a Sentence object representing the sentence.
        """

        sentence_representation = None

        if self.elmo:
            # Get full text of sentence - excluding root, which is loaded differently
            # for transition and graph-based parsers.
            if options.graph_based:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[1:]])
            else:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[:-1]])

            sentence_representation = \
                self.elmo.get_sentence_representation(sentence_text)

        if self.albert:
            # Get full text of sentence - excluding root, which is loaded differently
            # for transition and graph-based parsers.
            if options.graph_based:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[1:]])
            else:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[:-1]])

            sentence_representation = \
                self.albert.get_sentence_representation(sentence_text)

        if self.bert:
            # Get full text of sentence - excluding root, which is loaded differently
            # for transition and graph-based parsers.
            if options.graph_based:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[1:]])
            else:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[:-1]])

            sentence_representation = \
                self.bert.get_sentence_representation(sentence_text)

        for i, root in enumerate(sentence):
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm], self.device)
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup

                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                if treebank_id is not None:
                    root.vecs["treebank"] = self.treebank_lookup[
                        self.treebanks[treebank_id]]
            # lookahead
            # self.pretrained_embeddings = ''
            if self.elmo:
                # lookahead
                # self.pretrained_embeddings = 'elmo'
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["elmo"] = sentence_representation[i]
                else:
                    # TODO
                    root.vecs["elmo"] = dy.inputTensor(
                        np.zeros((self.elmo.emb_dim, 1)), self.device)

            if self.albert:
                # lookahead
                # self.pretrained_embeddings = 'albert'
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["albert"] = sentence_representation[i]
                else:
                    # TODO
                    root.vecs["albert"] = dy.inputTensor(
                        np.zeros((self.albert.emb_dim, 1)), self.device)

            if self.bert:
                # lookahead
                # self.pretrained_embeddings = 'bert'
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["bert"] = sentence_representation[i]
                else:
                    # TODO
                    # dy.zeros() doesn't have a device='cuda' parameter
                    #root.vecs["bert"] = dy.zeros(self.bert.emb_dim)
                    root.vecs["bert"] = dy.inputTensor(
                        np.zeros((self.bert.emb_dim, 1)), self.device)

            root.vec = dy.concatenate(
                list(
                    filter(None, [
                        root.vecs["word"], root.vecs["elmo"],
                        root.vecs["albert"], root.vecs["bert"],
                        root.vecs["pos"], root.vecs["char"],
                        root.vecs["treebank"]
                    ])))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)

        return sentence_representation

    def get_char_vector(self, root, train, test_embeddings_chars={}):

        if root.char_rep == "*root*":  # no point running a character analysis over this placeholder token
            return self.charPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.char_rep:
                if char in self.chars:
                    char_vecs.append(self.char_lookup[self.chars[char]])
                elif char in test_embeddings_chars:
                    char_vecs.append(
                        dy.inputVector(test_embeddings_chars[char],
                                       self.device))
                else:
                    char_vecs.append(self.char_lookup[0])
            return self.char_bilstm.get_sequence_vector(char_vecs, train)

    def init_lookups(self, options):

        if self.external_embedding["words"]:
            print('Initialising %i word vectors with external embeddings' %
                  len(self.external_embedding["words"]),
                  file=sys.stderr)
            for word in self.external_embedding["words"]:
                if len(self.external_embedding["words"]
                       [word]) != options.word_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified word embedding size of %s"
                        % (options.word_emb_size))
                self.word_lookup.init_row(
                    self.words[word], self.external_embedding["words"][word])
        elif options.word_emb_size > 0:
            print(
                'No word external embeddings found: all vectors initialised randomly',
                file=sys.stderr)

        if self.external_embedding["chars"]:
            print('Initialising %i char vectors with external embeddings' %
                  len(self.external_embedding["chars"]),
                  file=sys.stderr)
            for char in self.external_embedding["chars"]:
                if len(self.external_embedding["chars"]
                       [char]) != options.char_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified char embedding size of %s"
                        % (options.char_emb_size))
                self.char_lookup.init_row(
                    self.chars[char], self.external_embedding["chars"][char])
        elif options.char_emb_size > 0:
            print(
                'No character external embeddings found: all vectors initialised randomly',
                file=sys.stderr)
h = 8

self_attn = MultiHeadedAttention(h=h,
                                 d_model=d_model,
                                 d_k=d_model // h,
                                 d_v=d_model // h,
                                 dropout=0.)
feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=1024)
embedding = Embeddings(d_model=d_model, vocab=V)

encoder = Encoder(self_attn=self_attn,
                  feed_forward=feed_forward,
                  size=d_model,
                  dropout=0.)
generator = Generator(d_model=d_model, vocab_size=V)
model = Bert(encoder=encoder,
             embedding=embedding,
             generator=generator,
             n_layers=4)

data_iter = create_batch(30, 5)
for i, batch in enumerate(data_iter):
    x = embedding(batch.src)
    y = self_attn(x, x, x, batch.src_mask)

    masked_src = batch.src.masked_fill(
        batch.src_mask.squeeze(-2) == 0, mask_token)
    x2 = embedding(masked_src)
    y2 = self_attn(x2, x2, x2, batch.src_mask)
    print(y)
Esempio n. 12
0
from bert import Bert
from chatbot import Chatbot
from controller import Controller
from flask import Flask, request, abort, jsonify, render_template
from elasticsearch import Elasticsearch
from elasticsearch_dsl import MultiSearch, Search
from spacy.lang.en.stop_words import STOP_WORDS
from datetime import datetime

app = Flask(__name__)

# Load all environment variables
dotenv_path = join(dirname(__file__), '.env')
load_dotenv(dotenv_path)

bert = Bert()
chatbot = Chatbot(os.getenv('ChatbotModelName'), os.getenv('ChatbotDataFile'),
                  int(os.getenv('ChatbotNbIterations')))
controller = Controller()

nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)

ES_HOST = os.getenv('Host')
ES_PORT = os.getenv('Port')
ES_INDEX = os.getenv('Index')

sessions = []


@app.route('/', methods=['POST'])