Ejemplo n.º 1
0
def maximize_coverage(source: CoNLL2003Dataset, target: CoNLL2003Dataset,
                      n: int, tokenizer: BertTokenizer) -> CoNLL2003Dataset:
    MAX_SEQ_LEN = 150

    target_vocab = set()
    for document in target.documents:
        for sentence in document:
            for token, ner_tag in sentence:
                if token is None:
                    continue
                word_pieces = tokenizer.tokenize(token)
                target_vocab.update(word_pieces)

    annotated_train_sentences = []
    for document in source.documents:
        annotated_train_sentences += document

    tokenized_train_sentences = []
    for i, annotated_sentence in enumerate(annotated_train_sentences):
        sentence_word_pieces = []
        for token, _ in annotated_sentence:
            if token is None:
                continue
            word_pieces = tokenizer.tokenize(token)
            sentence_word_pieces += word_pieces
        sentence_word_pieces = set(sentence_word_pieces[:MAX_SEQ_LEN])
        coverage = len(target_vocab & sentence_word_pieces)
        tokenized_train_sentences.append({
            "id": i,
            "set": sentence_word_pieces,
            "coverage": coverage,
        })

    selected_train_sentences = []
    for i in range(n):
        tokenized_train_sentences.sort(key=lambda s: s["coverage"])
        best_sentence = tokenized_train_sentences.pop()
        selected_train_sentences.append(
            annotated_train_sentences[best_sentence["id"]])
        new_word_pieces = target_vocab & best_sentence["set"]
        for new_word_piece in new_word_pieces:
            target_vocab.remove(new_word_piece)
            for j in range(len(tokenized_train_sentences)):
                if new_word_piece in tokenized_train_sentences[j]["set"]:
                    tokenized_train_sentences[j]["set"].remove(new_word_piece)
                    tokenized_train_sentences[j]["coverage"] -= 1

    output = deepcopy(source)
    output.documents = [[sentence] for sentence in selected_train_sentences]
    return output
Ejemplo n.º 2
0
class NERDataSet(Dataset):
    def __init__(self, data_path, config, add_cls=False, add_sep=False):
        self.config = config
        self.sents, self.tags = load_tsv(data_path,
                                         add_cls=add_cls,
                                         add_sep=add_sep)
        self.tokenizer = BertTokenizer(vocab_file=config.vocab_path,
                                       do_lower_case=False)
        self.tokenize()

    def __len__(self):
        return len(self.sents)

    def tokenize(self):
        alltok_sents, alltok_tags = [], []
        for sent_words, sent_tags in zip(self.sents, self.tags):
            tok_sent, tok_tag = [], []
            for w, t in zip(sent_words, sent_tags):  # tokenize the words
                tokens = self.tokenizer.tokenize(w)
                tok_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                tok_tags = [t] + [self.config.piece_tag] * (len(tokens) - 1)
                ttags_ids = [self.config.tag2idx[tt] for tt in tok_tags]
                tok_sent.extend(tok_ids)
                tok_tag.extend(ttags_ids)
            alltok_sents.append(tok_sent)
            alltok_tags.append(tok_tag)
        self.tok_sents = alltok_sents
        self.tok_tags = alltok_tags

    def __getitem__(self, idx):
        return self.tok_sents[idx], self.tok_tags[idx]
Ejemplo n.º 3
0
class JapaneseWorker:
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_basic_tokenize=False)
        self.cls_id = self.bert_tokenizer.vocab['[CLS]']
        self.mask_id = self.bert_tokenizer.vocab['[MASK]']
        self.bert_model = 'PATH_TO_BERTJPN'

        self.cp = 'checkpoint/jp/cp_step_710000.pt'
        self.opt = 'checkpoint/jp/opt_step_710000.pt'

    @staticmethod
    def linesplit(src):
        """
        :param src: type str, String type article
        :return: type list, punctuation seperated sentences
        """
        def remove_newline(x):
            x = x.replace('\n', '')
            return x

        def remove_blank(x):
            x = x.replace(' ', '')
            return x

        def remove_unknown(x):
            unknown = ['\u3000']
            for h in unknown:
                x = x.replace(h, '')
            return x

        src = remove_blank(src)
        src = remove_newline(src)
        src = remove_unknown(src)
        src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src)
        src_line = [x for x in src_line if x is not '']
        return src_line

    def tokenizer(self, src):
        """
        :param src: type list, punctuation seperated sentences
        :return: token: type list, numberized tokens
                 token_id: type list, tokens
        """
        token = []
        token_id = []

        def _preprocess_text(text):
            return text.replace(" ", "")  # for Juman

        for sentence in src:
            preprocessed_text = _preprocess_text(sentence)
            juman_tokens = self.juman_tokenizer(preprocessed_text)
            tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens))
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            token += tokens
            token_id += ids
        return token, token_id
Ejemplo n.º 4
0
class BertWithJumanModel():
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False,is_tokenized=False):
        self.juman_tokenizer = JumanTokenizer()
        self.model = BertModel.from_pretrained(bert_path)
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False, do_basic_tokenize=False)
        self.use_cuda = use_cuda
        self.is_tokenized = is_tokenized

    def _preprocess_text(self, text):
        return text.replace(" ", "")  # for Juman


    def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"):

        if not self.is_tokenized:
            preprocessed_text = self._preprocess_text(text)
            tokens = self.juman_tokenizer.tokenize(preprocessed_text)
            bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        else:
            bert_tokens = self.bert_tokenizer.tokenize(" ".join(text))

        ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

        self.model.eval()
        with torch.no_grad():
            all_encoder_layers, _ = self.model(tokens_tensor)

        embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
        if pooling_strategy == "REDUCE_MEAN":
            return np.mean(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MAX":
            return np.max(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MEAN_MAX":
            return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)]
        elif pooling_strategy == "CLS_TOKEN":
            return embedding[0]
        else:
            raise ValueError("specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}")
Ejemplo n.º 5
0
def get_sample_bert_token_id_seq(bert_tokenizer: BertTokenizer, left_seq_str,
                                 right_seq_str, max_seq_len):
    left_bert_token_seq = bert_tokenizer.tokenize(left_seq_str)
    right_bert_token_seq = bert_tokenizer.tokenize(right_seq_str)

    if len(right_bert_token_seq) + 3 > max_seq_len:
        right_bert_token_seq = right_bert_token_seq[:max_seq_len - 3]

    if len(right_bert_token_seq) + len(left_bert_token_seq) + 3 > max_seq_len:
        left_bert_token_seq = left_bert_token_seq[:max_seq_len -
                                                  len(right_bert_token_seq) -
                                                  3]

    bert_token_seq = ['[CLS]'] + left_bert_token_seq + [
        '[SEP]'
    ] + right_bert_token_seq + ['[SEP]']
    # print(bert_token_seq)
    bert_token_id_seq = bert_tokenizer.convert_tokens_to_ids(bert_token_seq)
    return bert_token_id_seq
Ejemplo n.º 6
0
class BertWithJumanModel():
    """学習済みBertを使うやつ Fork:https://github.com/yagays/pytorch_bert_japanese"""
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        self.juman_tokenizer = JumanTokenizer()
        self.model = BertModel.from_pretrained(bert_path)
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        return text.replace(" ", "")

    def get_sentence_embedding(self,
                               text,
                               pooling_layer=-2,
                               pooling_strategy="REDUCE_MEAN"):
        preprocessed_text = self._preprocess_text(text)
        n = math.ceil(len(preprocessed_text) / 2048)
        result = [
            preprocessed_text[idx:idx + n]
            for idx in range(0, len(preprocessed_text), n)
        ]
        tokens = []
        for t in result:
            tokens += self.juman_tokenizer.tokenize(t)
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

        self.model.eval()
        with torch.no_grad():
            all_encoder_layers, _ = self.model(tokens_tensor)

        embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
        if pooling_strategy == "REDUCE_MEAN":
            return np.mean(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MAX":
            return np.max(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MEAN_MAX":
            return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)]
        elif pooling_strategy == "CLS_TOKEN":
            return embedding[0]
        else:
            raise ValueError(
                "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}"
            )
Ejemplo n.º 7
0
def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer):
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)


    mask_positions = []
    tokenized_text = tokenizer.tokenize(text)
    top_words_all = []
    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    while mask_positions:
        top_words = []
        # Convert tokens to vocab indices
        token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([token_ids])

        # Call BERT to calculate unnormalized probabilities for all pos
        model.eval()
        predictions = model(tokens_tensor)

        # get predictions
        mask_preds = predictions[0, mask_positions, :]

        candidates = [] #(word, prob)
        for mask_pos in mask_positions:
            mask_preds = predictions[0, mask_pos, :]

            top_idxs = mask_preds.detach().numpy().argsort()[::-1]
            top_idx = top_idxs[0]
            top_prob = mask_preds[top_idx]
            top_word = tokenizer.ids_to_tokens[top_idx]
            candidates.append((top_word, top_prob.detach().item()))
            top_words_pos = []
            for i in top_idxs[:20]:
                top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item()))
            top_words.append(top_words_pos)
        best_candidate = max(candidates, key = lambda x: x[1])
        best_pos = mask_positions[candidates.index(best_candidate)]

        tokenized_text[best_pos] = best_candidate[0]
        mask_positions = [i for i in mask_positions if i != best_pos]

        top_words_all.append(top_words[candidates.index(best_candidate)])

    pred_sent = ' '.join(tokenized_text).replace(' ##', '')
    return (pred_sent, top_words_all)
Ejemplo n.º 8
0
class FedPredictDataset(Dataset):
    def __init__(self,
                 texts,
                 vocab_path,
                 max_seq_length=512,
                 vocab='finance-uncased'):
        self.texts = texts
        self.dict_labels = {'lower': 0, 'maintain': 1, 'raise': 2}

        self.max_seq_length = max_seq_length
        self.vocab = vocab
        if self.vocab == 'finance-uncased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=True,
                                           do_basic_tokenize=True)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized_review = self.tokenizer.tokenize(self.texts[index])

        if len(tokenized_review) > self.max_seq_length:
            tokenized_review = tokenized_review[:self.max_seq_length]

        ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review)

        mask_input = [1] * len(ids_review)

        padding = [0] * (self.max_seq_length - len(ids_review))
        ids_review += padding
        mask_input += padding

        input_type = [0] * self.max_seq_length

        assert len(ids_review) == self.max_seq_length
        assert len(mask_input) == self.max_seq_length
        assert len(input_type) == self.max_seq_length

        ids_review = torch.tensor(ids_review)
        mask_input = torch.tensor(mask_input)
        input_type = torch.tensor(input_type)

        input_feature = {
            "token_type_ids": input_type,
            "attention_mask": mask_input,
            "input_ids": ids_review
        }

        return input_feature
Ejemplo n.º 9
0
def _bert_embed_sentence(sentence, bert_model: BertModel, bert_tokenizer: BertTokenizer):
    text = "[CLS] {} [SEP]".format(sentence)
    tokenized_text = bert_tokenizer.tokenize(text)
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [0] * len(indexed_tokens)
    segments_tensors = torch.tensor([segments_ids])

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokens_tensor = tokens_tensor.to(device)
    segments_tensors = segments_tensors.to(device)

    with torch.no_grad():
        encoded_layers, _ = bert_model(tokens_tensor, segments_tensors, output_all_encoded_layers=False)

    # Embedding of the [CLS] token
    return encoded_layers[0][0]
def convert_data2(path1, path2, max_length, number, seq1, seq2):
    """转ID,进行padding,再加上CLP、SEP之后"""
    tokenizer = BertTokenizer('./model/bert-base-chinese/vocab.txt')
    input_id = []
    input_mask = []
    segment_id = []
    # number = 0
    print(len(seq1))

    for i in range(number):
        tokens_a = tokenizer.tokenize(seq1[i])
        tokens_b = tokenizer.tokenize(seq2[i])
        # print(seq2[i])
        # print(tokens_b)
        while True:
            if (len(tokens_a) + len(tokens_b)) <= max_length - 3:
                break
            else:
                # print(tokens_b)
                # tokens_b.pop()
                tokens_a = tokens_a[: int((max_length - 3) * len(tokens_a)/(len(tokens_a) + len(tokens_b)))]
                tokens_b = tokens_b[: int((max_length - 3) * len(tokens_b)/(len(tokens_a) + len(tokens_b)))]
        # 头尾加上[CLS] [SEP]标签
        tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
        tokens = tokens_a + tokens_b + ['[SEP]']
        input_id_ = tokenizer.convert_tokens_to_ids(tokens)
        segment_id_ = [0] * len(tokens_a) + [1] * (len(tokens_b) + 1)
        input_mask_ = [1] * len(tokens)
        # segment_id是用于区分token_a和token_b的
        # input_mask用于区分padding
        padding_ = [0] * (max_length - len(tokens))
        # 所有的输入进入bert的配置参数都要加上padding
        input_id_ += padding_
        segment_id_ += padding_
        input_mask_ += padding_
        # 每条语句放入列表中[sentence_num, MAX_LENGTH]
        input_id.append(input_id_)
        input_mask.append(input_mask_)
        segment_id.append(segment_id_)

    return input_id, input_mask, segment_id
def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int):
    # print('Template sentence: ', text)
    mask_positions = []

    # insert mask tokens
    tokenized_text = tokenizer.tokenize(text)

    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    # Convert tokens to vocab indices
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([token_ids])

    # Call BERT to calculate unnormalized probabilities for all pos
    model.eval()
    predictions = model(tokens_tensor)

    # normalize by softmax
    predictions = F.softmax(predictions, dim=2)

    # For the target word position, get probabilities for each word of interest
    normalized = predictions[0, tgt_pos, :]
    out_prob = normalized[tokenizer.vocab[tgt_word]].item()

    # Also, fill in all blanks by max prob, and print for inspection
    for mask_pos in mask_positions:
        predicted_index = torch.argmax(predictions[0, mask_pos, :]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokenized_text[mask_pos] = predicted_token

    for mask_pos in mask_positions:
        tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_"
    pred_sent = ' '.join(tokenized_text).replace(' ##', '')
    # print(pred_sent)
    return out_prob, pred_sent
Ejemplo n.º 12
0
class mod_eventclass(BasePlugin):
    """ Web Scraping plugin: mod_eventclass
    For classifying news events.
    """
    minArticleLengthInChars = 400
    pluginType = Types.MODULE_DATA_PROCESSOR  # implies data post-processor

    dataFrame = None
    device = None
    model = None
    sentencesColList = [
        'url', 'sentence', 'sentence_no', 'neutral_prob', 'positive_prob',
        'negative_prob'
    ]
    sentencesRec = None

    def __init__(self):
        """ Initialize the object
        """
        super().__init__()

    def additionalConfig(self, sessionHistoryObj):
        """ Perform additional configuration that is specific to this plugin.

        :param sessionHistoryObj: The session history object to be used by this plugin
         for putting items into the data processing competed queue.
        :return:
        """
        self.workDir = self.app_config.data_dir
        self.sessionHistDB = sessionHistoryObj
        self.pretuned_modelfile = self.app_config.checkAndSanitizeConfigString(
            'plugins', 'mod_eventclass_modelfile')
        self.model_weights_path = self.app_config.checkAndSanitizeConfigString(
            'plugins', 'mod_eventclass_weightspath')
        self.vocab_path = self.app_config.checkAndSanitizeConfigString(
            'plugins', 'mod_eventclass_vocab_path')
        self.labels = {0: 'neutral', 1: 'positive', 2: 'negative'}
        # TODO: fix model load error:
        self.setupModel()
        self.sentencesRec = pd.DataFrame(np.zeros(
            (1, len(self.sentencesColList)), dtype=np.unicode_),
                                         columns=self.sentencesColList)
        # convert last 4 into float32 dtype
        for colname in [
                "sentence_no", "neutral_prob", "positive_prob", "negative_prob"
        ]:
            self.sentencesRec[colname] = pd.to_numeric(
                self.sentencesRec[colname])

    def setupModel(self):
        """ Load the classification model.
        """
        num_labels = len(self.labels)
        vocab_type = "finance-uncased"
        self.max_seq_length = 256
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.model = BertClassification(weight_path=self.model_weights_path,
                                        num_labels=num_labels,
                                        vocab=vocab_type)
        self.model.load_state_dict(
            torch.load(self.pretuned_modelfile, map_location=self.device))
        self.model.to(self.device)
        self.tokenizer = BertTokenizer(vocab_file=self.vocab_path,
                                       do_lower_case=True,
                                       do_basic_tokenize=True)

    def processDataObj(self, newsEventObj):
        """ Process given data object by this plugin.

        :param newsEventObj: The NewsEvent object to be classified.
        :type newsEventObj: NewsEvent
        """
        assert type(newsEventObj) == NewsEvent
        # Do not proceed if the articles has already been classified, i.e. contains scores
        if newsEventObj.getClassification() is None:
            # TODO: lock file to avoid conflicting writes, release lock at the end of the method
            logger.debug(
                f"Started news event classification for data in: {newsEventObj.getFileName()}"
            )
            classificationObj = self.classifyText(newsEventObj.getText(),
                                                  newsEventObj.getURL())
            # put classification field in NewsEvent document:
            newsEventObj.setClassification(classificationObj)
            # prepare filename:
            fileNameWOExt = newsEventObj.getFileName().replace('.json', '')
            # save document to file:
            newsEventObj.writeFiles(fileNameWOExt, '', saveHTMLFile=False)
            logger.info(
                f"Completed classifying news event in: {fileNameWOExt} as: {classificationObj}"
            )

    def classifyText(self, textValue, url):
        """
        Examine and classify the text from the document and return classification scores text.

        :param textValue: Text to be examined and classified.
        :type textValue: str
        :return: Classification scores
        :rtype: dict{str:float}
        """
        sentenceDF = None
        classificationScores = {
            'positive': 0.0,
            'neutral': 0.0,
            'negative': 0.0
        }
        try:
            logger.debug(
                f'Classifying using finbert model for text of length {len(textValue)}'
            )
            if len(textValue) > self.minArticleLengthInChars:
                thisRec = self.sentencesRec.copy(deep=True)
                thisRec['url'] = url
                sentences = sent_tokenize(textValue.lower())
                self.model.eval()
                for index, sent in enumerate(sentences):
                    thisRec['sentence'] = sent
                    thisRec['sentence_no'] = index
                    # apply model on the sentence to get classification scores
                    [neutralProb, positiveProb,
                     negativeProb] = self.classifySentences(sent)
                    thisRec['neutral_prob'] = neutralProb
                    thisRec['positive_prob'] = positiveProb
                    thisRec['negative_prob'] = negativeProb
                    if sentenceDF is None:
                        sentenceDF = thisRec
                    else:
                        sentenceDF = sentenceDF.append(thisRec)
                aggscores = sentenceDF.groupby('url').agg({
                    'neutral_prob':
                    'sum',
                    'positive_prob':
                    'sum',
                    'negative_prob':
                    'sum'
                })
                classificationScores = {
                    'positive': aggscores['positive_prob'][0],
                    'neutral': aggscores['neutral_prob'][0],
                    'negative': aggscores['negative_prob'][0]
                }
        except Exception as e:
            print("Error getting sentence classification:", e)
        return (classificationScores)

    def classifySentences(self, sent):
        """ Classify one text sentence at a time.
        """
        tokenized_sent = self.tokenizer.tokenize(sent)
        if len(tokenized_sent) > self.max_seq_length:
            tokenized_sent = tokenized_sent[:self.max_seq_length]
        ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_sent)
        mask_input = [1] * len(ids_review)
        padding = [0] * (self.max_seq_length - len(ids_review))
        ids_review += padding
        mask_input += padding
        input_type = [0] * self.max_seq_length
        input_ids = torch.tensor(ids_review).to(self.device).reshape(-1, 256)
        attention_mask = torch.tensor(mask_input).to(self.device).reshape(
            -1, 256)
        token_type_ids = torch.tensor(input_type).to(self.device).reshape(
            -1, 256)
        with torch.set_grad_enabled(False):
            outputs = self.model(input_ids, token_type_ids, attention_mask)
            outputs = F.softmax(outputs, dim=1)
            # print('\n FinBERT predicted sentiment: ', labels[torch.argmax(outputs).item()])
            return ([i.item() for i in outputs.data[0]])
Ejemplo n.º 13
0
        cnn_sentence = word_tokenize(text.lower())
        cnn_sentence = [cnn_vocabulary.w2i[w] for w in cnn_sentence]
        cnn_sentence += [cnn_vocabulary.w2i['[SEP]']]
        cnn_sentence = [cnn_vocabulary.w2i['[CLS]']] + cnn_sentence
        sent_len = [len(cnn_sentence)]
        cnn_sentence = torch.tensor([cnn_sentence
                                     ]).type(torch.LongTensor).to(device)

        cnn_prob = cnn_model(cnn_sentence, (cnn_sentence > 0))
        final_prob += cnn_prob

        lstm_prob = lstm_model(cnn_sentence, sent_len)
        final_prob += lstm_prob

        bert_sentence = tokenizer.tokenize(text)
        bert_sentence = [bert_vocabulary.w2i[w] for w in bert_sentence]
        bert_sentence += [bert_vocabulary.w2i['[SEP]']]
        bert_sentence = [bert_vocabulary.w2i['[CLS]']] + bert_sentence
        bert_sentence = torch.tensor([bert_sentence
                                      ]).type(torch.LongTensor).to(device)

        bert_sent_prob = bert_sent_model(bert_sentence)
        final_prob += bert_sent_prob
        bert_word_prob = bert_word_model(bert_sentence)
        final_prob += bert_word_prob

        _, pred_topic = torch.max(final_prob, 1)
        pred_topic = pred_topic.cpu().numpy()[0]
        results.append(i2t[pred_topic])
Ejemplo n.º 14
0
def evaluate(args:Dict):
    model_root = args['--model-root'] if args['--model-root'] else './models'
    print("load model from {}".format(model_root), file=sys.stderr)

    dataLoader = sentence.Sentence(args['--test-src'])

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")

    output_model_file = os.path.join(model_root, "model_file.bin")
    output_config_file = os.path.join(model_root, "config_file.bin")
    output_vocab_file = os.path.join(model_root, "vocab.txt")
    config = BertConfig.from_json_file(output_config_file)
    model = BertForTokenClassification(config,num_labels=len(dataLoader.tag2idx))
    state_dict = torch.load(output_model_file)
    model.load_state_dict(state_dict)
    tokenizer = BertTokenizer(output_vocab_file, do_lower_case=False)

    tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences]

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    MAX_LEN = int(args['--max-len'])

    input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags_test = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels],
                         maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks_test = [[float(i > 0) for i in ii] for ii in input_ids_test]

    for i, inp in enumerate(input_ids_test):
        if (102 not in inp):
            inp[-1] = 102
            tags_test[i][-1] = dataLoader.tag2idx.get("O")

    te_inputs = torch.tensor(input_ids_test).to(torch.int64)
    te_tags = torch.tensor(tags_test).to(torch.int64)
    te_masks = torch.tensor(attention_masks_test)

    test_data = TensorDataset(te_inputs, te_masks, te_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=int(args['--batch-size']))

    model.eval()
    predictions = []
    true_labels = []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)

        logits = logits.detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        label_ids = b_labels.to('cpu').numpy()
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    pred_tags = [[dataLoader.tags_vals[p_i] for p_i in p] for p in predictions]
    test_tags = [[dataLoader.tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l]

    tags_test_fin = list()
    for l in tags_test:
        temp_tag = list()
        for l_i in l:
            temp_tag.append(dataLoader.tags_vals[l_i])
        tags_test_fin.append(temp_tag)

    print("Test loss: {}".format(eval_loss / nb_eval_steps))
    print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps))
    print("Test F1-Score: {}".format(f1_score(tags_test_fin, pred_tags)))

    print(classification_report(tags_test_fin, pred_tags))

    print("Number of Test sentences: ", len(tags_test_fin))
Ejemplo n.º 15
0
class Vocabulary:
    """Vocubulary for Dataset 
    
    Read the train dataset's words into vocabulary;
    Conduct necessary preprocessing;
    Obtain word2index and topic2index dictionary;

    """
    def __init__(self):
        self.tokenizer = BertTokenizer(
            vocab_file=os.path.join(main_dir, 'pretrained_bert',
                                    'uncased_L-12_H-768_A-12', 'vocab.txt'))

        # generate w2i, t2i, and train data
        self.get_vocab()

        # self.get_dataset(split='train')

    def get_num_words(self):
        return len(self.w2i)

    def get_num_topics(self):
        return len(self.t2i)

    def get_dataset(self, split):
        if split == 'train':
            try:
                return self.train_data
            except:
                self.train_data = self.read_dataset(
                    os.path.join(main_dir, 'data/topicclass_train.txt'))
                return self.train_data
        elif split == 'valid':
            try:
                return self.valid_data
            except:
                self.valid_data = self.read_dataset(
                    os.path.join(main_dir, 'data/topicclass_valid.txt'))
                return self.valid_data
        elif split == 'test':
            try:
                return self.test_data
            except:
                self.test_data = self.read_dataset(
                    os.path.join(main_dir, 'data/topicclass_test.txt'))
                return self.test_data
        else:
            raise ValueError("Unkown split, split must in train/valid/test!")

    def get_vocab(self):
        """ Generate vocabulary from train dataset """
        # create word2index and topic2index dict
        w2i = defaultdict(lambda: len(w2i))
        filename = os.path.join(
            main_dir, 'pretrained_bert/uncased_L-12_H-768_A-12/vocab.txt')

        with open(filename, "r") as f:
            for word in f:
                index = w2i[word.rstrip('\n')]

        UNK = w2i['[UNK]']
        # fix the word2index thus any new words in valid and test dataset will be unkown
        self.w2i = defaultdict(lambda: UNK, w2i)

        # self.t2i
        self.t2i = defaultdict(lambda: len(self.t2i))

        filename = os.path.join(main_dir, 'data/topicclass_train.txt')
        self.train_data = []

        with open(filename, "r") as f:
            for line in tqdm(f):
                topic, text = line.lower().strip().split(" ||| ")
                sentence = self.tokenizer.tokenize(text)
                sentence = [self.w2i[w] for w in sentence]
                sentence += [self.w2i['[SEP]']]
                sentence = [self.w2i['[CLS]']] + sentence
                # make train data
                self.train_data.append((sentence, self.t2i[topic]))

    def read_dataset(self, filename):
        """ Read rawdata using word2index and topic2index """
        data = []
        logger.info("Reading {} into dataset...".format(filename))
        with open(filename, "r") as f:
            for line in tqdm(f):
                topic, text = line.lower().strip().split(" ||| ")
                sentence = self.tokenizer.tokenize(text)
                sentence = [self.w2i[w] for w in sentence]
                sentence += [self.w2i['[SEP]']]
                sentence = [self.w2i['[CLS]']] + sentence
                data.append((sentence, self.t2i[topic]))
        return data
Ejemplo n.º 16
0
def bert_sentence_pair_preprocessing(dataset: pd.DataFrame, tokenizer: BertTokenizer, max_sequence_length=64):
    max_bert_input_length = 70

    dataset_input_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long)
    dataset_token_type_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long)
    dataset_attention_masks = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long)
    dataset_lengths = torch.empty((len(dataset), 1), dtype=torch.long)
    dataset_labels = torch.empty((len(dataset), 1), dtype=torch.long)
    dataset_other_type_ids = torch.empty((len(dataset), 18), dtype=torch.long)
    # dataset_input_tensors = torch.empty(len(dataset), 4, max_bert_input_length, dtype=torch.float)

    for idx, data in dataset.iterrows():
        tokens = []
        input_type_ids = []

        # other type 전처리
        other_type_ids = []
        other_type_ids.append(data['addr0'])
        other_type_ids.append(data['addr1'])
        other_type_ids.append(data['addr2'])
        other_type_ids.append(data['addr3'])
        other_type_ids.append(data['addr4'])
        other_type_ids.append(data['addr5'])
        other_type_ids.append(data['phone0'])
        other_type_ids.append(data['phone1'])
        other_type_ids.append(data['phone2'])
        other_type_ids.append(data['phone3'])
        other_type_ids.append(data['cate0'])
        other_type_ids.append(data['cate1'])
        other_type_ids.append(data['cate2'])
        other_type_ids.append(data['cate3'])
        other_type_ids.append(data['cate4'])
        other_type_ids.append(data['cname0'])
        other_type_ids.append(data['cname1'])
        other_type_ids.append(data['cname2'])

        dataset_other_type_ids[idx] = torch.tensor(other_type_ids, dtype=torch.long)

        sentence_1_tokenized, sentence_2_tokenized = tokenizer.tokenize(data['full_placename1']), tokenizer.tokenize(data['full_placename2'])

        tokens.append("[CLS]")
        input_type_ids.append(0)

        for token in sentence_1_tokenized:
            tokens.append(token)
            input_type_ids.append(0)

        tokens.append("[SEP]")
        input_type_ids.append(0)

        for token in sentence_2_tokenized:
            tokens.append(token)
            input_type_ids.append(1)

        tokens.append("[SEP]")
        input_type_ids.append(1)

        # 전처리한 token 바탕으로 인덱스값 얻음
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # attention mask 전처리
        attention_masks = [1] * len(input_ids)

        # input_ids length 저장
        dataset_lengths[idx] = torch.tensor(len(input_ids), dtype=torch.long)

        while len(input_ids) < max_bert_input_length:
            input_ids.append(0)
            attention_masks.append(0)
            input_type_ids.append(0)

        dataset_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long)
        dataset_token_type_ids[idx] = torch.tensor(input_type_ids, dtype=torch.long)
        dataset_attention_masks[idx] = torch.tensor(attention_masks, dtype=torch.long)

        dataset_labels[idx] = torch.tensor(data['label'], dtype=torch.long)

    return dataset_input_ids, dataset_token_type_ids, dataset_attention_masks, dataset_other_type_ids, dataset_lengths, dataset_labels
Ejemplo n.º 17
0
class for_BERT():
    def __init__(self, mode='training'):
        self.mode = mode

        with open(dir_path + '/data/tag2idx.json', 'r') as f:
            self.tag2idx = json.load(f)

        self.idx2tag = dict(zip(self.tag2idx.values(), self.tag2idx.keys()))

        # load pretrained BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=False)

        # load BERT tokenizer with untokenizing frames
        never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
        added_never_split = []
        added_never_split.append('<tgt>')
        added_never_split.append('</tgt>')
        added_never_split_tuple = tuple(added_never_split)
        never_split_tuple += added_never_split_tuple
        vocab_file_path = dir_path + '/data/bert-multilingual-cased-dict-add-frames'
        self.tokenizer_with_frame = BertTokenizer(
            vocab_file_path,
            do_lower_case=False,
            max_len=256,
            never_split=never_split_tuple)

    def idx2tag(self, predictions):
        pred_tags = [self.idx2tag[p_i] for p in predictions for p_i in p]

        # bert tokenizer and assign to the first token
    def bert_tokenizer(self, text):
        orig_tokens = text.split(' ')
        bert_tokens = []
        orig_to_tok_map = []
        bert_tokens.append("[CLS]")
        for orig_token in orig_tokens:
            orig_to_tok_map.append(len(bert_tokens))
            bert_tokens.extend(self.tokenizer_with_frame.tokenize(orig_token))
        bert_tokens.append("[SEP]")

        return orig_tokens, bert_tokens, orig_to_tok_map

    def convert_to_bert_input(self, input_data):
        tokenized_texts, args = [], []
        orig_tok_to_maps = []
        for i in range(len(input_data)):
            data = input_data[i]
            text = ' '.join(data[0])
            orig_tokens, bert_tokens, orig_to_tok_map = self.bert_tokenizer(
                text)
            orig_tok_to_maps.append(orig_to_tok_map)
            tokenized_texts.append(bert_tokens)

            if self.mode == 'training':
                ori_args = data[2]
                arg_sequence = []
                for i in range(len(bert_tokens)):
                    if i in orig_to_tok_map:
                        idx = orig_to_tok_map.index(i)
                        ar = ori_args[idx]
                        arg_sequence.append(ar)
                    else:
                        arg_sequence.append('X')
                args.append(arg_sequence)

        input_ids = pad_sequences([
            self.tokenizer.convert_tokens_to_ids(txt)
            for txt in tokenized_texts
        ],
                                  maxlen=MAX_LEN,
                                  dtype="long",
                                  truncating="post",
                                  padding="post")
        orig_tok_to_maps = pad_sequences(orig_tok_to_maps,
                                         maxlen=MAX_LEN,
                                         dtype="long",
                                         truncating="post",
                                         padding="post",
                                         value=-1)

        if self.mode == 'training':
            arg_ids = pad_sequences([[self.tag2idx.get(ar) for ar in arg]
                                     for arg in args],
                                    maxlen=MAX_LEN,
                                    value=self.tag2idx["X"],
                                    padding="post",
                                    dtype="long",
                                    truncating="post")

        attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
        data_inputs = torch.tensor(input_ids)
        data_orig_tok_to_maps = torch.tensor(orig_tok_to_maps)
        data_masks = torch.tensor(attention_masks)

        if self.mode == 'training':
            data_args = torch.tensor(arg_ids)
            bert_inputs = TensorDataset(data_inputs, data_orig_tok_to_maps,
                                        data_args, data_masks)
        else:
            bert_inputs = TensorDataset(data_inputs, data_orig_tok_to_maps,
                                        data_masks)
        return bert_inputs
Ejemplo n.º 18
0
class NER:
    def __init__(self,
                 encoding,
                 base_model="bert-base-uncased",
                 num_ner=0,
                 tag_dropout=0.3,
                 pos_dropout=0.3,
                 ner_dropout=None,
                 tag_dropout_2=0.3,
                 pos_dropout_2=0.3,
                 ner_dropout_2=None,
                 architecture="simple",
                 ner=False,
                 middle_layer=None):
        """ There are only two base_model options allowed: "bert-base-uncased" and "finbert-uncased" """
        # Fine Tuning parameters
        self.ner = ner
        self.num_ner = num_ner
        self.ner_dropout = ner_dropout
        self.architecture = architecture
        self.middle_layer = middle_layer
        self.tag_dropout = tag_dropout
        self.pos_dropout = pos_dropout
        self.tag_dropout_2 = tag_dropout_2
        self.pos_dropout_2 = pos_dropout_2
        self.ner_dropout_2 = ner_dropout_2

        # configuration
        self.config = config

        # Accuracies and Losses
        self.list_train_losses = []
        self.list_test_losses = []
        self.list_tag_acc = []
        self.list_pos_acc = []

        # std means standardized, in our case the tags are replaced by integers for the classification
        self.pos_std = None
        self.tag_std = None
        self.device = None

        # define the encoding of the dataframe
        if "utf" in encoding.lower():
            self.encoding = "utf-8"
        elif "latin-1" in encoding.lower():
            self.encoding = "latin-1"
        else:
            self.encoding = encoding

        # be sure the model's name follows the correct structure
        self.base_model = base_model.replace("_", "-")

        # Fix the tokenizer and special tokens
        if base_model == "bert-base-uncased":
            self.tokenizer = BertTokenizer(
                vocab_file=config.BERT_UNCASED_VOCAB,
                do_lower_case=True,
                do_basic_tokenize=True)
            self.special_tokens_dict = special_tokens_dict(
                config.BERT_UNCASED_VOCAB)
        elif base_model == "finbert-uncased":
            self.tokenizer = BertTokenizer(
                vocab_file=config.FINBERT_UNCASED_VOCAB,
                do_lower_case=True,
                do_basic_tokenize=True)
            self.special_tokens_dict = special_tokens_dict(
                config.FINBERT_UNCASED_VOCAB)

    def training(self, saving=True):
        logger.info("Preprocessing data ...")
        # We preprocess and normalize (as categories) the data and output it as np.arrays/ pd.series
        sentences, pos, tag, self.pos_std, self.tag_std = preprocess_data_BERT(
            self.config.TRAINING_FILE, self.encoding)
        logger.info("Data has been preprocessed")

        # Checkpoint for the standardized pos and tag. tag <-> integer value
        logger.info("Making checkpoint for the preprocessed data ...")
        if saving:
            data_check_pt = {"pos_std": self.pos_std, "tag_std": self.tag_std}
            joblib.dump(value=data_check_pt,
                        filename=config.CHECKPOINTS_META_PATH)
        else:
            pass

        # Save the number of classes per classification problem
        num_tag = len(list(self.tag_std.classes_))
        num_pos = len(list(self.pos_std.classes_))
        data4 = np.array(num_pos)
        np.savez(join(config.BASE_DATA_PATH, "num_pos"), data4)
        data3 = np.array(num_tag)
        np.savez(join(config.BASE_DATA_PATH, "num_tag"), data3)

        # Split training set with skl
        logger.info(" Splitting data and creating data sets ...")
        self.train_sentences, self.test_sentences, self.train_pos, self.test_pos, self.train_tag, self.test_tag \
            = train_test_split(sentences, pos, tag, random_state=42, test_size=0.2)

        # Format based on Entities_dataset: getitem outputs pandas dataframes
        self.train = dataset.Entities_dataset(
            texts=self.train_sentences,
            pos=self.train_pos,
            tags=self.train_tag,
            tokenizer=self.tokenizer,
            special_tokens=self.special_tokens_dict,
            model_name=self.base_model)

        self.test = dataset.Entities_dataset(
            texts=self.test_sentences,
            pos=self.test_pos,
            tags=self.test_tag,
            tokenizer=self.tokenizer,
            special_tokens=self.special_tokens_dict,
            model_name=self.base_model)

        # Loaders from torch: it formats the data for pytorch and fixes the batch and the num of kernels
        # "workers" means subprocess no gpus in the cuda
        self.train_data_loader = DataLoader(
            self.train, batch_size=self.config.TRAIN_BATCH_SIZE, num_workers=4)
        self.test_data_loader = DataLoader(
            self.test, batch_size=self.config.VALID_BATCH_SIZE, num_workers=4)

        # Load model to device and hyperparameters
        logger.info("Moving model to cuda ...")
        self.model_device(phase="train", num_tag=num_tag, num_pos=num_pos)
        self.hyperparameters()

        # initialize the loss
        best_loss = np.inf
        best_tag_acc = 0
        best_pos_acc = 0

        # EPOCHS
        logger.info("Starting Fine-tuning ...")
        for epoch in range(self.config.EPOCHS):

            # Training
            logger.info("Start epoch {}".format(epoch + 1))
            train_loss = train_val_loss.train(self.train_data_loader,
                                              self.model, self.optimizer,
                                              self.device, self.scheduler)
            test_loss, tag_acc, pos_acc = train_val_loss.validation(
                self.test_data_loader, self.model, self.device)

            # Accuracies and Losses
            logger.info("Train Loss = {}".format(train_loss))
            logger.info("Test Loss = {}".format(test_loss))
            logger.info("Accuracy for tags is = {}".format(tag_acc))
            logger.info("Accuracy for pos is = {}".format(pos_acc))
            self.list_train_losses.append(float(train_loss))
            self.list_test_losses.append(float(test_loss))
            self.list_tag_acc.append(float(tag_acc))
            self.list_pos_acc.append(float(pos_acc))
            logger.info("End epoch {}".format(epoch + 1))
            logger.info("Testing epoch {}".format(epoch + 1))
            if test_loss < best_loss:
                torch.save(self.model.state_dict(),
                           self.config.CHECKPOINTS_MODEL_PATH)
                best_loss = test_loss
            if pos_acc > best_pos_acc:
                best_pos_acc = pos_acc
            if tag_acc > best_tag_acc:
                best_tag_acc = tag_acc
            logger.info("End epoch {} with loss {} asnd best loss {}".format(
                epoch + 1, test_loss, best_loss))

        logger.info("Fine-tuning finished")
        logger.info("With training losses: {}".format(self.list_train_losses))
        logger.info("With test losses: {}".format(self.list_test_losses))

        # plotting
        losses_accuracies = {
            "Tag accuracy": self.list_tag_acc,
            "Pos accuracy": self.list_pos_acc,
            "Train loss": self.list_train_losses,
            "Test loss": self.list_test_losses
        }
        name = "model=" + self.base_model + "_epochs=" + str(
            config.EPOCHS) + "_test_batch="
        name += str(config.VALID_BATCH_SIZE) + "_train_batch=" + str(
            config.TRAIN_BATCH_SIZE) + "_max_len="
        name += str(config.MAX_LEN) + "_dropouts=" + str(
            self.tag_dropout) + "_" + str(self.pos_dropout)
        name += "_" + str(self.ner_dropout) + "_architecture=" + str(
            self.architecture)
        name += '_POS=' + str(best_pos_acc) + '_TAG=' + str(best_tag_acc)
        ploter(output_path=config.BASE_DATA_PATH,
               name=name,
               num_epochs=self.config.EPOCHS,
               **losses_accuracies)

        # Saving results
        data_pos = np.array(self.list_pos_acc)
        np.savez(join(config.BASE_DATA_PATH, "pos_accuracies_" + name),
                 data_pos)
        data_tag = np.array(self.list_tag_acc)
        np.savez(join(config.BASE_DATA_PATH, "tag_accuracies_" + name),
                 data_tag)
        data1 = np.array(self.list_train_losses)
        np.savez(join(config.BASE_DATA_PATH, "train_losses_" + name), data1)
        data2 = np.array(self.list_test_losses)
        np.savez(join(config.BASE_DATA_PATH, "test_losses_" + name), data2)
        return best_loss

    def predict(self, text):
        """ Given a example text it predicts and prints the tokens and their labels for tag and pos"""

        # Loading the results
        num_tag = np.load(join(config.BASE_DATA_PATH, "num_tag.npz"))
        num_tag = num_tag.f.arr_0
        num_pos = np.load(join(config.BASE_DATA_PATH, "num_pos.npz"))
        num_pos = num_pos.f.arr_0

        # check pos and tag
        if self.pos_std is None:
            std_data = joblib.load(config.CHECKPOINTS_META_PATH)
            self.pos_std = std_data["pos_std"]
            self.tag_std = std_data["tag_std"]
        else:
            pass

        # preprocessing
        sentence = text.split()

        # tokenizing
        tokenized_text = self.tokenizer.tokenize(text)

        # converting into iterable input for the model
        tets_text = dataset.Entities_dataset(
            texts=[sentence],
            pos=[[0] * len(sentence)],
            tags=[[0] * len(sentence)],
            tokenizer=self.tokenizer,
            special_tokens=self.special_tokens_dict,
            model_name=self.base_model)

        # move model to device and fix not update for the gradients since it is a prediction
        self.model_device(phase="predict", num_tag=num_tag, num_pos=num_pos)

        with torch.no_grad():
            data = tets_text[0]
            for k, v in data.items():
                data[k] = v.to(self.device).unsqueeze(0)
            tag, pos, _ = self.model(**data)

            # argmax: max value axis 2, the distribution ; cpu().numpy(): convert to cuda variable
            print(tokenized_text)
            print(
                self.tag_std.inverse_transform(
                    tag.argmax(2).cpu().numpy().reshape(-1))
                [1:len(tokenized_text) + 1])
            print(
                self.pos_std.inverse_transform(
                    pos.argmax(2).cpu().numpy().reshape(-1))
                [1:len(tokenized_text) + 1])

    def model_device(self, phase, num_tag, num_pos):
        """ Use GPU, load model and move it there -- device or cpu if cuda is not available """
        self.device = check_device()
        self.model = BERT_NER(num_tag=num_tag,
                              num_pos=num_pos,
                              num_ner=self.num_ner,
                              base_model=self.base_model,
                              tag_dropout=self.tag_dropout,
                              pos_dropout=self.pos_dropout,
                              ner_dropout=self.ner_dropout,
                              tag_dropout_2=self.tag_dropout_2,
                              pos_dropout_2=self.pos_dropout_2,
                              ner_dropout_2=self.ner_dropout_2,
                              architecture=self.architecture,
                              ner=self.ner,
                              middle_layer=self.middle_layer)
        if phase == "train":
            self.model.to(self.device)
        elif phase == "predict":
            self.model.load_state_dict(
                torch.load(self.config.CHECKPOINTS_MODEL_PATH))
            self.model.to(self.device)
        else:
            pass

    def hyperparameters(self):
        """ This method fix the parameters and makes a filter over to exclude LayerNorm and biases """

        # nn.module list of parameters: all parameters from BERT plus the pos and tag layer
        self.param_optimizer = list(self.model.named_parameters())

        #  exclude LayerNorm and biases
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [{
            "params": [
                p for n, p in self.param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001
        }, {
            "params": [
                p for n, p in self.param_optimizer
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        }]

        num_train_steps = int(
            len(self.train_sentences) / self.config.TRAIN_BATCH_SIZE *
            self.config.EPOCHS)
        self.optimizer = AdamW(optimizer_parameters, lr=3e-5)

        # Scheduler
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_train_steps)
Ejemplo n.º 19
0
bert.eval()

# ## Setup tokenizer

# In[ ]:

tokenizer = BertTokenizer(
    vocab_file='../input/torch-bert-weights/bert-base-uncased-vocab.txt')

# ## Make prediction

# In[ ]:

# lets tokenize some text (I intentionally mispelled 'plastic' to check berts subword information handling)
text = 'hi my name is Dieter and I like wearing my yellow pglastic hat while coding.'
tokens = tokenizer.tokenize(text)
tokens

# In[ ]:

# added start and end token and convert to ids
tokens = ["[CLS]"] + tokens + ["[SEP]"]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

# In[ ]:

# put input on gpu and make prediction
bert_output = bert(torch.tensor([input_ids]).cuda())
bert_output
Ejemplo n.º 20
0
                    help='show top k predictions')
if __name__ == '__main__':
    args = parser.parse_args()
    bert_tokenizer = BertTokenizer(
        vocab_file='/media/lonelyprince7/mydisk/NLP-dataset/bert_models/bert-base-uncased-vocab.txt')
    bert_model = BertForMaskedLM.from_pretrained(
        '/media/lonelyprince7/mydisk/NLP-dataset/bert_models/bert-base-uncased.tar.gz')
    sentences, res = read_data()
    print(res)
    predict_res = []
    mask_cnt = 0
    for sentence in sentences:
        sentence = sentence.strip()
        sentence = sentence.replace('_', '[MASK]')
        # print(sentence)
        tokens = bert_tokenizer.tokenize(sentence)
        if len(tokens) == 0:
            continue
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad():
            logits = bert_model(token_idx, segment_idx,
                                mask, masked_lm_labels=None)
        logits = logits.squeeze(0)
        probs = torch.softmax(logits, dim=-1)
        for idx, token in enumerate(tokens):
            if token == MASK:
                mask_cnt += 1
Ejemplo n.º 21
0
def main(annotated_text_file=str,
         text_file=str,
         vocab_file: str,
         word_file: FileType,
         threshold: int,
         missing_tokens_file: str,
         output_file: str):
    nlp = spacy.load('en_core_web_lg',
                     disable=['tokenizer', 'tagger', 'ner', 'textcat'])

    ###### Find missing tokens
    # subprocess.call("1_extract_vocab.sh", shell = True)
    subprocess.check_call("1_extract_vocab.sh -i %s -o %s" %
                          (annotated_text_file, missing_tokens_file),
                          shell=True)
    vocab_file = expanduser(vocab_file)
    tokenizer = BertTokenizer(vocab_file, do_lower_case=False)
    f = open(missing_tokens_file, "w+")
    print('count,original,splitted', file=f)  # file header
    for line in tqdm(word_file, 'words'):
        c_word = line.strip().split()
        if len(c_word) == 1:  # word is a space
            continue

        count, word = c_word
        count = int(count)
        if count < threshold:
            break

        tokens = tokenizer.tokenize(word)
        if len(tokens) > 1:  # we have subwords
            if len(tokens) == 2 and tokens[1] == '##s':
                continue
            print(count,
                  word,
                  '#'.join(t.strip('#') for t in tokens),
                  sep=',',
                  file=f)  # create csv from that output
    ######2nd Stage
    count_unused = 0
    vocab = []
    count = 0
    f = open(missing_tokens_file, "r")
    for x in f:
        ### the first line is a warning from bert
        #   if count > 0:
        vocab.append(x.replace("\n", "").split(','))
        count += 1
    new_vocab = pd.DataFrame(vocab[1:], columns=vocab[0])
    new_vocab['count'] = new_vocab['count'].apply(int)
    new_vocab.sort_values('count', ascending=False).to_csv('new_vocab.csv')
    missing_tokens = pd.read_csv('new_vocab.csv')
    with open(output_file, 'w') as write:
        with open(vocab_file, 'r') as read:
            for line in tqdm(read):
                if '[unused' in line and count_unused < missing_tokens.shape[0]:
                    write.write(missing_tokens.iloc[count_unused]['original'] +
                                '\n')
                    count_unused += 1
                else:
                    write.write(line)

    ### TO SEPARATE THE FULL TEXT INTO DOCUMENTS

    df = pd.read_csv(text_file, delimiter="\n\n", header=None)
    docs = df[0].apply(lambda x: x.replace('Operator', ""))

    #### LOAD THE ANNOTATED DATA

    with open(annotated_text_file) as json_file:
        data_annotated = json.load(json_file)
    content = []
    sentiment = []
    for i in range(len(data_annotated['data'])):
        try:
            content.append(data_annotated['data'][i]['content'])

            try:
                sentiment.append(
                    data_annotated['data'][i]['annotation']['sentiment'])

            except:
                print('pb sentiment')
                del content[i]
        except:
            print('pb')
        continue

    docs2 = pd.Series(content)
    docs = pd.concat([docs, docs2])

    # vocab = []
    # count = 0
    # f = open("new_vocab.txt", "r")
    # for x in f:
    #     vocab.append(x.replace("\n", "").split(','))
    #     count += 1
    #
    # ###### ADDING NEW VOCABULARY
    # new_vocab = pd.DataFrame(vocab[1:], columns=vocab[0])
    # new_vocab['count'] = new_vocab['count'].apply(int)
    # new_vocab.sort_values('count', ascending=False).to_csv('new_vocab.csv')

    documents_liste = docs.tolist()

    ## Writing the documents to separate files : propportion to fasten the execution

    for i, document in enumerate(documents_liste):
        if i < len(documents_liste) * 0.99:
            output_file = 'data/transcript_' + str(i) + '.txt'
        else:
            output_file = 'test/transcript_' + str(i) + '.txt'

        new_file = open(output_file, mode="w+", encoding="utf-8")
        new_file.write(document)
        new_file.close()
class WordPieceVectorizer1D(AbstractVectorizer):
    """Define a Baseline Vectorizer that can do WordPiece with BERT tokenizer

    If you use tokens=wordpiece, this vectorizer is used, and so then there is
    a dependency on bert_pretrained_pytorch
    """
    def __init__(self, **kwargs):
        """Loads a BertTokenizer using bert_pretrained_pytorch

        :param kwargs:
        """
        super(WordPieceVectorizer1D, self).__init__(kwargs.get('transform_fn'))
        from pytorch_pretrained_bert import BertTokenizer
        self.max_seen = 128
        handle = kwargs.get('embed_file')
        custom_vocab = kwargs.get('vocab_file')
        if custom_vocab is None:
            self.tokenizer = BertTokenizer.from_pretrained(handle,
                                                           do_lower_case=True)
        else:
            special_tokens = kwargs.get('special_tokens')
            never_split = ('[UNK]', '[SEP]', '[PAD]', '[CLS]',
                           '[MASK]') + special_tokens
            self.tokenizer = BertTokenizer(custom_vocab,
                                           do_basic_tokenize=True,
                                           never_split=never_split)
        self.mxlen = kwargs.get('mxlen', -1)

    @property
    def vocab(self):
        return self.tokenizer.vocab

    def count(self, tokens):
        seen = 0
        counter = Counter()
        for tok in self.iterable(tokens):
            counter[tok] += 1
            seen += 1
        self.max_seen = max(self.max_seen, seen)
        return counter

    def iterable(self, tokens):
        for tok in tokens:
            if tok == '<unk>':
                yield '[UNK]'
            elif tok == '<EOS>':
                yield '[SEP]'
            else:
                for subtok in self.tokenizer.tokenize(tok):
                    yield subtok

    def _next_element(self, tokens, vocab):
        for atom in self.iterable(tokens):
            value = vocab.get(atom)
            if value is None:
                value = vocab['[UNK]']
            yield value

    def run(self, tokens, vocab):
        if self.mxlen < 0:
            self.mxlen = self.max_seen
        vec1d = np.zeros(self.mxlen, dtype=np.long)
        for i, atom in enumerate(self._next_element(tokens, vocab)):
            if i == self.mxlen:
                i -= 1
                break
            vec1d[i] = atom
        valid_length = i + 1
        return vec1d, valid_length

    def get_dims(self):
        return self.mxlen,
class text_dataset(Dataset):
    def __init__(self,
                 x_y_list,
                 vocab_path,
                 max_seq_length=256,
                 vocab='base-cased',
                 transform=None):
        self.max_seq_length = max_seq_length
        self.x_y_list = x_y_list
        self.vocab = vocab
        if self.vocab == 'base-cased':
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-cased', do_lower_case=False, do_basic_tokenize=True)
        elif self.vocab == 'finance-cased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=False,
                                           do_basic_tokenize=True)
        elif self.vocab == 'base-uncased':
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-uncased',
                do_lower_case=True,
                do_basic_tokenize=True)
        elif self.vocab == 'finance-uncased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=True,
                                           do_basic_tokenize=True)

    def __getitem__(self, index):
        tokenized_review = self.tokenizer.tokenize(self.x_y_list[0][index])

        if len(tokenized_review) > self.max_seq_length:
            tokenized_review = tokenized_review[:self.max_seq_length]

        ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review)

        mask_input = [1] * len(ids_review)

        padding = [0] * (self.max_seq_length - len(ids_review))
        ids_review += padding
        mask_input += padding

        input_type = [0] * self.max_seq_length

        assert len(ids_review) == self.max_seq_length
        assert len(mask_input) == self.max_seq_length
        assert len(input_type) == self.max_seq_length

        ids_review = torch.tensor(ids_review)
        mask_input = torch.tensor(mask_input)
        input_type = torch.tensor(input_type)

        sentiment = self.x_y_list[1][index]
        list_of_labels = [torch.from_numpy(np.array(sentiment))]

        input_feature = {
            "token_type_ids": input_type,
            "attention_mask": mask_input,
            "input_ids": ids_review
        }

        return input_feature, list_of_labels[0]

    def __len__(self):
        return len(self.x_y_list[0])
Ejemplo n.º 24
0
class Preprocess:
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.rouge_calculator = RougeNCalc()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        self.trim_input = 0
        self.trim_clss = 0

    def __call__(self, data_dic, length):
        self.src_body = data_dic['body']
        self.src_summary = data_dic['summary'].split('<sep>')
        self._init_data()

        if self.src_body is '':
            raise ValueError('Empty data')

        # step 1. article to lines
        self._split_line()
        # step 2. pick extractive summary by rouge
        self._rougematch()
        # step 3. tokenize
        self._tokenize()
        # step 4. clss process
        self._prep_clss()
        # step 5. segs process
        self._prep_segs()
        # step 6. trim length for input
        self._set_length(length)

        return {
            'src': self.tokenid,
            'labels': self.label,
            'segs': self.segs,
            'mask': self.mask,
            'mask_cls': self.mask_cls,
            'clss': self.clss,
            'src_str': self.src_line
        }

    def _init_data(self):
        self.src_line = []
        self.label = []
        self.tokenid = []
        self.token = []
        self.clss = []
        self.segs = []
        self.mask = []
        self.mask_cls = []

    # step 1.
    def _split_line(self):
        # regex note: (?!...) Negative Lookahead
        # e.g. /foo(?!bar)/ for "foobar foobaz" get "foobaz" only
        self.src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', self.src_body)
        self.src_line = [x for x in self.src_line if x is not '']

    # step 2.
    def _rougematch(self):
        self.label = [0] * len(self.src_line)
        for summ in self.src_summary:
            scores = [self.rouge_calculator(x, summ) for x in self.src_line]
            self.label[scores.index(max(scores))] = 1

    # step 3.
    def _tokenize(self):
        def _preprocess_text(text):
            return text.replace(" ", "")  # for Juman

        for sentence in self.src_line:
            preprocessed_text = _preprocess_text(sentence)
            juman_tokens = self.juman_tokenizer(preprocessed_text)
            tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens))
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            self.token += tokens
            self.tokenid += ids

    # step 4.
    def _prep_clss(self):
        self.clss = [
            i for i, x in enumerate(self.tokenid)
            if x == self.bert_tokenizer.vocab['[CLS]']
        ]

    # step 5.
    def _prep_segs(self):
        flag = 1
        for idx in self.tokenid:
            if idx == self.bert_tokenizer.vocab['[CLS]']:
                flag = not flag
            self.segs.append(int(flag))

    # step 6.
    def _set_length(self, n):
        self.__trim_data(n)
        self.__add_mask(n)

    def __trim_data(self, n):
        if len(self.tokenid) > n:
            # If last sentence starts after 512
            if self.clss[-1] > 512:
                for i, idx in enumerate(self.clss):
                    if idx > n:
                        # Index of last [SEP] in length=n
                        self.trim_input = self.clss[i - 1] - 1
                        # Index of last [CLS] index in clss
                        self.trim_clss = i - 2
                        break
            # If src longer than 512 but last sentence start < 512
            else:
                self.trim_input = self.clss[len(self.clss) - 1] - 1
                self.trim_clss = len(self.clss) - 2
        # Do nothing if length < n
        if self.trim_clss * self.trim_input == 0:
            return
        self.tokenid = self.tokenid[:(self.trim_input + 1)]
        self.segs = self.segs[:(self.trim_input + 1)]
        self.clss = self.clss[:(self.trim_clss + 1)]
        self.label = self.label[:(self.trim_clss + 1)]
        self.src_line = self.src_line[:(self.trim_clss + 1)]

    def __add_mask(self, n):
        # from index to len: +1
        pad_len = (n - len(self.tokenid))
        self.tokenid = self.tokenid + ([self.bert_tokenizer.vocab['[MASK]']] *
                                       pad_len)
        self.segs = self.segs + ([int(not self.segs[-1])] * pad_len)
Ejemplo n.º 25
0
def annotate_example_for_bert(
        example: Dict,
        table: Dict,
        bert_tokenizer: BertTokenizer,
        table_representation_method: Optional[str] = 'canonical'):
    e_id = example['id']

    # sub-tokenize the question
    question_tokens = example['tokens']
    example['original_tokens'] = question_tokens
    token_position_map = OrderedDict(
    )  # map of token index before and after sub-tokenization

    question_feature = example['features']

    cur_idx = 0
    new_question_feature = []
    question_subtokens = []
    for old_idx, token in enumerate(question_tokens):
        if token == '<DECODE>': token = '[MASK]'
        if token == '<START>': token = '[MASK]'

        sub_tokens = bert_tokenizer.tokenize(token)
        question_subtokens.extend(sub_tokens)

        token_new_idx_start = cur_idx
        token_new_idx_end = cur_idx + len(sub_tokens)
        token_position_map[old_idx] = (token_new_idx_start, token_new_idx_end)
        new_question_feature.extend([question_feature[old_idx]] *
                                    len(sub_tokens))

        cur_idx = token_new_idx_end

    token_position_map[len(question_tokens)] = (len(question_subtokens),
                                                len(question_subtokens))

    example['tokens'] = question_subtokens
    example['features'] = new_question_feature

    for entity in example['entities']:
        old_token_start = entity['token_start']
        old_token_end = entity['token_end']

        new_token_start = token_position_map[old_token_start][0]
        new_token_end = token_position_map[old_token_end][0]

        entity['token_start'] = new_token_start
        entity['token_end'] = new_token_end

    if table_representation_method == 'concate':
        columns, column_info = get_columns_concate(example, table,
                                                   bert_tokenizer)
    elif table_representation_method == 'canonical':
        columns, column_info = get_columns_canonical(example, table)
    else:
        raise RuntimeError('Unknown table representation')

    # gather table data
    for column in columns:
        column.name_tokens = bert_tokenizer.tokenize(str(column.name))
        column.sample_value_tokens = bert_tokenizer.tokenize(
            str(column.sample_value))

    rows = [table['kg'][row_id] for row_id in sorted(table['kg'])]
    valid_rows = []
    untokenized_rows = []
    for row in rows:
        valid_row = {}
        untokenized_row = {}
        for col in columns:
            cell_val = row.get(col.raw_name, [])
            if cell_val:
                cell_val = str(cell_val[0])
                untokenized_row[col.name] = cell_val
                cell_tokens = bert_tokenizer.tokenize(cell_val)
            else:
                cell_tokens = []
                untokenized_row[col.name] = ''

            valid_row[col.name] = cell_tokens

        valid_rows.append(valid_row)
        untokenized_rows.append(untokenized_row)

    table = Table(id=example['context'],
                  header=columns,
                  data=valid_rows,
                  column_info=column_info)
    untokenized_table = Table(id=example['context'],
                              header=columns,
                              data=untokenized_rows)

    example['table'] = table
    example['untokenized_table'] = untokenized_table

    return example
Ejemplo n.º 26
0
def home():

    #   global model, BERT_FP, bert, tokenizer, nlp
    model = torch.load('model_sciBERT_CRF10.pth')
    BERT_FP = 'scibert_scivocab_uncased'
    bert = BertModel.from_pretrained(BERT_FP)
    tokenizer = BertTokenizer(vocab_file=BERT_FP + '/vocab.txt')
    nlp = en_core_web_sm.load()
    datatowrite = []
    result = ''
    if (request.method == 'POST'):
        token_indices = []
        file_raw = request.form.get('abstract')
        actual_file = open('abstract_str/abstract.txt', 'w')
        actual_file.write(file_raw)
        actual_file.close()
        file = file_raw.lower()
        tokens_list = tokenizer.tokenize(file)
        n = 0
        for i, item in enumerate(tokens_list):
            try:
                start_index = file.index(item.strip('#'))
            except:
                start_index = 100
            if ((start_index < 5 or unk == 1) and item != '[UNK]'):
                token_indices.append(
                    (start_index + n, n + start_index + len(item.strip('#'))))

                n = token_indices[-1][-1]
                file = file[start_index + len(item.strip('#')):]
            else:
                token_indices.append((-1, -1))

                if (item != '[UNK]'):
                    n += len(item.strip('#'))
                    file = file[len(item.strip('#')):]

        with torch.no_grad():
            inputs = tokenizer.convert_tokens_to_ids(tokens_list)
            inputs = bert(torch.tensor([inputs]))[0]
            for j in range(len(inputs)):
                inputs[j] = inputs[j].numpy()
            inputs = torch.tensor(np.array(inputs))
            prediction = model(inputs.permute(1, 2, 0, 3).squeeze(0))
            output = prediction[0]

        dic = {}
        dataarr = file_raw
        tagsarr = output
        indicesarr = token_indices

        indicesdata = []
        datatowrite = []
        for j in range(len(tagsarr)):
            if (tagsarr[j] == 0 or tagsarr[j] == 4):
                indicesdata.append(list(indicesarr[j]))
            if (tagsarr[j] == 1 or tagsarr[j] == 2):
                indicesdata[-1][1] = indicesarr[j][1]

        indicestowrite = indicesdata

        ind_temp = []
        data_temp = []
        for j in indicestowrite:
            ind_temp.append(j)
            data_temp.append(dataarr[j[0]:j[1]])

        indicestowrite = []
        datatowrite = []
        for j in range(len(ind_temp)):
            temp = nlp(data_temp[j])
            count = 0
            for k in temp:
                count += 1

            if (count == 1):
                ind = [
                    [k.start() + 1,
                     k.start() + 1 + len(data_temp[j])] for k in re.finditer(
                         '[^a-z]' + re.escape(data_temp[j].lower()) +
                         '[^a-z]', dataarr.lower())
                    if [k.start() +
                        1, k.start() + 1 + len(data_temp[j])] not in ind_temp
                    and [k.start() +
                         1, k.start() + 1 +
                         len(data_temp[j])] not in indicestowrite
                ]
                temp_ind = []
                dat = []
                for l in ind:
                    if (dataarr[l[0]:l[1]].lower() != dataarr[l[0]:l[1]]):
                        dat.append(dataarr[l[0]:l[1]])
                        temp_ind.append(l)
                indicestowrite += temp_ind
                datatowrite += dat

        ind_temp = ind_temp + indicestowrite
        data_temp = data_temp + datatowrite
        indicestowrite = []
        datatowrite = []

        for j in range(len(data_temp)):
            temp_2 = nlp(data_temp[j])
            temp = []
            for word in temp_2:
                temp.append((len(word.text), word.text))

            if (len(temp) == 1):
                if (str(temp[0][1]).lower() != str(temp[0][1])
                        or re.match('^[a-z]+$', temp[0][1]) == None
                        or len(temp[0][1]) > 3):
                    indicestowrite.append(ind_temp[j])
                    datatowrite.append(data_temp[j])
            else:
                indicestowrite.append(ind_temp[j])
                datatowrite.append(data_temp[j])
        indicestowrite = sorted(indicestowrite, key=lambda x: x[0])
        if (len(indicestowrite) == 0):
            return render_template("index.html", keyphrases=file_raw)
        print(indicestowrite)
        annotation_file = open('abstract_str/abstract.ann', 'w')
        for qwe in range(len(indicestowrite)):
            annotation_file.write(
                'T' + str(qwe + 1) + '\t' + 'Process ' +
                str(indicestowrite[qwe][0]) + ' ' +
                str(indicestowrite[qwe][1]) + '\t' +
                file_raw[indicestowrite[qwe][0]:indicestowrite[qwe][1]] + '\n')
        annotation_file.close()
        X_test, y_test_gold, _, test_entities = read_and_map(
            'abstract_str', mapper)
        loaded_model = pickle.load(open('finalized_model_joined.sav', 'rb'))
        predictions = loaded_model.predict(X_test)
        y_values = ['Process', 'Material', 'Task']
        document_abbr = {}
        asd = os.listdir('abstract_str')
        for i in range(len(asd)):
            document_abbr[asd[i][:-4]] = {}

        for i in range(len(predictions)):
            if (test_entities[i].string == test_entities[i].string.upper()
                    and len(test_entities[i].string) > 1):
                if (y_values[predictions[i]] == "Material"):
                    predictions[i] = y_values.index("Process")

            if (test_entities[i].string
                    == test_entities[i].string.capitalize()
                    and len(test_entities[i].string) == 2):
                predictions[i] = y_values.index("Material")

            tmp = test_entities[i].string.split(" ")
            if (len(tmp) == 1):
                if (test_entities[i].string == test_entities[i].string.upper()
                        and hasNumbers(test_entities[i].string)):
                    predictions[i] = y_values.index("Material")

            if (test_entities[i].string == test_entities[i].string.upper()):
                try:
                    predictions[i] = document_abbr[test_entities[i].docid][
                        test_entities[i].string]
                except:
                    obracket = test_entities[i].start - 1
                    cbracket = test_entities[i].end
                    file = open(
                        'abstract_str/' + test_entities[i].docid + '.txt',
                        'r').read()
                    if (file[obracket] == '(' and file[cbracket] == ')'):
                        if (test_entities[i].start -
                                test_entities[i - 1].end == 2):
                            # print(test_entities[i].string, '\t',test_entities[i-1].string ,'\t' ,test_entities[i].start, '\t',test_entities[i-1].end )
                            document_abbr[test_entities[i].docid][
                                test_entities[i].string] = predictions[i - 1]
                            predictions[i] = predictions[i - 1]

            for j in range(len(tmp)):
                if (len(tmp[j]) == 1 and tmp[j] == tmp[j].upper()):
                    predictions[i] = y_values.index("Material")

        # print(predictions)

        n = 0
        result = []
        last_closing = 0
        for i in range(len(indicestowrite)):
            qwe_temp = file_raw[n:indicestowrite[i][0]]
            if (qwe_temp != ''):
                result.append(qwe_temp)
            temp = ''
            if (predictions[i] == 0):
                temp = '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' + file_raw[
                    indicestowrite[i][0]:indicestowrite[i]
                    [1]] + '</strong></span>'
            elif (predictions[i] == 1):
                temp = '<span style="background-color:rgba(252, 152, 3, 0.5);"><strong>' + file_raw[
                    indicestowrite[i][0]:indicestowrite[i]
                    [1]] + '</strong></span>'
            elif (predictions[i] == 2):
                temp = '<span style="background-color:rgba(3, 152, 252, 0.5);"><strong>' + file_raw[
                    indicestowrite[i][0]:indicestowrite[i]
                    [1]] + '</strong></span>'

            if (indicestowrite[i][1] > last_closing):
                result.append(temp)
                last_closing = indicestowrite[i][1]
                n = indicestowrite[i][1]
            # else:
            #     ov_string = file_raw[indicestowrite[i][0]:indicestowrite[i][1]]
            #     temp_start = result[-1].index(ov_string)
            #     result[-1] = result[-1][:temp_start] + temp + result[-1][ temp_start+indicestowrite[i][1] - indicestowrite[i][0]:]

            # result += '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' +  file_raw[i[0]:i[1]] + '</strong></span>'

        result += file_raw[n:]
        # print(result)
        result = "".join(result)
    return render_template("index.html", keyphrases=result)
class for_BERT():
    def __init__(self, mode='training', language='ko', version=1.0):
        version = str(version)
        self.mode = mode
        if language == 'en':
            data_path = dir_path + '/koreanframenet/resource/info/fn' + version + '_'
        else:
            data_path = dir_path + '/koreanframenet/resource/info/kfn' + version + '_'
        with open(data_path + 'lu2idx.json', 'r') as f:
            self.lu2idx = json.load(f)
        if version == '1.5':
            fname = dir_path + '/koreanframenet/resource/info/fn1.5_frame2idx.json'
        else:
            fname = dir_path + '/koreanframenet/resource/info/fn1.7_frame2idx.json'
        with open(fname, 'r') as f:
            #self.sense2idx = json.load(f)
            self.frame2idx = json.load(f)
        with open(data_path + 'lufrmap.json', 'r') as f:
            #self.lusensemap = json.load(f)
            self.lufrmap = json.load(f)
        with open(dir_path + '/koreanframenet/resource/info/fn1.7_fe2idx.json',
                  'r') as f:
            self.arg2idx = json.load(f)
        with open(
                dir_path + '/koreanframenet/resource/info/fn1.7_frargmap.json',
                'r') as f:
            self.frargmap = json.load(f)
        with open(
                dir_path +
                '/koreanframenet/resource/info/fn1.7_bio_fe2idx.json',
                'r') as f:
            self.bio_arg2idx = json.load(f)
        with open(
                dir_path +
                '/koreanframenet/resource/info/fn1.7_bio_frargmap.json',
                'r') as f:
            self.bio_frargmap = json.load(f)

        self.idx2frame = dict(
            zip(self.frame2idx.values(), self.frame2idx.keys()))
        self.idx2lu = dict(zip(self.lu2idx.values(), self.lu2idx.keys()))
        self.idx2arg = dict(zip(self.arg2idx.values(), self.arg2idx.keys()))
        self.idx2bio_arg = dict(
            zip(self.bio_arg2idx.values(), self.bio_arg2idx.keys()))

        # load pretrained BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=False)

        # load BERT tokenizer with untokenizing frames
        never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
        added_never_split = []
        added_never_split.append('<tgt>')
        added_never_split.append('</tgt>')
        #         for frame in self.frame2idx:
        #             added_never_split.append('['+frame+']')
        added_never_split_tuple = tuple(added_never_split)
        never_split_tuple += added_never_split_tuple
        vocab_file_path = dir_path + '/data/bert-multilingual-cased-dict-add-frames'
        self.tokenizer_with_frame = BertTokenizer(
            vocab_file_path,
            do_lower_case=False,
            max_len=512,
            never_split=never_split_tuple)

    def idx2tag(self, predictions, model='frameid'):
        if model == 'frameid':
            pred_tags = [self.idx2frame[p_i] for p in predictions for p_i in p]
        elif model == 'argclassification':
            pred_tags = [self.idx2arg[p_i] for p in predictions for p_i in p]
        elif model == 'argid':
            pred_tags = [
                self.idx2bio_arg[p_i] for p in predictions for p_i in p
            ]
        return pred_tags

    def get_masks(self, datas, model='frameid'):
        if model == 'frameid':
            mapdata = self.lufrmap
            num_label = len(self.frame2idx)
        elif model == 'argclassification':
            mapdata = self.frargmap
            num_label = len(self.arg2idx)
        elif model == 'argid':
            mapdata = self.bio_frargmap
            num_label = len(self.bio_arg2idx)
        masks = []
        for idx in datas:
            mask = torch.zeros(num_label)
            try:
                candis = mapdata[str(int(idx[0]))]
            except KeyboardInterrupt:
                raise
            except:
                candis = mapdata[int(idx[0])]
            for candi_idx in candis:
                mask[candi_idx] = 1
            masks.append(mask)
        masks = torch.stack(masks)
        return masks

    # bert tokenizer and assign to the first token
    def bert_tokenizer(self, text):
        orig_tokens = text.split(' ')
        bert_tokens = []
        orig_to_tok_map = []
        bert_tokens.append("[CLS]")
        for orig_token in orig_tokens:
            orig_to_tok_map.append(len(bert_tokens))
            bert_tokens.extend(self.tokenizer_with_frame.tokenize(orig_token))
        bert_tokens.append("[SEP]")

        return orig_tokens, bert_tokens, orig_to_tok_map

    def convert_to_bert_input_frameid(self, input_data):
        tokenized_texts, lus, frames = [], [], []

        for i in range(len(input_data)):
            data = input_data[i]
            text = ' '.join(data[0])
            orig_tokens, bert_tokens, orig_to_tok_map = self.bert_tokenizer(
                text)
            tokenized_texts.append(bert_tokens)

            ori_lus = data[1]
            lu_sequence = []
            for i in range(len(bert_tokens)):
                if i in orig_to_tok_map:
                    idx = orig_to_tok_map.index(i)
                    l = ori_lus[idx]
                    lu_sequence.append(l)
                else:
                    lu_sequence.append('_')
            lus.append(lu_sequence)

            if self.mode == 'training':
                ori_frames = data[2]
                frame_sequence = []
                for i in range(len(bert_tokens)):
                    if i in orig_to_tok_map:
                        idx = orig_to_tok_map.index(i)
                        l = ori_frames[idx]
                        frame_sequence.append(l)
                    else:
                        frame_sequence.append('_')
                frames.append(frame_sequence)

        input_ids = pad_sequences([
            self.tokenizer.convert_tokens_to_ids(txt)
            for txt in tokenized_texts
        ],
                                  maxlen=MAX_LEN,
                                  dtype="long",
                                  truncating="post",
                                  padding="post")

        tgt_seq, lu_seq, frame_seq = [], [], []
        for sent_idx in range(len(lus)):
            lu_items = lus[sent_idx]
            tgt, lu = [], []
            for idx in range(len(lu_items)):
                if lu_items[idx] != '_':
                    if len(tgt) == 0:
                        tgt.append(idx)
                        lu.append(self.lu2idx[lu_items[idx]])
            tgt_seq.append(tgt)
            lu_seq.append(lu)

            if self.mode == 'training':
                frame_items = frames[sent_idx]
                frame = []
                for idx in range(len(frame_items)):
                    if frame_items[idx] != '_':
                        if len(frame) == 0:
                            frame.append(self.frame2idx[frame_items[idx]])
                frame_seq.append(frame)

        attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
        data_inputs = torch.tensor(input_ids)
        data_tgt_idx = torch.tensor(tgt_seq)
        data_lus = torch.tensor(lu_seq)
        data_frames = torch.tensor(frame_seq)
        data_masks = torch.tensor(attention_masks)

        if self.mode == 'training':
            bert_inputs = TensorDataset(data_inputs, data_tgt_idx, data_lus,
                                        data_frames, data_masks)
        else:
            bert_inputs = TensorDataset(data_inputs, data_tgt_idx, data_lus,
                                        data_masks)
        return bert_inputs
    def __init__(self,
                 path: str,
                 fields: List[Tuple[str, tt.data.Field]],
                 tokenizer: BertTokenizer,
                 max_length: int = 512,
                 include_features=False,
                 **kwargs):
        max_length = max_length - 3  # Count without special tokens

        with open(path) as dataf:
            data_json = json.load(dataf)
            examples = []
            # Each input needs  to have at most 2 segments
            # We will create following input
            # - [CLS] source post, previous post [SEP] choice_1 [SEP]

            for example in data_json["Examples"]:
                make_ids = lambda x: tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(x))
                text = make_ids(example["spacy_processed_text"])
                prev = make_ids(example["spacy_processed_text_prev"])
                src = make_ids(example["spacy_processed_text_src"])
                segment_A = src
                segment_C = prev
                segment_B = text
                text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                           [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]

                # truncate if exceeds max length
                if len(text_ids) > max_length:
                    # Truncate segment A
                    segment_C = segment_C[:max_length // 2]
                    text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                               [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]
                    if len(text_ids) > max_length:
                        # Truncate segment A
                        segment_A = segment_A[:max_length // 2]
                        text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                                   [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]
                        if len(text_ids) > max_length:
                            # Truncate also segment B
                            segment_B = segment_B[:max_length // 2]
                            text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                                       [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]

                segment_ids = [0] * (len(segment_A) + 2) + [2] * (
                    len(segment_C) + 1) + [1] * (len(segment_B) + 1)
                # example_list = list(example.values())[:-3] + [text_ids, segment_ids]
                if include_features:
                    example_list = list(
                        example.values()) + [text_ids, segment_ids]
                else:
                    example_list = [
                        example["id"], example["branch_id"],
                        example["tweet_id"], example["stance_label"],
                        example["veracity_label"], "\n-----------\n".join([
                            example["raw_text_src"], example["raw_text_prev"],
                            example["raw_text"]
                        ]), example["issource"]
                    ] + [text_ids, segment_ids]

                examples.append(Example.fromlist(example_list, fields))
            super(RumourEval2019Dataset_BERTTriplets_3Segments,
                  self).__init__(examples, fields, **kwargs)
Ejemplo n.º 29
0
class BertWithJumanModel():
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        # CUDA-GPUを利用するかどうかのフラグ読み込み
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        try:
            return text.replace(" ", "")  # for Juman
        except:
            return ''

    def get_sentence_embedding(self,
                               text,
                               pooling_layer=-2,
                               pooling_strategy="REDUCE_MEAN"):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            # GPUの利用チェック、利用
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

        # モデルを評価モードに変更
        self.model.eval()
        with torch.no_grad():
            # 自動微分を適用しない(メモリ・高速化などなど)
            # id列からベクトル表現を計算する
            all_encoder_layers, _ = self.model(tokens_tensor)

            # SWEMと同じ方法でベクトルを時間方向にaverage-poolingしているらしい
            # 文章列によって次元が可変になってしまうので、伸びていく方向に対してプーリングを行い次元を固定化する
            # https://yag-ays.github.io/project/swem/
            embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
            if pooling_strategy == "REDUCE_MEAN":
                return np.mean(embedding, axis=0)
            elif pooling_strategy == "REDUCE_MAX":
                return np.max(embedding, axis=0)
            elif pooling_strategy == "REDUCE_MEAN_MAX":
                return np.r_[np.max(embedding, axis=0),
                             np.mean(embedding, axis=0)]
            elif pooling_strategy == "CLS_TOKEN":
                return embedding[0]
            else:
                raise ValueError(
                    "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}"
                )
if __name__ == '__main__':
    args = parser.parse_args()
    assert os.path.exists(args.bert_model), '{} does not exist'.format(
        args.bert_model)
    assert os.path.exists(args.bert_vocab), '{} does not exist'.format(
        args.bert_vocab)
    assert args.topk > 0, '{} should be positive'.format(args.topk)

    print('Initialize BERT vocabulary from {}...'.format(args.bert_vocab))
    bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab)
    print('Initialize BERT model from {}...'.format(args.bert_model))
    bert_model = BertForMaskedLM.from_pretrained(args.bert_model)

    while True:
        message = input('Enter your message: ').strip()
        tokens = bert_tokenizer.tokenize(message)
        if len(tokens) == 0:
            continue
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad():
            logits = bert_model(token_idx,
                                segment_idx,
                                mask,
                                masked_lm_labels=None)
        logits = logits.squeeze(0)
        probs = torch.softmax(logits, dim=-1)