Esempio n. 1
0
    def _dict_to_samples(self, dictionary, all_dicts=None):
        assert len(
            all_dicts
        ) > 1, "Need at least 2 documents to sample random sentences from"
        doc = dictionary["doc"]
        samples = []

        # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
        for idx in range(len(doc) - 1):
            tokenized = {}
            if self.next_sent_pred:
                text_a, text_b, is_next_label = get_sentence_pair(
                    doc, all_dicts, idx)
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": text_b,
                    "nextsentence_label": is_next_label,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer)
                tokenized["text_b"] = tokenize_with_metadata(
                    text_b, self.tokenizer)
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], tokenized["text_b"][
                        seq_name], _ = truncate_sequences(
                            seq_a=tokenized["text_a"][seq_name],
                            seq_b=tokenized["text_b"][seq_name],
                            tokenizer=self.tokenizer,
                            max_seq_len=self.max_seq_len)
                    samples.append(
                        Sample(id=None,
                               clear_text=sample_in_clear_text,
                               tokenized=tokenized))
            # if we don't do next sentence prediction, we should feed in a single sentence
            else:
                text_a = doc[idx]
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": None,
                    "nextsentence_label": None,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer)
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], _, _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=None,
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                    samples.append(
                        Sample(id=None,
                               clear_text=sample_in_clear_text,
                               tokenized=tokenized))
        return samples
Esempio n. 2
0
def test_fast_tokenizer_with_metadata_with_examples(caplog, model_name):
    fast_tokenizer = Tokenizer.load(model_name,
                                    lower_case=False,
                                    use_fast=True)
    tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False)

    for text in TEXTS:
        # our tokenizer with metadata on "whitespace tokenized words"
        tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)
        fast_tokenized_meta = tokenize_with_metadata(text=text,
                                                     tokenizer=fast_tokenizer)

        # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
        assert tokenized_meta == fast_tokenized_meta, f"Failed using {tokenizer.__class__.__name__}"
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     if len(tokenized["tokens"]) == 0:
         text = dictionary["text"]
         logger.warning(
             f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}"
         )
         return []
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(
             seq_a=tokenized[seq_name],
             seq_b=None,
             tokenizer=self.tokenizer,
             max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     for task_name, task in self.tasks.items():
         if task_name in dictionary:
             label = float(dictionary[task_name])
             scaled_label = (label -
                             task["label_list"][0]) / task["label_list"][1]
             dictionary[task_name] = scaled_label
     if self.features:
         feats_embed = dictionary.pop("features")
         return [
             FeaturesEmbeddingSample(id=None,
                                     clear_text=dictionary,
                                     tokenized=tokenized,
                                     feat_embeds=feats_embed)
         ]
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     if len(tokenized["tokens"]) == 0:
         text = dictionary["text"]
         logger.warning(
             f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}"
         )
         return []
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(
             seq_a=tokenized[seq_name],
             seq_b=None,
             tokenizer=self.tokenizer,
             max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     for task_name, task in self.tasks.items():
         if task_name in dictionary:
             scaled_dict_labels = []
             for label in dictionary[task_name]:
                 label = float(label)
                 scaled_label = (
                     label - task["label_list"][0]) / task["label_list"][1]
                 scaled_dict_labels.append(scaled_label)
             dictionary[task_name] = scaled_dict_labels
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
Esempio n. 5
0
def test_bert_tokenizer_all_meta(caplog):
    caplog.set_level(logging.CRITICAL)

    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

    # original tokenizer from transformer repo
    tokenized = tokenizer.tokenize(basic_text)
    assert tokenized == [
        'Some', 'Text', 'with', 'never', '##see', '##nto', '##ken', '##s',
        'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-',
        'token', '_', 'with', '/', 'ch', '##ars'
    ]

    # ours with metadata
    tokenized_meta = tokenize_with_metadata(text=basic_text,
                                            tokenizer=tokenizer)
    assert tokenized_meta["tokens"] == tokenized
    assert tokenized_meta["offsets"] == [
        0, 5, 10, 15, 20, 23, 26, 29, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58,
        59, 64, 65, 69, 70, 72
    ]
    assert tokenized_meta["start_of_word"] == [
        True, True, True, True, False, False, False, False, True, True, False,
        False, False, False, True, True, True, False, False, False, False,
        False, False, False
    ]
Esempio n. 6
0
def test_all_tokenizer_on_special_cases(caplog):
    caplog.set_level(logging.CRITICAL)

    lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"]
    tokenizers = []
    for lang_name in lang_names:
        t = Tokenizer.load(lang_name, lower_case=False)
        tokenizers.append(t)

    texts = [
     "This is a sentence",
     "Der entscheidende Pass",
    "This      is a sentence with multiple spaces",
    "力加勝北区ᴵᴺᵀᵃছজটডণত",
     "Thiso text is included tolod makelio sure Unicodeel is handled properly:",
   "This is a sentence...",
   "Let's see all on this text and. !23# neverseenwordspossible",
    """This is a sentence.
    With linebreak""",
    "This is a sentence with    tab"]

    for tokenizer in tokenizers:
        for text in texts:
            # Important: we don't assume to preserve whitespaces after tokenization.
            # This means: \t, \n " " etc will all resolve to a single " ".
            # This doesn't make a difference for BERT + XLNet but it does for roBERTa

            # 1. original tokenize function from transformer repo on full sentence
            standardized_whitespace_text = ' '.join(text.split()) # remove multiple whitespaces

            tokenized = tokenizer.tokenize(standardized_whitespace_text)
            tokenized_by_word = []
            # 2. original tokenize function from transformer repo on "whitespace tokenized words"
            for i, tok in enumerate(text.split(" ")):
                if i == 0:
                    tokenized_tok = tokenizer.tokenize(tok)
                else:
                    try:
                        tokenized_tok = tokenizer.tokenize(tok, add_prefix_space=True)
                    except TypeError:
                        tokenized_tok = tokenizer.tokenize(tok)
                tokenized_by_word.extend(tokenized_tok)
            assert tokenized == tokenized_by_word

            # 3. our tokenizer with metadata on "whitespace tokenized words"
            tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)

            # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
            assert tokenized_meta["tokens"] == tokenized, f"Failed using {tokenizer.__class__.__name__}"

            # verify that offsets align back to original text
            if text == "力加勝北区ᴵᴺᵀᵃছজটডণত":
                # contains [UNK] that are impossible to match back to original text space
                continue
            for tok, offset in zip(tokenized_meta["tokens"], tokenized_meta["offsets"]):
                #subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them
                tok = re.sub(r"^(##|Ġ|▁)", "", tok)
                #tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok))
                original_tok = text[offset:offset+len(tok)]
                assert tok == original_tok, f"Offset alignment wrong for {tokenizer.__class__.__name__} and text '{text}'"
Esempio n. 7
0
 def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dict:
         dict["label"] = float(dict["label"])
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Esempio n. 8
0
def split_text_token_wise_with_metadata(text,
                                        tokenizer,
                                        min_chunk_size=30,
                                        max_chunk_size=100):
    tokenized_text = tokenize_with_metadata(text, tokenizer)

    token_len = len(tokenized_text["tokens"])
    chunk_size = random.randint(min_chunk_size, max_chunk_size)

    # calculate nr of even chunks with chunksize < chunk_size
    nr_of_chunks = math.ceil(token_len / chunk_size)

    chunks = []

    for i, key in enumerate(tokenized_text.keys()):
        key_chunks = np.array_split(np.array(tokenized_text[key]),
                                    nr_of_chunks)

        # update each dict with chunked key
        for j in range(nr_of_chunks):
            if len(chunks) > j:
                chunks[j][key] = key_chunks[j].tolist()
            else:
                chunks.append({key: key_chunks[j].tolist()})

    # reconstruct clear text from offsets
    for k in range(nr_of_chunks):
        chunks[k]["clear_text"] = text[
            chunks[k]["offsets"][0]:chunks[k]["offsets"][-1] + 1]

    return chunks
Esempio n. 9
0
def test_fast_bert_custom_vocab(caplog):
    caplog.set_level(logging.CRITICAL)

    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False,
                               use_fast=True)

    #deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt")
    tokenizer.add_tokens(new_tokens=["neverseentokens"])

    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

    # original tokenizer from transformer repo
    tokenized = tokenizer.tokenize(basic_text)
    assert tokenized == [
        'Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?',
        '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch',
        '##ars'
    ]

    # ours with metadata
    tokenized_meta = tokenize_with_metadata(text=basic_text,
                                            tokenizer=tokenizer)
    assert tokenized_meta["tokens"] == tokenized
    assert tokenized_meta["offsets"] == [
        0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69,
        70, 72
    ]
    assert tokenized_meta["start_of_word"] == [
        True, True, True, True, True, True, False, False, False, False, True,
        True, True, False, False, False, False, False, False, False
    ]
Esempio n. 10
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer,
                                             max_seq_len=self.max_seq_len)
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
Esempio n. 11
0
 def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dict:
         label = float(dict["label"])
         scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
         dict["label"] = scaled_label
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Esempio n. 12
0
    def apply_tokenization(self, dictionary):
        """ This performs tokenization on all documents and questions. The result is a list (unnested)
        where each entry is a dictionary for one document-question pair (potentially mutliple answers). """

        raw_baskets = []
        document_text = dictionary["context"]
        document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
        document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
        questions = dictionary["qas"]
        for question in questions:
            answers = []
            # For training and dev where labelled samples are read in from a SQuAD style file
            try:
                squad_id = question["id"]
                question_text = question["question"]
                for answer in question["answers"]:
                    a = {"text": answer["text"],
                         "offset": answer["answer_start"]}
                    answers.append(a)
            # For inference where samples are read in as dicts without an id or answers
            except TypeError:
                squad_id = None
                question_text = question
            question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
            question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]

            if "is_impossible" not in question:
                is_impossible = False
            else:
                is_impossible = question["is_impossible"]
            raw = {"document_text": document_text,
                   "document_tokens": document_tokenized["tokens"],
                   "document_offsets": document_tokenized["offsets"],
                   "document_start_of_word": document_start_of_word,
                   "question_text": question_text,
                   "question_tokens": question_tokenized["tokens"],
                   "question_offsets": question_tokenized["offsets"],
                   "question_start_of_word": question_start_of_word,
                   "answers": answers,
                   "is_impossible": is_impossible,
                   "squad_id": squad_id}
            raw_baskets.append(raw)
        return raw_baskets
Esempio n. 13
0
    def apply_tokenization(self, dictionary):
        """ This performs tokenization on all documents and questions. The result is a list (unnested)
        where each entry is a dictionary for one document-question pair (potentially mutliple answers). """

        raw_baskets = []
        document_text = dictionary["context"]
        document_tokenized = tokenize_with_metadata(document_text,
                                                    self.tokenizer)
        document_start_of_word = [
            int(x) for x in document_tokenized["start_of_word"]
        ]
        questions = dictionary["qas"]
        for question in questions:
            squad_id = question["id"]
            question_text = question["question"]
            question_tokenized = tokenize_with_metadata(
                question_text, self.tokenizer)
            question_start_of_word = [
                int(x) for x in question_tokenized["start_of_word"]
            ]
            answers = []
            for answer in question["answers"]:
                a = {"text": answer["text"], "offset": answer["answer_start"]}
                answers.append(a)
            if "is_impossible" not in question:
                is_impossible = False
            else:
                is_impossible = question["is_impossible"]
            raw = {
                "document_text": document_text,
                "document_tokens": document_tokenized["tokens"],
                "document_offsets": document_tokenized["offsets"],
                "document_start_of_word": document_start_of_word,
                "question_text": question_text,
                "question_tokens": question_tokenized["tokens"],
                "question_offsets": question_tokenized["offsets"],
                "question_start_of_word": question_start_of_word,
                "answers": answers,
                "is_impossible": is_impossible,
                "squad_id": squad_id
            }
            raw_baskets.append(raw)
        return raw_baskets
Esempio n. 14
0
    def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
        if "paragraphs" not in dictionary:  # TODO change this inference mode hack
            dictionary = self._convert_rest_api_dict(infer_dict=dictionary)
        samples = create_samples_squad(entry=dictionary)
        for sample in samples:
            tokenized = tokenize_with_metadata(text=" ".join(
                sample.clear_text["doc_tokens"]),
                                               tokenizer=self.tokenizer)
            sample.tokenized = tokenized

        return samples
Esempio n. 15
0
def create_samples_sentence_pairs(baskets, tokenizer, max_seq_len):
    """Creates examples for Language Model Finetuning that consist of two sentences and the isNext label indicating if
     the two are subsequent sentences from one doc"""
    all_docs = [b.raw["doc"] for b in baskets]
    for basket in tqdm(baskets):
        doc = basket.raw["doc"]
        basket.samples = []
        for idx in range(len(doc) - 1):
            id = "%s-%s" % (basket.id, idx)
            text_a, text_b, is_next_label = get_sentence_pair(doc, all_docs, idx)
            sample_in_clear_text = {
                "text_a": text_a,
                "text_b": text_b,
                "is_next_label": is_next_label,
            }
            tokenized = {}
            tokenized["text_a"] = tokenize_with_metadata(text_a, tokenizer, max_seq_len)
            tokenized["text_b"] = tokenize_with_metadata(text_b, tokenizer, max_seq_len)
            basket.samples.append(Sample(id=id, clear_text=sample_in_clear_text, tokenized=tokenized))
    return baskets
Esempio n. 16
0
def test_save_load(caplog):
    caplog.set_level(logging.CRITICAL)

    lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"]
    tokenizers = []
    for lang_name in lang_names:
        t = Tokenizer.load(lang_name, lower_case=False)
        t.add_tokens(new_tokens=["neverseentokens"])
        tokenizers.append(t)

    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

    for tokenizer in tokenizers:
        save_dir = f"testsave"
        tokenizer_type = tokenizer.__class__.__name__
        tokenizer.save_pretrained(save_dir)
        tokenizer_loaded = Tokenizer.load(save_dir, tokenizer_class=tokenizer_type)
        tokenized_before = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
        tokenized_after = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer_loaded)
        assert tokenized_before == tokenized_after
Esempio n. 17
0
 def _dict_to_samples(cls, dict, all_dicts=None):
     doc = dict["doc"]
     samples = []
     for idx in range(len(doc) - 1):
         text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
         sample_in_clear_text = {
             "text_a": text_a,
             "text_b": text_b,
             "is_next_label": is_next_label,
         }
         tokenized = {}
         tokenized["text_a"] = tokenize_with_metadata(
             text_a, cls.tokenizer, cls.max_seq_len
         )
         tokenized["text_b"] = tokenize_with_metadata(
             text_b, cls.tokenizer, cls.max_seq_len
         )
         samples.append(
             Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)
         )
     return samples
def _get_predictions_inner(sentence, tokenizer, model, device):
    meta = tokenize_with_metadata(sentence, tokenizer)
    sent_tokens, offsets, start_of_words = meta["tokens"], meta[
        "offsets"], meta["start_of_word"]
    indexed_tokens = tokenizer.convert_tokens_to_ids(sent_tokens)
    # create 1 * T input token tensor
    tokens_tensor = torch.tensor(indexed_tokens).unsqueeze(0)
    tokens_tensor = tokens_tensor.to(device)
    with torch.no_grad():
        log_probs = model(tokens_tensor)[0].log_softmax(dim=2).squeeze()
    return list(
        zip(sent_tokens, indexed_tokens, (None, ) + log_probs.unbind(),
            offsets, start_of_words))
Esempio n. 19
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None,
                                                        tokenizer=self.tokenizer,
                                                        max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dictionary:
         label = float(dictionary["label"])
         scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
         dictionary["label"] = scaled_label
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
Esempio n. 20
0
    def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
        # TODO split samples that are too long in this function, related to todo in self._sample_to_features
        if "paragraphs" not in dict:  # TODO change this inference mode hack
            dict = cls._convert_inference(infer_dict=dict)
        samples = create_samples_squad(entry=dict)
        for sample in samples:
            tokenized = tokenize_with_metadata(
                text=" ".join(sample.clear_text["doc_tokens"]),
                tokenizer=cls.tokenizer,
                max_seq_len=cls.max_seq_len,
            )
            sample.tokenized = tokenized

        return samples
Esempio n. 21
0
def test_detokenization_in_fast_tokenizers(model_name):
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name,
                               use_fast=True)
    for text in TEXTS:
        tokens_with_metadata = tokenize_with_metadata(text, tokenizer)
        tokens = tokens_with_metadata["tokens"]

        detokenized = " ".join(tokens)
        detokenized = re.sub(r"(^|\s+)(##)", "", detokenized)

        detokenized_ids = tokenizer(detokenized,
                                    add_special_tokens=False)["input_ids"]
        detokenized_tokens = [
            tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids
        ]

        assert tokens == detokenized_tokens
 def _dict_to_samples(cls, dict, all_dicts=None):
     """
     Converts a dict with a document to a sample (which will subsequently be featurized). It is used during prediction.
     
     This is a modified version of BertStyleLMProcessor._dict_to_samples from farm/data_handler/processor.py. It has been modified to create samples with just a single text, rather than two, as is the case for a normal BERT model.
     """
     doc = dict["doc"]
     samples = []
     for idx in range(len(doc) - 1):
         tokenized = {}
         tokenized["text_a"] = tokenize_with_metadata(
             doc[idx], cls.tokenizer, cls.max_seq_len)
         samples.append(
             Sample(id=None,
                    clear_text={"doc": doc[idx]},
                    tokenized=tokenized))
     return samples
Esempio n. 23
0
def get_sequence_pair(doc,
                      chunk,
                      chunk_clear_text,
                      all_baskets,
                      tokenizer,
                      max_num_tokens,
                      prob_next_sentence=0.5):
    """
    Get one sample from corpus consisting of two sequences. A sequence can consist of more than one sentence.
    With prob. 50% these are two subsequent sequences from one doc. With 50% the second sequence will be a
    random one from another document.

    :param doc: The current document.
    :type doc: [str]
    :param chunk: List of subsequent, tokenized sentences.
    :type chunk: [dict]
    :param chunk_clear_text: List of subsequent sentences.
    :type chunk_clear_text: [str]
    :param all_baskets: SampleBaskets containing multiple other docs from which we can sample the second sequence
    if we need a random one.
    :type all_baskets: [dict]
    :param tokenizer: Used to split a sentence (str) into tokens.
    :param max_num_tokens: Samples are truncated after this many tokens.
    :type max_num_tokens: int
    :return: (list, list, dict, int)
        tokenized seq a,
        tokenized seq b,
        sample in clear text with label,
        number of unused sentences in chunk
    """
    sequence_a = []
    sequence_b = []
    sample_in_clear_text = {"text_a": "", "text_b": ""}
    # determine how many segments from chunk go into sequence_a
    len_sequence_a = 0
    a_end = 1
    if len(chunk) >= 2:
        a_end = random.randrange(1, len(chunk))
    for i in range(a_end):
        sequence_a.append(chunk[i])
        sample_in_clear_text["text_a"] += f"{chunk_clear_text[i]} "
        len_sequence_a += len(chunk[i]["tokens"])
    sample_in_clear_text["text_a"].strip()

    # actual next sequence
    if (random.random() > prob_next_sentence) and (len(chunk) > 1):
        label = True
        for i in range(a_end, len(chunk)):
            sequence_b.append(chunk[i])
            sample_in_clear_text["text_b"] += f"{chunk_clear_text[i]} "
        sample_in_clear_text["text_b"].strip()
        sample_in_clear_text["nextsentence_label"] = True
        num_unused_segments = 0
    # edge case: split sequence in half
    elif (len(chunk) == 1) and len_sequence_a >= max_num_tokens:
        sequence_a = {}
        sequence_b = {}
        if int(len(chunk[0]["tokens"]) / 2) >= max_num_tokens:
            boundary = int(max_num_tokens / 2)
        else:
            boundary = int(len(chunk[0]["tokens"]) / 2)
        sequence_a["tokens"] = chunk[0]["tokens"][:boundary]
        sequence_a["offsets"] = chunk[0]["offsets"][:boundary]
        sequence_a["start_of_word"] = chunk[0]["start_of_word"][:boundary]
        sequence_b["tokens"] = chunk[0]["tokens"][boundary:]
        sequence_b["start_of_word"] = chunk[0]["start_of_word"][boundary:]
        # get offsets for sequence_b right
        seq_b_offset_start = chunk[0]["offsets"][boundary]
        sequence_b["offsets"] = [
            offset - seq_b_offset_start
            for offset in chunk[0]["offsets"][boundary:]
        ]
        # get clear text
        clear_text_boundary = chunk[0]["offsets"][boundary]
        sample_in_clear_text["text_a"] = chunk_clear_text[
            0][:clear_text_boundary]
        sample_in_clear_text["text_b"] = chunk_clear_text[0][
            clear_text_boundary:]
        sample_in_clear_text["text_a"].strip()
        sample_in_clear_text["text_b"].strip()
        sample_in_clear_text["nextsentence_label"] = True
        return [sequence_a], [sequence_b], sample_in_clear_text, 0
    # random next sequence
    else:
        label = False
        sequence_b_length = 0
        target_b_length = max_num_tokens - len_sequence_a
        random_doc = _get_random_doc(all_baskets, forbidden_doc=doc)

        random_start = random.randrange(len(random_doc))
        for i in range(random_start, len(random_doc)):
            current_sentence_tokenized = tokenize_with_metadata(
                random_doc[i], tokenizer)
            sequence_b.append(current_sentence_tokenized)
            sample_in_clear_text["text_b"] += f"{random_doc[i]} "
            sequence_b_length += len(current_sentence_tokenized["tokens"])
            if sequence_b_length >= target_b_length:
                break

        sample_in_clear_text["text_b"].strip()
        sample_in_clear_text["nextsentence_label"] = False

        # We didn't use all of the segments in chunk => put them back
        num_unused_segments = len(chunk) - a_end

    assert len(sequence_a) > 0
    assert len(sequence_b) > 0
    return sequence_a, sequence_b, sample_in_clear_text, num_unused_segments
Esempio n. 24
0
 def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len)
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Esempio n. 25
0
 def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len)
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Esempio n. 26
0
def fit_s3e_on_corpus(processor,
                      model,
                      corpus,
                      n_clusters=10,
                      mean_removal=True,
                      pca_removal=True,
                      pca_n_components=300,
                      pca_n_top_components=10,
                      default_token_weight=1,
                      min_token_occurrences=0,
                      svd_postprocessing=False,
                      use_gpu=False,
                      batch_size=50):
    """
    Pooling of word/token embeddings as described by Wang et al in the paper
    "Efficient Sentence Embedding via Semantic Subspace Analysis"
    (https://arxiv.org/abs/2002.09620)
    Adjusted their implementation from here: https://github.com/BinWang28/Sentence-Embedding-S3E

    This method fits the "model" on a custom corpus. This includes the derivation of token_weights depending on
    token occurences in the corpus, creation of the semantic clusters via k-means and a couple of
    pre-/post-processing steps to normalize the embeddings.

    The resulting objects can be saved or directly passed to the Inferencer to get the actual embeddings for your sentences.
    Note: Some operations like `mean_removal` imply changes on the AdaptiveModel or Processor. That's why we return them.

    :param processor: FARM Processor with a Tokenizer used for reading the corpus (e.g. Inference Processor)
    :param model: FARM AdaptiveModel with an embedding layer in the LM (currently only supporting 'WordEmbedding_LM' as a language model)
    :param corpus: Path to a text file or a str 
    :param n_clusters: Number of clusters for S3E. The more clusters, the higher the dimensionality of the resulting embeddings.
    :param mean_removal: Bool, whether to remove the mean from the token embeddings (preprocessing) 
    :param pca_removal: Bool, whether to remove pca components from the token embeddings (preprocessing)
    :param pca_n_components: int, how many PCA components to fit if `pca_removal` is enabled 
    :param pca_n_top_components: int, how many top PCA components to remove if `pca_removal` is enabled 
    :param default_token_weight: float, what weight to assign for tokens that are in vocab but not in corpus
    :param min_token_occurrences: int, mininum number of token occurrences in the corpus for keeping it in the vocab.
                                  Helps to shrink the model & speed it up.
    :param svd_postprocessing: Bool, whether to remove the top truncated SVD / LSA components from the sentence embeddings (postprocessing).
                               Note: Requires creating all sentence embeddings once for the corpus slowing down this method substantially.
                                     Doesn't impact later inference speed though.
    :param use_gpu: bool, whether to use a GPU
    :param batch_size: int, size of batch for the inferencer (only needed when `svd_postprocessing` is enabled)
    :return: model, processor, s3e_stats
    """

    from farm.infer import Inferencer
    from farm.modeling.tokenization import tokenize_with_metadata

    # Get tokens of corpus
    if isinstance(corpus, Path):
        logger.info("Reading corpus for fitting S3E ")
        with open(corpus, "r") as f:
            corpus = f.read()
    else:
        assert type(corpus) == str, "`corpus` must be of type str or Path()"

    tokenized_corpus = tokenize_with_metadata(corpus,
                                              processor.tokenizer)["tokens"]
    token_counts = dict(Counter(tokenized_corpus))
    n_tokens = sum(token_counts.values())

    # Trim vocab & embeddings to most frequent tokens (only to improve speed & ram consumption)
    model.language_model.trim_vocab(token_counts,
                                    processor,
                                    min_threshold=min_token_occurrences)

    # Normalize embeddings
    model.language_model.normalize_embeddings(
        zero_mean=mean_removal,
        pca_removal=pca_removal,
        pca_n_components=pca_n_components,
        pca_n_top_components=pca_n_top_components)
    normalized_word_embs = model.language_model.model.embeddings.cpu().numpy()

    # Get token weights
    token_weights = {}
    eps = 1e-3
    for word, id in processor.tokenizer.vocab.items():
        if word in token_counts:
            token_weights[id] = eps / (eps + token_counts[word] / n_tokens)
        else:
            # words that are in vocab but not present in corpus get the default weight
            token_weights[id] = default_token_weight

    # Construct Cluster
    weight_list = np.array(list(token_weights.values()))
    logger.info('Creating clusters for S3E embeddings')
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=42).fit(normalized_word_embs,
                                         sample_weight=weight_list)

    s3e_stats = {
        "token_to_cluster": kmeans.labels_,
        "centroids": kmeans.cluster_centers_,
        "token_weights": token_weights,
        "svd_components": None
    }

    if svd_postprocessing:
        logger.info(
            'Post processing sentence embeddings using principal component removal'
        )

        # Input
        sentences = [{
            "text": s
        } for s in corpus.split("\n") if len(s.strip()) > 0]

        # Get embeddings
        try:
            inferencer = Inferencer(model=model,
                                    processor=processor,
                                    task_type="embeddings",
                                    gpu=use_gpu,
                                    batch_size=batch_size,
                                    extraction_strategy="s3e",
                                    extraction_layer=-1,
                                    s3e_stats=s3e_stats)
            result = inferencer.inference_from_dicts(dicts=sentences)
        finally:
            inferencer.close_multiprocessing_pool()
        sentence_embeddings = [s["vec"] for s in result]
        sentence_embeddings = np.vstack(sentence_embeddings)

        # Principal Component Removal
        svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
        svd.fit(sentence_embeddings)
        s3e_stats["svd_components"] = svd.components_

    return model, processor, s3e_stats
Esempio n. 27
0
 def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     words = re.findall(r"<t>(.*?)</t>", dict["text"], flags=0)
     word_one = words[0]
     term_one_idx = -1
     term_two_idx = -1        
     term_one_idxs = [m.start() for m in re.finditer(re.escape(word_one), dict["text"])]
     for idx, k in enumerate(term_one_idxs):
         try:
             if dict["text"][k-3:k] == '<t>':
                 term_one_idx = idx
         except:
             pass
     if len(words) > 1:
             word_two = words[1]
             word_two_tokenized = tokenize_with_metadata(word_two, self.tokenizer, self.max_seq_len)['tokens']
             term_two_idxs = [m.start() for m in re.finditer(re.escape(word_two), dict["text"])]
             for idx, k in enumerate(term_two_idxs):
                 try:
                     if dict["text"][k-3:k] == '<t>':
                         term_two_idx = idx
                 except:
                     pass
     dict["text"] = re.sub(r'<t>','', dict["text"])
     dict["text"] = re.sub(r'</t>','', dict["text"])
     tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len)
     word_one_tokenized = tokenize_with_metadata(word_one, self.tokenizer, self.max_seq_len)['tokens']
     x1, y = [], []
     for token in tokenized['tokens']:
             if token == '[CLS]':
                     x1.append(5)
                     y.append('[CLS]')
             elif token == '[SEP]':
                     x1.append(4)
                     y.append('[SEP]')
             else:
                     x1.append(0)
                     y.append('N')
     idx = find_overlap(word_one_tokenized, tokenized['tokens'], term_one_idx)
     if idx > -1:
             for x in range(0,len(word_one_tokenized)):
                     x1[idx+x] = 1
                     y[idx+x] = 'Y'
     else:
         print("-1--")
         print(word_one_tokenized)
         print(tokenized['tokens'])
         x1, y = [], []
         for token in tokenized['tokens']:
             if token == '[CLS]':
                     x1.append(5)
                     y.append('[CLS]')
             elif token == '[SEP]':
                     x1.append(4)
                     y.append('[SEP]')
             else:
                     x1.append(0)
                     y.append('N')
     if len(words) > 1:
             idx = find_overlap(word_two_tokenized, tokenized['tokens'], term_two_idx)
             if idx > -1:
                     for x in range(0,len(word_two_tokenized)):
                             y[idx+x] = 'Y'
                             x1[idx+x] = 1
             else:
                     print("-2--")
                     print(word_two_tokenized)
                     print(tokenized['tokens'])
                     x1, y = [], []
                     for token in tokenized['tokens']:
                             if token == '[CLS]':
                                     x1.append(5)
                                     y.append('[CLS]')
                             elif token == '[SEP]':
                                     x1.append(4)
                                     y.append('[SEP]')
                             else:
                                     x1.append(0)
                                     y.append('N')
     tokenized['custom_data'] = x1
     tokenized['ner_label'] = y
     dict['custom_data'] = x1
     dict['ner_label'] = y
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]