Ejemplo n.º 1
0
def xlnet_tokenization_siamese(tokenizer, left, right):
    input_ids, input_mask, all_seg_ids, s_tokens = [], [], [], []
    for i in range(len(left)):
        tokens = tokenize_fn(transformer_textcleaning(left[i]), tokenizer)
        tokens_right = tokenize_fn(transformer_textcleaning(right[i]),
                                   tokenizer)
        segment_ids = [SEG_ID_A] * len(tokens)
        tokens.append(SEP_ID)
        s_tokens.append([tokenizer.IdToPiece(i) for i in tokens])
        segment_ids.append(SEG_ID_A)

        tokens.extend(tokens_right)
        segment_ids.extend([SEG_ID_B] * len(tokens_right))
        tokens.append(SEP_ID)
        segment_ids.append(SEG_ID_B)

        tokens.append(CLS_ID)
        segment_ids.append(SEG_ID_CLS)

        cur_input_ids = tokens
        cur_input_mask = [0] * len(cur_input_ids)
        assert len(tokens) == len(cur_input_mask)
        assert len(tokens) == len(segment_ids)
        input_ids.append(tokens)
        input_mask.append(cur_input_mask)
        all_seg_ids.append(segment_ids)

    maxlen = max([len(i) for i in input_ids])
    input_ids = padding_sequence(input_ids, maxlen)
    input_mask = padding_sequence(input_mask, maxlen, pad_int=1)
    all_seg_ids = padding_sequence(all_seg_ids, maxlen, pad_int=4)
    return input_ids, input_mask, all_seg_ids, s_tokens
Ejemplo n.º 2
0
def xlnet_tokenization(tokenizer, texts):
    input_ids, input_masks, segment_ids, s_tokens = [], [], [], []
    for text in texts:
        text = transformer_textcleaning(text)
        tokens_a = tokenize_fn(text, tokenizer)[:MAXLEN]
        tokens = []
        segment_id = []
        for token in tokens_a:
            tokens.append(token)
            segment_id.append(SEG_ID_A)

        tokens.append(SEP_ID)
        segment_id.append(SEG_ID_A)
        tokens.append(CLS_ID)
        segment_id.append(SEG_ID_CLS)

        input_id = tokens
        input_mask = [0] * len(input_id)

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        s_tokens.append([tokenizer.IdToPiece(i) for i in tokens])

    maxlen = max([len(i) for i in input_ids])
    input_ids = padding_sequence(input_ids, maxlen)
    input_masks = padding_sequence(input_masks, maxlen, pad_int=1)
    segment_ids = padding_sequence(segment_ids, maxlen, pad_int=SEG_ID_PAD)

    return input_ids, input_masks, segment_ids, s_tokens
Ejemplo n.º 3
0
def bert_tokenization_siamese(tokenizer, left, right):
    input_ids, input_masks, segment_ids, s_tokens = [], [], [], []
    a, b = [], []
    for i in range(len(left)):
        tokens_a = tokenizer.tokenize(transformer_textcleaning(left[i]))
        logging.debug(tokens_a)
        tokens_b = tokenizer.tokenize(transformer_textcleaning(right[i]))
        logging.debug(tokens_b)
        a.append(tokens_a)
        b.append(tokens_b)

    for i in range(len(left)):
        tokens_a = a[i]
        tokens_b = b[i]

        tokens = []
        segment_id = []
        tokens.append('[CLS]')
        segment_id.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_id.append(0)

        tokens.append('[SEP]')
        s_tokens.append(tokens[:])
        segment_id.append(0)
        for token in tokens_b:
            tokens.append(token)
            segment_id.append(1)
        tokens.append('[SEP]')
        segment_id.append(1)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)

    maxlen = max([len(i) for i in input_ids])
    input_ids = padding_sequence(input_ids, maxlen)
    input_masks = padding_sequence(input_masks, maxlen)
    segment_ids = padding_sequence(segment_ids, maxlen)

    return input_ids, input_masks, segment_ids, s_tokens
Ejemplo n.º 4
0
    def paraphrase(
        self, string: str, beam_search: bool = True, split_fullstop: bool = True
    ):
        """
        Paraphrase a string.

        Parameters
        ----------
        string : str
        beam_search : bool, (optional=True)
            If True, use beam search decoder, else use greedy decoder.
        split_fullstop: bool, (default=True)
            if True, will generate paraphrase for each strings splitted by fullstop.

        Returns
        -------
        result: str
        """

        if split_fullstop:

            splitted_fullstop = split_into_sentences(
                transformer_textcleaning(string)
            )

            results, batch, mapping = [], [], {}
            for no, splitted in enumerate(splitted_fullstop):
                if len(splitted.split()) < 4:
                    results.append(splitted)
                else:
                    mapping[len(batch)] = no
                    results.append('REPLACE-ME')
                    batch.append(splitted)

            if len(batch):
                output = self._paraphrase(batch, beam_search = beam_search)
                for no in range(len(output)):
                    results[mapping[no]] = output[no]

            return ' '.join(results)

        else:
            return self._paraphrase([string], beam_search = beam_search)[0]
Ejemplo n.º 5
0
def bert_tokenization(tokenizer, texts):
    input_ids, input_masks, segment_ids, s_tokens = [], [], [], []
    for text in texts:
        text = transformer_textcleaning(text)
        tokens_a = tokenizer.tokenize(text)[:MAXLEN]
        tokens = ['[CLS]'] + tokens_a + ['[SEP]']
        segment_id = [0] * len(tokens)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        s_tokens.append(tokens)

    maxlen = max([len(i) for i in input_ids])
    input_ids = padding_sequence(input_ids, maxlen)
    input_masks = padding_sequence(input_masks, maxlen)
    segment_ids = padding_sequence(segment_ids, maxlen)

    return input_ids, input_masks, segment_ids, s_tokens
Ejemplo n.º 6
0
def parse_bert_tagging(left, tokenizer):
    left = transformer_textcleaning(left)
    bert_tokens = ['[CLS]'] + tokenizer.tokenize(left) + ['[SEP]']
    input_mask = [1] * len(bert_tokens)
    return tokenizer.convert_tokens_to_ids(
        bert_tokens), input_mask, bert_tokens
Ejemplo n.º 7
0
def rake(
    string: str,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default=None)
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
        For automatic Ngram generator.

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)
    phrase_list = list(vocab.keys())
    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores)

    sortedKeywords = sorted(keywordcandidates.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords
                        if vocab[i[0]] >= atleast]
    return ranked_sentences[:top_k]
Ejemplo n.º 8
0
def similarity_transformer(
    string: str,
    model,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    use_maxsum: bool = False,
    use_mmr: bool = False,
    diversity: float = 0.5,
    nr_candidates: int = 20,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Sentence embedding VS keyword embedding similarity.
    https://github.com/MaartenGr/KeyBERT/blob/master/keybert/model.py

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or, 
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    use_maxsum: bool, optional (default=False) 
        Whether to use Max Sum Similarity.
    use_mmr: bool, optional (default=False) 
        Whether to use MMR.
    diversity: float, optional (default=0.5)
        The diversity of results between 0 and 1 if use_mmr is True.
    nr_candidates: int, optional (default=20) 
        The number of candidates to consider if use_maxsum is set to True.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'vectorize'):
        raise ValueError('model must have `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if nr_candidates < top_k:
        raise Exception('nr_candidates must bigger than top_k')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    words = list(vocab.keys())
    vectors_keywords = model.vectorize(words)
    vectors_string = model.vectorize([string])

    if use_mmr:
        # https://github.com/MaartenGr/KeyBERT/blob/master/keybert/mmr.py

        word_doc_similarity = cosine_similarity(vectors_keywords,
                                                vectors_string)
        word_similarity = cosine_similarity(vectors_keywords)
        keywords_idx = [np.argmax(word_doc_similarity)]
        candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
        for _ in range(top_n - 1):
            candidate_similarities = word_doc_similarity[candidates_idx, :]
            target_similarities = np.max(
                word_similarity[candidates_idx][:, keywords_idx], axis=1)

            mmr = (
                1 - diversity
            ) * candidate_similarities - diversity * target_similarities.reshape(
                -1, 1)
            mmr_idx = candidates_idx[np.argmax(mmr)]

            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)
        ranked_sentences = [(word_doc_similarity.reshape(1, -1)[0][idx],
                             words[idx]) for idx in keywords_idx]

    elif use_maxsum:
        # https://github.com/MaartenGr/KeyBERT/blob/master/keybert/maxsum.py

        distances = cosine_similarity(vectors_string, vectors_keywords)
        distances_words = cosine_similarity(vectors_keywords, vectors_keywords)
        words_idx = list(distances.argsort()[0][-nr_candidates:])
        words_vals = [words[index] for index in words_idx]
        candidates = distances_words[np.ix_(words_idx, words_idx)]
        min_sim = 100_000
        candidate = None
        for combination in itertools.combinations(range(len(words_idx)),
                                                  top_n):
            sim = sum([
                candidates[i][j] for i in combination for j in combination
                if i != j
            ])
            if sim < min_sim:
                candidate = combination
                min_sim = sim

        ranked_sentences = [(distances[0][idx], words_vals[idx])
                            for idx in candidate]

    else:
        distances = cosine_similarity(vectors_string, vectors_keywords)
        ranked_sentences = [(distances[0][index], words[index])
                            for index in distances.argsort()[0]][::-1]

    ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast]
    return ranked_sentences[:top_k]
Ejemplo n.º 9
0
def attention(
    string: str,
    model,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Attention mechanism.

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or, 
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """

    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'attention'):
        raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    attention = model.attention([string])[0]
    d = defaultdict(float)
    for k, v in attention:
        d[k] += v

    scores = []
    for k in vocab.keys():
        scores.append(sum([d.get(w, 0) for w in k.split()]))

    total = sum(scores)

    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]
Ejemplo n.º 10
0
def similarity(
    string: str,
    model,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Sentence embedding VS keyword embedding similarity.

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `vectorize` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'vectorize'):
        raise ValueError('model must have `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if nr_candidates < top_k:
        raise ValueError('nr_candidates must bigger than top_k')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    words = list(vocab.keys())
    vectors_keywords = model.vectorize(words)
    vectors_string = model.vectorize([string])

    distances = cosine_similarity(vectors_string, vectors_keywords)
    ranked_sentences = [(distances[0][index], words[index])
                        for index in distances.argsort()[0]][::-1]

    ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast]
    return ranked_sentences[:top_k]
Ejemplo n.º 11
0
def parse_bert_tagging(left, tokenizer, space_after_punct=False):
    left = transformer_textcleaning(left, space_after_punct=space_after_punct)
    bert_tokens = ['[CLS]'] + tokenizer.tokenize(left) + ['[SEP]']
    input_mask = [1] * len(bert_tokens)
    logging.debug(bert_tokens)
    return tokenizer.convert_tokens_to_ids(bert_tokens), input_mask, bert_tokens
Ejemplo n.º 12
0
def rake(string: str,
         model=None,
         top_k: int = 5,
         auto_ngram: bool = True,
         ngram_method: str = 'bow',
         ngram: Tuple[int, int] = (1, 1),
         atleast: int = 1,
         stop_words: List[str] = STOPWORDS,
         **kwargs):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        Transformer model or any model has `attention` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stop_words: list, (default=malaya.text.function.STOPWORDS)
        list of stop words to remove. 

    Returns
    -------
    result: Tuple[float, str]
    """

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must has or `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in ('bow', 'skipgram'):
        raise ValueError("ngram_method must be in  ['bow', 'skip-gram']")
    if auto_ngram and not len(stop_words):
        raise ValueError('insert stop_words if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if auto_ngram:
        vocab = _auto_ngram(string, stop_words)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stop_words=stop_words,
                      **kwargs)
    phrase_list = list(vocab.keys())
    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores)

    sortedKeywords = sorted(keywordcandidates.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords
                        if vocab[i[0]] >= atleast]
    return ranked_sentences[:top_k]
Ejemplo n.º 13
0
def attention(string: str,
              model,
              top_k: int = 5,
              auto_ngram: bool = True,
              ngram_method: str = 'bow',
              ngram: Tuple[int, int] = (1, 1),
              atleast: int = 1,
              stop_words: List[str] = STOPWORDS,
              **kwargs):
    """
    Extract keywords using Attention mechanism.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        Transformer model or any model has `attention` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stop_words: list, (default=malaya.text.function.STOPWORDS)
        list of stop words to remove. 

    Returns
    -------
    result: Tuple[float, str]
    """

    if not hasattr(model, 'attention'):
        raise ValueError('model must has or `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in ('bow', 'skipgram'):
        raise ValueError("ngram_method must be in  ['bow', 'skip-gram']")
    if auto_ngram and not len(stop_words):
        raise ValueError('insert stop_words if auto_ngram')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stop_words)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stop_words=stop_words,
                      **kwargs)

    attention = model.attention([string])[0]
    d = defaultdict(float)
    for k, v in attention:
        d[k] += v

    scores = []
    for k in vocab.keys():
        scores.append(sum([d.get(w, 0) for w in k.split()]))

    total = sum(scores)

    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]