Ejemplo n.º 1
0
    def anno_mining_token(self, alia_base_emo=True) -> List[Example]:
        """
        use `SO_PMI` and `Doc_Distance` to annotate suspicious mining span tokens
        
        where `Doc_Distance` as:
            doc_dist(w) = NDoc_pos(w) - NDoc_neg(w)
        
        Parameters
        ----------
        alia_base_emo: bool, determine whether use base emotion vocab
        """
        alter_tks = [x[0] for x in self.alter_tks]

        pos_words = Vocab.gene_from_list(self.alia_pos_words,
                                         Vocab().postive_name, 1)
        neg_words = Vocab.gene_from_list(self.alia_neg_words,
                                         Vocab().negtive_name, -1)

        if alia_base_emo:
            pos_words += self.base_pos_words
            neg_words += self.base_neg_words

        emo_vocab = pos_words + neg_words

        # filter base emo tokens
        alter_tks = [tk for tk in alter_tks if tk not in emo_vocab.tk2idx]

        # create each mat
        emo_vocab += Vocab.gene_from_list(alter_tks,
                                          name=Vocab().alters_name,
                                          score=0)

        emo_mat = token_emotion_mat(emo_vocab)
        label_mat = doc_label_mat(self.doc_labels)
        doc_mat = doc_onehot_mat(self.doc_tokens, emo_vocab)

        alter_idx = emo_vocab.get_group(emo_vocab.alters_name)

        # so_pmi
        so_pmi_scores_obj = pair_pmi(doc_mat, emo_mat, emo_vocab)
        so_pmi_scores = [exam.label for exam in so_pmi_scores_obj]

        # doc_distance
        doc_dist = np.sum(doc_mat[alter_idx] * label_mat, axis=1)
        pmi_dist_scores = so_pmi_scores * doc_dist

        # only score greater than 0 selected
        res_idx = np.where(pmi_dist_scores > 0)[0]
        res_exam = [so_pmi_scores_obj[idx] for idx in res_idx]

        print(f"mining new span token {len(res_exam)}")

        self.new_tks = res_exam

        return res_exam
Ejemplo n.º 2
0
    def __init__(self,
                 stop_words: Union[List[Text], Vocab] = None,
                 base_pos_words: Union[List[Text], Vocab] = None,
                 base_neg_words: Union[List[Text], Vocab] = None):
        """
        Parameters
        ----------
        examples : List[Example], token list for each sequence of doc
        stop_words: Union[List[Text], None], stop words list
        base_neg_words : Union[List[Text], None], base negative words
        base_pos_words : Union[List[Text], None], base positive words
        """
        self.stop_words = Vocab.gene_from_list(stop_words, score = 0) if \
            isinstance(stop_words, List) else stop_words
        self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \
            isinstance(base_pos_words, List) else base_pos_words
        self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \
            isinstance(base_neg_words, List) else base_neg_words

        self.seedwords = None
Ejemplo n.º 3
0
    def __init__(
        self,
        examples: List[Example],
        seed_tokens: List[Example],
        extreme_words: Union[List[Text], Vocab],
        deny_words: Union[List[Text], Vocab],
        base_pos_words: Union[List[Text], Vocab] = None,
        base_neg_words: Union[List[Text], Vocab] = None,
    ):
        """
        Parameters
        ----------
        examples: List[Example], each example could extract `tokens`
        seed_tokens: List[Example], seed tokens mined
        extreme_words: List[Text], a set of extreme words
        deny_words: List[Text], a set of deny words
        base_pos_words: base positive words if needed
        base_neg_words: base negative words if needed
        """

        self.doc_tokens = [exam.get("tokens") for exam in examples]
        self.doc_labels = [exam.label for exam in examples]
        self.doc_size = len(self.doc_tokens)

        self.seed_tokens = seed_tokens
        self.alia_pos_words, self.alia_neg_words = self._alia_emo_words()

        self.extreme_words = Vocab.gene_from_list(
            extreme_words, score=2) if isinstance(extreme_words,
                                                  List) else extreme_words
        self.deny_words = Vocab.gene_from_list(deny_words) if isinstance(
            deny_words, List) else deny_words
        self.span_words = self.extreme_words + self.deny_words  # vocab

        self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \
            isinstance(base_pos_words, List) else base_pos_words
        self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \
            isinstance(base_neg_words, List) else base_neg_words

        self.alter_tks = None
        self.new_tks = None
Ejemplo n.º 4
0
    def anno_seed_word(self, doc_tokens: List[List[Text]],
                       seed_words: List[Text]) -> List[Example]:
        """auto annotation for seed words selected through `PMI`,
        where `so_pmi` would calculated from
            so_pmi(word) = mean(PMI(word, Pw)) - mean(PMI(word, Nw))

        if so_pmi(word) > 0, the seed word would tagged as positive
        if so_pmi(word) = 0, tagged as neutral
        if so_pmi(word) < 0, tagged as negative
        """
        _seed_words_vocab = Vocab.gene_from_list(seed_words,
                                                 Vocab().alters_name, 0)
        emo_vocab = self.base_pos_words + self.base_neg_words + _seed_words_vocab

        emo_mat = token_emotion_mat(emo_vocab)
        doc_mat = doc_onehot_mat(doc_tokens, emo_vocab)

        so_pmi_score = pair_pmi(doc_mat, emo_mat, emo_vocab)

        return so_pmi_score
Ejemplo n.º 5
0
# seed_words = [Example(text = "哈哈哈", label = 1), Example(text = "卧槽", label = -1)]
# # 1.2 field new word mining
# new_word_op = spanNewWordMining(dataset,
#                                 seed_words,
#                                 extreme_word_dict,
#                                 deny_words_dict,
#                                 base_posword_dict,
#                                 base_negword_dict)
# new_word_op.run(min_window = 2, max_window = 3, alia_base_emo = False)

# ==================
# test for sentence score
from src.sentence_score.sentence_score import *

pos_emo_dict = Vocab.gene_from_list(base_posword_dict,
                                    name=Vocab.postive_name,
                                    score=1)
neg_emo_dict = Vocab.gene_from_list(base_negword_dict,
                                    name=Vocab.negtive_name,
                                    score=-1)
deny_emo_dict = Vocab.gene_from_list(deny_words_dict, score=0)
ext_emo_dict = extreme_word_dict
emo_dict = pos_emo_dict + neg_emo_dict

score_op = totalSentenceScore(tok_method=DenyExtremeTokenScore(
    emo_dict=emo_dict, ext_dict=ext_emo_dict, deny_dict=deny_emo_dict),
                              seq_methods=[
                                  transitionSentenceScore(),
                                  hypothesisSentenceScore(),
                                  tailpuncSentenceScore()
                              ])