Example #1
0
def test_empty_vocab():
    """
    Nothing is present in an empty word list
    """
    vocab = Vocab([])
    assert vocab.as_list() == []
    assert not vocab.has("sheep")
Example #2
0
def token_emotion_mat(vocab: Vocab):
    """pass"""
    emotion_mat = np.zeros(shape=(vocab.size()))
    emotion_mat[vocab.get_group(vocab.postive_name)] = 1
    emotion_mat[vocab.get_group(vocab.negtive_name)] = -1

    return emotion_mat
Example #3
0
 def _load_force_tks(self, force_tks: Union[List, Vocab]):
     if force_tks is None:
         return Vocab()
     elif isinstance(force_tks, List):
         vocab = Vocab()
         vocab.add_seq(force_tks)
         return vocab
     else:
         return force_tks
Example #4
0
    def create_onehot_mat(self, doc_tokens: List[List[Text]]) -> np.array:
        """pass"""
        vocab = Vocab()
        tks = list(set(chain.from_iterable(doc_tokens)))
        vocab.add_seq(tks)

        doc_mat = doc_onehot_mat(doc_tokens, vocab)

        return doc_mat[:-1], vocab
def sample_data(data_path, basedir, specified_index=None):
    """
    Sample format of the processed
    data from data.py
    Args:
        data_path: path for train.p|valid.p
    """

    # global basedir

    with open(data_path, 'rb') as f:
        entries = pickle.load(f)

    # Choose a random sample
    rand_index = random.randint(0, len(entries))

    # Prepare vocab
    vocab_file = os.path.join(basedir, 'data/processed_reviews/vocab.txt')
    vocab = Vocab(vocab_file, verbose=False)

    # Sample
    (processed_review, review_seq_len, label) = entries[rand_index]

    print("==> Number of entries:", len(entries))
    print("==> Random index:", rand_index)
    print("==> Processed Review:", processed_review)
    print("==> Review Len:", review_seq_len)
    print("==> Label:", label)
    print("==> See if processed review makes sense:",
          ids_to_tokens(
              processed_review,
              vocab=vocab,
          ))
Example #6
0
def test_small_vocab():
    l = ["eeny", "moe", "miney", "meeny"]
    vocab = Vocab(l)
    assert vocab.has("moe")
    assert vocab.has("eeny")
    assert vocab.has("miney")
    assert vocab.has("meeny")
    assert not vocab.has("many")
    assert sorted(vocab.as_list()) == sorted(l)
Example #7
0
    def anno_seed_word(self, doc_tokens: List[List[Text]],
                       seed_words: List[Text]) -> List[Example]:
        """auto annotation for seed words selected through `PMI`,
        where `so_pmi` would calculated from
            so_pmi(word) = mean(PMI(word, Pw)) - mean(PMI(word, Nw))

        if so_pmi(word) > 0, the seed word would tagged as positive
        if so_pmi(word) = 0, tagged as neutral
        if so_pmi(word) < 0, tagged as negative
        """
        _seed_words_vocab = Vocab.gene_from_list(seed_words,
                                                 Vocab().alters_name, 0)
        emo_vocab = self.base_pos_words + self.base_neg_words + _seed_words_vocab

        emo_mat = token_emotion_mat(emo_vocab)
        doc_mat = doc_onehot_mat(doc_tokens, emo_vocab)

        so_pmi_score = pair_pmi(doc_mat, emo_mat, emo_vocab)

        return so_pmi_score
Example #8
0
    def __init__(self,
                 stop_words: Union[List[Text], Vocab] = None,
                 base_pos_words: Union[List[Text], Vocab] = None,
                 base_neg_words: Union[List[Text], Vocab] = None):
        """
        Parameters
        ----------
        examples : List[Example], token list for each sequence of doc
        stop_words: Union[List[Text], None], stop words list
        base_neg_words : Union[List[Text], None], base negative words
        base_pos_words : Union[List[Text], None], base positive words
        """
        self.stop_words = Vocab.gene_from_list(stop_words, score = 0) if \
            isinstance(stop_words, List) else stop_words
        self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \
            isinstance(base_pos_words, List) else base_pos_words
        self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \
            isinstance(base_neg_words, List) else base_neg_words

        self.seedwords = None
Example #9
0
class RuPosIndexer:
    """Индексирует датасет и хранит словари"""
    def __init__(self):
        self.token_vocab = Vocab(lowercase=True, paddings=True)
        self.pos_vocab = Vocab(paddings=True)
        self.gram_vocab = Vocab(paddings=True)

    def index_dataset(self, dataset: List[Sentence]):
        """
        Заполняет словари по датасету
        """
        for sentence in dataset:
            self.token_vocab.fill(sentence.tokens)
            self.pos_vocab.fill(sentence.pos_tags)
            self.gram_vocab.fill(sentence.grammems)

    def sentence_to_indexes(
            self,
            sentence: Sentence) -> Tuple[List[int], List[int], List[int]]:
        """
        Переводит предложение в индексы
        """
        tokens = [self.token_vocab[token] for token in sentence.tokens]
        pos_tags = [self.pos_vocab[pos] for pos in sentence.pos_tags]
        grammemes = [self.gram_vocab[gram] for gram in sentence.grammems]

        return tokens, pos_tags, grammemes
Example #10
0
def doc_onehot_mat(doc_tokens: List[List[Text]], vocab: Vocab):
    """pass"""

    tk2idx = vocab.tk2idx
    all_tks = list(tk2idx.keys())

    onehot_mat = np.zeros(shape=(vocab.size() + 1, len(doc_tokens)),
                          dtype=np.int8)

    for id, doc in enumerate(doc_tokens):
        tks = list(map(lambda tk: tk2idx[tk] if tk in doc else -1, all_tks))
        onehot_mat[tks, id] = 1

    return onehot_mat
Example #11
0
def pair_pmi(doc_mat: Union[np.array, List[List]],
             emo_mat: Union[np.array, List],
             vocab: Vocab,
             if_sign=True) -> List[Example]:
    """pass"""
    seed_idx, pos_idx, neg_idx = vocab.get_all_group()

    doc_mat = np.asarray(doc_mat).squeeze()
    emo_mat = np.asarray(emo_mat).flatten()

    if len(doc_mat.shape) != 2:
        raise ValueError("doc_mat must have dimension of 2")

    pair_seed_pos_idx = list(product(seed_idx, pos_idx))
    pair_seed_neg_idx = list(product(seed_idx, neg_idx))

    scores = defaultdict(float)

    pbar = tqdm(total=len(pair_seed_neg_idx) + len(pair_seed_pos_idx),
                desc="so pmi annotation calling")
    for group in [pair_seed_pos_idx, pair_seed_neg_idx]:
        for seed, emo in group:
            sub_count = doc_mat[[seed, emo]].sum(axis=1)
            co_curr = (sub_count == 2).sum()
            seed_curr, emo_curr = sub_count
            scores[seed] += pmi_score(co_curr, seed_curr,
                                      emo_curr) * emo_mat[emo]
            pbar.update(1)
    pbar.close()

    scores = np.asarray([scores[idx] for idx in seed_idx])
    scores = np.sign(scores) if if_sign else scores

    return [
        Example(text=vocab.get_tk(tk), label=sco)
        for tk, sco in zip(seed_idx, scores)
    ]
Example #12
0
def create_vocab(pos_words: Union[List[Text],
                                  Vocab], neg_words: Union[List[Text], Vocab],
                 seed_words: Union[List[Text], Vocab]) -> Vocab:
    """pass"""

    _convert_func = lambda x: list(x.tk2idx.keys()) if isinstance(x, Vocab
                                                                  ) else x

    pos_words = _convert_func(pos_words)
    neg_words = _convert_func(neg_words)
    seed_words = _convert_func(seed_words)

    emo_vocab = Vocab()
    emo_vocab.add_seq(pos_words, emo_vocab.postive_name, 1)
    emo_vocab.add_seq(neg_words, emo_vocab.negtive_name, -1)
    emo_vocab.add_seq(seed_words, emo_vocab.alters_name, 0)
    return emo_vocab
Example #13
0
    def anno_mining_token(self, alia_base_emo=True) -> List[Example]:
        """
        use `SO_PMI` and `Doc_Distance` to annotate suspicious mining span tokens
        
        where `Doc_Distance` as:
            doc_dist(w) = NDoc_pos(w) - NDoc_neg(w)
        
        Parameters
        ----------
        alia_base_emo: bool, determine whether use base emotion vocab
        """
        alter_tks = [x[0] for x in self.alter_tks]

        pos_words = Vocab.gene_from_list(self.alia_pos_words,
                                         Vocab().postive_name, 1)
        neg_words = Vocab.gene_from_list(self.alia_neg_words,
                                         Vocab().negtive_name, -1)

        if alia_base_emo:
            pos_words += self.base_pos_words
            neg_words += self.base_neg_words

        emo_vocab = pos_words + neg_words

        # filter base emo tokens
        alter_tks = [tk for tk in alter_tks if tk not in emo_vocab.tk2idx]

        # create each mat
        emo_vocab += Vocab.gene_from_list(alter_tks,
                                          name=Vocab().alters_name,
                                          score=0)

        emo_mat = token_emotion_mat(emo_vocab)
        label_mat = doc_label_mat(self.doc_labels)
        doc_mat = doc_onehot_mat(self.doc_tokens, emo_vocab)

        alter_idx = emo_vocab.get_group(emo_vocab.alters_name)

        # so_pmi
        so_pmi_scores_obj = pair_pmi(doc_mat, emo_mat, emo_vocab)
        so_pmi_scores = [exam.label for exam in so_pmi_scores_obj]

        # doc_distance
        doc_dist = np.sum(doc_mat[alter_idx] * label_mat, axis=1)
        pmi_dist_scores = so_pmi_scores * doc_dist

        # only score greater than 0 selected
        res_idx = np.where(pmi_dist_scores > 0)[0]
        res_exam = [so_pmi_scores_obj[idx] for idx in res_idx]

        print(f"mining new span token {len(res_exam)}")

        self.new_tks = res_exam

        return res_exam
Example #14
0
def test_from_simulated_file():
    from io import StringIO
    l = StringIO(initial_value="""
        #comment
        # another comment line
        sheep

        rats
        #comment
        squirrels
        """)
    vocab = Vocab(l)
    assert sorted(vocab.as_list()) == ["rats", "sheep", "squirrels"]
    assert vocab.has("sheep")
    assert vocab.has("rats")
    assert vocab.has("squirrels")
    assert not vocab.has("#comment")
Example #15
0
    def __init__(
        self,
        examples: List[Example],
        seed_tokens: List[Example],
        extreme_words: Union[List[Text], Vocab],
        deny_words: Union[List[Text], Vocab],
        base_pos_words: Union[List[Text], Vocab] = None,
        base_neg_words: Union[List[Text], Vocab] = None,
    ):
        """
        Parameters
        ----------
        examples: List[Example], each example could extract `tokens`
        seed_tokens: List[Example], seed tokens mined
        extreme_words: List[Text], a set of extreme words
        deny_words: List[Text], a set of deny words
        base_pos_words: base positive words if needed
        base_neg_words: base negative words if needed
        """

        self.doc_tokens = [exam.get("tokens") for exam in examples]
        self.doc_labels = [exam.label for exam in examples]
        self.doc_size = len(self.doc_tokens)

        self.seed_tokens = seed_tokens
        self.alia_pos_words, self.alia_neg_words = self._alia_emo_words()

        self.extreme_words = Vocab.gene_from_list(
            extreme_words, score=2) if isinstance(extreme_words,
                                                  List) else extreme_words
        self.deny_words = Vocab.gene_from_list(deny_words) if isinstance(
            deny_words, List) else deny_words
        self.span_words = self.extreme_words + self.deny_words  # vocab

        self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \
            isinstance(base_pos_words, List) else base_pos_words
        self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \
            isinstance(base_neg_words, List) else base_neg_words

        self.alter_tks = None
        self.new_tks = None
Example #16
0
###
# Globals
###
app = flask.Flask(__name__)

CONFIG = config.configuration()
app.secret_key = CONFIG.SECRET_KEY  # Should allow using session variables

#
# One shared 'Vocab' object, read-only after initialization,
# shared by all threads and instances.  Otherwise we would have to
# store it in the browser and transmit it on each request/response cycle,
# or else read it from the file on each request/responce cycle,
# neither of which would be suitable for responding keystroke by keystroke.

WORDS = Vocab(CONFIG.VOCAB)
NUM = min(len(WORDS.as_list()), CONFIG.SUCCESS_AT_COUNT)

###
# Pages
###


@app.route("/")
@app.route("/index")
def index():
    """The main page of the application"""
    flask.g.vocab = WORDS.as_list()
    flask.session["target_count"] = min(len(flask.g.vocab),
                                        CONFIG.SUCCESS_AT_COUNT)
    flask.session["jumble"] = jumbled(flask.g.vocab,
Example #17
0
def test_single_vocab():
    vocab = Vocab(["moe"])
    assert vocab.as_list() == ["moe"]
    assert vocab.has("moe")
    assert not vocab.has("meeny")
Example #18
0
    Patience limit:           {args.patience_limit}
##############################\n""",
      file=stdout)

# Fix the seeds for random number generators.
if args.seed is not None: fix_random_seeds(args.seed)

# Read the data.
data_path = args.root + "datasets/%s_en_data/" % args.language
print(f"Reading training data from {data_path} ...", file=stdout)
(src_train_sents, tgt_train_sents, src_dev_sents, tgt_dev_sents,
 src_test_sents, tgt_test_sents) = get_data(data_path, args.language)

# Build a vocabulary of source and target language.
vocab_file = "vocab_%s_en.json" % args.language
vocab = Vocab.build(src_train_sents, tgt_train_sents, args.vocab_size,
                    args.freq_cutoff)
vocab.save(vocab_file)

# Build a model object.
model = NMT(word_embed_size=args.word_embed_size,
            char_embed_size=args.char_embed_size,
            hidden_size=args.hidden_size,
            vocab=vocab,
            dropout_rate=args.dropout_rate,
            kernel_size=args.kernel_size,
            padding=args.padding)

# Train the model.
train_data = list(zip(src_train_sents, tgt_train_sents))
dev_data = list(zip(src_dev_sents, tgt_dev_sents))
dataset = {"train_data": train_data, "dev_data": dev_data}
    fitlog.add_hyper({'model': args.w, 'fold': args.fold})

    # set cuda
    config.use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    if config.use_cuda:
        torch.cuda.set_device(args.gpu)
        config.device = torch.device("cuda", args.gpu)
    else:
        config.device = torch.device("cpu")
    logging.info("Use cuda: %s, gpu id: %d.", config.use_cuda, args.gpu)

    # vocab
    cache_name = "./save/vocab/" + str(args.fold) + ".pickle"
    if Path(cache_name).exists():
        vocab_file = open(cache_name, 'rb')
        vocab = pickle.load(vocab_file)
        logging.info('Load vocab from ' + cache_name + ', words %d, labels %d.' % (vocab.word_size, vocab.label_size))
    else:
        vocab = Vocab(config.train_file)
        file = open(cache_name, 'wb')
        pickle.dump(vocab, file)
        logging.info('Cache vocab to ' + cache_name)

    # model
    model = Model(config, vocab)

    # trainer
    trainer = Trainer(model, config, vocab, fitlog)
    trainer.train()
    trainer.test()
Example #20
0
def train(FLAGS, basedir):
    """
    Train a previous or new model.
    """
    # Data paths
    vocab_path = os.path.join(basedir, 'data/processed_reviews/vocab.txt')
    train_data_path = os.path.join(basedir, 'data/processed_reviews/train.p')
    validation_data_path = os.path.join(basedir,
                                        'data/processed_reviews/validation.p')
    vocab = Vocab(vocab_path)
    FLAGS.num_classes = 2

    # Load embeddings (if using GloVe)
    if FLAGS.embedding == 'glove':
        with open(os.path.join(basedir, 'data/processed_reviews/embeddings.p'),
                  'rb') as f:
            embeddings = pickle.load(f)
        FLAGS.vocab_size = len(embeddings)

    # Start tensorflow session
    with tf.Session() as sess:

        # Create|reload model
        imdb_model = create_model(sess, FLAGS, len(vocab), basedir)

        # Metrics
        metrics = {
            "train_loss": [],
            "valid_loss": [],
            "train_acc": [],
            "valid_acc": [],
        }

        # Store attention score history for few samples
        attn_history = {
            "sample_0": {
                "review": None,
                "label": None,
                "review_len": None,
                "attn_scores": []
            },
            "sample_1": {
                "review": None,
                "label": None,
                "review_len": None,
                "attn_scores": []
            },
            "sample_2": {
                "review": None,
                "label": None,
                "review_len": None,
                "attn_scores": []
            },
            "sample_3": {
                "review": None,
                "label": None,
                "review_len": None,
                "attn_scores": []
            },
            "sample_4": {
                "review": None,
                "label": None,
                "review_len": None,
                "attn_scores": []
            },
        }

        # Start training
        for train_epoch_num, train_epoch in \
            enumerate(generate_epoch(
                train_data_path, FLAGS.num_epochs, FLAGS.batch_size)):

            print("==> EPOCH:", train_epoch_num)

            for train_batch_num, (batch_features, batch_seq_lens) in \
                enumerate(train_epoch):

                batch_reviews, batch_labels = batch_features
                batch_review_lens, = batch_seq_lens

                # Display shapes once
                if (train_epoch_num == 0 and train_batch_num == 0):
                    print("Reviews: ", np.shape(batch_reviews))
                    print("Labels: ", np.shape(batch_labels))
                    print("Review lens: ", np.shape(batch_review_lens))

                _, train_logits, train_loss, train_acc, lr, attn_scores = \
                    imdb_model.train(
                        sess=sess,
                        batch_reviews=batch_reviews,
                        batch_labels=batch_labels,
                        batch_review_lens=batch_review_lens,
                        embeddings=embeddings,
                        keep_prob=FLAGS.keep_prob,
                        )

            for valid_epoch_num, valid_epoch in \
                enumerate(generate_epoch(
                    data_path=validation_data_path,
                    num_epochs=1,
                    batch_size=FLAGS.batch_size,
                    )):

                for valid_batch_num, (valid_batch_features, valid_batch_seq_lens) in \
                    enumerate(valid_epoch):

                    valid_batch_reviews, valid_batch_labels = valid_batch_features
                    valid_batch_review_lens, = valid_batch_seq_lens

                    valid_logits, valid_loss, valid_acc = imdb_model.eval(
                        sess=sess,
                        batch_reviews=valid_batch_reviews,
                        batch_labels=valid_batch_labels,
                        batch_review_lens=valid_batch_review_lens,
                        embeddings=embeddings,
                        keep_prob=1.0,  # no dropout for val|test
                    )

            print ("[EPOCH]: %i, [LR]: %.6e, [TRAIN ACC]: %.3f, [VALID ACC]: %.3f " \
                   "[TRAIN LOSS]: %.6f, [VALID LOSS]: %.6f" % (
                train_epoch_num, lr, train_acc, valid_acc, train_loss, valid_loss))

            # Store the metrics
            metrics["train_loss"].append(train_loss)
            metrics["valid_loss"].append(valid_loss)
            metrics["train_acc"].append(train_acc)
            metrics["valid_acc"].append(valid_acc)

            # Store attn history
            for i in range(5):
                sample = "sample_%i" % i
                attn_history[sample]["review"] = batch_reviews[i]
                attn_history[sample]["label"] = batch_labels[i]
                attn_history[sample]["review_len"] = batch_review_lens[i]
                attn_history[sample]["attn_scores"].append(attn_scores[i])

            # Save the model (maybe)
            if ((train_epoch_num == (FLAGS.num_epochs - 1))
                    or ((train_epoch_num % FLAGS.save_every == 0) and
                        (train_epoch_num > 0))):

                # Make parents ckpt dir if it does not exist
                if not os.path.isdir(
                        os.path.join(basedir, FLAGS.data_dir, 'ckpt')):
                    os.makedirs(os.path.join(basedir, FLAGS.data_dir, 'ckpt'))

                # Make child ckpt dir for this specific model
                if not os.path.isdir(os.path.join(basedir, FLAGS.ckpt_dir)):
                    os.makedirs(os.path.join(basedir, FLAGS.ckpt_dir))

                checkpoint_path = \
                    os.path.join(
                        basedir, FLAGS.ckpt_dir, "%s.ckpt" % FLAGS.model_name)

                print("==> Saving the model.")
                imdb_model.saver.save(sess,
                                      checkpoint_path,
                                      global_step=imdb_model.global_step)

    # Save the metrics
    metrics_file = os.path.join(basedir, FLAGS.ckpt_dir, 'metrics.p')
    with open(metrics_file, 'wb') as f:
        pickle.dump(metrics, f)

    # Save the attention scores
    attn_history_file = os.path.join(basedir, FLAGS.ckpt_dir, 'attn_history.p')
    with open(attn_history_file, 'wb') as f:
        pickle.dump(attn_history, f)
Example #21
0
###
# Globals
###
app = flask.Flask(__name__)

CONFIG = config.configuration()
app.secret_key = CONFIG.SECRET_KEY  # Should allow using session variables

#
# One shared 'Vocab' object, read-only after initialization,
# shared by all threads and instances.  Otherwise we would have to
# store it in the browser and transmit it on each request/response cycle,
# or else read it from the file on each request/responce cycle,
# neither of which would be suitable for responding keystroke by keystroke.

WORDS = Vocab(CONFIG.VOCAB)

###
# Pages
###


@app.route("/")
@app.route("/index")
def index():
    """The main page of the application"""
    flask.g.vocab = WORDS.as_list()
    flask.session["target_count"] = min(len(flask.g.vocab),
                                        CONFIG.SUCCESS_AT_COUNT)
    flask.session["jumble"] = jumbled(flask.g.vocab,
                                      flask.session["target_count"])
Example #22
0
from src.jtnn_vae import JTNNVAE
from src.vocab import Vocab
lg = rdkit.RDLogger.logger()
lg.setLevel(rdkit.RDLogger.CRITICAL)

parser = argparse.ArgumentParser()
parser.add_argument('--nsample', type=int, required=True)
parser.add_argument('--vocab', required=True)
parser.add_argument('--model', required=True)
parser.add_argument('--output', required=True)
parser.add_argument('--config', required=True)

args = parser.parse_args()
config = load_json_config(args.config)
vocab = get_vocab(args.vocab)
vocab = Vocab(vocab)
model = JTNNVAE(vocab, config['hidden_size'], config['latent_size'],
                config['depthT'], config['depthG'])
train_model_params = paddle.load(args.model)
model.set_state_dict(train_model_params)
model.eval()

res = []
for i in range(args.nsample):
    smi = model.sample_prior()
    print(i, smi)
    res.append(smi)
with open(args.output, 'w') as f:
    for smi in res:
        f.write(smi + '\n')
Example #23
0
 def __init__(self):
     self.token_vocab = Vocab(lowercase=True, paddings=True)
     self.pos_vocab = Vocab(paddings=True)
     self.gram_vocab = Vocab(paddings=True)
Example #24
0
    fold2data(fold_num)

    # convert each fold data
    for fold in range(9, fold_num):
        cache_name = "./save/vocab/" + str(fold) + ".pickle"
        train = "train_" + str(fold)
        dev = "dev_" + str(fold)
        files = [train, dev]

        # biuld vocab
        if Path(cache_name).exists():
            vocab_file = open(cache_name, 'rb')
            vocab = pickle.load(vocab_file)
            vocab_name = "./save/vocab/vocab.txt"
            vocab.dump(vocab_name)
            print('Load vocab from ' + cache_name)
        else:
            vocab = Vocab('./data/' + train + '.pickle')
            file = open(cache_name, 'wb')
            pickle.dump(vocab, file)
            print('Save vocab to ' + cache_name)

        for file in files:
            pass

            # data 2 word2vec
            # convert_data_word2vec(file)

            # data 2 bert
            # convert_data_bert_pretrain(train)
Example #25
0
 def _truncate(val: np.array, trun_count: int, vocab: Vocab = None):
     sort_index = np.argsort(val, axis=None)[::-1][:trun_count]
     if vocab:
         return vocab.get_tks(sort_index)
     else:
         return sort_index
Example #26
0
# test for load data
from src.utils import read_data_from_csv, read_line_from_txt, Example
from src.vocab import Vocab
dataset = read_data_from_csv("../corpus/weibo_senti_100k.csv",
                             label_map={
                                 "1": 1,
                                 "0": -1
                             })
dataset = dataset[:10] + dataset[-10:]
dataset_size = len(dataset)

stop_word_dict = read_line_from_txt("../dict/stopword.txt")
base_posword_dict = read_line_from_txt("../dict/ntusd/NTUSD_positive.txt")
base_negword_dict = read_line_from_txt("../dict/ntusd/NTUSD_negative.txt")
extreme_words = read_line_from_txt("../dict/hownet/extreme.txt")
extreme_word_dict = Vocab()
for word in extreme_words:
    if not word.startswith("-"):
        _lb, _tt = word.strip().split(',')
        _lb = int(_lb)
        extreme_word_dict.add(_tt, _lb)
deny_words_dict = read_line_from_txt("../dict/deny.txt")

# =================
# # test for label\emoji distribution
# from src.utils import label_distribution_viewer, emoji_distribution_viewer
# label_distribution_viewer(dataset, label_map={"1": "pos", "0": "neg"}, verbose=True)
# emoji_distribution_viewer(dataset, drop_df = 5)

# =================
# test normalizer