Example #1
0
	def cleaner_to_file(self, filename):
		filename = str(filename)
		with open(filename, "r", encoding='utf-8') as f:
			lines = f.read()
			
			from underthesea import sent_tokenize
			
			sentences = sent_tokenize(lines)
			sentences = sent_tokenize(lines)
			if len(sentences) > 0:
				for line in sentences:
					cleaned_text = self.text_cleaner(line)
					for word in cleaned_text:
						self.words.append(word)
			f.close()
Example #2
0
def parse_sent_true(content):
    regex_day = '(\d+\/\d+)|(\d+\-\d+)'
    regex_num = '\s\d+\s'
    regex_BN = 'BN\d+'

    arr_sents = []
    ind = content.find(':')
    content = content[ind + 2:]
    sents_token = []
    arr_sents_token = sent_tokenize(content)
    for elem in arr_sents_token:
        find = re.findall(';', elem)  #tim ; voi tung cau
        if len(find) == 1:
            sents_token += elem.split(';')
        else:
            sents_token.append(elem)
    for sent in sents_token:
        # txt = sent.strip( )
        txt = sent
        date = [m.span() for m in re.finditer(regex_day, txt)]
        num = [m.span() for m in re.finditer(regex_num, txt)]
        BN = [m.span() for m in re.finditer(regex_BN, txt)]
        if len(num) > 0 or len(BN) > 0:
            arr = token_sent(txt)
            vec_sent = get_w2v_sent(arr).tolist()
            arr_sents.append([sent, date, num, BN, vec_sent])
        # else:
        #     vec_sent = []

        # arr_sents.append([sent, date, num, BN, vec_sent])
    return arr_sents
Example #3
0
def extract_info(text):
    places_temp = []
    items_temp = []

    sents = sent_tokenize(text)
    for sent in sents:
        sent = sent.replace('.', '')
        sent = sent.replace(',', '')
        sent = sent.replace(':', '')
        sent = " ".join(sent.split())
        words = word_tokenize(sent)
        while (len(words) > 0):
            t = ''
            for w in words:
                t = t + ' ' + w
            t = t.lstrip()
            s, code = check(t)
            if s != '':
                print('------------>' + s)
                if code == 'item':
                    items_temp.append(s)
                elif code == 'place':
                    places_temp.append(s)
                else:
                    continue
                t = t.replace(s, '', 1)
                words = word_tokenize(t)
            else:
                t = t.replace(words[0], '', 1)
                words = word_tokenize(t)

    return places_temp, items_temp
Example #4
0
    def tokenize(self, text, never_split=None, with_info=False, **kwargs):
        """Tokenizes a piece of text."""
        never_split = self.never_split + (never_split
                                          if never_split is not None else [])
        text = unicodedata.normalize('NFKC', text)

        tokens = []
        token_infos = []

        cursor = 0
        for line in sent_tokenize(text):
            if line == 'EOS':
                if self.preserve_spaces and len(text[cursor:]) > 0:
                    tokens.append(text[cursor:])
                    token_infos.append(None)

                break
            token = line
            token_start = text.index(token, cursor)
            token_end = token_start + len(token)
            if self.preserve_spaces and cursor < token_start:
                tokens.append(text[cursor:token_start])

            if self.do_lower_case and token not in never_split:
                token = token.lower()

            tokens.append(token)

            cursor = token_end

        return tokens
Example #5
0
def convert_mode_veryshort(input_file, output_file, encoding):
    with open(input_file, 'r', encoding=encoding) as stream:
        squad = json.load(stream)

    convertedData = []

    # Remove _ symbol in title
    for data in squad['data']:
        data['title'] = " ".join(data['title'].split('_'))

    # Format 2: Sentence as Text
    for data in tqdm(squad['data']):
        for paragraph in data['paragraphs']:
            # Get paragraph split by sentences & determine its start index for easier processing
            para_context = sent_tokenize(
                paragraph['context'])  # Context split into list of sentences
            para_sent_startidxs = [
                paragraph['context'].index(sentence)
                for sentence in para_context
            ]

            # Process question-answer pairs
            for qas in paragraph['qas']:
                # Prepare data to save
                zaloQAS = {
                    'id': qas['id'],
                    'question': qas['question'],
                    'title': data['title'],
                    'label': False if qas['is_impossible'] else True
                }
                _question_len = get_word_count(qas['question'])

                # Loop & get answer text for each qa pair
                if len(qas['answers']) != 0 and qas['is_impossible'] is False \
                        and qas['answers'][0]['answer_start'] != -1:
                    # Only 1 answer, but rephrased
                    answer = qas['answers'][0]

                    # Find the sentence & sentence index that contains the answer
                    _text = None
                    for idx in range(len(para_context)):
                        if para_sent_startidxs[idx] > answer['answer_start']:
                            continue
                        elif para_sent_startidxs[idx] < answer['answer_start'] \
                                < para_sent_startidxs[idx] + len(para_context[idx]):
                            _text = para_context[idx]
                            break
                        else:
                            break
                    zaloQAS['text'] = "" if _text is None else _text
                else:
                    zaloQAS['text'] = para_context[random.randint(0, len(para_context)) - 1] if len(para_context) >= 1 \
                        else ""

                # Add data instance
                convertedData.append(zaloQAS)

    # Export converted data
    with open(output_file, 'w', encoding=encoding) as stream:
        stream.write(json.dumps(convertedData, ensure_ascii=False))
Example #6
0
def _get_paragraph_bert_sentences(p):
    res = ""

    for sent in sent_tokenize(p):
        if _is_valid_sent(sent):
            res += sent + "\n"

    return res.strip()
Example #7
0
 def test_1(self):
     text = "Taylor cho biết lúc đầu cô cảm thấy ngại với cô bạn thân Amanda nhưng rồi mọi thứ trôi qua nhanh chóng. Amanda cũng thoải mái với mối quan hệ này."
     actual = sent_tokenize(text)
     expected = [
         "Taylor cho biết lúc đầu cô cảm thấy ngại với cô bạn thân Amanda nhưng rồi mọi thứ trôi qua nhanh chóng.",
         "Amanda cũng thoải mái với mối quan hệ này."
     ]
     self.assertEqual(actual, expected)
Example #8
0
def process_part(text):
    result = []
    for line in text.split('\n'):
        sentences = sent_tokenize(line)
        for s in sentences:
            words = annotator.tokenize(s)[0]
            result.append(' '.join(words))

    return result
Example #9
0
def ner():
    a = request.form['content']
    corpus = sent_tokenize(a)
    b = []
    for sen in corpus:
        x_test = []
        for word in word_tokenize(sen, format="text").split(' '):
            x_test.extend(pos_tag(word))
        b.append(x_test)
    b1 = [ner_train.get_features(s) for s in b]
    c = crf.predict(b1)
    return json.dumps([b, c])
Example #10
0
def extract_relation(text):
    """
    return all re of text
    :param text: list of sent
    :return:
    """
    list_sent = sent_tokenize(text)
    relation_text = []
    for sent in list_sent:
        relation_sent = extract_re_sent(sent)
        if len(relation_sent) > 0:
            relation_text.append(relation_sent)
    return relation_text
Example #11
0
        def _truncate_seq_pair(ques, text, max_length):
            """Truncates a sequence pair in place to the maximum length."""
            ques_1 = ques
            sens = sent_tokenize(text)
            sens_t = []
            sen_tokens = []
            ques_tokens = tokenizer.tokenize(ques_1)
            for sen in sens:
                tokens = tokenizer.tokenize(sen)
                sen_tokens.append(tokens)
                sens_t.append(' '.join(tokens))
            ques_in = ' '.join(ques_tokens)

            def ranking_ques_sentences(ques, sentences):
                corpus = [ques]
                corpus.extend(sentences)
                vectorizer = CountVectorizer()
                X = vectorizer.fit_transform(corpus)
                X = X.toarray()
                ques = np.array([(len(X) - 1) * X[0]])
                sens = np.array(X[1:])
                rank_list = cosine_similarity(ques, sens).reshape(-1).tolist()
                return sorted(range(len(rank_list)),
                              key=lambda k: rank_list[k])

            # This is a simple heuristic which will always truncate the longer sequence
            # one token at a time. This makes more sense than truncating an equal percent
            # of tokens from each, since if one sequence is very short then each token
            # that's truncated likely contains more information than a longer sequence.
            rl = None
            i = 0
            while True:
                total_length = sum([len(sen)
                                    for sen in sen_tokens]) + len(ques_tokens)
                if total_length <= max_length:
                    break
                else:
                    if i == 0:
                        rl = ranking_ques_sentences(ques_in, sens_t)
                    try:
                        sen_tokens[rl[i]] = []
                        i = i + 1
                    except IndexError:
                        return None, None

            tokens_b = []
            for sen_ in sen_tokens:
                if len(sen_) != 0:
                    tokens_b += sen_

            return ques_tokens, tokens_b
Example #12
0
def doc2words(file):
    f = open(file)
    text = f.read()
    f.close()

    doc_words = []
    for line in text.split('\n'):
        sentences = sent_tokenize(line)
        for s in sentences:
            sent_words = annotator.tokenize(s)[0]
            for w in sent_words:
                doc_words.append(w.lower())

    return doc_words
Example #13
0
def tokenize_sentence(text, crude=None):
    sens = sent_tokenize(text)
    final = []
    for sen in sens:
        retok = _check_missing_punct(sen)
        final += retok

    # if crude is not None and len(final) < 2:
    #     for p in crude:
    #         final = []
    #         for sen in sens:
    #             final += sen.split('.')
    #         final = [s for s in final if len(s) > 0]
    return final
Example #14
0
def fil_content(news):
    sentence_li = sent_tokenize(news)
    for sentence in sentence_li:
        word_li = word_tokenize(sentence)

    count = 0
    for word in li_1:
        if word in word_li:
            count += 1
        else:
            continue
    if count >= 2:
        print("Negative content")
    else:
        print("It's ok")
Example #15
0
def clean(file):
    print(file)
    with open(join(RAW_FOLDER, file)) as f:
        content = f.read()
    with open(join(CLEANED_FOLDER, file), 'w') as out_file:
        out_file.write('')
    with open(join(CLEANED_FOLDER, file), 'a') as out_file:
        for line in content.split("\n"):
            if not check_line(line):
                continue
            sents = sent_tokenize(line)
            for sent in sents:
                if not check_line(sent):
                    continue
                out_file.write(sent.strip() + '\n')
    return
Example #16
0
 def underthesea_annotate(self, text, mode):
     if mode == 'sent_tokenize':
         return sent_tokenize(text)
     elif mode == 'word_tokenize':
         return word_tokenize(text)
     elif mode == 'pos_tag':
         return pos_tag(text)
     elif mode == 'chunk':
         return chunk(text)
     elif mode == 'ner':
         return ner(text)
     elif mode == 'classify':
         return classify(text)
     elif mode == 'sentiment':
         return sentiment(text)
     else:
         raise Exception("Wrong request, please check your request")
Example #17
0
 def read_raw_file(self) -> list:
     f = open("test_a1000.txt", "r+")
     noun_list = []
     adj_list = []
     for line in f:
         result = pos_tag(line)
         print(result)
         print('\n')
         record_n = []
         record_adj = []
         self.sentences.extend(sent_tokenize(line))
         for item in result:
             if self.is_noun(item[1]) and self.one_word_prune(item[0]):
                 record_n.append(str(item[0]).lower())
             if item[1] == 'A' or item[1] == 'AP':
                 record_adj.append(str(item[0]).lower())
         noun_list.append(record_n)
         adj_list.append(record_adj)
     self.transaction = noun_list
     return noun_list
Example #18
0
def phantich():
    paragraph = request.form['query']
    list_sents = sent_tokenize(paragraph)
    text_output = ""
    for sent in list_sents:
        example_token = ViTokenizer.tokenize(sent)
        x_example = []
        for word in example_token.split(" "):
            try:
                x_example.append(word2idx[word])
            except:
                x_example.append(word2idx["UNK"])
        x_example = pad_sequences(maxlen=max_len,
                                  sequences=[x_example],
                                  padding="post",
                                  value=word2idx["PADword"])
        output = model.predict(np.array(x_example))
        output = np.argmax(output, axis=-1)[0]
        s = ""
        for index, w in enumerate(example_token.split(" ")):
            w = w.replace("_", " ")
            if "PER" in tags[output[index]]:
                s += "<a style=\"color:red;\">" + w + "</a>" + " "
            elif "LOC" in tags[output[index]]:
                s += "<a style=\"color:green;\">" + w + "</a>" + " "
            elif "ORG" in tags[output[index]]:
                s += "<a style=\"color:yellow;\">" + w + "</a>" + " "
            elif "MISC" in tags[output[index]]:
                s += "<a style=\"color:blue;\">" + w + "</a>" + " "
            else:
                s += w + " "
        text_output += s.strip() + " "
    text_output = text_output.replace(" , ",
                                      ", ").replace(" . ", ". ").replace(
                                          " ; ", "; ").strip()
    print(text_output)
    return render_template("result.html",
                           data=[{
                               "label": text_output,
                               "query": paragraph
                           }])
Example #19
0
def load_corpus(corpus_file='truyen_kieu.txt', dictionary='dictionary.txt'):
    """
    :param nb_sentences: Use if all brown sentences are too many
    :return: index2word (list of string)
    """
    corpus = load_data(corpus_file)
    print('Building vocab ...')

    corpus = sent_tokenize(corpus)
    for i, sentence in enumerate(corpus):
        corpus[i] = word_tokenize(sentence)

    vocab = list(
        set([word.replace(' ', '_') for sent in corpus for word in sent]))
    with open(dictionary, 'w', encoding='utf8') as f:
        f.write('\n'.join(vocab))

    # ids: list of (list of word-id)
    ids = [[vocab.index(w) for w in sent if w in vocab] for sent in corpus]

    return ids, vocab
Example #20
0
def parse_sent_one(content):
    regex_day = '(\d+\/\d+)|(\d+\-\d+)'
    regex_num = '\s\d+\s'
    regex_BN = 'BN\d+'

    arr_sents = []
    arr_sents_token = sent_tokenize(content)
    for sent in arr_sents_token:
        # txt = sent.strip( )
        txt = sent
        date = [m.span() for m in re.finditer(regex_day, txt)]
        num = [m.span() for m in re.finditer(regex_num, txt)]
        BN = [m.span() for m in re.finditer(regex_BN, txt)]
        if len(num) > 0 or len(BN) > 0:
            arr = token_sent(txt)
            vec_sent = get_w2v_sent(arr).tolist()
            arr_sents.append([sent, date, num, BN, vec_sent])
        # else:
        #     vec_sent = []

        # arr_sents.append([sent, date, num, BN, vec_sent])
    return arr_sents
Example #21
0
def create_data(file=None, save_to=None):
    """
    text file to csv file: columns=['SENT#', 'WORD', 'POS', 'CHUNK', 'NER']
    """
    text = ''
    with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as fdata:
        lines = fdata.readlines()
        for line in lines:
            text += line

    df = pd.DataFrame(columns=['SENT#', 'WORD', 'POS', 'CHUNK', 'NER'])
    i = 0
    for sent in underthesea.sent_tokenize(text):
        tdf = pd.DataFrame(underthesea.ner(sent),
                           columns=['WORD', 'POS', 'CHUNK', 'NER'])
        tdf.insert(loc=0, column='SENT#', value=[i] * len(tdf))
        df = df.append(tdf, ignore_index=True)
        i += 1

    df = df.drop(columns=['CHUNK'])
    df.to_csv(save_to, index=False)
    print('saved to ' + save_to)
Example #22
0
def read_file(file_path, language='vi', sentence_segment=False, sen=False):
    res = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        res = [re.sub('\n|\u200b', '', line).strip() for line in lines]
        res = [line for line in res if len(line) > 0]
        if sen:
            return res[0]
        else:
            if sentence_segment:
                res_segmented = []
                if language == 'vi':
                    for line in res:
                        res_segmented += sent_tokenize(line)
                else:
                    for line in res:
                        res_segmented += sentence_tokenize(line)
                # punctuation = '[!"#$%&\'()*+,./;<=>?@[\\]^_`{|}~]'
                # res = [re.sub(punctuation, '', line) for line in res]
                return res_segmented
            else:
                return res
Example #23
0
def get_sentences_from_file(paths, idx, thres=10):
    global base_vocab

    content = ""
    num = 0
    error = 0
    print("---------------------------------------------- " + str(idx) +
          " ------------------------------------------------")
    vocab = set()
    for path in paths:
        print(str(idx) + "---------------" + path)
        try:
            with open(path, encoding="utf-8") as fs:
                data = fs.read()
                if (is_tcvn3_encoding(data)):
                    data = convert_tcvn3_to_unicode(data)
                tmp = sent_tokenize(data)

                for index in range(20, len(tmp) - 20):
                    if (len(tmp[index].split()) > 50):
                        tags = pos_tag(tmp[index])
                        sen = " ".join(
                            [tag[0] for tag in tags if tag[1] != "CH"])
                        line = formatSentence(sen)[0]
                        line = " ".join([
                            word if word in base_vocab else "<UNKNOWN>"
                            for word in line.split()
                        ])
                        print(line)
                        if (occurrence_counter(line) <= thres):
                            content += line + "\n"
                            # vocab.update(line.split())
                            num += 1

        except:
            pass
            error += 1

    store_gz(content, "data_train/data_" + str(idx) + ".gz")
Example #24
0
 def __call__(self, text: str):
     # print(text)
     text = UniStd(text)
     for pre_process in self._pre_processes:
         text = pre_process(text)
     # custom regex
     text = self._custom_regex_replacer(text)
     # text = vn_norm(text)
     result = []
     sents = sent_tokenize(text)
     for sent in sents:
         # print(sent)
         sent_result = []
         depends = word_tokenize(sent)
         if self._end_punctuation and depends[len(depends) -
                                              1] not in punctuations:
             depends.append('.')
             # depends.append(('.', 9, 'punct'))
         for words in depends:
             # words, _, word_type = depend
             if len(words) == 1 and words in punctuations:
                 sent_result.append(words)
             else:
                 words = self._custom_simple_replacer(words)
                 words = self._acronym_replacer(words)
                 words = self._teen_code_replacer(words)
                 words = vn_norm(words)
                 word_split = words.split()
                 for word in word_split:
                     word = self._custom_simple_replacer(word)
                     word = self._acronym_replacer(word)
                     word = self._teen_code_replacer(word)
                     word = self._g2p_vn_replacer(word.lower(),
                                                  try_other=self._try_other)
                     sent_result.extend(word.split())
         sent_result = ' '.join(sent_result)
         result.append(sent_result)
     return result
Example #25
0
    def analyze(self, text, keyword, lower=False):
        """Main function to analyze text"""
        doc = sent_tokenize(text)
        doc = [self.filtering_sentence(sent, self.stopwords, keyword, lower) for sent in doc]

        # Filter sentences
        sentences = self.sentence_segment(doc, lower) # list of list of words
        
        # Build vocabulary
        vocab = get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = get_token_pairs(sentences, self.window_size)
        
        # Get normalized matrix
        g = get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
        return get_keywords(self.node_weight, self.num_keywords)
Example #26
0
def extract_info(paragraph=None, time_public=None, model=None):
    if not model:
        model = load_model(model_dir + 'covid_ner.job')

    entity_list = ['HOS', 'LOC', 'FLIGHT', 'BN']
    BN_list = list()
    triplets = list()
    tmp_BNid = None
    BNid_set = set()
    BNS_bool = False

    paragraph = preprocess_raw(raw_text=paragraph)
    for sent in underthesea.sent_tokenize(paragraph):
        relation_list = list()
        cur_time = None

        my_ner_sent = ner_sent2(sent, model)

        idx = -1
        for it in my_ner_sent:
            idx += 1
            if it[1] == 'BN' and it[0] == 'BN':
                pass
            elif it[1] == 'BN' and 'BN' in it[0] and len(relation_list) == 0:
                tmp_BNid = it[0]
                if it[0] not in BNid_set:
                    # print('new ' + str(tmp_BNid))
                    # create and add BN to list
                    BNid_set.add(it[0])
                    myBN = [None] * 5
                    myBN[0] = tmp_BNid
                    BN_list.append(myBN)
            elif it[1] == 'BNS' and 'BN' in it[0]:
                BNS_bool = True
            elif it[1] == 'SEX':
                for bn in BN_list:
                    if bn[0] == tmp_BNid:
                        if bn[2] is None:
                            bn[2] = it[0]
                        break
            elif it[1] == 'AGE':
                for bn in BN_list:
                    if bn[0] == tmp_BNid:
                        if bn[1] is None:
                            bn[1] = it[0]
                        break
            elif it[1] == 'ADD':
                for bn in BN_list:
                    if bn[0] == tmp_BNid:
                        if bn[3] is None:
                            bn[3] = it[0]
                        break
            elif it[1] == 'NAT':
                for bn in BN_list:
                    if bn[0] == tmp_BNid:
                        if bn[4] is None: bn[4] = it[0]
                        break
            elif it[1] == 'TIME':
                cur_time = it
            elif it[1] == 'STATUS':
                time_x = None
                if cur_time:
                    time_x = cur_time
                else:  # find next time
                    for idx1 in range(idx + 1, len(my_ner_sent)):
                        it1 = my_ner_sent[idx1]
                        if it1[1] == 'TIME':
                            time_x = it1
                            break

                #  hien tai, hien nay, hien -> time_public
                if time_x and re.search(pattern=re_cur_time,
                                        string=time_x[0],
                                        flags=flags) and time_public:
                    time_x = (str(time_public), 'TIME')

                if BNS_bool:
                    for id in BNid_set:
                        triplets.append([(id, 'BN'), (it[0], 'R'),
                                         ('SARS-CoV-2', 'E'), time_x])
                elif tmp_BNid:
                    triplets.append([(tmp_BNid, 'BN'), (it[0], 'R'),
                                     ('SARS-CoV-2', 'E'), time_x])

            elif it[1] == 'R':
                relation_list.append(it)

            elif it[1] in entity_list:
                time_x = None
                if cur_time:
                    time_x = cur_time
                else:  # find next time
                    for idx1 in range(idx + 1, len(my_ner_sent)):
                        it1 = my_ner_sent[idx1]
                        if it1[1] == 'TIME':
                            time_x = it1
                            break

                #  hien tai, hien nay, hien -> time_public
                if time_x and re.search(pattern=re_cur_time,
                                        string=time_x[0],
                                        flags=flags) and time_public:
                    time_x = (str(time_public), 'TIME')

                if BNS_bool:
                    if len(relation_list) == 0:
                        tmp_relation = ('trên chuyến bay',
                                        'R') if it[1] == 'FLIGHT' else (
                                            'liên quan đến', 'R')
                        for id in BNid_set:
                            triplets.append([(id, 'BN'), tmp_relation, it,
                                             time_x])
                    else:
                        for id in BNid_set:
                            for relation in relation_list:
                                triplets.append([(id, 'BN'), relation, it,
                                                 time_x])

                elif tmp_BNid:
                    if len(relation_list) == 0:
                        tmp_relation = ('trên chuyến bay',
                                        'R') if it[1] == 'FLIGHT' else (
                                            'liên quan đến', 'R')
                        triplets.append([(tmp_BNid, 'BN'), tmp_relation, it,
                                         time_x])
                    else:
                        for relation in relation_list:
                            triplets.append([(tmp_BNid, 'BN'), relation, it,
                                             time_x])
                relation_list.clear()

    return BN_list, triplets
 def __call__(self, text):
     return sent_tokenize(text)
Example #28
0
 def preprocess(self, text):
     sentences = sent_tokenize(text)
     features = self.convert_sentences_to_features(sentences)
     data = NERdataset(features, self.device)
     return DataLoader(data, batch_size=self.batch_size)
def wordless_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wordless_text_utils.check_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'):
        lang_texts = {
            'ces': 'czech',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'est': 'estonian',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            # Greek (Modern)
            'ell': 'greek',
            'ita': 'italian',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'pol': 'polish',
            'por': 'portuguese',
            'slv': 'slovene',
            'spa': 'spanish',
            'swe': 'swedish',
            'tur': 'turkish',
            # Other Languages
            'other': 'english'
        }

        sentences = nltk.sent_tokenize(text, language=lang_texts[lang])
    elif sentence_tokenizer == main.tr('spaCy - Sentencizer'):
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        sentences = [sentence.text for sentence in doc.sents]
    # Chinese & Japanese
    elif sentence_tokenizer in [
            main.tr('Wordless - Chinese Sentence Tokenizer'),
            main.tr('Wordless - Japanese Sentence Tokenizer')
    ]:
        for line in text.splitlines():
            sentence_start = 0

            for i, char in enumerate(line):
                if i >= sentence_start and char in ['。', '!', '?', '!', '?']:
                    for j, char in enumerate(line):
                        if j > i and char not in [
                                '。', '!', '?', '!', '?', '’', '”', ')', ')'
                        ]:
                            sentences.append(line[sentence_start:j])

                            sentence_start = j

                            break

            if sentence_start <= len(line):
                sentences.append(line[sentence_start:])
    # Thai
    elif sentence_tokenizer == 'PyThaiNLP - Thai Sentence Tokenizer':
        sentences = pythainlp.tokenize.sent_tokenize(text)
    # Tibetan
    elif sentence_tokenizer == 'Wordless - Tibetan Sentence Tokenizer':
        sentences = text.split()
    # Vietnamese
    elif sentence_tokenizer == 'Underthesea - Vietnamese Sentence Tokenizer':
        sentences = underthesea.sent_tokenize(text)

    sentences = wordless_text_utils.record_boundary_sentences(sentences, text)

    return sentences
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_nlp_utils.init_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # Input of SudachiPy cannot be more than 49149 BYTES
    if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
        # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
        sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10)
    else:
        sections = wl_nlp_utils.split_into_chunks_text(
            text,
            section_size=main.settings_custom['files']['misc']
            ['read_files_in_chunks'])

    for section in sections:
        # NLTK
        if sentence_tokenizer == 'nltk_punkt':
            lang_texts = {
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                # English
                'eng_gb': 'english',
                'eng_us': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                # German
                'deu_at': 'german',
                'deu_de': 'german',
                'deu_ch': 'german',
                'ell': 'greek',
                'ita': 'italian',
                # Norwegian
                'nob': 'norwegian',
                'nno': 'norwegian',
                'pol': 'polish',
                # Portuguese
                'por_br': 'portuguese',
                'por_pt': 'portuguese',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tur': 'turkish',
                # Other languages
                'other': 'english'
            }

            sentences.extend(
                nltk.sent_tokenize(section, language=lang_texts[lang]))
        # spaCy
        elif sentence_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            sentences.extend([sentence.text for sentence in doc.sents])
        # Chinese & Japanese
        elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']:
            for line in section.splitlines():
                sentence_start = 0

                for i, char in enumerate(line):
                    if i >= sentence_start and char in [
                            '。', '!', '?', '!', '?'
                    ]:
                        for j, char_next in enumerate(line):
                            if j > i and char_next not in [
                                    '。', '!', '?', '!', '?', '’', '”', ')', ')'
                            ]:
                                sentences.append(line[sentence_start:j])

                                sentence_start = j

                                break

                if sentence_start <= len(line):
                    sentences.append(line[sentence_start:])
        # Icelandic
        elif sentence_tokenizer == 'tokenizer_isl':
            for sentence in tokenizer.split_into_sentences(section):
                sentences.append(
                    wl_word_detokenization.wl_word_detokenize(
                        main, tokens=sentence.split(), lang='isl'))
        # Thai
        elif sentence_tokenizer == 'pythainlp_crfcut':
            sentences.extend(pythainlp.sent_tokenize(section))
        # Tibetan
        elif sentence_tokenizer == 'botok_bod':
            wl_nlp_utils.init_word_tokenizers(main, lang='bod')

            tokens = main.botok_word_tokenizer.tokenize(section)

            for sentence_tokens in botok.sentence_tokenizer(tokens):
                sentences.append(''.join([
                    sentence_token.text
                    for sentence_token in sentence_tokens['tokens']
                ]))
        # Vietnamese
        elif sentence_tokenizer == 'underthesea_vie':
            sentences.extend(underthesea.sent_tokenize(section))

    # Strip spaces
    sentences = [
        sentence_non_empty for sentence in sentences
        if (sentence_non_empty := sentence.strip())
    ]