def cleaner_to_file(self, filename): filename = str(filename) with open(filename, "r", encoding='utf-8') as f: lines = f.read() from underthesea import sent_tokenize sentences = sent_tokenize(lines) sentences = sent_tokenize(lines) if len(sentences) > 0: for line in sentences: cleaned_text = self.text_cleaner(line) for word in cleaned_text: self.words.append(word) f.close()
def parse_sent_true(content): regex_day = '(\d+\/\d+)|(\d+\-\d+)' regex_num = '\s\d+\s' regex_BN = 'BN\d+' arr_sents = [] ind = content.find(':') content = content[ind + 2:] sents_token = [] arr_sents_token = sent_tokenize(content) for elem in arr_sents_token: find = re.findall(';', elem) #tim ; voi tung cau if len(find) == 1: sents_token += elem.split(';') else: sents_token.append(elem) for sent in sents_token: # txt = sent.strip( ) txt = sent date = [m.span() for m in re.finditer(regex_day, txt)] num = [m.span() for m in re.finditer(regex_num, txt)] BN = [m.span() for m in re.finditer(regex_BN, txt)] if len(num) > 0 or len(BN) > 0: arr = token_sent(txt) vec_sent = get_w2v_sent(arr).tolist() arr_sents.append([sent, date, num, BN, vec_sent]) # else: # vec_sent = [] # arr_sents.append([sent, date, num, BN, vec_sent]) return arr_sents
def extract_info(text): places_temp = [] items_temp = [] sents = sent_tokenize(text) for sent in sents: sent = sent.replace('.', '') sent = sent.replace(',', '') sent = sent.replace(':', '') sent = " ".join(sent.split()) words = word_tokenize(sent) while (len(words) > 0): t = '' for w in words: t = t + ' ' + w t = t.lstrip() s, code = check(t) if s != '': print('------------>' + s) if code == 'item': items_temp.append(s) elif code == 'place': places_temp.append(s) else: continue t = t.replace(s, '', 1) words = word_tokenize(t) else: t = t.replace(words[0], '', 1) words = word_tokenize(t) return places_temp, items_temp
def tokenize(self, text, never_split=None, with_info=False, **kwargs): """Tokenizes a piece of text.""" never_split = self.never_split + (never_split if never_split is not None else []) text = unicodedata.normalize('NFKC', text) tokens = [] token_infos = [] cursor = 0 for line in sent_tokenize(text): if line == 'EOS': if self.preserve_spaces and len(text[cursor:]) > 0: tokens.append(text[cursor:]) token_infos.append(None) break token = line token_start = text.index(token, cursor) token_end = token_start + len(token) if self.preserve_spaces and cursor < token_start: tokens.append(text[cursor:token_start]) if self.do_lower_case and token not in never_split: token = token.lower() tokens.append(token) cursor = token_end return tokens
def convert_mode_veryshort(input_file, output_file, encoding): with open(input_file, 'r', encoding=encoding) as stream: squad = json.load(stream) convertedData = [] # Remove _ symbol in title for data in squad['data']: data['title'] = " ".join(data['title'].split('_')) # Format 2: Sentence as Text for data in tqdm(squad['data']): for paragraph in data['paragraphs']: # Get paragraph split by sentences & determine its start index for easier processing para_context = sent_tokenize( paragraph['context']) # Context split into list of sentences para_sent_startidxs = [ paragraph['context'].index(sentence) for sentence in para_context ] # Process question-answer pairs for qas in paragraph['qas']: # Prepare data to save zaloQAS = { 'id': qas['id'], 'question': qas['question'], 'title': data['title'], 'label': False if qas['is_impossible'] else True } _question_len = get_word_count(qas['question']) # Loop & get answer text for each qa pair if len(qas['answers']) != 0 and qas['is_impossible'] is False \ and qas['answers'][0]['answer_start'] != -1: # Only 1 answer, but rephrased answer = qas['answers'][0] # Find the sentence & sentence index that contains the answer _text = None for idx in range(len(para_context)): if para_sent_startidxs[idx] > answer['answer_start']: continue elif para_sent_startidxs[idx] < answer['answer_start'] \ < para_sent_startidxs[idx] + len(para_context[idx]): _text = para_context[idx] break else: break zaloQAS['text'] = "" if _text is None else _text else: zaloQAS['text'] = para_context[random.randint(0, len(para_context)) - 1] if len(para_context) >= 1 \ else "" # Add data instance convertedData.append(zaloQAS) # Export converted data with open(output_file, 'w', encoding=encoding) as stream: stream.write(json.dumps(convertedData, ensure_ascii=False))
def _get_paragraph_bert_sentences(p): res = "" for sent in sent_tokenize(p): if _is_valid_sent(sent): res += sent + "\n" return res.strip()
def test_1(self): text = "Taylor cho biết lúc đầu cô cảm thấy ngại với cô bạn thân Amanda nhưng rồi mọi thứ trôi qua nhanh chóng. Amanda cũng thoải mái với mối quan hệ này." actual = sent_tokenize(text) expected = [ "Taylor cho biết lúc đầu cô cảm thấy ngại với cô bạn thân Amanda nhưng rồi mọi thứ trôi qua nhanh chóng.", "Amanda cũng thoải mái với mối quan hệ này." ] self.assertEqual(actual, expected)
def process_part(text): result = [] for line in text.split('\n'): sentences = sent_tokenize(line) for s in sentences: words = annotator.tokenize(s)[0] result.append(' '.join(words)) return result
def ner(): a = request.form['content'] corpus = sent_tokenize(a) b = [] for sen in corpus: x_test = [] for word in word_tokenize(sen, format="text").split(' '): x_test.extend(pos_tag(word)) b.append(x_test) b1 = [ner_train.get_features(s) for s in b] c = crf.predict(b1) return json.dumps([b, c])
def extract_relation(text): """ return all re of text :param text: list of sent :return: """ list_sent = sent_tokenize(text) relation_text = [] for sent in list_sent: relation_sent = extract_re_sent(sent) if len(relation_sent) > 0: relation_text.append(relation_sent) return relation_text
def _truncate_seq_pair(ques, text, max_length): """Truncates a sequence pair in place to the maximum length.""" ques_1 = ques sens = sent_tokenize(text) sens_t = [] sen_tokens = [] ques_tokens = tokenizer.tokenize(ques_1) for sen in sens: tokens = tokenizer.tokenize(sen) sen_tokens.append(tokens) sens_t.append(' '.join(tokens)) ques_in = ' '.join(ques_tokens) def ranking_ques_sentences(ques, sentences): corpus = [ques] corpus.extend(sentences) vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) X = X.toarray() ques = np.array([(len(X) - 1) * X[0]]) sens = np.array(X[1:]) rank_list = cosine_similarity(ques, sens).reshape(-1).tolist() return sorted(range(len(rank_list)), key=lambda k: rank_list[k]) # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. rl = None i = 0 while True: total_length = sum([len(sen) for sen in sen_tokens]) + len(ques_tokens) if total_length <= max_length: break else: if i == 0: rl = ranking_ques_sentences(ques_in, sens_t) try: sen_tokens[rl[i]] = [] i = i + 1 except IndexError: return None, None tokens_b = [] for sen_ in sen_tokens: if len(sen_) != 0: tokens_b += sen_ return ques_tokens, tokens_b
def doc2words(file): f = open(file) text = f.read() f.close() doc_words = [] for line in text.split('\n'): sentences = sent_tokenize(line) for s in sentences: sent_words = annotator.tokenize(s)[0] for w in sent_words: doc_words.append(w.lower()) return doc_words
def tokenize_sentence(text, crude=None): sens = sent_tokenize(text) final = [] for sen in sens: retok = _check_missing_punct(sen) final += retok # if crude is not None and len(final) < 2: # for p in crude: # final = [] # for sen in sens: # final += sen.split('.') # final = [s for s in final if len(s) > 0] return final
def fil_content(news): sentence_li = sent_tokenize(news) for sentence in sentence_li: word_li = word_tokenize(sentence) count = 0 for word in li_1: if word in word_li: count += 1 else: continue if count >= 2: print("Negative content") else: print("It's ok")
def clean(file): print(file) with open(join(RAW_FOLDER, file)) as f: content = f.read() with open(join(CLEANED_FOLDER, file), 'w') as out_file: out_file.write('') with open(join(CLEANED_FOLDER, file), 'a') as out_file: for line in content.split("\n"): if not check_line(line): continue sents = sent_tokenize(line) for sent in sents: if not check_line(sent): continue out_file.write(sent.strip() + '\n') return
def underthesea_annotate(self, text, mode): if mode == 'sent_tokenize': return sent_tokenize(text) elif mode == 'word_tokenize': return word_tokenize(text) elif mode == 'pos_tag': return pos_tag(text) elif mode == 'chunk': return chunk(text) elif mode == 'ner': return ner(text) elif mode == 'classify': return classify(text) elif mode == 'sentiment': return sentiment(text) else: raise Exception("Wrong request, please check your request")
def read_raw_file(self) -> list: f = open("test_a1000.txt", "r+") noun_list = [] adj_list = [] for line in f: result = pos_tag(line) print(result) print('\n') record_n = [] record_adj = [] self.sentences.extend(sent_tokenize(line)) for item in result: if self.is_noun(item[1]) and self.one_word_prune(item[0]): record_n.append(str(item[0]).lower()) if item[1] == 'A' or item[1] == 'AP': record_adj.append(str(item[0]).lower()) noun_list.append(record_n) adj_list.append(record_adj) self.transaction = noun_list return noun_list
def phantich(): paragraph = request.form['query'] list_sents = sent_tokenize(paragraph) text_output = "" for sent in list_sents: example_token = ViTokenizer.tokenize(sent) x_example = [] for word in example_token.split(" "): try: x_example.append(word2idx[word]) except: x_example.append(word2idx["UNK"]) x_example = pad_sequences(maxlen=max_len, sequences=[x_example], padding="post", value=word2idx["PADword"]) output = model.predict(np.array(x_example)) output = np.argmax(output, axis=-1)[0] s = "" for index, w in enumerate(example_token.split(" ")): w = w.replace("_", " ") if "PER" in tags[output[index]]: s += "<a style=\"color:red;\">" + w + "</a>" + " " elif "LOC" in tags[output[index]]: s += "<a style=\"color:green;\">" + w + "</a>" + " " elif "ORG" in tags[output[index]]: s += "<a style=\"color:yellow;\">" + w + "</a>" + " " elif "MISC" in tags[output[index]]: s += "<a style=\"color:blue;\">" + w + "</a>" + " " else: s += w + " " text_output += s.strip() + " " text_output = text_output.replace(" , ", ", ").replace(" . ", ". ").replace( " ; ", "; ").strip() print(text_output) return render_template("result.html", data=[{ "label": text_output, "query": paragraph }])
def load_corpus(corpus_file='truyen_kieu.txt', dictionary='dictionary.txt'): """ :param nb_sentences: Use if all brown sentences are too many :return: index2word (list of string) """ corpus = load_data(corpus_file) print('Building vocab ...') corpus = sent_tokenize(corpus) for i, sentence in enumerate(corpus): corpus[i] = word_tokenize(sentence) vocab = list( set([word.replace(' ', '_') for sent in corpus for word in sent])) with open(dictionary, 'w', encoding='utf8') as f: f.write('\n'.join(vocab)) # ids: list of (list of word-id) ids = [[vocab.index(w) for w in sent if w in vocab] for sent in corpus] return ids, vocab
def parse_sent_one(content): regex_day = '(\d+\/\d+)|(\d+\-\d+)' regex_num = '\s\d+\s' regex_BN = 'BN\d+' arr_sents = [] arr_sents_token = sent_tokenize(content) for sent in arr_sents_token: # txt = sent.strip( ) txt = sent date = [m.span() for m in re.finditer(regex_day, txt)] num = [m.span() for m in re.finditer(regex_num, txt)] BN = [m.span() for m in re.finditer(regex_BN, txt)] if len(num) > 0 or len(BN) > 0: arr = token_sent(txt) vec_sent = get_w2v_sent(arr).tolist() arr_sents.append([sent, date, num, BN, vec_sent]) # else: # vec_sent = [] # arr_sents.append([sent, date, num, BN, vec_sent]) return arr_sents
def create_data(file=None, save_to=None): """ text file to csv file: columns=['SENT#', 'WORD', 'POS', 'CHUNK', 'NER'] """ text = '' with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as fdata: lines = fdata.readlines() for line in lines: text += line df = pd.DataFrame(columns=['SENT#', 'WORD', 'POS', 'CHUNK', 'NER']) i = 0 for sent in underthesea.sent_tokenize(text): tdf = pd.DataFrame(underthesea.ner(sent), columns=['WORD', 'POS', 'CHUNK', 'NER']) tdf.insert(loc=0, column='SENT#', value=[i] * len(tdf)) df = df.append(tdf, ignore_index=True) i += 1 df = df.drop(columns=['CHUNK']) df.to_csv(save_to, index=False) print('saved to ' + save_to)
def read_file(file_path, language='vi', sentence_segment=False, sen=False): res = [] with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() res = [re.sub('\n|\u200b', '', line).strip() for line in lines] res = [line for line in res if len(line) > 0] if sen: return res[0] else: if sentence_segment: res_segmented = [] if language == 'vi': for line in res: res_segmented += sent_tokenize(line) else: for line in res: res_segmented += sentence_tokenize(line) # punctuation = '[!"#$%&\'()*+,./;<=>?@[\\]^_`{|}~]' # res = [re.sub(punctuation, '', line) for line in res] return res_segmented else: return res
def get_sentences_from_file(paths, idx, thres=10): global base_vocab content = "" num = 0 error = 0 print("---------------------------------------------- " + str(idx) + " ------------------------------------------------") vocab = set() for path in paths: print(str(idx) + "---------------" + path) try: with open(path, encoding="utf-8") as fs: data = fs.read() if (is_tcvn3_encoding(data)): data = convert_tcvn3_to_unicode(data) tmp = sent_tokenize(data) for index in range(20, len(tmp) - 20): if (len(tmp[index].split()) > 50): tags = pos_tag(tmp[index]) sen = " ".join( [tag[0] for tag in tags if tag[1] != "CH"]) line = formatSentence(sen)[0] line = " ".join([ word if word in base_vocab else "<UNKNOWN>" for word in line.split() ]) print(line) if (occurrence_counter(line) <= thres): content += line + "\n" # vocab.update(line.split()) num += 1 except: pass error += 1 store_gz(content, "data_train/data_" + str(idx) + ".gz")
def __call__(self, text: str): # print(text) text = UniStd(text) for pre_process in self._pre_processes: text = pre_process(text) # custom regex text = self._custom_regex_replacer(text) # text = vn_norm(text) result = [] sents = sent_tokenize(text) for sent in sents: # print(sent) sent_result = [] depends = word_tokenize(sent) if self._end_punctuation and depends[len(depends) - 1] not in punctuations: depends.append('.') # depends.append(('.', 9, 'punct')) for words in depends: # words, _, word_type = depend if len(words) == 1 and words in punctuations: sent_result.append(words) else: words = self._custom_simple_replacer(words) words = self._acronym_replacer(words) words = self._teen_code_replacer(words) words = vn_norm(words) word_split = words.split() for word in word_split: word = self._custom_simple_replacer(word) word = self._acronym_replacer(word) word = self._teen_code_replacer(word) word = self._g2p_vn_replacer(word.lower(), try_other=self._try_other) sent_result.extend(word.split()) sent_result = ' '.join(sent_result) result.append(sent_result) return result
def analyze(self, text, keyword, lower=False): """Main function to analyze text""" doc = sent_tokenize(text) doc = [self.filtering_sentence(sent, self.stopwords, keyword, lower) for sent in doc] # Filter sentences sentences = self.sentence_segment(doc, lower) # list of list of words # Build vocabulary vocab = get_vocab(sentences) # Get token_pairs from windows token_pairs = get_token_pairs(sentences, self.window_size) # Get normalized matrix g = get_matrix(vocab, token_pairs) # Initionlization for weight(pagerank value) pr = np.array([1] * len(vocab)) # Iteration previous_pr = 0 for epoch in range(self.steps): pr = (1-self.d) + self.d * np.dot(g, pr) if abs(previous_pr - sum(pr)) < self.min_diff: break else: previous_pr = sum(pr) # Get weight for each node node_weight = dict() for word, index in vocab.items(): node_weight[word] = pr[index] self.node_weight = node_weight return get_keywords(self.node_weight, self.num_keywords)
def extract_info(paragraph=None, time_public=None, model=None): if not model: model = load_model(model_dir + 'covid_ner.job') entity_list = ['HOS', 'LOC', 'FLIGHT', 'BN'] BN_list = list() triplets = list() tmp_BNid = None BNid_set = set() BNS_bool = False paragraph = preprocess_raw(raw_text=paragraph) for sent in underthesea.sent_tokenize(paragraph): relation_list = list() cur_time = None my_ner_sent = ner_sent2(sent, model) idx = -1 for it in my_ner_sent: idx += 1 if it[1] == 'BN' and it[0] == 'BN': pass elif it[1] == 'BN' and 'BN' in it[0] and len(relation_list) == 0: tmp_BNid = it[0] if it[0] not in BNid_set: # print('new ' + str(tmp_BNid)) # create and add BN to list BNid_set.add(it[0]) myBN = [None] * 5 myBN[0] = tmp_BNid BN_list.append(myBN) elif it[1] == 'BNS' and 'BN' in it[0]: BNS_bool = True elif it[1] == 'SEX': for bn in BN_list: if bn[0] == tmp_BNid: if bn[2] is None: bn[2] = it[0] break elif it[1] == 'AGE': for bn in BN_list: if bn[0] == tmp_BNid: if bn[1] is None: bn[1] = it[0] break elif it[1] == 'ADD': for bn in BN_list: if bn[0] == tmp_BNid: if bn[3] is None: bn[3] = it[0] break elif it[1] == 'NAT': for bn in BN_list: if bn[0] == tmp_BNid: if bn[4] is None: bn[4] = it[0] break elif it[1] == 'TIME': cur_time = it elif it[1] == 'STATUS': time_x = None if cur_time: time_x = cur_time else: # find next time for idx1 in range(idx + 1, len(my_ner_sent)): it1 = my_ner_sent[idx1] if it1[1] == 'TIME': time_x = it1 break # hien tai, hien nay, hien -> time_public if time_x and re.search(pattern=re_cur_time, string=time_x[0], flags=flags) and time_public: time_x = (str(time_public), 'TIME') if BNS_bool: for id in BNid_set: triplets.append([(id, 'BN'), (it[0], 'R'), ('SARS-CoV-2', 'E'), time_x]) elif tmp_BNid: triplets.append([(tmp_BNid, 'BN'), (it[0], 'R'), ('SARS-CoV-2', 'E'), time_x]) elif it[1] == 'R': relation_list.append(it) elif it[1] in entity_list: time_x = None if cur_time: time_x = cur_time else: # find next time for idx1 in range(idx + 1, len(my_ner_sent)): it1 = my_ner_sent[idx1] if it1[1] == 'TIME': time_x = it1 break # hien tai, hien nay, hien -> time_public if time_x and re.search(pattern=re_cur_time, string=time_x[0], flags=flags) and time_public: time_x = (str(time_public), 'TIME') if BNS_bool: if len(relation_list) == 0: tmp_relation = ('trên chuyến bay', 'R') if it[1] == 'FLIGHT' else ( 'liên quan đến', 'R') for id in BNid_set: triplets.append([(id, 'BN'), tmp_relation, it, time_x]) else: for id in BNid_set: for relation in relation_list: triplets.append([(id, 'BN'), relation, it, time_x]) elif tmp_BNid: if len(relation_list) == 0: tmp_relation = ('trên chuyến bay', 'R') if it[1] == 'FLIGHT' else ( 'liên quan đến', 'R') triplets.append([(tmp_BNid, 'BN'), tmp_relation, it, time_x]) else: for relation in relation_list: triplets.append([(tmp_BNid, 'BN'), relation, it, time_x]) relation_list.clear() return BN_list, triplets
def __call__(self, text): return sent_tokenize(text)
def preprocess(self, text): sentences = sent_tokenize(text) features = self.convert_sentences_to_features(sentences) data = NERdataset(features, self.device) return DataLoader(data, batch_size=self.batch_size)
def wordless_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wordless_text_utils.check_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'): lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', # Greek (Modern) 'ell': 'greek', 'ita': 'italian', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', 'por': 'portuguese', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other Languages 'other': 'english' } sentences = nltk.sent_tokenize(text, language=lang_texts[lang]) elif sentence_tokenizer == main.tr('spaCy - Sentencizer'): nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True sentences = [sentence.text for sentence in doc.sents] # Chinese & Japanese elif sentence_tokenizer in [ main.tr('Wordless - Chinese Sentence Tokenizer'), main.tr('Wordless - Japanese Sentence Tokenizer') ]: for line in text.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in ['。', '!', '?', '!', '?']: for j, char in enumerate(line): if j > i and char not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Thai elif sentence_tokenizer == 'PyThaiNLP - Thai Sentence Tokenizer': sentences = pythainlp.tokenize.sent_tokenize(text) # Tibetan elif sentence_tokenizer == 'Wordless - Tibetan Sentence Tokenizer': sentences = text.split() # Vietnamese elif sentence_tokenizer == 'Underthesea - Vietnamese Sentence Tokenizer': sentences = underthesea.sent_tokenize(text) sentences = wordless_text_utils.record_boundary_sentences(sentences, text) return sentences
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_nlp_utils.init_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # Input of SudachiPy cannot be more than 49149 BYTES if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4: # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300) sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10) else: sections = wl_nlp_utils.split_into_chunks_text( text, section_size=main.settings_custom['files']['misc'] ['read_files_in_chunks']) for section in sections: # NLTK if sentence_tokenizer == 'nltk_punkt': lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', # English 'eng_gb': 'english', 'eng_us': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', # German 'deu_at': 'german', 'deu_de': 'german', 'deu_ch': 'german', 'ell': 'greek', 'ita': 'italian', # Norwegian 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', # Portuguese 'por_br': 'portuguese', 'por_pt': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences.extend( nltk.sent_tokenize(section, language=lang_texts[lang])) # spaCy elif sentence_tokenizer.startswith('spacy_'): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(section) sentences.extend([sentence.text for sentence in doc.sents]) # Chinese & Japanese elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']: for line in section.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in [ '。', '!', '?', '!', '?' ]: for j, char_next in enumerate(line): if j > i and char_next not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == 'tokenizer_isl': for sentence in tokenizer.split_into_sentences(section): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Thai elif sentence_tokenizer == 'pythainlp_crfcut': sentences.extend(pythainlp.sent_tokenize(section)) # Tibetan elif sentence_tokenizer == 'botok_bod': wl_nlp_utils.init_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(section) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens['tokens'] ])) # Vietnamese elif sentence_tokenizer == 'underthesea_vie': sentences.extend(underthesea.sent_tokenize(section)) # Strip spaces sentences = [ sentence_non_empty for sentence in sentences if (sentence_non_empty := sentence.strip()) ]