class NERTokenizer: def __init__(self): self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) self._sentence_splitter = SentenceSplitter() def parse_text(self, text): tokens = self._word_tokenizer.tokenize_paragraph(text) sentences_tokenized = self._sentence_splitter.split(tokens) sentences = [] for sen in sentences_tokenized: sen = [tok.replace(" ", "") for tok in sen] if len(sen) == 0: continue sentences.append((sen, [])) return sentences
def get_sents(texts): tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) sentence_splitter = SentenceSplitter(is_tuple=False) results = [] for text in texts: # text = clean(text, lang='de', lower=False) tokens = tokenizer.tokenize_paragraph(text) sentences = sentence_splitter.split(tokens) cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences] results.append(cleaned) return results
def predict(input_text, model = learner): # input_txt = "" doc = nlp(input_text) if 'en' in doc._.language['language']: tokenizer = Tokenizer(language="en") input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!']) labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) elif 'de' in doc._.language['language']: tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!']) labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) elif 'fr' in doc._.language['language']: tokenizer = Tokenizer(language="en") input_txt = re.sub(r'[,.?!]', '', input_text).strip() labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) else: tokenizer = Tokenizer(language="en") input_txt = re.sub(r'[,.?!]', '', input_text).strip() labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) if not input_txt: return input_txt ## Assigning random language language = 'English' X = pd.DataFrame([(input_txt, labels, language)], columns=['Sentences', 'labels', 'language']) X.to_csv('/data/vchordia/sen_boundary/X.csv', index=False) dl = get_data_loader_for_predict(data, df_path="/data/vchordia/sen_boundary/X.csv") preds = learner.predict(dl) pred_tokens, pred_labels = bert_labels2tokens(dl, preds[0]) res_str = final_str(pred_tokens, pred_labels) return res_str