def __init__(self): self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) self._sentence_splitter = SentenceSplitter()
def process_text_line(line): tokenizer = Tokenizer() tokens = tokenizer.tokenize(line) #sentence_splitter = SentenceSplitter() #sentences = sentence_splitter.split(tokens) sentences = tokens result = [] for s in sentences: if PROCESS_DISCUSSION: s = remove_discussion_suffix(s) if len(s) >= 4: sentence_string = " ".join(s) if PROCESS_DISCUSSION: # check if this line still contains a dirty comment: if "( CEST )" not in sentence_string and "( CET )" not in sentence_string: result.append(sentence_string) else: result.append(sentence_string) return result
def main(): args = arguments() n_tokens = 0 t0 = time.perf_counter() is_xml = False if args.xml or args.tag is not None: is_xml = True tokenizer = Tokenizer(args.split_camel_case, args.token_classes, args.extra_info, args.language) sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info, args.language) if is_xml: if args.parallel > 1: logging.warning( "Parallel tokenization of XML files is currently not supported." ) eos_tags = args.tag if eos_tags is None: eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split( ) eos_tags = set(eos_tags) tokenized_paragraphs = [tokenizer.tokenize_xml(args.FILE)] if args.split_sentences: tokenized_paragraphs = list( sentence_splitter.split_xml(tokenized_paragraphs[0], eos_tags)) else: if args.paragraph_separator == "empty_lines": paragraphs = utils.get_paragraphs(args.FILE) elif args.paragraph_separator == "single_newlines": paragraphs = (line for line in args.FILE if line.strip() != "") if args.parallel > 1: pool = multiprocessing.Pool( min(args.parallel, multiprocessing.cpu_count())) tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250) else: tokenized_paragraphs = map(tokenizer.tokenize, paragraphs) tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp) if args.split_sentences: tokenized_paragraphs = map(sentence_splitter.split, tokenized_paragraphs) tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp) if args.token_classes or args.extra_info: if is_xml: tokenized_paragraphs = ([(l[0], ) if l[1] is None else l for l in tp] for tp in tokenized_paragraphs) tokenized_paragraphs = (["\t".join(t) for t in tp] for tp in tokenized_paragraphs) for tp in tokenized_paragraphs: n_tokens += len(tp) print("\n".join(tp), "\n", sep="") t1 = time.perf_counter() logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
def SentenceSplit(text): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) tokens = tokenizer.tokenize(text) sentence_splitter = SentenceSplitter(is_tuple=False) sentences = sentence_splitter.split(tokens) return sentences
def __init__(self, language='en'): self.language = language if language == 'en': self.tokenizer = TreebankTokenizer() elif language == 'de': self.tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) else: raise NotImplementedError
def build_list(filename): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) gazetteers = set() f = open(filename, 'r', encoding='utf-8') for line in f.readlines(): gazetteers.add(' '.join(tokenizer.tokenize(line.strip()))) f.close() print('read {}'.format(filename)) return gazetteers
def get_sents(texts): tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) sentence_splitter = SentenceSplitter(is_tuple=False) results = [] for text in texts: # text = clean(text, lang='de', lower=False) tokens = tokenizer.tokenize_paragraph(text) sentences = sentence_splitter.split(tokens) cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences] results.append(cleaned) return results
class NERTokenizer: def __init__(self): self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) self._sentence_splitter = SentenceSplitter() def parse_text(self, text): tokens = self._word_tokenizer.tokenize_paragraph(text) sentences_tokenized = self._sentence_splitter.split(tokens) sentences = [] for sen in sentences_tokenized: sen = [tok.replace(" ", "") for tok in sen] if len(sen) == 0: continue sentences.append((sen, [])) return sentences
def main(): args = arguments() n_tokens = 0 t0 = time.perf_counter() tokenizer = Tokenizer(args.split_camel_case, args.token_classes, args.extra_info) sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info) if args.paragraph_separator == "empty_lines": paragraphs = get_paragraphs(args.FILE) elif args.paragraph_separator == "single_newlines": paragraphs = (line for line in args.FILE if line.strip() != "") if args.parallel > 1: pool = multiprocessing.Pool( min(args.parallel, multiprocessing.cpu_count())) tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250) else: tokenized_paragraphs = map(tokenizer.tokenize, paragraphs) tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp) if args.split_sentences: tokenized_paragraphs = map(sentence_splitter.split, tokenized_paragraphs) tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp) if args.token_classes or args.extra_info: tokenized_paragraphs = (["\t".join(t) for t in tp] for tp in tokenized_paragraphs) for tp in tokenized_paragraphs: n_tokens += len(tp) print("\n".join(tp), "\n", sep="") t1 = time.perf_counter() logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" % (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
class TestTokenizer(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True) def _equal(self, raw, tokenized): """""" if isinstance(tokenized, str): tokenized = tokenized.split() dll = DLL([Token(raw, first_in_sentence=True, last_in_sentence=True)]) tokens = self.tokenizer._tokenize(dll) self.assertEqual([t.text for t in tokens], tokenized) def _equal_xml(self, raw, tokenized): """""" if isinstance(tokenized, str): tokenized = tokenized.split() eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) token_lists = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags) token_dlls = map(DLL, token_lists) chunks = map(self.tokenizer._tokenize, token_dlls) complete = list(itertools.chain.from_iterable(chunks)) complete = utils.escape_xml_tokens(complete) self.assertEqual([t.text for t in complete], tokenized)
class TestTokenizer(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) def _equal(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split()) def _equal_xml(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize_xml(raw, is_file=False), tokenized.split()) def _fail_means_improvement(self, raw, tokenized): """""" self.assertNotEqual(self.tokenizer.tokenize(raw), tokenized.split())
class TestTokenizer(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) def _equal(self, raw, tokenized): """""" self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())
class TestTokenizerExtra(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True, extra_info=True) def _equal(self, raw, tokenized): """""" tokens, extra_info = zip(*self.tokenizer.tokenize(raw)) self.assertEqual(list(tokens), tokenized.split())
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split() eos_tags = set(eos_tags) tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split_xml(tokens, eos_tags) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class WordTokenizer(object): def __init__(self, language='en'): self.language = language if language == 'en': self.tokenizer = TreebankTokenizer() elif language == 'de': self.tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) else: raise NotImplementedError def tokenize(self, sentence): return self.tokenizer.tokenize(sentence)
def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter()
def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(language="en_PTB", split_camel_case=True)
import pprint from pydash import py_ import gspread from oauth2client.service_account import ServiceAccountCredentials from somajo import Tokenizer scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] credentials = ServiceAccountCredentials.from_json_keyfile_name('easy-deutsch.json', scope) gc = gspread.authorize(credentials) sheet = gc.open("Deutsch Wörter").worksheet('Expressions') tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) data = py_(sheet.get_all_values()).filter(lambda r: r[0]).map(lambda r: py_.compact(r)).map( lambda r: [py_.capitalize(r[0], strict=False), *r[1:]] ).map( lambda r, i: dict(id=i, de=r[0], low=r[0].lower(), tokens=tokenizer.tokenize(r[0].lower()), rest=r[1:]) ).value() token_index = {} for tokens in py_.pluck(data, 'tokens'): for token in tokens: if len(token) <= 1: continue t = token.lower() if t not in token_index:
#!/usr/bin/env python # This Python file uses the following encoding: utf-8 from somajo import Tokenizer import json import io from collections import Counter from nltk.corpus import stopwords tokenizer = Tokenizer(split_camel_case=False, token_classes=True) count_all = Counter() count_hashtags = Counter() twStop = set( io.open('resources/german_stopwords.txt', encoding='utf-8').read().splitlines()) stop = set(stopwords.words('german')) with io.open("data/fluechtlinge.json", encoding='utf-8') as jsonFile: for line in jsonFile: tweet = json.loads(line) text = tweet['text'].encode('utf-8').replace('ö', 'oe').replace( 'ä', 'ae').replace('ü', 'ue') regular = [ token.token for token in tokenizer.tokenize(text.lower()) if token.token_class == "regular" and token.token not in twStop ] hashtag = [ token.token for token in tokenizer.tokenize(tweet['text'].lower()) if token.token_class == "hashtag" ] count_all.update(regular) count_hashtags.update(hashtag)
def b2(): t = time() _ = Tokenizer().tokenize("".join( random.choices(string.printable, k=1000000))) print(time() - t)
def predict(input_text, model = learner): # input_txt = "" doc = nlp(input_text) if 'en' in doc._.language['language']: tokenizer = Tokenizer(language="en") input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!']) labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) elif 'de' in doc._.language['language']: tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!']) labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) elif 'fr' in doc._.language['language']: tokenizer = Tokenizer(language="en") input_txt = re.sub(r'[,.?!]', '', input_text).strip() labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) else: tokenizer = Tokenizer(language="en") input_txt = re.sub(r'[,.?!]', '', input_text).strip() labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt)) if not input_txt: return input_txt ## Assigning random language language = 'English' X = pd.DataFrame([(input_txt, labels, language)], columns=['Sentences', 'labels', 'language']) X.to_csv('/data/vchordia/sen_boundary/X.csv', index=False) dl = get_data_loader_for_predict(data, df_path="/data/vchordia/sen_boundary/X.csv") preds = learner.predict(dl) pred_tokens, pred_labels = bert_labels2tokens(dl, preds[0]) res_str = final_str(pred_tokens, pred_labels) return res_str
def tokenSplit(text): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) tokens = tokenizer.tokenize(text) return tokens
def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True)