Exemple #1
0
    def __init__(self):

        self._word_tokenizer = Tokenizer(split_camel_case=True,
                                         token_classes=False,
                                         extra_info=False)

        self._sentence_splitter = SentenceSplitter()
def process_text_line(line):
    tokenizer = Tokenizer()
    tokens = tokenizer.tokenize(line)

    #sentence_splitter = SentenceSplitter()
    #sentences = sentence_splitter.split(tokens)
    sentences = tokens
    result = []

    for s in sentences:

        if PROCESS_DISCUSSION:
            s = remove_discussion_suffix(s)

        if len(s) >= 4:
            sentence_string = " ".join(s)

            if PROCESS_DISCUSSION:
                # check if this line still contains a dirty comment:
                if "( CEST )" not in sentence_string and "( CET )" not in sentence_string:
                    result.append(sentence_string)
            else:
                result.append(sentence_string)

    return result
Exemple #3
0
def main():
    args = arguments()
    n_tokens = 0
    t0 = time.perf_counter()
    is_xml = False
    if args.xml or args.tag is not None:
        is_xml = True
    tokenizer = Tokenizer(args.split_camel_case, args.token_classes,
                          args.extra_info, args.language)
    sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info,
                                         args.language)
    if is_xml:
        if args.parallel > 1:
            logging.warning(
                "Parallel tokenization of XML files is currently not supported."
            )
        eos_tags = args.tag
        if eos_tags is None:
            eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split(
            )
        eos_tags = set(eos_tags)
        tokenized_paragraphs = [tokenizer.tokenize_xml(args.FILE)]
        if args.split_sentences:
            tokenized_paragraphs = list(
                sentence_splitter.split_xml(tokenized_paragraphs[0], eos_tags))
    else:
        if args.paragraph_separator == "empty_lines":
            paragraphs = utils.get_paragraphs(args.FILE)
        elif args.paragraph_separator == "single_newlines":
            paragraphs = (line for line in args.FILE if line.strip() != "")
        if args.parallel > 1:
            pool = multiprocessing.Pool(
                min(args.parallel, multiprocessing.cpu_count()))
            tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs,
                                             250)
        else:
            tokenized_paragraphs = map(tokenizer.tokenize, paragraphs)
        tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp)
        if args.split_sentences:
            tokenized_paragraphs = map(sentence_splitter.split,
                                       tokenized_paragraphs)
            tokenized_paragraphs = (s for tp in tokenized_paragraphs
                                    for s in tp)
    if args.token_classes or args.extra_info:
        if is_xml:
            tokenized_paragraphs = ([(l[0], ) if l[1] is None else l
                                     for l in tp]
                                    for tp in tokenized_paragraphs)
        tokenized_paragraphs = (["\t".join(t) for t in tp]
                                for tp in tokenized_paragraphs)
    for tp in tokenized_paragraphs:
        n_tokens += len(tp)
        print("\n".join(tp), "\n", sep="")
    t1 = time.perf_counter()
    logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                 (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
Exemple #4
0
def SentenceSplit(text):

    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    tokens = tokenizer.tokenize(text)

    sentence_splitter = SentenceSplitter(is_tuple=False)
    sentences = sentence_splitter.split(tokens)
    return sentences
Exemple #5
0
 def __init__(self, language='en'):
     self.language = language
     if language == 'en':
         self.tokenizer = TreebankTokenizer()
     elif language == 'de':
         self.tokenizer = Tokenizer(split_camel_case=True,
                                    token_classes=False,
                                    extra_info=False)
     else:
         raise NotImplementedError
Exemple #6
0
def build_list(filename):
    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    gazetteers = set()
    f = open(filename, 'r', encoding='utf-8')
    for line in f.readlines():
        gazetteers.add(' '.join(tokenizer.tokenize(line.strip())))
    f.close()
    print('read {}'.format(filename))
    return gazetteers
Exemple #7
0
def get_sents(texts):
    tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
    sentence_splitter = SentenceSplitter(is_tuple=False)
    
    results = []
    for text in texts:
#         text = clean(text, lang='de', lower=False)
        tokens = tokenizer.tokenize_paragraph(text)
        sentences = sentence_splitter.split(tokens)
        cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences]
        results.append(cleaned)
    return results
Exemple #8
0
class NERTokenizer:
    def __init__(self):

        self._word_tokenizer = Tokenizer(split_camel_case=True,
                                         token_classes=False,
                                         extra_info=False)

        self._sentence_splitter = SentenceSplitter()

    def parse_text(self, text):
        tokens = self._word_tokenizer.tokenize_paragraph(text)

        sentences_tokenized = self._sentence_splitter.split(tokens)

        sentences = []
        for sen in sentences_tokenized:

            sen = [tok.replace(" ", "") for tok in sen]

            if len(sen) == 0:
                continue

            sentences.append((sen, []))

        return sentences
Exemple #9
0
def main():
    args = arguments()
    n_tokens = 0
    t0 = time.perf_counter()
    tokenizer = Tokenizer(args.split_camel_case, args.token_classes,
                          args.extra_info)
    sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info)
    if args.paragraph_separator == "empty_lines":
        paragraphs = get_paragraphs(args.FILE)
    elif args.paragraph_separator == "single_newlines":
        paragraphs = (line for line in args.FILE if line.strip() != "")
    if args.parallel > 1:
        pool = multiprocessing.Pool(
            min(args.parallel, multiprocessing.cpu_count()))
        tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250)
    else:
        tokenized_paragraphs = map(tokenizer.tokenize, paragraphs)
    tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp)
    if args.split_sentences:
        tokenized_paragraphs = map(sentence_splitter.split,
                                   tokenized_paragraphs)
        tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp)
    if args.token_classes or args.extra_info:
        tokenized_paragraphs = (["\t".join(t) for t in tp]
                                for tp in tokenized_paragraphs)

    for tp in tokenized_paragraphs:
        n_tokens += len(tp)
        print("\n".join(tp), "\n", sep="")
    t1 = time.perf_counter()
    logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                 (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
Exemple #10
0
class TestTokenizer(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True)

    def _equal(self, raw, tokenized):
        """"""
        if isinstance(tokenized, str):
            tokenized = tokenized.split()
        dll = DLL([Token(raw, first_in_sentence=True, last_in_sentence=True)])
        tokens = self.tokenizer._tokenize(dll)
        self.assertEqual([t.text for t in tokens], tokenized)

    def _equal_xml(self, raw, tokenized):
        """"""
        if isinstance(tokenized, str):
            tokenized = tokenized.split()
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        token_lists = utils.xml_chunk_generator(raw, is_file=False, eos_tags=eos_tags)
        token_dlls = map(DLL, token_lists)
        chunks = map(self.tokenizer._tokenize, token_dlls)
        complete = list(itertools.chain.from_iterable(chunks))
        complete = utils.escape_xml_tokens(complete)
        self.assertEqual([t.text for t in complete], tokenized)
Exemple #11
0
class TestTokenizer(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)

    def _equal(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())

    def _equal_xml(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize_xml(raw, is_file=False), tokenized.split())

    def _fail_means_improvement(self, raw, tokenized):
        """"""
        self.assertNotEqual(self.tokenizer.tokenize(raw), tokenized.split())
Exemple #12
0
class TestTokenizer(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)

    def _equal(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())
Exemple #13
0
class TestTokenizerExtra(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True, extra_info=True)

    def _equal(self, raw, tokenized):
        """"""
        tokens, extra_info = zip(*self.tokenizer.tokenize(raw))
        self.assertEqual(list(tokens), tokenized.split())
Exemple #14
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Exemple #15
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split()
        eos_tags = set(eos_tags)
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split_xml(tokens, eos_tags)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Exemple #16
0
class WordTokenizer(object):
    def __init__(self, language='en'):
        self.language = language
        if language == 'en':
            self.tokenizer = TreebankTokenizer()
        elif language == 'de':
            self.tokenizer = Tokenizer(split_camel_case=True,
                                       token_classes=False,
                                       extra_info=False)
        else:
            raise NotImplementedError

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(sentence)
Exemple #17
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = Tokenizer(split_camel_case=True)
     self.sentence_splitter = SentenceSplitter()
Exemple #18
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = Tokenizer(language="en_PTB", split_camel_case=True)
Exemple #19
0
import pprint

from pydash import py_

import gspread
from oauth2client.service_account import ServiceAccountCredentials
from somajo import Tokenizer

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name('easy-deutsch.json', scope)
gc = gspread.authorize(credentials)
sheet = gc.open("Deutsch Wörter").worksheet('Expressions')

tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
data = py_(sheet.get_all_values()).filter(lambda r: r[0]).map(lambda r: py_.compact(r)).map(
    lambda r: [py_.capitalize(r[0], strict=False), *r[1:]]
).map(
    lambda r, i: dict(id=i, de=r[0], low=r[0].lower(), tokens=tokenizer.tokenize(r[0].lower()), rest=r[1:])
).value()

token_index = {}

for tokens in py_.pluck(data, 'tokens'):
    for token in tokens:
        if len(token) <= 1:
            continue

        t = token.lower()
        if t not in token_index:
Exemple #20
0
#!/usr/bin/env python
# This Python file uses the following encoding: utf-8
from somajo import Tokenizer
import json
import io
from collections import Counter
from nltk.corpus import stopwords

tokenizer = Tokenizer(split_camel_case=False, token_classes=True)
count_all = Counter()
count_hashtags = Counter()
twStop = set(
    io.open('resources/german_stopwords.txt',
            encoding='utf-8').read().splitlines())
stop = set(stopwords.words('german'))

with io.open("data/fluechtlinge.json", encoding='utf-8') as jsonFile:
    for line in jsonFile:
        tweet = json.loads(line)
        text = tweet['text'].encode('utf-8').replace('ö', 'oe').replace(
            'ä', 'ae').replace('ü', 'ue')
        regular = [
            token.token for token in tokenizer.tokenize(text.lower())
            if token.token_class == "regular" and token.token not in twStop
        ]
        hashtag = [
            token.token for token in tokenizer.tokenize(tweet['text'].lower())
            if token.token_class == "hashtag"
        ]
        count_all.update(regular)
        count_hashtags.update(hashtag)
Exemple #21
0
def b2():
    t = time()
    _ = Tokenizer().tokenize("".join(
        random.choices(string.printable, k=1000000)))
    print(time() - t)
def predict(input_text, model = learner):
    # input_txt = ""
    doc = nlp(input_text)
    if 'en' in doc._.language['language']:
        tokenizer = Tokenizer(language="en")
        input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!'])
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))

    elif 'de' in doc._.language['language']:
        tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
        input_txt = ' '.join(token for token in tokenizer.tokenize_paragraph(input_text) if token not in [',', '.', '?', '!'])
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))

    elif 'fr' in doc._.language['language']:
        tokenizer = Tokenizer(language="en")
        input_txt = re.sub(r'[,.?!]', '', input_text).strip()
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))
    else:
        tokenizer = Tokenizer(language="en")
        input_txt = re.sub(r'[,.?!]', '', input_text).strip()
        labels = 'BOS ' * len(tokenizer.tokenize_paragraph(input_txt))

    if not input_txt:
        return input_txt

    ## Assigning random language
    language = 'English'

    X = pd.DataFrame([(input_txt, labels, language)], columns=['Sentences', 'labels', 'language'])
    X.to_csv('/data/vchordia/sen_boundary/X.csv', index=False)
    dl = get_data_loader_for_predict(data, df_path="/data/vchordia/sen_boundary/X.csv")
    preds = learner.predict(dl)
    pred_tokens, pred_labels = bert_labels2tokens(dl, preds[0])
    res_str = final_str(pred_tokens, pred_labels)
    return res_str
def tokenSplit(text):
    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    tokens = tokenizer.tokenize(text)
    return tokens
Exemple #24
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = Tokenizer(split_camel_case=True)