def process_text_line(line):
    tokenizer = Tokenizer()
    tokens = tokenizer.tokenize(line)

    #sentence_splitter = SentenceSplitter()
    #sentences = sentence_splitter.split(tokens)
    sentences = tokens
    result = []

    for s in sentences:

        if PROCESS_DISCUSSION:
            s = remove_discussion_suffix(s)

        if len(s) >= 4:
            sentence_string = " ".join(s)

            if PROCESS_DISCUSSION:
                # check if this line still contains a dirty comment:
                if "( CEST )" not in sentence_string and "( CET )" not in sentence_string:
                    result.append(sentence_string)
            else:
                result.append(sentence_string)

    return result
Beispiel #2
0
class TestTokenizer(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)

    def _equal(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())

    def _equal_xml(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize_xml(raw, is_file=False), tokenized.split())

    def _fail_means_improvement(self, raw, tokenized):
        """"""
        self.assertNotEqual(self.tokenizer.tokenize(raw), tokenized.split())
Beispiel #3
0
class TestTokenizer(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)

    def _equal(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())
Beispiel #4
0
class TestTokenizerExtra(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True, extra_info=True)

    def _equal(self, raw, tokenized):
        """"""
        tokens, extra_info = zip(*self.tokenizer.tokenize(raw))
        self.assertEqual(list(tokens), tokenized.split())
Beispiel #5
0
def SentenceSplit(text):

    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    tokens = tokenizer.tokenize(text)

    sentence_splitter = SentenceSplitter(is_tuple=False)
    sentences = sentence_splitter.split(tokens)
    return sentences
Beispiel #6
0
def build_list(filename):
    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    gazetteers = set()
    f = open(filename, 'r', encoding='utf-8')
    for line in f.readlines():
        gazetteers.add(' '.join(tokenizer.tokenize(line.strip())))
    f.close()
    print('read {}'.format(filename))
    return gazetteers
Beispiel #7
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Beispiel #8
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split()
        eos_tags = set(eos_tags)
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split_xml(tokens, eos_tags)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Beispiel #9
0
class WordTokenizer(object):
    def __init__(self, language='en'):
        self.language = language
        if language == 'en':
            self.tokenizer = TreebankTokenizer()
        elif language == 'de':
            self.tokenizer = Tokenizer(split_camel_case=True,
                                       token_classes=False,
                                       extra_info=False)
        else:
            raise NotImplementedError

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(sentence)
Beispiel #10
0
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from somajo import Tokenizer

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name('easy-deutsch.json', scope)
gc = gspread.authorize(credentials)
sheet = gc.open("Deutsch Wörter").worksheet('Expressions')

tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
data = py_(sheet.get_all_values()).filter(lambda r: r[0]).map(lambda r: py_.compact(r)).map(
    lambda r: [py_.capitalize(r[0], strict=False), *r[1:]]
).map(
    lambda r, i: dict(id=i, de=r[0], low=r[0].lower(), tokens=tokenizer.tokenize(r[0].lower()), rest=r[1:])
).value()

token_index = {}

for tokens in py_.pluck(data, 'tokens'):
    for token in tokens:
        if len(token) <= 1:
            continue

        t = token.lower()
        if t not in token_index:
            token_index[t] = dict(
                key=t,
                ids=py_(data).filter(lambda d: t in d['tokens']).pluck('id').value()
            )
def tokenSplit(text):
    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    tokens = tokenizer.tokenize(text)
    return tokens
Beispiel #12
0
tokenizer = Tokenizer(split_camel_case=False, token_classes=True)
count_all = Counter()
count_hashtags = Counter()
twStop = set(
    io.open('resources/german_stopwords.txt',
            encoding='utf-8').read().splitlines())
stop = set(stopwords.words('german'))

with io.open("data/fluechtlinge.json", encoding='utf-8') as jsonFile:
    for line in jsonFile:
        tweet = json.loads(line)
        text = tweet['text'].encode('utf-8').replace('ö', 'oe').replace(
            'ä', 'ae').replace('ü', 'ue')
        regular = [
            token.token for token in tokenizer.tokenize(text.lower())
            if token.token_class == "regular" and token.token not in twStop
        ]
        hashtag = [
            token.token for token in tokenizer.tokenize(tweet['text'].lower())
            if token.token_class == "hashtag"
        ]
        count_all.update(regular)
        count_hashtags.update(hashtag)
        tokens = tokenizer.tokenize(tweet['text'])
        print text
        for token in tokenizer.tokenize(text):
            print token.token + " ist " + token.token_class

print "Häufigste Worte: "
for word in count_all.most_common(10):