Ejemplo n.º 1
0
def split(list_of_text, thread_number, TMP_DIR):
    """
    Splits text in sentences
    Writes line for line with leading space (for BPE) 
    Every document is separated by a free line
    """
    print(os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)))
    outF = open(
        os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)),
        "w")
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for part in list_of_text:
        sentences = tokenizer.tokenize_text([part])
        for sentence in sentences:
            output = ""
            for token in sentence:
                #word_list = [token.text for token in sentence]
                if (token.space_after and not token.last_in_sentence
                        and not token.first_in_sentence):
                    output += (token.text + ' ')
                elif token.first_in_sentence:
                    output += (' ' + token.text + ' ')
                else:
                    #output = " ".join(word_list[:-1])
                    output += token.text
                    #output += word_list[-1]
            #sen_out.append(output)
            outF.write(output)
            outF.write("\n")
        outF.write("\n")

    return thread_number
Ejemplo n.º 2
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True)

    def _equal(self, raw, tokenized_sentences):
        """"""
        sentences = self.tokenizer.tokenize_text([raw])
        sentences = [" ".join([t.text for t in s]) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        sentences = self.tokenizer.tokenize_xml(raw, eos_tags)
        sentences = [" ".join([t.text for t in s]) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml_strip(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        sentences = self.tokenizer.tokenize_xml(raw, eos_tags, strip_tags=True)
        sentences = [" ".join([t.text for t in s]) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Ejemplo n.º 3
0
class SoMaJoSentenceTokenizer(Tokenizer):
    def __init__(self, model_name: str):
        super().__init__()
        self.tokenizer = SoMaJo(model_name)

    def tokenize(self, text: str) -> List[str]:
        out_sentences = []
        sentences = list(self.tokenizer.tokenize_text([text]))

        for i, sentence in enumerate(sentences):
            text = ""

            for token in sentence:
                if "SpaceAfter=No" in token.extra_info:
                    whitespace = ""
                else:
                    whitespace = " "

                text += token.text + whitespace

            if i == len(sentences) - 1:
                text = text.rstrip()

            out_sentences.append(text)

        return out_sentences
Ejemplo n.º 4
0
def tokenize(text):
    tokenizer = SoMaJo(language="de_CMC")
    for i in range(len(text)):
        text[i] = text[i].split()
        tok = tokenizer.tokenize_text(text[i])
        tok_sent = []
        for sent in tok:
            for word in sent:
                tok_sent.append(word.text)
        text[i] = tok_sent
 def tokenizer(self, text):
     tokenizer = SoMaJo("en_PTB")
     tokenized_object = tokenizer.tokenize_text([text])
     sentences = []
     types = []
     for sent in tokenized_object:
         sentence = []
         for token in sent:
             sentence.append(token.text)
             types.append(token.token_class)
         sentences.append(sentence)
     self.output['tokens'] = sentences
     self.output['types'] = types
     return sentences, types
Ejemplo n.º 6
0
def replace_hashtags_tokenizer(text):
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for i in range(len(text)):
        line = text[i].split()
        for j in range(len(line)):
            if line[j].startswith('#'):
                hashtag = []
                line[j] = line[j].replace('#', "")
                hashtag.append(line[j])
                tok_hashtag = tokenizer.tokenize_text(hashtag)
                for tok in tok_hashtag:
                    for t in tok:
                        print(t.text)
        text[i] = " ".join(line)
    return (text)
Ejemplo n.º 7
0
    def make(self, prerequisite_data):
        paragraphs = prerequisite_data['paragraph']
        tokenizer = SoMaJo("de_CMC", split_camel_case=True)
        sentences = tokenizer.tokenize_text(paragraphs)

        tokens = []
        sentence_alignment = []

        for (i, s) in zip(range(len(sentences)), sentences):
            tokens += [token.text for token in s]
            sentence_alignment += [i] * len(s)

        return {
            'token-somajo': tokens,
            'sentence-somajo': sentence_alignment,
            'token': tokens,
            'sentence': sentence_alignment
        }
Ejemplo n.º 8
0
class SoMaJoTokenizer(Tokenizer):
    def __init__(self, language, processes=None):
        from somajo import SoMaJo

        tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language]
        self.tokenizer = SoMaJo(tokenizer_type,
                                split_camel_case=True,
                                split_sentences=True)

    def _tokenize_text(self, text):
        sentences = []

        if len(text) == 0:
            return sentences

        for sentence in self.tokenizer.tokenize_text([text]):
            sentences.append([
                Token(token.text, " " if token.space_after else "")
                for token in sentence
            ])

        if not text[-1].isspace():
            sentences[-1][-1] = Token(sentences[-1][-1].text, "")

        return sentences

    def split(self, texts, verbose=False):
        bar = None
        if verbose:
            from tqdm.auto import tqdm

            bar = tqdm(total=len(texts))

        # pool.imap leaks memory for some reason
        for sentences in map(self._tokenize_text, texts):
            yield sentences

            if verbose:
                bar.update(1)
Ejemplo n.º 9
0
class SoMaJoWordTokenizer(Tokenizer):
    def __init__(self, model_name: str):
        super().__init__()
        self.tokenizer = SoMaJo(model_name, split_sentences=False)

    def tokenize(self, text: str) -> List[str]:
        out_tokens = []
        tokens = next(self.tokenizer.tokenize_text([text]))

        for i, token in enumerate(tokens):
            if "SpaceAfter=No" in token.extra_info or i == len(tokens) - 1:
                whitespace = ""
            else:
                whitespace = " "

            # sometimes sample more spaces than one space so the model learns to deal with it
            while random.random() < 0.05:
                whitespace += " "

            out_tokens.append(token.text + whitespace)

        return [x for x in out_tokens if len(x) > 0]
Ejemplo n.º 10
0
class GeRouge:
    """
    Computes ROUGE scores on German texts.

    Args:
        alpha: Weighting factor of Recall and Precision. Between 0 and 1.
        stemming: Boolean. Defines whether stemming is used or not.
        split_compounds: Boolean. Defines whether compound words are split or not.
        minimal_mode: Boolean. Skip time consuming steps for quick calculation.
                        TODO: specify what exactly is skipped.
    """

    def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False):
        self.tokenizer = SoMaJo('de_CMC')
        self.sentence_splitter = SentenceSplitter(is_tuple=False)
        self.alpha = alpha
        self.stemming = stemming
        self.split_compounds = split_compounds
        self.stemmer = SnowballStemmer('german')
        self.minimal_mode = minimal_mode
        self.base_path = pathlib.Path(__file__).parent.absolute()

        self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–']
        self.remove_chars.extend(list(string.punctuation))
        self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')]

        self.stop = set()
        with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f:
            for line in f:
                self.stop.add(line.strip())
        if not minimal_mode:
            self.smart_stop = set()
            with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f:
                for line in f:
                    word = line.strip().lower()
                    self.smart_stop.add(word)
                    for replace_char in self.replace_chars:
                        word = word.replace(replace_char[0], replace_char[1])
            self.lemmas = {}
            with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f:
                for line in f:
                    l = line.strip().split('\t')
                    l[0] = l[0].strip().lower()
                    l[1] = l[1].strip().lower()
                    for replace_char in self.replace_chars:
                        l[0] = l[0].replace(replace_char[0], replace_char[1])
                        l[1] = l[1].replace(replace_char[0], replace_char[1])
                    self.lemmas[l[0]] = l[1]

    def tokenize_sents(self, text):
        # SoMaJo now splits sentences simultaneously
        sents = list(self.tokenizer.tokenize_text([text]))
        length = sum([len(sent) for sent in sents])
        transformed_sents = [list(self.transform_sent(sent)) for sent in sents]
        transformed_sents = [[token for token in sent if token is not None and token != ''] for sent in
                             transformed_sents]

        return transformed_sents, length

    @staticmethod
    def create_ngrams(transformed_sents, n=1):
        ngrams_sents = [ngram_splitter(sent, n) for sent in transformed_sents if len(sent) >= n]
        ngrams = set([token for sent in ngrams_sents for token in sent])

        return ngrams

    def transform_sent(self, sent):
        for token in sent:
            token, splitted = self.transform_token(token.text)
            if splitted:
                for partial_token in token:
                    yield partial_token
            else:
                yield token

    def transform_token(self, token):
        if not self.minimal_mode and token.lower().strip() in self.lemmas:
            token = self.lemmas[token.lower().strip()]

        compound_candidates = self.split_compound(token)
        if self.split_compounds and compound_candidates is not None and compound_candidates[0][0] > 0.5 and \
                compound_candidates[0][1] != token:
            return_tokens = []
            for token in compound_candidates[0][1:]:
                if len(token) > 0:
                    tokens, splitted = self.transform_token(token)
                    if splitted:
                        return_tokens.extend(tokens)
                    else:
                        return_tokens.append(tokens)
            return return_tokens, True
        else:
            token = token.lower().strip()

            for remove_char in self.remove_chars:
                token = token.replace(remove_char, '')
            for replace_char in self.replace_chars:
                token = token.replace(replace_char[0], replace_char[1])

            if (token in self.stop or
                    (not self.minimal_mode and token in self.smart_stop) or
                    bool(re.search(r'\d', token))):
                token = ''
            elif self.stemming:
                if not self.minimal_mode and token in self.lemmas:
                    token = self.lemmas[token]
                token = self.stemmer.stem(token)

            return token, False

    def rouge_n(self, reference, summary, ngrams=(1, 2)):
        """
        Computes Rouge-N scores based on n-grams.
        :param reference: Ground truth summary.
        :param summary: Generated prediction summary.
        :param ngrams: For which n-grams to calculate scores. Can be arbitrarily many.
        :return: List of (precision, recall, F1) tuples for each individual n-gram length.
        """
        reference_tokenized, reference_length = self.tokenize_sents(reference)
        summary_tokenized, summary_length = self.tokenize_sents(summary)
        return self.rouge_n_partial(reference_tokenized, summary_tokenized, ngrams)

    def rouge_l(self, reference, summary):
        """
        Calculates Rouge-L based on the longest common sub-sequence.
        :param reference: Ground truth summary.
        :param summary: Generated prediction summary.
        :return: Tuple with (precision, recall, F1) values for Rouge-L.
        """
        reference_tokenized, _ = self.tokenize_sents(reference)
        summary_tokenized, _ = self.tokenize_sents(summary)
        return self.computeL(summary_tokenized, reference_tokenized)

    def rouge_n_partial(self, reference_tokenized, summary_tokenized, ngrams):
        rouge_n = []

        for n in ngrams:
            if n < 1:
                rouge_n.append((0, 0, 0))
                continue

            reference = self.create_ngrams(reference_tokenized, n=n)
            summary = self.create_ngrams(summary_tokenized, n=n)

            if len(reference) == 0 or len(summary) == 0:
                rouge_n.append((0, 0, 0))
                continue

            matches = sum(
                [sum([ngram_reference == ngram_summary for ngram_summary in summary]) for ngram_reference in reference])
            rouge_p = matches / len(summary)
            rouge_r = matches / len(reference)
            denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha))
            if denominator != 0:
                rouge_f1 = (rouge_p * rouge_r) / denominator
            else:
                rouge_f1 = 0.0
            rouge_n.append((rouge_p, rouge_r, rouge_f1))

        return rouge_n

    def computeL(self, sys, ref):
        unionLCS = set()
        ref_size = sum([len(l) for l in ref])
        sys_size = sum([len(l) for l in sys])
        for r in ref:
            for s in sys:
                seq1 = GeRouge.lcs(r, s)
                seq2 = GeRouge.lcs(s, r)
                seq = seq1 if len(seq1) > len(seq2) else seq2
                unionLCS.update(seq)

        if ref_size > 0:
            rouge_r = len(unionLCS) / ref_size
        else:
            rouge_r = 0
        if sys_size > 0:
            rouge_p = len(unionLCS) / sys_size
        else:
            rouge_p = 0
        denominator = (rouge_r * self.alpha) + (rouge_p * (1 - self.alpha))
        if denominator != 0:
            rouge_f1 = (rouge_p * rouge_r) / denominator
        else:
            rouge_f1 = 0.0

        return rouge_p, rouge_r, rouge_f1

    @staticmethod
    def split_compound(word: str):
        """
        Code adapted from: https://github.com/dtuggener/CharSplit
        Return list of possible splits, best first
        :param word: Word to be split
        :return: List of all splits
        """
        word = word.lower()

        # If there is a hyphen in the word, return part of the word behind the last hyphen
        if '-' in word:
            return [[1., '-'.join((word.split('-'))[:-1]).title(), word.split('-')[-1].title()]]

        scores = []  # Score for each possible split position
        # Iterate through characters, start at forth character, go to 3rd last
        for n in range(3, len(word) - 2):

            pre_slice = word[:n]

            # Cut of Fugen-S
            if pre_slice.endswith('ts') or pre_slice.endswith('gs') or pre_slice.endswith('ks') \
                    or pre_slice.endswith('hls') or pre_slice.endswith('ns'):
                if len(word[:n - 1]) > 2: pre_slice = word[:n - 1]

            # Start, in, and end probabilities
            pre_slice_prob = []
            in_slice_prob = []
            start_slice_prob = []

            # Extract all ngrams
            for k in range(len(word) + 1, 2, -1):

                # Probability of first compound, given by its ending prob
                if pre_slice_prob == [] and k <= len(pre_slice):
                    end_ngram = pre_slice[-k:]  # Look backwards
                    pre_slice_prob.append(ngram_probs.suffix.get(end_ngram, -1))  # Punish unlikely pre_slice end_ngram

                # Probability of ngram in word, if high, split unlikely
                in_ngram = word[n:n + k]
                in_slice_prob.append(ngram_probs.infix.get(in_ngram, 1))  # Favor ngrams not occurring within words

                # Probability of word starting
                if start_slice_prob == []:
                    ngram = word[n:n + k]
                    # Cut Fugen-S
                    if ngram.endswith('ts') or ngram.endswith('gs') or ngram.endswith('ks') \
                            or ngram.endswith('hls') or ngram.endswith('ns'):
                        if len(ngram[:-1]) > 2:
                            ngram = ngram[:-1]
                    start_slice_prob.append(ngram_probs.prefix.get(ngram, -1))

            if pre_slice_prob == [] or start_slice_prob == []: continue

            start_slice_prob = max(start_slice_prob)
            pre_slice_prob = max(pre_slice_prob)  # Highest, best preslice
            in_slice_prob = min(in_slice_prob)  # Lowest, punish splitting of good ingrams
            score = start_slice_prob - in_slice_prob + pre_slice_prob
            scores.append([score, word[:n].title(), word[n:].title()])

        scores.sort(reverse=True)
        if scores == []:
            scores = [[0, word.title(), word.title()]]
        return sorted(scores, reverse=True)

    @staticmethod
    def lcs(a, b):
        lcsWords = []
        start = 0

        for word1 in a:
            for i in range(start, len(b)):
                word2 = b[i]
                if word1 == word2:
                    lcsWords.append(word2)
                    start = i + 1

        return lcsWords
Ejemplo n.º 11
0
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False


# =============================================================================
# SoMaJo taken from https://github.com/tsproisl/SoMaJo
# =============================================================================
if False:
    from tqdm import tqdm

    sen_out = []
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for part in tqdm(raw_text):
        sentences = tokenizer.tokenize_text([part])
        for sentence in sentences:
            word_list = [token.text for token in sentence]
            output = " ".join(word_list[:-1])
            output += word_list[-1]
            sen_out.append(output)

    _is_punctuation(raw_text[-1][-1])

    stripped = []
    for index, part in tqdm(enumerate(sen_out)):
        reordered = ""
        for char in part:
            if not _is_punctuation(char):
                reordered += char
            else:
Ejemplo n.º 12
0

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('filename')
    args = parser.parse_args()
    input_filename = args.filename

    with gzip.open(input_filename, 'r') as f, \
            gzip.open(input_filename + '-out.gz', 'wt') as output_file:

        with tqdm(total=2980314) as pbar:
            for line in f:
                pbar.update(1)
                line_dict = orjson.loads(line)
                content = line_dict['raw_content']
                language = line_dict['language']
                if language == 'de':
                    sentences = tokenizer.tokenize_text([content], parallel=1)
                    for s in sentences:
                        sentence_string = detokenize(s)
                        output_file.write(sentence_string + '\n')

                    # split documents?
                    #output_file.write('\n')
                else:
                    print('###################')
                    print(language)
                    print(content)
Ejemplo n.º 13
0
def SentenceSplit(text):

    tokenizer = SoMaJo("de_CMC")
    tokens = tokenizer.tokenize_text(text)
    return tokens