Ejemplos de sent_tokenize en Python, ejemplos de pythainlp.tokenize.sent_tokenize en Python

Ejemplo n.º 1

0

Mostrar archivo

 def test_sent_tokenize(self):
     self.assertEqual(sent_tokenize(None), [])
     self.assertEqual(sent_tokenize(""), [])
     self.assertEqual(
         sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"),
         ["รักน้ำ", "รักปลา", ""],
     )
     self.assertEqual(sent_tokenize("รักน้ำ  รักปลา  "), ["รักน้ำ", "รักปลา"])

Ejemplo n.º 2

0

Mostrar archivo

Archivo: __init__.py Proyecto: wannaphongcom/pythainlp

 def test_sent_tokenize(self):
     self.assertEqual(sent_tokenize(None), [])
     self.assertEqual(sent_tokenize(""), [])
     self.assertEqual(
         sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"),
         ["รักน้ำ", "รักปลา", ""],
     )
     self.assertEqual(sent_tokenize("รักน้ำ  รักปลา  "), ["รักน้ำ", "รักปลา"])

Ejemplo n.º 3

0

Mostrar archivo

 def test_sent_tokenize(self):
     self.assertEqual(sent_tokenize(None), [])
     self.assertEqual(sent_tokenize(""), [])
     self.assertEqual(
         sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"),
         ["รักน้ำ", "รักปลา", ""],
     )
     self.assertEqual(
         sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace+newline"),
         ["รักน้ำ", "รักปลา"],
     )
     self.assertEqual(
         sent_tokenize("วันนี้ฉันกินข้าว และโดดเรียน", engine="crfcut"),
         ["วันนี้ฉันกินข้าว และโดดเรียน"],
     )
     self.assertEqual(
         sent_tokenize("น้ำพึ่งเรือ แต่เสือพึ่งป่า", engine="crfcut"),
         ["น้ำพึ่งเรือ ", "แต่เสือพึ่งป่า"],
     )
     self.assertEqual(
         sent_tokenize("วันนี้ฉันกินข้าว และโดดเรียน"),
         ["วันนี้ฉันกินข้าว และโดดเรียน"],
     )
     self.assertEqual(
         sent_tokenize("น้ำพึ่งเรือ แต่เสือพึ่งป่า"),
         ["น้ำพึ่งเรือ ", "แต่เสือพึ่งป่า"],
     )

Ejemplo n.º 4

0

Mostrar archivo

Archivo: pyir.py Proyecto: wannaphong/IR

 def load(self,filename,encoding="utf-8"):
     with open(filename,"r",encoding="utf-8") as f:
         self.data=f.read()
     self.content = [x.strip() for x in sent_tokenize(self.data)]
     self.build_data(self.content)
     self.name_docs[filename]=self.idx-1
     self.idx2docs=dict((v,k) for k,v in self.name_docs.items())

Ejemplo n.º 5

0

Mostrar archivo

Archivo: __init__.py Proyecto: phattharachon/sentiment_api

 def summarize(self, text, n,tokenize):
     sents = sent_tokenize(text)
     word_sent = [word_tokenize(s,tokenize) for s in sents]
     self._freq = self._compute_frequencies(word_sent)
     ranking = defaultdict(int)
     for i, sent in enumerate(word_sent):
         for w in sent:
             if w in self._freq:
                 ranking[i] += self._freq[w]
     sents_idx = self._rank(ranking,n)
     return [sents[j] for j in sents_idx]

Ejemplo n.º 6

0

Mostrar archivo

Archivo: thaipbs.py Proyecto: wannaphong/sandy_nsc2020

 def get_today(self):
     self.today = []
     offset = datetime.timezone(datetime.timedelta(hours=7))
     for i in self.news:
         #print(datetime.datetime.now(offset).date())
         #print(i[0].date())
         if (datetime.datetime.now(offset) -
                 datetime.timedelta(days=0)).date() == i[0].date():
             self.today.append('\n'.join(
                 sent_tokenize(i[2] + "\n" + self.clean(i[1]))))
     return self.today

Ejemplo n.º 7

0

Mostrar archivo

Archivo: feature_option.py Proyecto: Jirayut558/Seniorproject

def phrasecut(article):
    list_in = sent_tokenize(article, engine='whitespace')
    list_out = []
    i = 0
    while i < len(list_in):
        if list_in[i].isdigit():
            list_out.append(list_in[i] + list_in[i + 1])
            i += 2
        else:
            list_out.append(list_in[i])
            i += 1
    return list_out

Ejemplo n.º 8

0

Mostrar archivo

Archivo: freq.py Proyecto: wannaphongcom/pythainlp

    def summarize(self, text: str, n: int, tokenizer: str = "newmm") -> List[str]:
        sents = sent_tokenize(text)
        word_tokenized_sents = [word_tokenize(sent, engine=tokenizer) for sent in sents]
        self.__freq = self.__compute_frequencies(word_tokenized_sents)
        ranking = defaultdict(int)

        for i, sent in enumerate(word_tokenized_sents):
            for w in sent:
                if w in self.__freq:
                    ranking[i] += self.__freq[w]
        summaries_idx = self.__rank(ranking, n)

        return [sents[j] for j in summaries_idx]

Ejemplo n.º 9

0

Mostrar archivo

    def summarize(self, text: str, n: int, tokenizer: str):
        sents = sent_tokenize(text)
        word_tokenized_sents = [
            word_tokenize(sent, tokenizer) for sent in sents
        ]
        self.__freq = self.__compute_frequencies(word_tokenized_sents)
        ranking = defaultdict(int)

        for i, sent in enumerate(word_tokenized_sents):
            for w in sent:
                if w in self.__freq:
                    ranking[i] += self.__freq[w]
        summaries_idx = self.__rank(ranking, n)

        return [sents[j] for j in summaries_idx]

Ejemplo n.º 10

0

Mostrar archivo

Archivo: __init__.py Proyecto: unsuthee/pythainlp

def summarize_text(text, n, engine="frequency", tokenizer="newmm"):
    """
    Thai text summarization
    :param str text: text to be summarized
    :param int n: number of sentences to be included in the summary
    :param str engine: text summarization engine
    :param str tokenizer: word tokenizer
    :return List[str] summary: list of selected sentences
    """
    sents = []

    if engine == "frequency":
        sents = FrequencySummarizer().summarize(text, n, tokenizer)
    else:  # if engine not found, return first n sentences
        sents = sent_tokenize(text)[:n]

    return sents

Ejemplo n.º 11

0

Mostrar archivo

 def process_thai(self, content, sent_file):
     """
     处理泰语
     :param content:
     :return:
     """
     sentences = set()
     content = self.delete_english(content)  # 删除英语
     with open(sent_file, 'a', encoding='utf8') as s_f:
         sents = sent_tokenize(content)
         for sent in sents:
             if sent:
                 sentence_length = word_tokenize(sent)
                 if 7 <= len(sentence_length) <= 15:
                     if not contain_number(sent):
                         if sent not in sentences:
                             sentences.add(sent)
                             s_f.write(sent + "\n")

Ejemplo n.º 12

0

Mostrar archivo

Archivo: freq.py Proyecto: totaeza31/pythainlp

    def summarize(self,
                  text: str,
                  n: int,
                  tokenizer: str = "newmm") -> List[str]:
        sents = sent_tokenize(text, engine="whitespace+newline")
        word_tokenized_sents = [
            word_tokenize(sent, engine=tokenizer) for sent in sents
        ]
        self.__freq = self.__compute_frequencies(word_tokenized_sents)
        ranking = defaultdict(int)

        for i, sent in enumerate(word_tokenized_sents):
            for w in sent:
                if w in self.__freq:
                    ranking[i] += self.__freq[w]
        summaries_idx = self.__rank(ranking, n)

        return [sents[j] for j in summaries_idx]

Ejemplo n.º 13

0

Mostrar archivo

Archivo: __init__.py Proyecto: wannaphongcom/pythainlp

def summarize(
    text: str, n: int, engine: str = "frequency", tokenizer: str = "newmm"
) -> List[str]:
    """
    Thai text summarization

    :param str text: text to be summarized
    :param int n: number of sentences to be included in the summary
    :param str engine: text summarization engine
    :param str tokenizer: word tokenizer
    :return List[str] summary: list of selected sentences
    """
    sents = []

    if engine == "frequency":
        sents = FrequencySummarizer().summarize(text, n, tokenizer)
    else:  # if engine not found, return first n sentences
        sents = sent_tokenize(text)[:n]

    return sents

Ejemplo n.º 14

0

Mostrar archivo

Archivo: core.py Proyecto: nadachr/thai_sentiment_analysis

def summarize(
    text: str,
    n: int = 1,
    engine: str = DEFAULT_SUMMARIZE_ENGINE,
    tokenizer: str = "newmm",
) -> List[str]:
    """
        This function summarizes text based on frequency of words.

        Under the hood, this function first tokenize sentence from the given
        text with :func:`pythainlp.tokenize.sent_tokenize`.
        Then, computes frequencies of tokenized words
        (with :func:`pythainlp.tokenize.word_tokenize`) in all sentences
        and normalized with maximum word frequency. The words with normalized
        frequncy that are less than 0.1 or greater than 0.9 will be
        filtered out from frequency dictionary. Finally, it picks *n* sentences
        with highest sum of normalized frequency from all words
        in the sentence and also appear in the frequency dictionary.

        :param str text: text to be summarized
        :param int n: number of sentences to be included in the summary
                      By default, n is *1* (effective for frequency engine only)
        :param str engine: text summarization engine (By default: *frequency*).
        :param str tokenizer: word tokenizer engine name (refer to
                              :func:`pythainlp.tokenize.word_tokenize`).
                              By default, tokenizer is *newmm*
                              (effective for frequency engine only)

        :return: list of selected sentences
        **Options for engine**
            * *frequency* (default) - frequency of words
            * *mt5* - mT5-small model
            * *mt5-small* - mT5-small model
            * *mt5-base* - mT5-base model
            * *mt5-large* - mT5-large model
            * *mt5-xl* - mT5-xl model
            * *mt5-xxl* - mT5-xxl model

        :Example:
        ::

            from pythainlp.summarize import summarize

            text = '''
                    ทำเนียบท่าช้าง หรือ วังถนนพระอาทิตย์
                    ตั้งอยู่บนถนนพระอาทิตย์ เขตพระนคร กรุงเทพมหานคร
                    เดิมเป็นบ้านของเจ้าพระยามหาโยธา (ทอเรียะ คชเสนี)
                    บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์ (พญาเจ่ง)
                    ต้นสกุลคชเสนี เชื้อสายมอญ เจ้าพระยามหาโยธา (ทอเรีย)
                    เป็นปู่ของเจ้าจอมมารดากลิ่นในพระบาทสมเด็จพระจอมเกล้าเจ้าอยู่หัว
                    และเป็นมรดกตกทอดมาถึง พระเจ้าบรมวงศ์เธอ กรมพระนเรศรวรฤทธิ์
                    (พระองค์เจ้ากฤดาภินิหาร)
                    ต่อมาในรัชสมัยพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัวโปรดเกล้าฯ
                    ให้สร้างตำหนัก 2 ชั้น
                    เป็นที่ประทับของพระเจ้าบรมวงศ์เธอ
                    กรมพระนเรศวรฤทิธิ์และเจ้าจอมมารดา
                    ต่อมาเรียกอาคารหลักนี้ว่า ตำหนักเดิม
                '''

            summarize(text, n=1)
            # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์']

            summarize(text, n=3)
            # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์',
            # 'เดิมเป็นบ้านของเจ้าพระยามหาโยธา',
            # 'เจ้าพระยามหาโยธา']

            summarize(text, engine="mt5-small")
            # output: ['<extra_id_0> ท่าช้าง หรือ วังถนนพระอาทิตย์
            # เขตพระนคร กรุงเทพมหานคร ฯลฯ ดังนี้:
            # ที่อยู่ - ศิลปวัฒนธรรม']
    """
    if not text or not isinstance(text, str):
        return []
    sents = []

    if engine == DEFAULT_SUMMARIZE_ENGINE:
        sents = FrequencySummarizer().summarize(text, n, tokenizer)
    elif engine.startswith('mt5-') or engine == "mt5":
        size = engine.replace('mt5-', '')
        from .mt5 import mT5Summarizer
        sents = mT5Summarizer(model_size=size).summarize(text)
    else:  # if engine not found, return first n sentences
        sents = sent_tokenize(text, engine="whitespace+newline")[:n]

    return sents

Ejemplo n.º 15

0

Mostrar archivo

Archivo: __init__.py Proyecto: veer66/pythainlp

def summarize(text: str,
              n: int,
              engine: str = "frequency",
              tokenizer: str = "newmm") -> List[str]:
    """
        This function summarizes text based on frequency of words.

        Under the hood, this function first tokenize sentence from the given
        text with :func:`pythainlp.tokenize.sent_tokenize`.
        Then, computes frequencies of tokenized words
        (with :func:`pythainlp.tokenize.word_tokenize`) in all sentences
        and normalized with maximum word frequency. The words with normalized
        frequncy that are less than 0.1 or greater than 0.9 will be
        filtered out from frequency dictionary. Finally, it picks *n* sentences
        with highest sum of normalized frequency from all words
        in the sentence and also appear in the frequency dictionary.

        :param str text: text to be summarized
        :param int n: number of sentences to be included in the summary
        :param str engine: text summarization engine (By default: *frequency*).
                           There is only one engine currently.
        :param str tokenizer: word tokenizer engine name (refer to
                              :func:`pythainlp.tokenize.word_tokenize`).
                              By default, *engine* is set to *newmm*

        :return: list of selected sentences
        :rtype: list[str]

        :Example:
        ::

            from pythainlp.summarize import summarize

            text = '''
                    ทำเนียบท่าช้าง หรือ วังถนนพระอาทิตย์
                    ตั้งอยู่บนถนนพระอาทิตย์ เขตพระนคร กรุงเทพมหานคร
                    เดิมเป็นบ้านของเจ้าพระยามหาโยธา (ทอเรียะ คชเสนี)
                    บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์ (พญาเจ่ง)
                    ต้นสกุลคชเสนี เชื้อสายมอญ เจ้าพระยามหาโยธา (ทอเรีย)
                    เป็นปู่ของเจ้าจอมมารดากลิ่นในพระบาทสมเด็จพระจอมเกล้าเจ้าอยู่หัว
                    และเป็นมรดกตกทอดมาถึง พระเจ้าบรมวงศ์เธอ กรมพระนเรศรวรฤทธิ์
                    (พระองค์เจ้ากฤดาภินิหาร)
                    ต่อมาในรัชสมัยพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัวโปรดเกล้าฯ
                    ให้สร้างตำหนัก 2 ชั้น
                    เป็นที่ประทับของพระเจ้าบรมวงศ์เธอ
                    กรมพระนเรศวรฤทิธิ์และเจ้าจอมมารดา
                    ต่อมาเรียกอาคารหลักนี้ว่า ตำหนักเดิม
                '''

            summarize(text, n=1)
            # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์']

            summarize(text, n=3)
            # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์',
            # 'เดิมเป็นบ้านของเจ้าพระยามหาโยธา',
            # 'เจ้าพระยามหาโยธา']
    """
    sents = []

    if engine == "frequency":
        sents = FrequencySummarizer().summarize(text, n, tokenizer)
    else:  # if engine not found, return first n sentences
        sents = sent_tokenize(text)[:n]

    return sents

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_tokenize.py Proyecto: hopedataannotations/pythainlp

    def test_sent_tokenize(self):
        self.assertEqual(sent_tokenize(None), [])
        self.assertEqual(sent_tokenize(""), [])
        self.assertEqual(
            sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"),
            ["รักน้ำ", "รักปลา", ""],
        )
        self.assertEqual(
            sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace+newline"),
            ["รักน้ำ", "รักปลา"],
        )

        sent_1 = "ฉันไปโรงเรียน เธอไปโรงพยาบาล"
        sent_1_toks = ["ฉันไปโรงเรียน ", "เธอไปโรงพยาบาล"]
        sent_2 = "วันนี้ฉันกินข้าว และโดดเรียน"
        sent_2_toks = ["วันนี้ฉันกินข้าว และโดดเรียน"]
        sent_3 = ("(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" +
                  "จากผลงานวิจัยที่เคยทำมาในอดีต" +
                  " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" +
                  " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้")
        sent_3_toks = [
            "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" +
            "จากผลงานวิจัยที่เคยทำมาในอดีต ",
            "มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ",
            "จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้",
        ]

        self.assertEqual(
            sent_tokenize(sent_1, engine="crfcut"),
            sent_1_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_2, engine="crfcut"),
            sent_2_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_3, engine="crfcut"),
            sent_3_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_1),
            sent_1_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_2),
            sent_2_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_3),
            sent_3_toks,
        )
        self.assertIsNotNone(
            sent_tokenize(
                sent_1,
                keep_whitespace=False,
                engine="whitespace",
            ), )
        self.assertFalse(" " in sent_tokenize(
            sent_1,
            engine="whitespace",
            keep_whitespace=False,
        ))
        with self.assertRaises(ValueError):
            sent_tokenize("ฉันไป กิน", engine="XX")  # engine does not exist

Ejemplo n.º 17

0

Mostrar archivo

from pythainlp.tokenize import sent_tokenize, word_tokenize
from pythainlp import pos_tag
text = "ผมชื่อต้นตาล ผมอายุ 40 ปี ผมเล่นเกม"
#print(text)
sent = sent_tokenize(text)
print("จำนวนประโยค : {}".format(str(len(sent))))
#print(sent)
for i in range(0, len(sent)):
    print("Sentence {} is '{}'".format(
        i + 1, str(pos_tag(word_tokenize(sent[i]), corpus='orchid_ud'))))
#print(txt.split("\n"))

Ejemplo n.º 18

0

Mostrar archivo

# -*- coding: utf-8 -*-

from pythainlp.tokenize import sent_tokenize, word_tokenize

text = "ฉันรักภาษาไทย เพราะฉันใช้ภาษาไทย "
print(text)

print(sent_tokenize(text))
# ['ฉันรักภาษาไทย', 'เพราะฉันใช้ภาษาไทย', '']

print(word_tokenize(text))
# ['ฉัน', 'รัก', 'ภาษาไทย', ' ', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย', ' ']

print(word_tokenize(text, whitespaces=False))
# ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย']

text2 = "กฎหมายแรงงาน"
print(text2)

print(word_tokenize(text2))
# ['กฎหมายแรงงาน']

print(word_tokenize(text2, engine="longest-matching"))
# ['กฎหมาย', 'แรงงาน']

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_tokenize.py Proyecto: preenet/pythainlp

    def test_sent_tokenize(self):
        self.assertEqual(sent_tokenize(None), [])
        self.assertEqual(sent_tokenize(""), [])
        self.assertEqual(
            sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"),
            ["รักน้ำ", "รักปลา", ""],
        )
        self.assertEqual(
            sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace+newline"),
            ["รักน้ำ", "รักปลา"],
        )

        sent_1 = "ฉันไปโรงเรียน เธอไปโรงพยาบาล"
        sent_1_toks = ["ฉันไปโรงเรียน ", "เธอไปโรงพยาบาล"]
        sent_2 = "วันนี้ฉันกินข้าว และโดดเรียน"
        sent_2_toks = ["วันนี้ฉันกินข้าว และโดดเรียน"]

        self.assertEqual(
            sent_tokenize(sent_1, engine="crfcut"), sent_1_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_2, engine="crfcut"), sent_2_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_1), sent_1_toks,
        )
        self.assertEqual(
            sent_tokenize(sent_2), sent_2_toks,
        )
        self.assertIsNotNone(
            sent_tokenize(
                sent_1,
                keep_whitespace=False,
                engine="whitespace",
            ),
        )
        self.assertFalse(
            " "
            in sent_tokenize(
                sent_1,
                engine="whitespace",
                keep_whitespace=False,
            )
        )
        with self.assertRaises(ValueError):
            sent_tokenize("ฉันไป กิน", engine="XX")  # engine does not exist

Ejemplo n.º 20

0

Mostrar archivo

Archivo: data_processing.py Proyecto: nakhunchumpolsathien/kenlm

def sent_tokenziation(text):
    return "\n".join(sent_tokenize(text, keep_whitespace=False))

Ejemplo n.º 21

0

Mostrar archivo

Archivo: check_health.py Proyecto: vistec-AI/thai_websites_crawler

    df['missing_en'] = df.en_text.isna()
    df['missing_th'] = df.th_text.isna()

    #characters
    df['per_en'] = df.en_text.map(lambda x: char_percent(r'[a-zA-Z0-9]',str(x)))
    df['per_th'] = df.th_text.map(lambda x: char_percent(r'[ก-๙0-9]',str(x)))
    df['th_in_en'] = df.en_text.map(lambda x: 1 if char_percent(r'[ก-๙]',str(x)) else 0)

    #tokens
    df['en_tokens'] = df.en_text.map(lambda x: len(str(x).split()))
    df['th_tokens'] = df.th_text.map(lambda x: len(word_tokenize(str(x))))
    df['e2t_tokens'] = df.en_tokens / df.th_tokens

    #sentences
    df['en_sentences'] = df.en_text.map(lambda x: len(str(x).split('.')))
    df['th_sentences'] = df.th_text.map(lambda x: len(sent_tokenize(str(x))))
    
    print(f'''
    {args.input_path}
    shape: {df.shape}
    missing en: {df.missing_en.sum()} segments
    missing th: {df.missing_th.sum()} segments
    en duplicates: {df.en_text.count() - df.en_text.nunique()} segments
    th duplicates: {df.th_text.count() - df.th_text.nunique()} segments
    th charcters in en texts: {df.th_in_en.sum()} segments
    en char (mean, median, min, max): {df.per_en.mean():.2f}, {df.per_en.median():.2f} ({df.per_en.min():.2f}-{df.per_en.max():.2f})
    th char (mean, median, min, max): {df.per_th.mean():.2f}, {df.per_th.median():.2f} ({df.per_th.min():.2f}-{df.per_th.max():.2f})
    en tokens (mean, median, min, max): {df.en_tokens.mean():.2f}, {df.en_tokens.median()} ({df.en_tokens.min()}-{df.en_tokens.max()})
    th tokens (mean, median, min, max): {df.th_tokens.mean():.2f}, {df.th_tokens.median()} ({df.th_tokens.min()}-{df.th_tokens.max()})
    en-to-th tokens ratio (mean, median, min, max): {df.e2t_tokens.mean():.2f}, {df.e2t_tokens.median():.2f} ({df.e2t_tokens.min():.2f}-{df.e2t_tokens.max():.2f})
    en sentences (mean, median, min, max): {df.en_sentences.mean():.2f}, {df.en_sentences.median()} ({df.en_sentences.min()}-{df.en_sentences.max()})