Use sentence_splitter if supported, else use polyglot.text.Text """ # pylint: disable=broad-except from typing import List, Optional from tqdm.auto import tqdm from polyglot.detect.base import logger as polyglot_logger from polyglot.text import Detector, Text from sentence_splitter import split_text_into_sentences from logzero import logger # turn of polyglot.text.Detector warning polyglot_logger.setLevel("ERROR") # fmt: off # use sentence_splitter if supported LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de", "el", "hu", "is", "it", "lv", "lt", "no", "pl", "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"] def seg_text( text: str, lang: Optional[str] = None, qmode: bool = False, maxlines: int = 1000 ) -> List[str]:
import twint # import nest_asyncio # nest_asyncio.apply() import time from kafka import KafkaProducer import json import cld3 from polyglot.detect import Detector import fasttext from polyglot.detect.base import logger as polyglot_logger from langdetect import detect, DetectorFactory from statistics import mode, StatisticsError polyglot_logger.setLevel("ERROR") # to stop polyglot from showing the warning import warnings import string import re warnings.filterwarnings("ignore") fasttext.FastText.eprint = lambda x: None # keywords = ["mcchicken","汉堡包","mekdi","ramly","fastfood","fast food" ,"makanan segera","makan mcd","hamburger","burger", # "fries","hotdog","fried chicken","pizza","hashbrown","mcd happy meal","nuggets", # "onion rings","coke","softdrink","pepsi","potato bowl","filet o fish", # "GCB","mcchicken deluxe","bigmac","cheese burger","mcflurry","sundae cone", "A&W", # "薯条","热肠","炸鸡","披萨","汽水","薯饼","炸鸡块","洋葱圈","mcdonald","US pizza", # "Pizza Hut","KFC","Texas Chicken","Domino's Pizza","Domino","mcd","wendy's", # "Spade's burger","4 fingers","four fingers","subway","jollibee","marrybrown","麦当劳","必胜客", # "肯德基","达美乐","祖乐比","全家桶","鸡肉汉堡","牛肉汉堡","薯泥","百事可乐","芝士"] keywords = [ "mcchicken", "fastfood", "fast food", "hamburger", "burger", "fries", "hotdog", "fried chicken", "pizza", "hashbrown", "mcd happy meal",