コード例 #1
0
ファイル: seg_text.py プロジェクト: ffreemt/vector-search
Use sentence_splitter if supported,
else use polyglot.text.Text
"""
# pylint: disable=broad-except

from typing import List, Optional

from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences

from logzero import logger

# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")


# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
          "el", "hu", "is", "it", "lv", "lt", "no", "pl",
          "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]


def seg_text(
        text: str,
        lang: Optional[str] = None,
        qmode: bool = False,
        maxlines: int = 1000
) -> List[str]:
コード例 #2
0
ファイル: q1.py プロジェクト: zxtanzxzx/de_twitter
import twint
# import nest_asyncio
# nest_asyncio.apply()
import time
from kafka import KafkaProducer
import json
import cld3
from polyglot.detect import Detector
import fasttext
from polyglot.detect.base import logger as polyglot_logger
from langdetect import detect, DetectorFactory
from statistics import mode, StatisticsError
polyglot_logger.setLevel("ERROR")  # to stop polyglot from showing the warning
import warnings
import string
import re
warnings.filterwarnings("ignore")
fasttext.FastText.eprint = lambda x: None

# keywords = ["mcchicken","汉堡包","mekdi","ramly","fastfood","fast food" ,"makanan segera","makan mcd","hamburger","burger",
#             "fries","hotdog","fried chicken","pizza","hashbrown","mcd happy meal","nuggets",
#             "onion rings","coke","softdrink","pepsi","potato bowl","filet o fish",
#             "GCB","mcchicken deluxe","bigmac","cheese burger","mcflurry","sundae cone", "A&W",
#             "薯条","热肠","炸鸡","披萨","汽水","薯饼","炸鸡块","洋葱圈","mcdonald","US pizza",
#             "Pizza Hut","KFC","Texas Chicken","Domino's Pizza","Domino","mcd","wendy's",
#             "Spade's burger","4 fingers","four fingers","subway","jollibee","marrybrown","麦当劳","必胜客",
#             "肯德基","达美乐","祖乐比","全家桶","鸡肉汉堡","牛肉汉堡","薯泥","百事可乐","芝士"]

keywords = [
    "mcchicken", "fastfood", "fast food", "hamburger", "burger", "fries",
    "hotdog", "fried chicken", "pizza", "hashbrown", "mcd happy meal",