Ejemplos de get_stoplists en Python

Lenguaje de programación: Python

Namespace/Package Name: justext

Método / Función: get_stoplists

Ejemplos en hotexamples.com: 6

Python get_stoplists - 6 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de justext.get_stoplists extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

def get_all_stop_words():
  """
  For the language independent version of Justext
  """
  stop_words = set()
  for language in justext.get_stoplists():
      stop_words.update(justext.get_stoplist(language))
  return stop_words

Ejemplo n.º 2

Mostrar archivo

Archivo: true_lg.py Proyecto: AntoineOrgerit/Web-Scrapping

def jt_truelg_treatement(input_file, output_file, file_name):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    It uses true language identification, linked to a specific file, to detect the language to
    use in JusText module.
    """
    if input_file.read() != " ":
        input_file.seek(0)
        languages = json.load(open("../../resources/doc_lg.json"))

        language = languages[os.path.basename(file_name)]
        if language not in justext.get_stoplists():
            language = "English"

        paragraphs = justext.justext(input_file.read(),
                                     justext.get_stoplist(language))

        for paragraph in paragraphs:
            output_file.write("<p>" + paragraph.text.replace("\n", " ") +
                              "</p>\n")
    else:
        output_file.write(" ")

Ejemplo n.º 3

Mostrar archivo

def jt_langid_treatement(input_file, output_file):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    It uses the langid module to detect the language to use in JusText module.
    """
    if input_file.read() != " ":
        input_file.seek(0)
        language = langid.classify(input_file.read())
        language = languages.get(alpha2=language[0]).name
        if "Greek" in language:
            language = "Greek"
        if language not in justext.get_stoplists():
            language = "English"

        input_file.seek(0)
        paragraphs = justext.justext(input_file.read(),
                                     justext.get_stoplist(language))

        for paragraph in paragraphs:
            output_file.write("<p>" + paragraph.text.replace("\n", " ") +
                              "</p>\n")
    else:
        output_file.write(" ")

Ejemplo n.º 4

Mostrar archivo

try:
    from contextlib import redirect_stderr
    MUFFLE_FLAG = True
except ImportError:
    MUFFLE_FLAG = False

# third-party
from lxml import etree, html
from readability import Document
from readability.readability import Unparseable

# try this option
try:
    import justext
    JT_STOPLIST = set()
    for language in justext.get_stoplists():
        JT_STOPLIST.update(justext.get_stoplist(language))
except ImportError:
    justext = JT_STOPLIST = None

# own
from .htmlprocessing import convert_tags, prune_html
from .settings import JUSTEXT_LANGUAGES, MANUALLY_STRIPPED
from .utils import trim, HTML_PARSER
from .xml import TEI_VALID_TAGS

LOGGER = logging.getLogger(__name__)

SANITIZED_XPATH = '//aside|//audio|//button|//fieldset|//figure|//footer|//iframe|//img|//image|//input|//label|//link|//nav|//noindex|//noscript|//object|//option|//select|//source|//svg|//time'

Ejemplo n.º 5

Mostrar archivo

Archivo: bot.py Proyecto: mill7r/AnchorBot

def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)

Ejemplo n.º 6

Mostrar archivo

Archivo: bot.py Proyecto: MuslimConditions/AnchorBot

def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)