Ejemplo n.º 1
0
def get_all_stop_words():
  """
  For the language independent version of Justext
  """
  stop_words = set()
  for language in justext.get_stoplists():
      stop_words.update(justext.get_stoplist(language))
  return stop_words
Ejemplo n.º 2
0
def jt_truelg_treatement(input_file, output_file, file_name):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    It uses true language identification, linked to a specific file, to detect the language to
    use in JusText module.
    """
    if input_file.read() != " ":
        input_file.seek(0)
        languages = json.load(open("../../resources/doc_lg.json"))

        language = languages[os.path.basename(file_name)]
        if language not in justext.get_stoplists():
            language = "English"

        paragraphs = justext.justext(input_file.read(),
                                     justext.get_stoplist(language))

        for paragraph in paragraphs:
            output_file.write("<p>" + paragraph.text.replace("\n", " ") +
                              "</p>\n")
    else:
        output_file.write(" ")
Ejemplo n.º 3
0
def jt_langid_treatement(input_file, output_file):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    It uses the langid module to detect the language to use in JusText module.
    """
    if input_file.read() != " ":
        input_file.seek(0)
        language = langid.classify(input_file.read())
        language = languages.get(alpha2=language[0]).name
        if "Greek" in language:
            language = "Greek"
        if language not in justext.get_stoplists():
            language = "English"

        input_file.seek(0)
        paragraphs = justext.justext(input_file.read(),
                                     justext.get_stoplist(language))

        for paragraph in paragraphs:
            output_file.write("<p>" + paragraph.text.replace("\n", " ") +
                              "</p>\n")
    else:
        output_file.write(" ")
Ejemplo n.º 4
0
try:
    from contextlib import redirect_stderr
    MUFFLE_FLAG = True
except ImportError:
    MUFFLE_FLAG = False

# third-party
from lxml import etree, html
from readability import Document
from readability.readability import Unparseable

# try this option
try:
    import justext
    JT_STOPLIST = set()
    for language in justext.get_stoplists():
        JT_STOPLIST.update(justext.get_stoplist(language))
except ImportError:
    justext = JT_STOPLIST = None

# own
from .htmlprocessing import convert_tags, prune_html
from .settings import JUSTEXT_LANGUAGES, MANUALLY_STRIPPED
from .utils import trim, HTML_PARSER
from .xml import TEI_VALID_TAGS

LOGGER = logging.getLogger(__name__)

SANITIZED_XPATH = '//aside|//audio|//button|//fieldset|//figure|//footer|//iframe|//img|//image|//input|//label|//link|//nav|//noindex|//noscript|//object|//option|//select|//source|//svg|//time'

Ejemplo n.º 5
0
def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)
Ejemplo n.º 6
0
def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)