def try_justext(tree, url, target_language): '''Second safety net: try with the generic algorithm justext''' result_body = etree.Element('body') # determine language if target_language is not None and target_language in JUSTEXT_LANGUAGES: justext_stoplist = get_stoplist(JUSTEXT_LANGUAGES[target_language]) else: justext_stoplist = JT_STOPLIST # extract try: paragraphs = custom_justext(tree, justext_stoplist) except ValueError as err: # not an XML element: HtmlComment LOGGER.error('justext %s %s', err, url) result_body = None else: for paragraph in [p for p in paragraphs if not p.is_boilerplate]: #if duplicate_test(paragraph) is not True: elem, elem.text = etree.Element('p'), paragraph.text result_body.append(elem) return result_body
def test_get_real_stoplist(self): stopwords = get_stoplist("Slovak") tools.assert_true(len(stopwords) > 0)
def jt_stoplist_init(): 'Retrieve and return the content of all JusText stoplists' stoplist = set() for language in get_stoplists(): stoplist.update(get_stoplist(language)) return stoplist
def test_get_missing_stoplist(self): with pytest.raises(ValueError): get_stoplist("Klingon")
def test_get_real_stoplist(self): stopwords = get_stoplist("Slovak") assert len(stopwords) > 0