def get_paragraphs_JT(str_text, mode, file_name=''):
    """
    using Justext
    """
    if mode == "_english":
        stop = justext.get_stoplist("English")
    elif mode == 'lang_detect':
        lang = get_langid(str_text)
        if lang == "Chinese":
            stop = set()
        else:
            stop = justext.get_stoplist(lang)
    # mode ou on détecte la 'vraie' langue fournie par le fichier doc_lg.json
    elif mode == 'lang_specified' and file_name != '':
        with open(DOC_LG_PATH, mode='r', encoding='utf-8',
                  errors='ignore') as lang_code_file:
            json_data = json.load(
                lang_code_file)  # on charge nos codes de langue
            lang = json_data[file_name]  # on récupère la langue
            if lang == "Chinese":
                stop = set()
            else:
                stop = justext.get_stoplist(lang)
            lang_code_file.close()

    else:
        stop = frozenset()

    if len(stop) == 0:
        any_lang_stop_words = get_all_stop_words()
        paragraphs = justext.justext(str_text, any_lang_stop_words)
    else:
        paragraphs = justext.justext(str_text, stop)
    list_paragraphs = [x.text for x in paragraphs if not x.is_boilerplate]
    return list_paragraphs
Beispiel #2
0
 def apply_justext_boilerplate_stripper(self, r, stoplist):
     index_key = "index_{}_{}_{}".format(r["country"], r["website"], r["feed_name"]) 
     index_key = "{}.csv".format(self.escape_filename(index_key))
     w = self.escape_filename(r["website"])
     feed_name = self.escape_filename(r["feed_name"])
     original_html_path = os.path.join(self.raw_dir, w, feed_name, r["original_html_file"])
     xml_dir = os.path.join(self.proc_dir, w, feed_name)
     try:
         os.makedirs(xml_dir)
     except IOError:
         pass
     processed_xml_path = os.path.join(xml_dir, r["original_html_file"].replace(".html", ".xml"))
     try:
         with open(original_html_path, "r", encoding="utf-8") as h:
             text = h.read()
     except FileNotFoundError:
         text = None
         self.index_df[index_key].loc[r.name, "downloaded"] = False
         self.index_df[index_key].loc[r.name, "processed"] = False
         self.index_df[index_key].loc[r.name, "justext_comment"] = np.nan
     if text:
         try:
             paragraphs = justext.justext(text, justext.get_stoplist("English"))
         except ValueError:  # e.g. if unable to get stoplist in pyinstaller compiled version
             paragraphs = justext.justext(text, stoplist=stoplist)
         to_keep = []
         bp_count = 0
         for paragraph in paragraphs:
             if not paragraph.is_boilerplate:
                 to_keep.append(paragraph)
             else:
                 bp_count += 1
         if to_keep:
             root = etree.Element("text")
             tree = etree.ElementTree(root)
             for paragraph in to_keep:
                 p_elem = etree.Element("p")
                 p_elem.text = paragraph.text
                 root.append(p_elem)
             xml_str = etree.tounicode(tree)
             try:
                 tree.write(processed_xml_path, pretty_print=True, encoding='utf-8', xml_declaration=True)
             except IOError as e:
                 print("WARNING: Could not write XML file:", e)
                 self.index_df[index_key].loc[r.name, "processed"] = False
             else:
                 self.index_df[index_key].loc[r.name, "processed"] = True
         else:
             print("WARNING: No non-boilerplate code found for", original_html_path)
         self.index_df[index_key].loc[r.name, "justext_comment"] = "{}/{}".format(len(to_keep), bp_count)
         self.index_df[index_key].loc[r.name, "extraction_method"] = "jusText"
Beispiel #3
0
    def getTextFromWeb(self):
        num_results = 10
        search_list = ["bbc", "Little Red Riding Hood"]
        sites = [] 
        text = []
        results = []
        while len(search_list)!=0 and len(results) < num_results:
            search = search_list.pop()
            results = results + google.google(search,nltk.word_tokenize)

        for d in results:
            sites.append(d)
            if len(sites) == num_results:
                break
  
        for url in sites:
            print url
            try:
                page = urllib2.urlopen(url).read()
            except urllib2.HTTPError, e:
                print "Search failed: %s" % e 
                continue
            paragraphs = justext.justext(page, justext.get_stoplist('English'))
            if len(text) < 50:
                for paragraph in paragraphs:
                    if paragraph['class'] == 'good' and len(text) < 50:
                        sentences = self.segment_sentences(paragraph['text'].encode('utf8'))
                        for s in sentences:
                            if not text.__contains__(s):
                                text.append(s)
Beispiel #4
0
def get_document_text(input_url_response):
    DOCUMENT_LENGTH = 0
    paragraphs = justext.justext(input_url_response.content,
                                 justext.get_stoplist("English"))
    for paragraph in paragraphs:
        DOCUMENT_LENGTH += len(paragraph.text)
    return DOCUMENT_LENGTH
Beispiel #5
0
def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        langsetting = JUSTEXT_LANGUAGES[target_language]
        justext_stoplist = justext.get_stoplist(langsetting)
    else:
        #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT)
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200,
                                     0.1, 0.2, 0.2, 200, True)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in [p for p in paragraphs if not p.is_boilerplate]:
            #if duplicate_test(paragraph) is not True:
            elem = etree.Element('p')
            elem.text = paragraph.text
            result_body.append(elem)
    return result_body
def remove_boilerplate(page_str, lang, relaxed=False):
    """
    Removes boilerplate from HTML documents.

    Uses JusText library.

    NOTE: quality dependent on correct language detection.

    :param page_str: str HTML page source.
    :param lang: str Google Translate language code.
    :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match
        is returned. Short and bad segments in between are kept.
    :return: list List of non-boilerplate segments/paragraphs.
    """
    if lang not in GTRANS_JUSTEXT_LANG_MAP:
        #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang)
        return []

    jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang]

    paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang))

    if relaxed:
        good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']]

        if len(good_indexes) == 0:
            return []

        return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]]
    else:
        return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
Beispiel #7
0
def get_article(articles, i, output):
    for article in tqdm(articles):
        try:
            a = newspaper.Article(article)
            a.download()
            a.parse()
            a.nlp()

            paragraphs = justext.justext(a.html,
                                         justext.get_stoplist("English"))
            text = '\n\n'.join(
                [p.text for p in paragraphs if not p.is_boilerplate])

            if (len(text) > len(a.text) + 50):
                a.set_text(text)

            h = html2text.HTML2Text()
            h.ignore_links = True
            h.ignore_images = True

            a.set_html(h.handle(a.html))

        except Exception as e:
            print(e)
            continue

        # TODO: config option?
        if len(a.text) < 400:
            continue

        output.append(a)
Beispiel #8
0
def get_url_article2(link, lang):
    '''
    TO BE DONE : error handling : http://www.voidspace.org.uk/python/articles/urllib2.shtml#handling-exceptions        
    '''
    ### bug encodage
    if len(link) < 5:
        return False
    try:
        #l = link.decode("utf-8",  errors='ignore')
        log.info("Retrieving : " + link)
        #hdr = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        hdr = 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0'
        headers = {'User-Agent': hdr}
        resp = requests.get(link, headers=headers)
        resp.raise_for_status()
        page = resp.text
        #log.info(page)
        contents = ''
        #print(justext.get_stoplist())
        paragraphs = justext.justext(page, justext.get_stoplist(lang))
        for paragraph in paragraphs:
            if paragraph.class_type == 'good':
                #and re.search(r'Facebook connect|cliquez|Envoyer cet article par email|D.couvrez tous nos packs|d.j.un|recevoirnos|nosoffres|acc.dezà|cliquez ici|En poursuivant votre navigation sur ce site|accédezà|pasencore|Veuillez cliquer|créez gratuitement votre compte]',paragraph.text)== None:
                contents = contents + "\n" + paragraph.text
        cts = remove_control_characters(contents)
        if len(cts) == 0:
            log.warning("No contents for :" + link)  # + " " + page
        return cts
    except requests.exceptions.RequestException as e:
        log.warning("Exception : " + str(e))
        return False
Beispiel #9
0
def simple_text_extractor(html, stopwords='English'):
    import corpkit
    """extract text from html/xml files using justext"""
    import requests
    import justext
    import os
    import copy
    # if on hard disk:
    if type(html) != list:
        html_files = [copy.deepcopy(html)]
    else:
        html_files = copy.deepcopy(html)
    output = []
    for html in html_files:
        if os.path.isfile(html):
            f = open(html)
            raw_html_text = f.read()
        # if it's a web address
        elif html.startswith('http'):
            response = requests.get(html)
            raw_html_text = response.content
        # if it's already html text:
        else:
            raw_html_text = copy.deepcopy(html)
        paragraphs = justext.justext(raw_html_text,
                                     justext.get_stoplist(stopwords))
        text = []
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                text.append(paragraph.text)
        text = '\n'.join(text)
        metadata = os.path.basename(html)
        tup = (text, metadata)
        output.append(tup)
    return output
def fetch(keyword, url, rank, articles, totalNumber):
    searchKeywords = keyword.split('" OR "') # We are going to check the article text for our keywords after being run through JusText
    response = requests.get(url)
    paragraphs = justext.justext(response.text, justext.get_stoplist("English"))
    empty = True
    containsKeyword = False
    minMentions = 3
    mentions = 0
    searchKeyword = searchKeywords[0].replace('"', '').strip().split(' ', 1)[0] # Get the first word of the search term
    articleParagraphs = []
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            if searchKeyword in paragraph.text:
                mentions += 1 #paragraph.text.count(searchKeyword)
                articleParagraphs.append(paragraph.text)
    if (mentions < minMentions):
        #print("A website (" + url + ") did not have the keyword enough times! Removed.")
        return
    '''for searchKeyword in searchKeywords:
        searchKeyword = searchKeyword.replace('"', '').strip().split(' ', 1)[0] # Get the first word of the search term
        if searchKeyword in article:
            containsKeyword = True
            break
    if (containsKeyword == False):
        print("A website (" + url + ") does not contain the keyword! Removed.")
        return '''
    articles.append(Article.Article(articleParagraphs, url, rank))
    print("\r" + str(len(articles)) + " / " + str(totalNumber) + " articles crawled to for keyword " + keyword, end=' ')
    sys.stdout.flush() 
Beispiel #11
0
def get_document(url):
    ''' This function will check if the url is valid and then
    proceed to parse it to produce a clean text (no html) which
    can be used as input to a recommendation engine.

    Arguments:
        url  -- input url that needs to be checked and parsed
    '''
    try:
        r = requests.head(url, allow_redirects=True)
    except requests.exceptions.ConnectionError as e:
        raise URLRetrievalError(url, 'Could not connect', e)
    if r.status_code != requests.codes.ok:
        raise URLRetrievalError(
            url, 'Invalid response code from remote server: {}'.format(
                r.status_code))
    if r.headers["content-type"].split(';')[0] not in [
            "text/html", "text/plain"
    ]:
        raise URLRetrievalError(
            url, 'Document has invalid MIME type: {}'.format(
                r.headers["content-type"]))
    raw = requests.get(url)
    paragraphs = justext.justext(raw.content, justext.get_stoplist("English"))
    text_only = ''
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            text_only += ' ' + paragraph.text
    if len(text_only) == 0:
        raise DocumentParsingError('Length of document is zero')
    return text_only
def get_text(html):
    paragraphs = justext.justext(html, justext.get_stoplist('English'))
    text = ""
    for paragraph in paragraphs:
    	if not paragraph.is_boilerplate: # and not paragraph.is_header:
            text = text + paragraph.text + ". "
    return text
Beispiel #13
0
    def getTextFromWeb(self):
        num_results = 10
        search_list = ["bbc", "Little Red Riding Hood"]
        sites = []
        text = []
        results = []
        while len(search_list) != 0 and len(results) < num_results:
            search = search_list.pop()
            results = results + google.google(search, nltk.word_tokenize)

        for d in results:
            sites.append(d)
            if len(sites) == num_results:
                break

        for url in sites:
            print url
            try:
                page = urllib2.urlopen(url).read()
            except urllib2.HTTPError, e:
                print "Search failed: %s" % e
                continue
            paragraphs = justext.justext(page, justext.get_stoplist('English'))
            if len(text) < 50:
                for paragraph in paragraphs:
                    if paragraph['class'] == 'good' and len(text) < 50:
                        sentences = self.segment_sentences(
                            paragraph['text'].encode('utf8'))
                        for s in sentences:
                            if not text.__contains__(s):
                                text.append(s)
Beispiel #14
0
def __get_keywords(file, bnc_frequencies, keyword_dict={}, ignore_capitalized=False):
	f = codecs.open(file, "r", encoding="utf-8").read()
	paragraphs = justext.justext(f, justext.get_stoplist("English"))
	freqs = {}
	text_freqs = {}
	for paragraph in paragraphs:
		if not paragraph.is_boilerplate:
			tokens = nltk.word_tokenize(clean_text(paragraph.text, not ignore_capitalized))
			for token in tokens:
				if ignore_capitalized and token != token.lower():
					continue
				if token not in text_freqs:
					text_freqs[token] = 0
				if token in freqs:
					text_freqs[token] += 1
					continue
				elif token in bnc_frequencies:
					freqs[token] = bnc_frequencies[token]
					text_freqs[token] += 1
				else:
					freqs[token] = 0
					text_freqs[token] += 1
	for f_key, f_value in text_freqs.iteritems():
		if f_value < 2:
			del freqs[f_key]
	x = len(freqs.keys())/10
	for i in range(x):
		min_word = min(freqs, key=freqs.get)
		if min_word not in keyword_dict:
			keyword_dict[min_word] = 0
		keyword_dict[min_word] += text_freqs[min_word]
		del freqs[min_word]
Beispiel #15
0
    def content(self):
        """
        :return: Text content of the given document
        """
        try:
            from os import path
            if path.isfile(self.document_location):
                import codecs
                with codecs.open(self.document_location, 'r', 'utf-8') as input_document:
                    content = input_document.read()
                    text = justext.justext(content, justext.get_stoplist("English"))
                    res = []
                    # total_length = 0
                    for paragraph in text:
                        if not paragraph.is_boilerplate:
                            res.append(paragraph.text)
                            # total_length += len(paragraph.text)
                        # if total_length > 10000:
                        #     break

                    res = '\n'.join(res)
                    return res
                    # return extract_text(content)
            else:
                logger.warning("Document not found: " + str(self.document_location))
        except Exception as exc:
            logger.warning(exc)
        return ""
Beispiel #16
0
def try_justext(tree, filecontent, record_id):
    '''safety net: try with justext'''
    result_body = etree.Element('body')
    justtextstring = html.tostring(tree,
                                   pretty_print=False,
                                   encoding='unicode')
    LOGGER.info('raw length: %s (file) %s (tostring) ', len(filecontent),
                len(justtextstring))
    try:
        # paragraphs = custom_justext(tree)
        paragraphs = justext.justext(justtextstring, JUSTEXT_STOPLIST)
    except ValueError as err:  # ValueError: Input object is not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, record_id)
        return None
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            # if lrutest.has_key(paragraph.text) is False or lrutest[paragraph.text] <= 2:
            if duplicate_test(paragraph, justext_switch=True) is not True:
                elem = etree.Element('p')
                elem.text = paragraph.text
                result_body.append(elem)
            # jt += paragraph.text + '</p><p>'
    # jt += '</p>'
    # temp_jt = u' '.join(jt.itertext())
    # temp_jt = jt
    return result_body
Beispiel #17
0
def get_text(link):
    response = requests.get(link)
    print(response)
    paragraphs = justext.justext(response.content,
                                 justext.get_stoplist("English"))
    text = "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
    return text
Beispiel #18
0
def overallSentiment(urls, verbose=False):
    """
    Guesses the overall sentiment of the given articles
    :param urls: List of URLs of articles to read
    :param verbose: Print status updates and specific verdicts
    :return: The proportion of articles that are positive
    """
    sentiments = []

    for url in urls:
        try:
            if verbose: print "Downloading", url + "..."
            response = requests.get(url)
            paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
            allText = "\n".join([paragraph.text for paragraph in paragraphs])
            if verbose: print "Reading..."
            sentiment = guessSentiment(allText)
            if verbose: print "Verdict:", sentiment
            sentiments.append(sentiment)
        except:
            if verbose: print "Failed to download", url


    positiveCount = len(filter(lambda x: x == "Positive", sentiments))
    return float(positiveCount) / len(urls)
Beispiel #19
0
def simple_text_extractor(html, stopwords = 'English'):
    import corpkit
    """extract text from html/xml files using justext"""
    import requests
    import justext
    import os
    import copy
    # if on hard disk:
    if type(html) != list:
        html_files = [copy.deepcopy(html)]
    else:
        html_files = copy.deepcopy(html)
    output = []
    for html in html_files:
        if os.path.isfile(html):
            f = open(html)
            raw_html_text = f.read()
        # if it's a web address
        elif html.startswith('http'):
            response = requests.get(html)
            raw_html_text = response.content
        # if it's already html text:
        else:
            raw_html_text = copy.deepcopy(html)
        paragraphs = justext.justext(raw_html_text, justext.get_stoplist(stopwords))
        text = []
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                text.append(paragraph.text)
        text = '\n'.join(text)
        metadata = os.path.basename(html)
        tup = (text, metadata)
        output.append(tup)
    return output
Beispiel #20
0
def get_document(url):
    ''' This function will check if the url is valid and then
    proceed to parse it to produce a clean text (no html) which
    can be used as input to a recommendation engine.

    Arguments:
        url  -- input url that needs to be checked and parsed
    '''
    try:
        r = requests.head(url, allow_redirects = True)
    except requests.exceptions.ConnectionError as e:
        raise URLRetrievalError(url, 'Could not connect', e)
    if r.status_code != requests.codes.ok:
        raise URLRetrievalError(url, 'Invalid response code from remote server: {}'
                                .format(r.status_code))
    if r.headers["content-type"].split(';')[0] not in ["text/html",
                                                       "text/plain"]:
        raise URLRetrievalError(url, 'Document has invalid MIME type: {}'
                                .format(r.headers["content-type"]))
    raw = requests.get(url)
    paragraphs = justext.justext(raw.content, justext.get_stoplist("English"))
    text_only = ''
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            text_only += ' ' + paragraph.text
    if len(text_only) == 0:
        raise DocumentParsingError('Length of document is zero')
    return text_only
def get_output_justext(input_data):
    result = []
    paragraphs = justext.justext(input_data, stoplist='English')
    for paragraph in paragraphs:
        result.append(paragraph.text)

    return "\n".join(result)
Beispiel #22
0
    def __init__(self, url):
        np_extract = Article(url)
        np_extract.download()
        if np_extract.download_state == 2:
            try:
                np_extract.parse()
                np_text = np_extract.text
            except:
                np_text = ''
        else:
            np_text = ''

        jt_text = ''
        try:
            response = requests.get(url)
            paragraphs = justext.justext(response.content,
                                         justext.get_stoplist("English"))
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    jt_text = jt_text + str(paragraph.text)
        except:
            jt_text = ''

        if len(np_text) > len(jt_text):
            self.text = np_text
        else:
            self.text = jt_text

        self.original_title = np_extract.title
        self.tok = nltk.word_tokenize(self.text)
        self.img = list(np_extract.images)
        self.vid = list(np_extract.movies)
        self.url = url
        self.nchar = len(self.text)
        self.nword = len(self.tok)
    def extract_paragraphs(content, is_html):
        if is_html:
            return [
                p.text for p in justext.justext(content.encode('utf-8'), hr_stoplist, encoding='utf-8') if not p.dom_path.endswith('pre')
            ]

        return [p.strip() for p in content.split('\n')]
Beispiel #24
0
def get_text_without_boilerplate(htmlcontent):

    # htmlcontent = htmlcontent.replace('\n', ' ')

    try:
        paragraphs = justext(htmlcontent, get_stoplist("English"))
    except Exception as e:
        raise SnippetGenerationError(
            "failed to process document using justext", original_exception=e)

    allparatext = ""

    for paragraph in paragraphs:

        try:
            if not paragraph.is_boilerplate:
                allparatext += " {}".format(paragraph.text)
        except Exception as e:
            raise SnippetGenerationError(
                "failed to process document using justext",
                original_exception=e)

    if allparatext == "":

        for paragraph in paragraphs:

            try:
                allparatext += "{}".format(paragraph.text)
            except Exception as e:
                raise SnippetGenerationError(
                    "failed to process document using justext",
                    original_exception=e)

    return allparatext
def boilerplate_remove(inp_text, stopwordlist, entry_str):
    warc1, warc2, text = inp_text.split(b'\r\n\r\n', maxsplit=2)
    warc1 = warc1.decode('UTF-8').replace('\r\n', '\n')
    warc2 = warc2.decode('UTF-8').replace('\r\n', '\n')
    length = len(text)
    if length <= 13:  # Threshold minimum: '<html></html>' is 13 long
        skip_action(warc1, warc2, 'LengthError({0})'.format(length), entry_str)
        return None
    try:
        paragraphs = justext.justext(text, stopwordlist)
    # TypeError JusText bug, AssertionError, ValueError JusText bug on comment...
    except (ParserError, UnicodeDecodeError, TypeError, AssertionError,
            ValueError) as err:
        # Do not distinguish between the different errors
        skip_action(warc1, warc2, err.__class__.__name__ + str(length),
                    entry_str)
        return None

    # Escape paragraph for parsable XML
    text_removed = '\n\n'.join(
        ('<p>\n{0}\n</p>'.format(xml.sax.saxutils.escape(paragraph.text))
         for paragraph in paragraphs if not paragraph.is_boilerplate))
    if len(text_removed) == 0:
        skip_action(warc1, warc2, 'JusTextBadError({0})'.format(length),
                    entry_str)
        return None

    filename, domain, url, warc_file, offset_str, length_str, response, mime_type = entry_str.split(
        ' ', maxsplit=8)
    filename = filename.replace('.gz', '')
    return '<doc domain="{0}" index="{1}" url="{2}" warc-file="{3}" offset="{4}" length="{5}" response="{6}"' \
           ' mime-type="{7}">\n<meta>\n<request>\n{8}\n</request>\n' \
           '<response>\n{9}\n</response>\n</meta>\n{10}\n</doc>\n\n\n'.\
        format(domain, filename, url, warc_file, offset_str, length_str, response, mime_type, warc1, warc2,
               text_removed).encode('UTF-8')
Beispiel #26
0
def process(record):
    response = requests.get(record['WARC-Target-URI'])
    first = True
    if response.text:
        paragraphs = justext.justext(response.content,
                                     justext.get_stoplist("English"))
        heading = ""
        body = ""
        for paragraph in paragraphs:
            if first and paragraph.is_heading:
                #words = filter(lambda word: not word in stopword_set, paragraph.text.split())
                #heading = (' ').join(words)
                heading = paragraph.text
                first = False
            elif not paragraph.is_boilerplate and paragraph.class_type == 'good':
                #words = filter(lambda word: not word in stopword_set, paragraph.text.split())
                #body += (' ').join(words)
                body += " " + paragraph.text
        if body != "":
            body = body.replace('"', "---")
            body = body.replace('\n', "")
            #records.append({"URL":record['WARC-Target-URI'], "Title":heading, "Sentences": body})
            file.write(("{\"URL\":\"" + record['WARC-Target-URI'] +
                        "\",\"Title\":\"" + heading + "\",\"Sentences\":\"" +
                        body + "\"").encode('utf-8').strip())
            file.write('\n')
Beispiel #27
0
    def documents( self, doc_id_start = 1, max_doc_id = None ):
	if max_doc_id == None:
	    max_doc_idx = self.count_doc()
	    
	for i in range( doc_id_start, max_doc_idx ):
	    doc = self.doc_retrieve( i )
	    if not doc.mime_type() == Wire.MIME_TEXT_HTML:
		continue
	  
	    text = self.retrieve_text_by_docid(i)
	    try:
		paragraphs = justext.justext(text, self.stopwords)
	    except lxml.etree.XMLSyntaxError:
		#print idx.url_by_docid(i), "bad html"
		continue
	    except lxml.etree.ParserError:
		#print idx.url_by_docid(i), "bad html"
		continue
	    except TypeError:
		#print idx.url_by_docid(i), "caused error"
		continue
	    
	    good_text = filter( lambda x: x['class'] == 'good', paragraphs )
	    if not good_text:
		continue
	    content = [ unescape(p['text']) for p in good_text ]
	    
	    soup = BeautifulSoup( text )
	    title_node = soup.find('title')

	    if title_node:
		title = unescape( title_node.getText().rstrip().lstrip() )
	    if not title_node:
		title = ''
		
	    meta_nodes = soup.findAll('meta')
	    description = ''
	    
	    for m in meta_nodes:
		try:
		    if m['name'] == 'description' and m['content']:
			description = m['content']
			break
		except KeyError:
		    continue
	    #print meta_nodes
	    '''
	    if meta_nodes:
		description = meta_nodes[0]['content']
	    else:
		description = ''
	    '''
	    #description = ''
	    
	    url = self.url_by_docid(i).decode('ascii', 'ignore')
	    site = url.split('/')[0]
	    
	    doc_data = { 'title': unicode(title), 'url': unicode(url), 'site': site, 'content': content, 'description': description }
	    yield doc_data
Beispiel #28
0
def get_article(item, source, reprocess=False):
    """Take the initial set of listings and enrich the content."""
    article = dict()
    encoded = item.get('link').encode('utf-8')
    article['uuid'] = hashlib.sha256(encoded).hexdigest()
    processed = is_found(article['uuid'])
    if processed and not reprocess:
        return {'article': processed, 'from_store': True}
    article['title'] = item.get('title', None)
    href = item.get('link', None)
    article['href'] = strip_google(href)
    article['source'] = derive_source(article['href'])
    article['collected'] = now_time()
    article['published'] = item.get('published', None)
    article['summary'] = item.get('summary', None)

    page_content = get_page_content(article['href'])
    if not page_content:
        logger.debug("No content found: %s" % article['href'])
        return {'article': None, 'from_store': True}
    paragraphs = justext.justext(page_content,
                                 justext.get_stoplist("English"),
                                 no_headings=True,
                                 max_heading_distance=150,
                                 length_high=140,
                                 max_link_density=0.4,
                                 stopwords_low=0.2,
                                 stopwords_high=0.3)
    text_content = list()
    for paragraph in paragraphs:
        if paragraph.is_boilerplate:
            continue
        text_content.append(paragraph.text)
    text_content = '\n'.join(text_content)
    tokens = get_tokens(text_content)

    article['word_count'] = len(tokens)
    article['read_time'] = round(float(article['word_count']) / 250, 2)
    clean = cleaned_tokens(tokens)
    article['tokens'] = [{
        t[0]: t[1]
    } for t in nltk.FreqDist(clean).most_common(100)]
    article['tags'] = [list(x.keys())[0] for x in article['tokens'][0:7]]
    article['sentiment'] = get_sentiment(text_content)
    article['feed_source'] = source.replace('www.google.com', 'google.com')
    articles = mongo.db[app.config['ARTICLES_COLLECTION']]
    if not reprocess:
        try:
            articles.insert(article)
        except:
            pass
    else:
        if not processed:
            try:
                articles.insert(article)
            except:
                pass
        articles.update({'_id': ObjectId(processed['_id'])}, {'$set': article})
    return {'article': article, 'from_store': False}
Beispiel #29
0
def remove_boilerplate(html, language="English"):
    try:
        paragraphs = justext.justext(html, justext.get_stoplist(language))
    except:
        return html  # TODO alternative to justext
    tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text
    content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate])
    return content
Beispiel #30
0
def html_to_text_justext(html_content_in_byte):
    paragraphs = justext.justext(html_content_in_byte,
                                 justext.get_stoplist("English"))
    boilerplate_free = [
        paragraph.text for paragraph in paragraphs
        if not paragraph.is_boilerplate
    ]
    return "".join(boilerplate_free)
Beispiel #31
0
 def remove_boilerplate(self, text):
     """
     Removes website artifacts: "Skip to Main Content", "About Us", etc.
     """
     jtext = justext.justext(text, justext.get_stoplist("English"))
     cleaned = [line.text for line in jtext if not line.is_boilerplate]
     cleaned_text = " ".join(cleaned) if cleaned else ""
     return cleaned_text
Beispiel #32
0
def webScraper(url):
    response = requests.get(url)
    paragraphs = justext.justext(response.content,
                                 justext.get_stoplist('English'))
    returningParagraphs = list()
    for item in paragraphs:
        returningParagraphs.append(item.text)
    return (returningParagraphs)
Beispiel #33
0
def get_doc_contents(filepath):
    contents = bytearray()
    with open(filepath,'rb') as f:
        paragraphs = justext.justext(f.read(), justext.get_stoplist('English'))
    for para in paragraphs:
        if not para.is_boilerplate:
            contents.extend(para.text.encode('UTF8'))
    return cleanup(str(contents))  # LIST OF CLEANED TOKENS
Beispiel #34
0
def remove_boilerplate(html, language="English"):
    try:
        paragraphs = justext.justext(html, justext.get_stoplist(language))
    except:
        return html  # TODO alternative to justext
    tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text
    content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate])
    return content
Beispiel #35
0
def getCorpus(html, stopwords, lmin, lmax):	
	full_text = []
	paragraphs = justext.justext(html, stopwords, lmin, lmax) 
	for paragraph in paragraphs:
		if paragraph.cf_class == 'good':
			real_text = ''.join("%s" % i.encode('utf-8') for i in paragraph.text_nodes)
			full_text.append(real_text)			
	return ' '.join(full_text)
Beispiel #36
0
def get_doc_contents(filepath):
    contents = bytearray()
    with open(filepath, 'rb') as f:
        paragraphs = justext.justext(f.read(), justext.get_stoplist('English'))
    for para in paragraphs:
        if not para.is_boilerplate:
            contents.extend(para.text.encode('UTF8'))
    return cleanup(str(contents))  # LIST OF CLEANED TOKENS
def read_files(path, file_name, langue):
  contenu = codecs.open(path + file_name,'r',encoding='utf-8').read()
  paragraphs = justext.justext(contenu, justext.get_stoplist(langue))
  chaine = ""
  for paragraph in paragraphs:
    if not paragraph.is_boilerplate:
      chaine+= paragraph.text+"\n"
  return chaine
Beispiel #38
0
def cleanHtml(html):
    # raw = nltk.clean_html(html) // was removed in nltk 3.0
    # If you do not install justext, use beautifulsoup:
    # soup = BeautifulSoup(html)
    # raw = soup.get_text()
    # This will do a better job once you install justext
    paragraphs = justext.justext(html, justext.get_stoplist('English'))
    return "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
Beispiel #39
0
def cleanHtml(html):
    # raw = nltk.clean_html(html) // was removed in nltk 3.0
    # If you do not install justext, use beautifulsoup:
    # soup = BeautifulSoup(html)
    # raw = soup.get_text()
    # This will do a better job once you install justext
    paragraphs = justext.justext(html, justext.get_stoplist('English'))
    return "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
Beispiel #40
0
 def get_content(self, html):
     # I should refactor the other get_content when this fails into here
     lang_mapping = {'nl': 'Dutch', 'en': 'English', 'com': 'English'}
     if self.detected_language not in lang_mapping:
         return ''
     lang = lang_mapping[self.detected_language]
     body_content = [x.text for x in justext.justext(html, justext.get_stoplist(lang))
                     if not x.is_boilerplate and not x.is_heading]
     return body_content
Beispiel #41
0
def run_justext(htmlstring):
    '''try with the generic algorithm justext'''
    valid = list()
    paragraphs = justext.justext(htmlstring, justext.get_stoplist("German")) , 50, 200, 0.1, 0.2, 0.2, 200, True)  # stop_words
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            valid.append(paragraph.text)
    result = ' '.join(valid)
    return result # sanitize(result)
Beispiel #42
0
def jt_treatement(input_file, output_file):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    """
    paragraphs = justext.justext(input_file.read(),
                                 justext.get_stoplist('English'))

    for paragraph in paragraphs:
        output_file.write("<p>" + paragraph.text.replace("\n", " ") + "</p>\n")
Beispiel #43
0
def run_justext(htmlstring):
    '''try with the generic algorithm justext'''
    valid = list()
    paragraphs = justext.justext(htmlstring, justext.get_stoplist("German"))
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            valid.append(paragraph.text)
    result = ' '.join(valid)
    return result
def parseHtmlToText(htmlContent):
    try:
        justextContent = justext.justext(htmlContent.encode("utf-8"), justext.get_stoplist('Estonian'))
#         text = getText(getParagraphs(justextContent))
    except Exception:
        justextContent = ""
    text = getText(getParagraphs(justextContent))
    #logger.info("Text length:"+len(text))
    return text
Beispiel #45
0
 def read_dial(self):
     response = requests.get(
         f'https://pidru4niki.com/15780506/filosofiya/osnovni_zakoni_dialektiki_svitoglyadne_metodologichne_znachennya'
     )
     paragraphs = justext.justext(response.content,
                                  justext.get_stoplist("Ukrainian"))
     prs = [pp for pp in paragraphs if not pp.is_boilerplate]
     chosen_p = random.choice(list(prs))
     self.speaker.tell_ua(chosen_p.text)
Beispiel #46
0
    def text(self):
        if not self._text:
            if self._article.is_valid_body():
                self._text = self._article.text
            else:
                self._text = '\n'.join(p.text for p in justext.justext(
                    self._article.html, justext.get_stoplist("English")))

        return self._text
Beispiel #47
0
def toJustText(webContent):
    print 'Entree dans toJustText'

    txt=''
    paragraphs = justext.justext(webContent, justext.get_stoplist("English"))
    for paragraph in paragraphs:
        #if not paragraph.is_boilerplate:
        txt+= smart_str(paragraph.text.encode('utf-8'))
    return txt
Beispiel #48
0
def clean_jusText_localFile(filename, language, outputfile) : 
  try : 
    with codecs.open(filename, "r", "utf-8") as f:
      with open(outputfile, "w") as output:
        content = f.read()
        paragraphs = justext.justext(content, justext.get_stoplist(CODE_LANG[language]))
        for paragraph in paragraphs:
          if not paragraph.is_boilerplate:
            output.write(paragraph.text.encode('utf-8')+"\n")
  except ValueError :
    print "[jusText] Stopwords list not available for "+language
Beispiel #49
0
 def getText(self):
     text = ''
     try:
         response = requests.get(JusTextWrapper.iUrl)
         paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
         for paragraph in paragraphs:
             if not paragraph.is_boilerplate:
                 text += " "+paragraph.text
         return text
     except:
         return ""
Beispiel #50
0
def text(request):
  page = urllib2.urlopen(request.GET.get('url','')).read()
  paragraphs = justext.justext(page, justext.get_stoplist('English'))
  text = []
  for paragraph in paragraphs:
    if paragraph['class'] == 'good':
        p = {}
        p['content'] = paragraph['text']
        p['heading'] = paragraph['heading']
        text.append(p)

  return HttpResponse(simplejson.dumps(text), 'application/json')
Beispiel #51
0
def get_url(webpage):
    doctext = bytearray()
    try:
        response = requests.get(webpage)
    except requests.exceptions.MissingSchema:
        webpage = 'http://' + webpage
        response = requests.get(webpage)
    paragraphs = justext.justext(response.content, justext.get_stoplist('English'))
    for para in paragraphs:
        if not para.is_boilerplate:
            doctext.extend(para.text.encode('UTF-8'))
    return cleanup(str(doctext))
Beispiel #52
0
 def crawl_url(self, url):
     content = Content('','')
     try:
         request = urllib2.Request(url)
         page = urllib2.urlopen(request).read()
         if page:
             paragraphs = justext.justext(page, [], stopwords_high=0, stopwords_low = 0, length_low=LENGTH_LOW_DEFAULT)
             text = [para.text for para in paragraphs if not para.is_boilerplate]
             content = Content(url, '\n'.join(text))
     except Exception as e:
         pass   
     return content
Beispiel #53
0
def get_text_from_reuters(link):
    response = requests.get(link)
    resText = response.content.decode("UTF-8", 'ignore')
    soup = BeautifulSoup(resText, 'html.parser')
    tmp = [x.extract() for x in soup.find_all(class_= "Edition_items_293of")]
    for tag in soup.find_all(["script", "meta", "head", "style", "noscript"]):
        tag.decompose()
    for tag in soup.find_all(True, class_= ["Attribution_content_27_rw", "Image_container_1tVQo"]):
        tag.decompose()
    paragraphs = justext.justext(soup.prettify(), justext.get_stoplist("English"))
    text = "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
    return text
Beispiel #54
0
 def solo_texto(self):
     '''
      Proviene de trae_datos donde pasa a self.texto
     Trae el contenido de cada url individual (cada noticia) por urllib.request.urlopen() y BeautifulSoup
     
     '''
     if self.link:
         con_ac = 'áéíóúüñ'
         sin_ac = 'aeiouun'
         conv = str.maketrans(con_ac, sin_ac)
         self.link = self.link.translate(conv)
     texto = ''     
     try:
         user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
         headers={'User-Agent':user_agent,} 
         req = urllib.request.Request(self.link, None, headers)
         art1 = urllib.request.urlopen(req)
         # signal.alarm(0) 
     except:
         print("<a href = '"+self.link+"'>Sin conexion (solo_texto) al link</a>")
         return False
     art2 = art1.read()
     art1.close()
     try:
         #metas = parseString(art2)
         #print(1)  
         try:         
             paras = justext.justext(art2, justext.get_stoplist('Spanish'))
         except:
             print("Error en justext")
         for para in paras:
             if para['class'] == 'good':
                 parra = para['text']
                 parra = self.cambia_acentos(parra, self.acentos)
                 parra = parra.replace('Ã', 'Ó')
                 if parra.endswith('.'):
                     texto += " " + parra
                 else:
                     texto += " " +parra + "."
         if not texto:
             print("<a href='"+self.link+"'>No hay texto recibido en trae_articulo"  + self.fuente+"</a>")
         else:
             self.articulo = bs(art2)
             #print(2)
             if (self.articulo):
                 self.busca_fotos()
                 #print(3) 
             return texto
     except:
         print("<a href = '"+self.link+"'>Errores en justext para link </a>")
         return False
def scrape(url, title):
    text = str()
    try:
        page = requests.get(url)
        paragraphs = justext.justext(page.content,
                                     justext.get_stoplist('English'))
        for par in paragraphs:
            if par['class'] == 'good':
                text += par['text']
        return text
    #Generic error catching is bad
    #As are printed log statements....
    except Exception:
        print 'Something went wrong...'
Beispiel #56
0
def fetch(url):
    print 'Fetching: %s' % url
    if not redis_client.sismember('htmlcache:fetched', url):
        naked_url = protocol_re.sub('', url)
        long_filename = nonword_re.sub('-', naked_url)
        filename = long_filename[:255]
        html = requests.get(url, timeout=10).text
        text = u'\n'.join(p['text'] for p in justext.justext(html, stopwords) if p['class'] == 'good')
        with open('%s/%s' % (opts.directory, filename), 'w') as fp:
            fp.write(text.encode('utf8'))
        redis_client.sadd('htmlcache:fetched', url)
        percent = (100.0 * len(text)) / (len(html) + 1)
        print '  Size reduced: %d -> %d (%0.2f%%)' % (len(html), len(text), percent)
    else:
        print '  Already fetched'
Beispiel #57
0
def remove_bad_by_classifier(doc):
    ps = justext.justext(
        doc, justext.get_stoplist('English'))
    to_delete = []
    good = []
    for p in ps:
        if p['class'] == 'bad':
            for el in doc.xpath(p['xpath']):
                to_delete.append((el, p['xpath']))
        elif p['class'] == 'good':
            good.append(p['xpath'])

    for el, xp in reversed(to_delete):
        if el.getparent() is not None and not any(xp in g for g in good):
            el.drop_tree()
Beispiel #58
0
	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		titulo = hxs.select('/html/head/title/text()').extract()
		rules = (Rule(SgmlLinkExtractor(allow='.*'),follow=True,callback='parse'))
		corpo = justext.justext(response.body, justext.get_stoplist('Portuguese'))
		texto = ''
		for paragrafo in corpo:
			if paragrafo['class'] == 'good':
				texto += paragrafo['text']
		item = Pagina()
		item['url'] = response.url
		item['titulo'] = unicode(titulo[0])
		item['texto'] = unicode(texto)
		item['tipo'] = self.name
		return item