Ejemplo n.º 1
0
def get_paragraphs_JT(str_text, mode, file_name=''):
    """
    using Justext
    """
    if mode == "_english":
        stop = justext.get_stoplist("English")
    elif mode == 'lang_detect':
        lang = get_langid(str_text)
        if lang == "Chinese":
            stop = set()
        else:
            stop = justext.get_stoplist(lang)
    # mode ou on détecte la 'vraie' langue fournie par le fichier doc_lg.json
    elif mode == 'lang_specified' and file_name != '':
        with open(DOC_LG_PATH, mode='r', encoding='utf-8',
                  errors='ignore') as lang_code_file:
            json_data = json.load(
                lang_code_file)  # on charge nos codes de langue
            lang = json_data[file_name]  # on récupère la langue
            if lang == "Chinese":
                stop = set()
            else:
                stop = justext.get_stoplist(lang)
            lang_code_file.close()

    else:
        stop = frozenset()

    if len(stop) == 0:
        any_lang_stop_words = get_all_stop_words()
        paragraphs = justext.justext(str_text, any_lang_stop_words)
    else:
        paragraphs = justext.justext(str_text, stop)
    list_paragraphs = [x.text for x in paragraphs if not x.is_boilerplate]
    return list_paragraphs
Ejemplo n.º 2
0
Archivo: utils.py Proyecto: DenXX/aqqu
    def content(self):
        """
        :return: Text content of the given document
        """
        try:
            from os import path
            if path.isfile(self.document_location):
                import codecs
                with codecs.open(self.document_location, 'r', 'utf-8') as input_document:
                    content = input_document.read()
                    text = justext.justext(content, justext.get_stoplist("English"))
                    res = []
                    # total_length = 0
                    for paragraph in text:
                        if not paragraph.is_boilerplate:
                            res.append(paragraph.text)
                            # total_length += len(paragraph.text)
                        # if total_length > 10000:
                        #     break

                    res = '\n'.join(res)
                    return res
                    # return extract_text(content)
            else:
                logger.warning("Document not found: " + str(self.document_location))
        except Exception as exc:
            logger.warning(exc)
        return ""
Ejemplo n.º 3
0
    def getTextFromWeb(self):
        num_results = 10
        search_list = ["bbc", "Little Red Riding Hood"]
        sites = [] 
        text = []
        results = []
        while len(search_list)!=0 and len(results) < num_results:
            search = search_list.pop()
            results = results + google.google(search,nltk.word_tokenize)

        for d in results:
            sites.append(d)
            if len(sites) == num_results:
                break
  
        for url in sites:
            print url
            try:
                page = urllib2.urlopen(url).read()
            except urllib2.HTTPError, e:
                print "Search failed: %s" % e 
                continue
            paragraphs = justext.justext(page, justext.get_stoplist('English'))
            if len(text) < 50:
                for paragraph in paragraphs:
                    if paragraph['class'] == 'good' and len(text) < 50:
                        sentences = self.segment_sentences(paragraph['text'].encode('utf8'))
                        for s in sentences:
                            if not text.__contains__(s):
                                text.append(s)
Ejemplo n.º 4
0
def get_document_text(input_url_response):
    DOCUMENT_LENGTH = 0
    paragraphs = justext.justext(input_url_response.content,
                                 justext.get_stoplist("English"))
    for paragraph in paragraphs:
        DOCUMENT_LENGTH += len(paragraph.text)
    return DOCUMENT_LENGTH
Ejemplo n.º 5
0
def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        langsetting = JUSTEXT_LANGUAGES[target_language]
        justext_stoplist = justext.get_stoplist(langsetting)
    else:
        #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT)
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200,
                                     0.1, 0.2, 0.2, 200, True)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in [p for p in paragraphs if not p.is_boilerplate]:
            #if duplicate_test(paragraph) is not True:
            elem = etree.Element('p')
            elem.text = paragraph.text
            result_body.append(elem)
    return result_body
Ejemplo n.º 6
0
def remove_boilerplate(page_str, lang, relaxed=False):
    """
    Removes boilerplate from HTML documents.

    Uses JusText library.

    NOTE: quality dependent on correct language detection.

    :param page_str: str HTML page source.
    :param lang: str Google Translate language code.
    :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match
        is returned. Short and bad segments in between are kept.
    :return: list List of non-boilerplate segments/paragraphs.
    """
    if lang not in GTRANS_JUSTEXT_LANG_MAP:
        #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang)
        return []

    jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang]

    paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang))

    if relaxed:
        good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']]

        if len(good_indexes) == 0:
            return []

        return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]]
    else:
        return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
Ejemplo n.º 7
0
def get_document(url):
    ''' This function will check if the url is valid and then
    proceed to parse it to produce a clean text (no html) which
    can be used as input to a recommendation engine.

    Arguments:
        url  -- input url that needs to be checked and parsed
    '''
    try:
        r = requests.head(url, allow_redirects=True)
    except requests.exceptions.ConnectionError as e:
        raise URLRetrievalError(url, 'Could not connect', e)
    if r.status_code != requests.codes.ok:
        raise URLRetrievalError(
            url, 'Invalid response code from remote server: {}'.format(
                r.status_code))
    if r.headers["content-type"].split(';')[0] not in [
            "text/html", "text/plain"
    ]:
        raise URLRetrievalError(
            url, 'Document has invalid MIME type: {}'.format(
                r.headers["content-type"]))
    raw = requests.get(url)
    paragraphs = justext.justext(raw.content, justext.get_stoplist("English"))
    text_only = ''
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            text_only += ' ' + paragraph.text
    if len(text_only) == 0:
        raise DocumentParsingError('Length of document is zero')
    return text_only
Ejemplo n.º 8
0
def get_document(url):
    ''' This function will check if the url is valid and then
    proceed to parse it to produce a clean text (no html) which
    can be used as input to a recommendation engine.

    Arguments:
        url  -- input url that needs to be checked and parsed
    '''
    try:
        r = requests.head(url, allow_redirects = True)
    except requests.exceptions.ConnectionError as e:
        raise URLRetrievalError(url, 'Could not connect', e)
    if r.status_code != requests.codes.ok:
        raise URLRetrievalError(url, 'Invalid response code from remote server: {}'
                                .format(r.status_code))
    if r.headers["content-type"].split(';')[0] not in ["text/html",
                                                       "text/plain"]:
        raise URLRetrievalError(url, 'Document has invalid MIME type: {}'
                                .format(r.headers["content-type"]))
    raw = requests.get(url)
    paragraphs = justext.justext(raw.content, justext.get_stoplist("English"))
    text_only = ''
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            text_only += ' ' + paragraph.text
    if len(text_only) == 0:
        raise DocumentParsingError('Length of document is zero')
    return text_only
Ejemplo n.º 9
0
def simple_text_extractor(html, stopwords = 'English'):
    import corpkit
    """extract text from html/xml files using justext"""
    import requests
    import justext
    import os
    import copy
    # if on hard disk:
    if type(html) != list:
        html_files = [copy.deepcopy(html)]
    else:
        html_files = copy.deepcopy(html)
    output = []
    for html in html_files:
        if os.path.isfile(html):
            f = open(html)
            raw_html_text = f.read()
        # if it's a web address
        elif html.startswith('http'):
            response = requests.get(html)
            raw_html_text = response.content
        # if it's already html text:
        else:
            raw_html_text = copy.deepcopy(html)
        paragraphs = justext.justext(raw_html_text, justext.get_stoplist(stopwords))
        text = []
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                text.append(paragraph.text)
        text = '\n'.join(text)
        metadata = os.path.basename(html)
        tup = (text, metadata)
        output.append(tup)
    return output
Ejemplo n.º 10
0
def get_text(link):
    response = requests.get(link)
    print(response)
    paragraphs = justext.justext(response.content,
                                 justext.get_stoplist("English"))
    text = "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
    return text
def get_text(html):
    paragraphs = justext.justext(html, justext.get_stoplist('English'))
    text = ""
    for paragraph in paragraphs:
    	if not paragraph.is_boilerplate: # and not paragraph.is_header:
            text = text + paragraph.text + ". "
    return text
Ejemplo n.º 12
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    try:
        logging.info('Acquiring stopword list for {}...'.format(
            args.boilerplate_language))
        stoplist = justext.get_stoplist(args.boilerplate_language)
        logging.info('Number of stopwords: {}'.format(len(stoplist)))
    except ValueError as e:
        logging.error('Invalid stopword language {}.'.format(
            args.boilerplate_language))
        exit(1)

    if not op.isdir(args.output_dir):
        os.makedirs(args.output_dir)
    os.nice(20)  # Play nice

    index_files = os.listdir(args.index_dir)
    fn = functools.partial(process,
                           index_dir=args.index_dir,
                           warc_dir=args.warc_dir,
                           output_dir=args.output_dir,
                           stoplist=stoplist)

    with Pool(args.processes) as pool:
        pool.map(fn, index_files)

    logging.info('Done.')
Ejemplo n.º 13
0
    def getTextFromWeb(self):
        num_results = 10
        search_list = ["bbc", "Little Red Riding Hood"]
        sites = []
        text = []
        results = []
        while len(search_list) != 0 and len(results) < num_results:
            search = search_list.pop()
            results = results + google.google(search, nltk.word_tokenize)

        for d in results:
            sites.append(d)
            if len(sites) == num_results:
                break

        for url in sites:
            print url
            try:
                page = urllib2.urlopen(url).read()
            except urllib2.HTTPError, e:
                print "Search failed: %s" % e
                continue
            paragraphs = justext.justext(page, justext.get_stoplist('English'))
            if len(text) < 50:
                for paragraph in paragraphs:
                    if paragraph['class'] == 'good' and len(text) < 50:
                        sentences = self.segment_sentences(
                            paragraph['text'].encode('utf8'))
                        for s in sentences:
                            if not text.__contains__(s):
                                text.append(s)
Ejemplo n.º 14
0
def overallSentiment(urls, verbose=False):
    """
    Guesses the overall sentiment of the given articles
    :param urls: List of URLs of articles to read
    :param verbose: Print status updates and specific verdicts
    :return: The proportion of articles that are positive
    """
    sentiments = []

    for url in urls:
        try:
            if verbose: print "Downloading", url + "..."
            response = requests.get(url)
            paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
            allText = "\n".join([paragraph.text for paragraph in paragraphs])
            if verbose: print "Reading..."
            sentiment = guessSentiment(allText)
            if verbose: print "Verdict:", sentiment
            sentiments.append(sentiment)
        except:
            if verbose: print "Failed to download", url


    positiveCount = len(filter(lambda x: x == "Positive", sentiments))
    return float(positiveCount) / len(urls)
Ejemplo n.º 15
0
def __get_keywords(file, bnc_frequencies, keyword_dict={}, ignore_capitalized=False):
	f = codecs.open(file, "r", encoding="utf-8").read()
	paragraphs = justext.justext(f, justext.get_stoplist("English"))
	freqs = {}
	text_freqs = {}
	for paragraph in paragraphs:
		if not paragraph.is_boilerplate:
			tokens = nltk.word_tokenize(clean_text(paragraph.text, not ignore_capitalized))
			for token in tokens:
				if ignore_capitalized and token != token.lower():
					continue
				if token not in text_freqs:
					text_freqs[token] = 0
				if token in freqs:
					text_freqs[token] += 1
					continue
				elif token in bnc_frequencies:
					freqs[token] = bnc_frequencies[token]
					text_freqs[token] += 1
				else:
					freqs[token] = 0
					text_freqs[token] += 1
	for f_key, f_value in text_freqs.iteritems():
		if f_value < 2:
			del freqs[f_key]
	x = len(freqs.keys())/10
	for i in range(x):
		min_word = min(freqs, key=freqs.get)
		if min_word not in keyword_dict:
			keyword_dict[min_word] = 0
		keyword_dict[min_word] += text_freqs[min_word]
		del freqs[min_word]
Ejemplo n.º 16
0
def simple_text_extractor(html, stopwords='English'):
    import corpkit
    """extract text from html/xml files using justext"""
    import requests
    import justext
    import os
    import copy
    # if on hard disk:
    if type(html) != list:
        html_files = [copy.deepcopy(html)]
    else:
        html_files = copy.deepcopy(html)
    output = []
    for html in html_files:
        if os.path.isfile(html):
            f = open(html)
            raw_html_text = f.read()
        # if it's a web address
        elif html.startswith('http'):
            response = requests.get(html)
            raw_html_text = response.content
        # if it's already html text:
        else:
            raw_html_text = copy.deepcopy(html)
        paragraphs = justext.justext(raw_html_text,
                                     justext.get_stoplist(stopwords))
        text = []
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                text.append(paragraph.text)
        text = '\n'.join(text)
        metadata = os.path.basename(html)
        tup = (text, metadata)
        output.append(tup)
    return output
Ejemplo n.º 17
0
def get_url_article2(link, lang):
    '''
    TO BE DONE : error handling : http://www.voidspace.org.uk/python/articles/urllib2.shtml#handling-exceptions        
    '''
    ### bug encodage
    if len(link) < 5:
        return False
    try:
        #l = link.decode("utf-8",  errors='ignore')
        log.info("Retrieving : " + link)
        #hdr = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        hdr = 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0'
        headers = {'User-Agent': hdr}
        resp = requests.get(link, headers=headers)
        resp.raise_for_status()
        page = resp.text
        #log.info(page)
        contents = ''
        #print(justext.get_stoplist())
        paragraphs = justext.justext(page, justext.get_stoplist(lang))
        for paragraph in paragraphs:
            if paragraph.class_type == 'good':
                #and re.search(r'Facebook connect|cliquez|Envoyer cet article par email|D.couvrez tous nos packs|d.j.un|recevoirnos|nosoffres|acc.dezà|cliquez ici|En poursuivant votre navigation sur ce site|accédezà|pasencore|Veuillez cliquer|créez gratuitement votre compte]',paragraph.text)== None:
                contents = contents + "\n" + paragraph.text
        cts = remove_control_characters(contents)
        if len(cts) == 0:
            log.warning("No contents for :" + link)  # + " " + page
        return cts
    except requests.exceptions.RequestException as e:
        log.warning("Exception : " + str(e))
        return False
Ejemplo n.º 18
0
    def __init__(self, url):
        np_extract = Article(url)
        np_extract.download()
        if np_extract.download_state == 2:
            try:
                np_extract.parse()
                np_text = np_extract.text
            except:
                np_text = ''
        else:
            np_text = ''

        jt_text = ''
        try:
            response = requests.get(url)
            paragraphs = justext.justext(response.content,
                                         justext.get_stoplist("English"))
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    jt_text = jt_text + str(paragraph.text)
        except:
            jt_text = ''

        if len(np_text) > len(jt_text):
            self.text = np_text
        else:
            self.text = jt_text

        self.original_title = np_extract.title
        self.tok = nltk.word_tokenize(self.text)
        self.img = list(np_extract.images)
        self.vid = list(np_extract.movies)
        self.url = url
        self.nchar = len(self.text)
        self.nword = len(self.tok)
Ejemplo n.º 19
0
def get_text_without_boilerplate(htmlcontent):

    # htmlcontent = htmlcontent.replace('\n', ' ')

    try:
        paragraphs = justext(htmlcontent, get_stoplist("English"))
    except Exception as e:
        raise SnippetGenerationError(
            "failed to process document using justext", original_exception=e)

    allparatext = ""

    for paragraph in paragraphs:

        try:
            if not paragraph.is_boilerplate:
                allparatext += " {}".format(paragraph.text)
        except Exception as e:
            raise SnippetGenerationError(
                "failed to process document using justext",
                original_exception=e)

    if allparatext == "":

        for paragraph in paragraphs:

            try:
                allparatext += "{}".format(paragraph.text)
            except Exception as e:
                raise SnippetGenerationError(
                    "failed to process document using justext",
                    original_exception=e)

    return allparatext
Ejemplo n.º 20
0
def process(record):
    response = requests.get(record['WARC-Target-URI'])
    first = True
    if response.text:
        paragraphs = justext.justext(response.content,
                                     justext.get_stoplist("English"))
        heading = ""
        body = ""
        for paragraph in paragraphs:
            if first and paragraph.is_heading:
                #words = filter(lambda word: not word in stopword_set, paragraph.text.split())
                #heading = (' ').join(words)
                heading = paragraph.text
                first = False
            elif not paragraph.is_boilerplate and paragraph.class_type == 'good':
                #words = filter(lambda word: not word in stopword_set, paragraph.text.split())
                #body += (' ').join(words)
                body += " " + paragraph.text
        if body != "":
            body = body.replace('"', "---")
            body = body.replace('\n', "")
            #records.append({"URL":record['WARC-Target-URI'], "Title":heading, "Sentences": body})
            file.write(("{\"URL\":\"" + record['WARC-Target-URI'] +
                        "\",\"Title\":\"" + heading + "\",\"Sentences\":\"" +
                        body + "\"").encode('utf-8').strip())
            file.write('\n')
Ejemplo n.º 21
0
def get_article(articles, i, output):
    for article in tqdm(articles):
        try:
            a = newspaper.Article(article)
            a.download()
            a.parse()
            a.nlp()

            paragraphs = justext.justext(a.html,
                                         justext.get_stoplist("English"))
            text = '\n\n'.join(
                [p.text for p in paragraphs if not p.is_boilerplate])

            if (len(text) > len(a.text) + 50):
                a.set_text(text)

            h = html2text.HTML2Text()
            h.ignore_links = True
            h.ignore_images = True

            a.set_html(h.handle(a.html))

        except Exception as e:
            print(e)
            continue

        # TODO: config option?
        if len(a.text) < 400:
            continue

        output.append(a)
Ejemplo n.º 22
0
def fetch(keyword, url, rank, articles, totalNumber):
    searchKeywords = keyword.split('" OR "') # We are going to check the article text for our keywords after being run through JusText
    response = requests.get(url)
    paragraphs = justext.justext(response.text, justext.get_stoplist("English"))
    empty = True
    containsKeyword = False
    minMentions = 3
    mentions = 0
    searchKeyword = searchKeywords[0].replace('"', '').strip().split(' ', 1)[0] # Get the first word of the search term
    articleParagraphs = []
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            if searchKeyword in paragraph.text:
                mentions += 1 #paragraph.text.count(searchKeyword)
                articleParagraphs.append(paragraph.text)
    if (mentions < minMentions):
        #print("A website (" + url + ") did not have the keyword enough times! Removed.")
        return
    '''for searchKeyword in searchKeywords:
        searchKeyword = searchKeyword.replace('"', '').strip().split(' ', 1)[0] # Get the first word of the search term
        if searchKeyword in article:
            containsKeyword = True
            break
    if (containsKeyword == False):
        print("A website (" + url + ") does not contain the keyword! Removed.")
        return '''
    articles.append(Article.Article(articleParagraphs, url, rank))
    print("\r" + str(len(articles)) + " / " + str(totalNumber) + " articles crawled to for keyword " + keyword, end=' ')
    sys.stdout.flush() 
Ejemplo n.º 23
0
def get_article(item, source, reprocess=False):
    """Take the initial set of listings and enrich the content."""
    article = dict()
    encoded = item.get('link').encode('utf-8')
    article['uuid'] = hashlib.sha256(encoded).hexdigest()
    processed = is_found(article['uuid'])
    if processed and not reprocess:
        return {'article': processed, 'from_store': True}
    article['title'] = item.get('title', None)
    href = item.get('link', None)
    article['href'] = strip_google(href)
    article['source'] = derive_source(article['href'])
    article['collected'] = now_time()
    article['published'] = item.get('published', None)
    article['summary'] = item.get('summary', None)

    page_content = get_page_content(article['href'])
    if not page_content:
        logger.debug("No content found: %s" % article['href'])
        return {'article': None, 'from_store': True}
    paragraphs = justext.justext(page_content,
                                 justext.get_stoplist("English"),
                                 no_headings=True,
                                 max_heading_distance=150,
                                 length_high=140,
                                 max_link_density=0.4,
                                 stopwords_low=0.2,
                                 stopwords_high=0.3)
    text_content = list()
    for paragraph in paragraphs:
        if paragraph.is_boilerplate:
            continue
        text_content.append(paragraph.text)
    text_content = '\n'.join(text_content)
    tokens = get_tokens(text_content)

    article['word_count'] = len(tokens)
    article['read_time'] = round(float(article['word_count']) / 250, 2)
    clean = cleaned_tokens(tokens)
    article['tokens'] = [{
        t[0]: t[1]
    } for t in nltk.FreqDist(clean).most_common(100)]
    article['tags'] = [list(x.keys())[0] for x in article['tokens'][0:7]]
    article['sentiment'] = get_sentiment(text_content)
    article['feed_source'] = source.replace('www.google.com', 'google.com')
    articles = mongo.db[app.config['ARTICLES_COLLECTION']]
    if not reprocess:
        try:
            articles.insert(article)
        except:
            pass
    else:
        if not processed:
            try:
                articles.insert(article)
            except:
                pass
        articles.update({'_id': ObjectId(processed['_id'])}, {'$set': article})
    return {'article': article, 'from_store': False}
Ejemplo n.º 24
0
def remove_boilerplate(html, language="English"):
    try:
        paragraphs = justext.justext(html, justext.get_stoplist(language))
    except:
        return html  # TODO alternative to justext
    tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text
    content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate])
    return content
Ejemplo n.º 25
0
def get_all_stop_words():
  """
  For the language independent version of Justext
  """
  stop_words = set()
  for language in justext.get_stoplists():
      stop_words.update(justext.get_stoplist(language))
  return stop_words
Ejemplo n.º 26
0
 def remove_boilerplate(self, text):
     """
     Removes website artifacts: "Skip to Main Content", "About Us", etc.
     """
     jtext = justext.justext(text, justext.get_stoplist("English"))
     cleaned = [line.text for line in jtext if not line.is_boilerplate]
     cleaned_text = " ".join(cleaned) if cleaned else ""
     return cleaned_text
Ejemplo n.º 27
0
def cleanHtml(html):
    # raw = nltk.clean_html(html) // was removed in nltk 3.0
    # If you do not install justext, use beautifulsoup:
    # soup = BeautifulSoup(html)
    # raw = soup.get_text()
    # This will do a better job once you install justext
    paragraphs = justext.justext(html, justext.get_stoplist('English'))
    return "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
Ejemplo n.º 28
0
def webScraper(url):
    response = requests.get(url)
    paragraphs = justext.justext(response.content,
                                 justext.get_stoplist('English'))
    returningParagraphs = list()
    for item in paragraphs:
        returningParagraphs.append(item.text)
    return (returningParagraphs)
Ejemplo n.º 29
0
def get_doc_contents(filepath):
    contents = bytearray()
    with open(filepath,'rb') as f:
        paragraphs = justext.justext(f.read(), justext.get_stoplist('English'))
    for para in paragraphs:
        if not para.is_boilerplate:
            contents.extend(para.text.encode('UTF8'))
    return cleanup(str(contents))  # LIST OF CLEANED TOKENS
Ejemplo n.º 30
0
def get_doc_contents(filepath):
    contents = bytearray()
    with open(filepath, 'rb') as f:
        paragraphs = justext.justext(f.read(), justext.get_stoplist('English'))
    for para in paragraphs:
        if not para.is_boilerplate:
            contents.extend(para.text.encode('UTF8'))
    return cleanup(str(contents))  # LIST OF CLEANED TOKENS
Ejemplo n.º 31
0
def read_files(path, file_name, langue):
  contenu = codecs.open(path + file_name,'r',encoding='utf-8').read()
  paragraphs = justext.justext(contenu, justext.get_stoplist(langue))
  chaine = ""
  for paragraph in paragraphs:
    if not paragraph.is_boilerplate:
      chaine+= paragraph.text+"\n"
  return chaine
Ejemplo n.º 32
0
def remove_boilerplate(html, language="English"):
    try:
        paragraphs = justext.justext(html, justext.get_stoplist(language))
    except:
        return html  # TODO alternative to justext
    tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text
    content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate])
    return content
Ejemplo n.º 33
0
def cleanHtml(html):
    # raw = nltk.clean_html(html) // was removed in nltk 3.0
    # If you do not install justext, use beautifulsoup:
    # soup = BeautifulSoup(html)
    # raw = soup.get_text()
    # This will do a better job once you install justext
    paragraphs = justext.justext(html, justext.get_stoplist('English'))
    return "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
Ejemplo n.º 34
0
def html_to_text_justext(html_content_in_byte):
    paragraphs = justext.justext(html_content_in_byte,
                                 justext.get_stoplist("English"))
    boilerplate_free = [
        paragraph.text for paragraph in paragraphs
        if not paragraph.is_boilerplate
    ]
    return "".join(boilerplate_free)
Ejemplo n.º 35
0
 def read_dial(self):
     response = requests.get(
         f'https://pidru4niki.com/15780506/filosofiya/osnovni_zakoni_dialektiki_svitoglyadne_metodologichne_znachennya'
     )
     paragraphs = justext.justext(response.content,
                                  justext.get_stoplist("Ukrainian"))
     prs = [pp for pp in paragraphs if not pp.is_boilerplate]
     chosen_p = random.choice(list(prs))
     self.speaker.tell_ua(chosen_p.text)
Ejemplo n.º 36
0
def run_justext(htmlstring):
    '''try with the generic algorithm justext'''
    valid = list()
    paragraphs = justext.justext(htmlstring, justext.get_stoplist("German"))
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            valid.append(paragraph.text)
    result = ' '.join(valid)
    return result
Ejemplo n.º 37
0
def print_file(filename):
	f = open(filename, "rU")
	text = f.read() # string
	htmlcontent = justext.justext(text, justext.get_stoplist("English"))
	htmlcontent = htmlcontent[3:]
	htmlcontent = filter( lambda x: not x.text.isnumeric(), htmlcontent)
	
	for i in htmlcontent:
		print(i.text)
Ejemplo n.º 38
0
 def get_content(self, html):
     # I should refactor the other get_content when this fails into here
     lang_mapping = {'nl': 'Dutch', 'en': 'English', 'com': 'English'}
     if self.detected_language not in lang_mapping:
         return ''
     lang = lang_mapping[self.detected_language]
     body_content = [x.text for x in justext.justext(html, justext.get_stoplist(lang))
                     if not x.is_boilerplate and not x.is_heading]
     return body_content
Ejemplo n.º 39
0
 def extract_content_using_justext(self, raw_page):
     paragraphs = justext.justext(raw_page, justext.get_stoplist("German"))
     content = ''
     for paragraph in paragraphs:
         if not paragraph.is_boilerplate:
             if len(content) > 0:
                 content += '\n'
             content += paragraph.text    
     return content   
Ejemplo n.º 40
0
def parseHtmlToText(htmlContent):
    try:
        justextContent = justext.justext(htmlContent.encode("utf-8"), justext.get_stoplist('Estonian'))
#         text = getText(getParagraphs(justextContent))
    except Exception:
        justextContent = ""
    text = getText(getParagraphs(justextContent))
    #logger.info("Text length:"+len(text))
    return text
Ejemplo n.º 41
0
    def text(self):
        if not self._text:
            if self._article.is_valid_body():
                self._text = self._article.text
            else:
                self._text = '\n'.join(p.text for p in justext.justext(
                    self._article.html, justext.get_stoplist("English")))

        return self._text
Ejemplo n.º 42
0
def jt_treatement(input_file, output_file):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    """
    paragraphs = justext.justext(input_file.read(),
                                 justext.get_stoplist('English'))

    for paragraph in paragraphs:
        output_file.write("<p>" + paragraph.text.replace("\n", " ") + "</p>\n")
Ejemplo n.º 43
0
def toJustText(webContent):
    print 'Entree dans toJustText'

    txt=''
    paragraphs = justext.justext(webContent, justext.get_stoplist("English"))
    for paragraph in paragraphs:
        #if not paragraph.is_boilerplate:
        txt+= smart_str(paragraph.text.encode('utf-8'))
    return txt
Ejemplo n.º 44
0
def run_justext(htmlstring):
    '''try with the generic algorithm justext'''
    valid = list()
    paragraphs = justext.justext(htmlstring, justext.get_stoplist("German")) , 50, 200, 0.1, 0.2, 0.2, 200, True)  # stop_words
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            valid.append(paragraph.text)
    result = ' '.join(valid)
    return result # sanitize(result)
Ejemplo n.º 45
0
 def getText(self):
     text = ''
     try:
         response = requests.get(JusTextWrapper.iUrl)
         paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
         for paragraph in paragraphs:
             if not paragraph.is_boilerplate:
                 text += " "+paragraph.text
         return text
     except:
         return ""
Ejemplo n.º 46
0
def clean_jusText_localFile(filename, language, outputfile) : 
  try : 
    with codecs.open(filename, "r", "utf-8") as f:
      with open(outputfile, "w") as output:
        content = f.read()
        paragraphs = justext.justext(content, justext.get_stoplist(CODE_LANG[language]))
        for paragraph in paragraphs:
          if not paragraph.is_boilerplate:
            output.write(paragraph.text.encode('utf-8')+"\n")
  except ValueError :
    print "[jusText] Stopwords list not available for "+language
Ejemplo n.º 47
0
def get_url(webpage):
    doctext = bytearray()
    try:
        response = requests.get(webpage)
    except requests.exceptions.MissingSchema:
        webpage = 'http://' + webpage
        response = requests.get(webpage)
    paragraphs = justext.justext(response.content, justext.get_stoplist('English'))
    for para in paragraphs:
        if not para.is_boilerplate:
            doctext.extend(para.text.encode('UTF-8'))
    return cleanup(str(doctext))
Ejemplo n.º 48
0
def get_text_from_reuters(link):
    response = requests.get(link)
    resText = response.content.decode("UTF-8", 'ignore')
    soup = BeautifulSoup(resText, 'html.parser')
    tmp = [x.extract() for x in soup.find_all(class_= "Edition_items_293of")]
    for tag in soup.find_all(["script", "meta", "head", "style", "noscript"]):
        tag.decompose()
    for tag in soup.find_all(True, class_= ["Attribution_content_27_rw", "Image_container_1tVQo"]):
        tag.decompose()
    paragraphs = justext.justext(soup.prettify(), justext.get_stoplist("English"))
    text = "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
    return text
Ejemplo n.º 49
0
def text(request):
  page = urllib2.urlopen(request.GET.get('url','')).read()
  paragraphs = justext.justext(page, justext.get_stoplist('English'))
  text = []
  for paragraph in paragraphs:
    if paragraph['class'] == 'good':
        p = {}
        p['content'] = paragraph['text']
        p['heading'] = paragraph['heading']
        text.append(p)

  return HttpResponse(simplejson.dumps(text), 'application/json')
Ejemplo n.º 50
0
 def solo_texto(self):
     '''
      Proviene de trae_datos donde pasa a self.texto
     Trae el contenido de cada url individual (cada noticia) por urllib.request.urlopen() y BeautifulSoup
     
     '''
     if self.link:
         con_ac = 'áéíóúüñ'
         sin_ac = 'aeiouun'
         conv = str.maketrans(con_ac, sin_ac)
         self.link = self.link.translate(conv)
     texto = ''     
     try:
         user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
         headers={'User-Agent':user_agent,} 
         req = urllib.request.Request(self.link, None, headers)
         art1 = urllib.request.urlopen(req)
         # signal.alarm(0) 
     except:
         print("<a href = '"+self.link+"'>Sin conexion (solo_texto) al link</a>")
         return False
     art2 = art1.read()
     art1.close()
     try:
         #metas = parseString(art2)
         #print(1)  
         try:         
             paras = justext.justext(art2, justext.get_stoplist('Spanish'))
         except:
             print("Error en justext")
         for para in paras:
             if para['class'] == 'good':
                 parra = para['text']
                 parra = self.cambia_acentos(parra, self.acentos)
                 parra = parra.replace('Ã', 'Ó')
                 if parra.endswith('.'):
                     texto += " " + parra
                 else:
                     texto += " " +parra + "."
         if not texto:
             print("<a href='"+self.link+"'>No hay texto recibido en trae_articulo"  + self.fuente+"</a>")
         else:
             self.articulo = bs(art2)
             #print(2)
             if (self.articulo):
                 self.busca_fotos()
                 #print(3) 
             return texto
     except:
         print("<a href = '"+self.link+"'>Errores en justext para link </a>")
         return False
Ejemplo n.º 51
0
 def apply_justext_boilerplate_stripper(self, r, stoplist):
     index_key = "index_{}_{}_{}".format(r["country"], r["website"], r["feed_name"]) 
     index_key = "{}.csv".format(self.escape_filename(index_key))
     w = self.escape_filename(r["website"])
     feed_name = self.escape_filename(r["feed_name"])
     original_html_path = os.path.join(self.raw_dir, w, feed_name, r["original_html_file"])
     xml_dir = os.path.join(self.proc_dir, w, feed_name)
     try:
         os.makedirs(xml_dir)
     except IOError:
         pass
     processed_xml_path = os.path.join(xml_dir, r["original_html_file"].replace(".html", ".xml"))
     try:
         with open(original_html_path, "r", encoding="utf-8") as h:
             text = h.read()
     except FileNotFoundError:
         text = None
         self.index_df[index_key].loc[r.name, "downloaded"] = False
         self.index_df[index_key].loc[r.name, "processed"] = False
         self.index_df[index_key].loc[r.name, "justext_comment"] = np.nan
     if text:
         try:
             paragraphs = justext.justext(text, justext.get_stoplist("English"))
         except ValueError:  # e.g. if unable to get stoplist in pyinstaller compiled version
             paragraphs = justext.justext(text, stoplist=stoplist)
         to_keep = []
         bp_count = 0
         for paragraph in paragraphs:
             if not paragraph.is_boilerplate:
                 to_keep.append(paragraph)
             else:
                 bp_count += 1
         if to_keep:
             root = etree.Element("text")
             tree = etree.ElementTree(root)
             for paragraph in to_keep:
                 p_elem = etree.Element("p")
                 p_elem.text = paragraph.text
                 root.append(p_elem)
             xml_str = etree.tounicode(tree)
             try:
                 tree.write(processed_xml_path, pretty_print=True, encoding='utf-8', xml_declaration=True)
             except IOError as e:
                 print("WARNING: Could not write XML file:", e)
                 self.index_df[index_key].loc[r.name, "processed"] = False
             else:
                 self.index_df[index_key].loc[r.name, "processed"] = True
         else:
             print("WARNING: No non-boilerplate code found for", original_html_path)
         self.index_df[index_key].loc[r.name, "justext_comment"] = "{}/{}".format(len(to_keep), bp_count)
         self.index_df[index_key].loc[r.name, "extraction_method"] = "jusText"
Ejemplo n.º 52
0
def scrape(url, title):
    text = str()
    try:
        page = requests.get(url)
        paragraphs = justext.justext(page.content,
                                     justext.get_stoplist('English'))
        for par in paragraphs:
            if par['class'] == 'good':
                text += par['text']
        return text
    #Generic error catching is bad
    #As are printed log statements....
    except Exception:
        print 'Something went wrong...'
Ejemplo n.º 53
0
def doImposter(seed,out,mainlang,imposters):
	
	# We find all the TXT of the LANG directory 
	# /PATH/LANG/*.TXT
	#path    = seed+mainlang+"*/*.txt"

	path = seed+"/*.txt"
	files = glob.glob(path)

	# Numbers of files to be chosen. This file are mixed to get random words
	file_choice = 3
	# Number of words to be chosen to build the query.
	word_choice = 3

	words   = []
	selection = lang[mainlang]

	# Random selection of the files to be mixed
	randomfiles = np.random.choice(files, file_choice)

	for single_file in randomfiles:
		textwords = ''.join( [line.strip() for line in codecs.open(single_file,'r','utf-8')] ).split()
		words = words + textwords

	stopwords =  justext.get_stoplist(selection['lang'])
	
	# After choose a text, we elimiate all the stop words of the variable
	cleanwords = [word for word in words if word not in set(stopwords)]	

	# Creation of ouput directory
	# output = os.path.join(out,mainlang)
	output = out 
	if not os.path.exists(output):
		os.makedirs(output)
	# ERASE 
	#else: 
	#	shutil.rmtree(out)
	#	os.makedirs(output)		
		

	created = 0
	print "Max imposters : %s" % imposters
	while created <= int(imposters) :
        	query = ' '.join( np.random.choice( cleanwords, word_choice) )
		try:
			doSearch(query, selection, stopwords, output)
		except:
			print "Error"
		created = len(glob.glob(output+"/*.txt"))
Ejemplo n.º 54
0
	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		titulo = hxs.select('/html/head/title/text()').extract()
		rules = (Rule(SgmlLinkExtractor(allow='.*'),follow=True,callback='parse'))
		corpo = justext.justext(response.body, justext.get_stoplist('Portuguese'))
		texto = ''
		for paragrafo in corpo:
			if paragrafo['class'] == 'good':
				texto += paragrafo['text']
		item = Pagina()
		item['url'] = response.url
		item['titulo'] = unicode(titulo[0])
		item['texto'] = unicode(texto)
		item['tipo'] = self.name
		return item
Ejemplo n.º 55
0
def remove_bad_by_classifier(doc):
    ps = justext.justext(
        doc, justext.get_stoplist('English'))
    to_delete = []
    good = []
    for p in ps:
        if p['class'] == 'bad':
            for el in doc.xpath(p['xpath']):
                to_delete.append((el, p['xpath']))
        elif p['class'] == 'good':
            good.append(p['xpath'])

    for el, xp in reversed(to_delete):
        if el.getparent() is not None and not any(xp in g for g in good):
            el.drop_tree()
Ejemplo n.º 56
0
def get_url_text(tweet):
    urls = re.findall(r'(https?://\S+)', tweet)
    good_text = ''
    if urls:
        try:
            # response = requests.get(urls[0])
            s = requests.Session()
            s.mount(urls[0], HTTPAdapter(max_retries=1))
            response = s.get(urls[0])
            paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    good_text += ' ' + paragraph.text
        except:
            pass
    return good_text
Ejemplo n.º 57
0
 def extract(self):
     html = self.data_instance.get_raw_html()
     html = html.encode(self.data_instance.raw_encoding,'ignore')
     paragraphs = justext.justext(html, justext.get_stoplist('English'),
                          encoding = self.data_instance.raw_encoding)    
     good_paragraphs = []
     for para in paragraphs:
         if para['class'] == 'good':
             paragraph_text = para['text']
             # this asseration makes sure we catch string and unicode only
             assert isinstance(paragraph_text, basestring)
             if type(paragraph_text) == unicode:
                 good_paragraphs.append(paragraph_text.encode('utf8', 'ignore'))
             else:
                 good_paragraphs.append(paragraph_text)
         
     return '\n\n'.join(good_paragraphs)
Ejemplo n.º 58
0
def crawl_page(url, parenturl=None):
    '''
    Crawl single page and add it to db

    :param url: URL of webpage to be crawled
    :type url: str
    :param parenturl: URL of parent website, obtained automatically if ommited
    :type parenturl: str
    '''
    if not newurl(url):
        return
    # if not parenturl:
    #     parsed = urlparse.urlparse(url)
    #     parenturl = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed)
    res = []
    urls = []
    try:
        response = urllib2.urlopen(url)
    except Exception:
        return urls
    if response.info().maintype == 'text':
        print url
        body = response.read()
        response.close()
        soup = BeautifulSoup(body)

        for tag in soup.findAll('a', href=True):
            tag['href'] = urlparse.urljoin(url, tag['href'])
            urls.append(tag['href'])

        if not body:
            return urls
        paragraphs = justext.justext(body, justext.get_stoplist("Czech"))
        for para in paragraphs:
            if not para.is_boilerplate:
                res.append(para.text)
        btext = ' '.join(res)
        isedu = client.classify_edu(btext)[0]
        page = {'url': url, 'btext': btext}
        print isedu
        if isedu == 2:  # is educational
            cls = client.classify(btext)
            addtodb(page, parenturl, cls)
    return urls
Ejemplo n.º 59
0
 def get_paragraphs(self, url):
     """From given url address returns content of the page without boilerplate.
     (Boilerplate is semanticaly unimportant page content.)
     
     Returns list of paragraphs. Library justext is used.
     See https://code.google.com/p/justext/.
     max_link_density is set higher than default because links are very
     frequently used in the Wikipedia.
     """
     page = urllib2.urlopen(url).read()
     # 'Czech' is stoplist in justext library.
     stoplist = justext.get_stoplist('Czech')
     paragraphs = justext.justext(page, stoplist, no_headings=True,
                                  max_link_density=0.9)
     # Removes paragraphs with too few and too much words.
     useful = [p['text'] for p in paragraphs if p['class']=='good' and p['word_count'] >= 6 and p['word_count'] < 350]  
     # Encode from unicode to utf-8
     useful = [par.encode('utf-8','replace') for par in useful]
     return useful