Exemple #1
0
def extract_article_information_from_html(html):
    """
    This methods gets a website the HTML as string and extracts the text of
    the article

    :param html: a HTML object from package requests
    :return: the article information
    """
    article_information = {}

    # run with newspaper
    article_newspaper = Article('')
    article_newspaper.set_html(html)
    article_newspaper.parse()

    article_information["summary"] = article_newspaper.summary
    article_information["author"] = str(article_newspaper.authors).strip('[]')
    article_information["tags"] = article_newspaper.tags
    article_information["title"] = article_newspaper.title

    newspaper_text = article_newspaper.text
    # run with newsplease
    # article_newsplease = NewsPlease.from_html(html)
    # newsplease_text = article_newsplease.cleaned_text
    # run with goose
    goose_extractor = Goose()
    goose_extractor = goose_extractor.extract(raw_html=html)
    article_goose = goose_extractor.cleaned_text
    if len(newspaper_text.split(" ")) > len(article_goose.split(" ")):
        article_information["text"] = newspaper_text
    else:
        article_information["text"] = article_goose
    return article_information
Exemple #2
0
    def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date:
            try:
                article_candidate.publish_date = article.publish_date.strftime(
                    '%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug(
                    '%s: Newspaper failed to extract the date in the supported format,'
                    'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate
def read_other_article(htmltext):
    """
        Processes articles other than the ones for which specific rules have been written
    :param htmltext: the htmltext of the article
    :return:
    """
    article = Article('')  # so that you can use local files with newspaper3k
    article.set_html(htmltext)
    article.parse()
    authors = article.authors  # sometimes is extracts stuff like "Reporter for Fox News. Follow Her on Twitter.."

    date = article.publish_date  # TODO: date not extracted here properly
    if date is not None:
        date = article.publish_date.strftime('%d/%m/%Y')

    text = article.text
    # text = "".join(i for i in text if i != '\n')
    title = article.title

    publisher = 'other'
    if determine_publisher(htmltext) == ArticleType.CNN:
        publisher = 'cnn'

    result_dict = {
        'title': title,
        'authors': authors,
        'text': text,
        'date': date,
        'publisher': publisher
    }
    return result_dict
        def check_url_get_content(url):
            """This function takes url as argument, extracts text content and other information using Article class
            from newspaper library and returns result as a dictionary.
            """

            result = {}
            try:
                # async with session.get(url, timeout=600) as resp:
                with urllib.request.urlopen(url, timeout=600) as resp:
                    # content = await resp.read()
                    content = resp.read()
                    # if content:
                    article = Article(url)
                    article.set_html(content)
                    article.parse()
                    article.nlp()
                    text = article.text
                    keywords = article.keywords
                    status_code = resp.status

                    # else:
                    #     text = 'none'
                    #     keywords = 'none'
                    #     status_code = 'none'
            except Exception as e:
                text = 'none'
                keywords = 'none'
                status_code = 'none'

            result['Text'] = text
            result['Keywords'] = keywords
            result['status_code'] = status_code

            return result
    def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date is not None:
            try:
                article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
                              'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate
def handle(task, progress):
	url = task.url
	progress.set_status("Requesting page...")
	resp = http_downloader.page_text(url, json=False)
	if not resp:
		return False

	config = Config()
	config.memoize_articles = False
	config.verbose = False
	article = Article(url='', config=config)

	article.download()
	article.set_html(resp)
	article.parse()
	if not article.top_image:
		return None

	src = article.top_image
	if 'http' not in src:
		if 'https' in url:
			src = 'https://' + src.lstrip('/ ').strip()
		else:
			src = 'http://' + src.lstrip('/ ').strip()

	progress.set_status("Downloading image...")

	return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)
Exemple #7
0
    def parse_item(self, response):

        tag = ""

        for value in response.url.split("/")[3:]:
            if str(value).isdigit():
                continue
            tag = value
            break

        item = EnPItem()
        art_parser = Article(response.url, language='en', fetch_images=False)
        # a.download()
        art_parser.set_html(response.text)
        art_parser.parse()

        item["home"] = response.url
        item["title"] = art_parser.title

        item["content"] = art_parser.text
        item["authors"] = art_parser.authors
        try:
            item["publish_date"] = art_parser.publish_date.strftime(
                '%Y-%m-%d %H:%M:%S')
        except:
            pass
        item["images"] = list(art_parser.images)
        item["keywords"] = art_parser.keywords
        item["meta_keywords"] = art_parser.meta_keywords
        item["tags"] = tag  #list(art_parser.tags)

        print(item)
        save_mess("%s.txt" % self.name,
                  json.dumps(dict(item), ensure_ascii=False))
Exemple #8
0
    def process_html(self, html):
        # fetch page content and parse html using newspaper
        article = Article(url="")
        article.set_html(html)
        article.parse()

        return article
Exemple #9
0
def generic(url):
    article = Article(url)
    r = requests.get(url, stream=True)
    # article.download()
    article.set_html(r.raw.read(MAX_DATA, decode_content=True))
    article.parse()
    return article
Exemple #10
0
def retrieve_article(url):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            url,
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        return text
    except Exception as e:
        print(e)
        return False
Exemple #11
0
 def parse(cls, from_url: str, resolved_url: str, http_status: int,
           html: str) -> List[model.Page]:
     a = Article(resolved_url)
     a.set_html(html)
     a.parse()
     try:
         parsed = model.Parsed(keywords=[
             s.strip() for s in a.meta_data['news_keywords'].split(",")
         ])
     except:
         parsed = None
     return [
         model.Page(
             from_url=from_url,
             resolved_url=resolved_url,
             http_status=http_status,
             article_metadata=dict(a.meta_data),
             article_published_at=a.publish_date,
             article_title=a.title,
             article_text=a.text,
             article_summary=a.meta_data['description'],
             parsed=parsed,
             fetched_at=datetime.datetime.now(),
         )
     ]
Exemple #12
0
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]
Exemple #13
0
def analyse():
    article = Article('')
    article.set_html(html=request.data)
    article.parse()
    return jsonify(source_url=article.source_url,
                   url=article.url,
                   title=article.title,
                   top_img=article.top_img,
                   meta_img=article.meta_img,
                   imgs=list(article.imgs),
                   movies=article.movies,
                   text=article.text,
                   keywords=article.keywords,
                   meta_keywords=article.meta_keywords,
                   tags=list(article.tags),
                   authors=article.authors,
                   publish_date=article.publish_date,
                   summary=article.summary,
                   article_html=article.article_html,
                   meta_description=article.meta_description,
                   meta_lang=article.meta_lang,
                   meta_favicon=article.meta_favicon,
                   meta_data=article.meta_data,
                   canonical_link=article.canonical_link,
                   additional_data=article.additional_data)
Exemple #14
0
 def parse_content(self, response):
     """extract content of news by newspaper"""
     item = response.meta['item']
     is_special, content = self._handle_special_site(response)
     if not is_special:
         # 不是特殊网站
         article = Article(item['url'], language='zh')
         article.set_html(response.body)
         article.is_downloaded = True
         article.parse()
         item['pic'] = article.top_image
         item['content'] = str(article.text)
         item['publish_date'] = article.publish_date
         if publish_date:
             item['publish_date'] = publish_date.strftime(
                 "%Y-%m-%d %H:%M:%S")
         else:
             item['publish_date'] = "null"
     else:
         item['pic'] = ""
         item['content'] = content
     # extract content failed
     if item['content'] == '':
         logging.error("empty content in: " + response.url)
         yield item
         # raw_content = response.xpath("//body//p/text()").extract()
         # item['content'] = ''.join(raw_content)
     item['content'] = item['content'].strip().replace(u"\xa0", "").replace(u"\u3000", "").replace("|", "")\
         .replace("用微信扫码二维码分享至好友和朋友圈", "").strip("您当前的位置 :").strip("您所在的位置:").strip("提示:点击上方").strip(">").strip()
     yield item
Exemple #15
0
    def parse_item(self, response):

        tags = response.xpath(
            '//*[@id="bread-nav"]/a[position()>=1]/text()').extract()
        item = EnPItem()
        art_parser = Article(response.url, language='en', fetch_images=False)
        # a.download()
        art_parser.set_html(response.text)
        art_parser.parse()

        item["home"] = response.url
        item["title"] = art_parser.title

        item["content"] = art_parser.text
        item["authors"] = art_parser.authors
        try:
            item["publish_date"] = art_parser.publish_date.strftime(
                '%Y-%m-%d %H:%M:%S')
        except:
            pass
        item["images"] = list(art_parser.images)
        item["keywords"] = art_parser.keywords
        item["meta_keywords"] = art_parser.meta_keywords
        item["tags"] = tags  #list(art_parser.tags)

        save_mess("daly_people.txt", json.dumps(dict(item),
                                                ensure_ascii=False))
def get_article_from_html(article_html):
    # Returns a `newspaper` Article object from article HTML
    article = Article('', keep_article_html=True)
    article.set_html(article_html)
    article.parse()
    attach_links(article)
    return article
Exemple #17
0
    def parse(self, from_url: str, resp: aiohttp.ClientResponse, html: str) -> es.Page:
        article = Article(str(resp.url))
        article.set_html(html)
        article.parse()

        if article.clean_top_node is not None:
            parsed = Parsed(
                keywords=article.meta_keywords,
                tickers=_parse_tickers(article.clean_top_node))
            article_html = etree.tostring(
                article.clean_top_node, encoding='utf-8').decode('utf-8')
        else:
            parsed = Parsed(keywords=article.meta_keywords, tickers=[])
            article_html = None

        parsed = Parsed(
            keywords=article.meta_keywords,
            tickers=_parse_tickers(article.clean_top_node))
        page = es.Page(
            from_url=from_url,
            resolved_url=str(resp.url),
            http_status=resp.status,
            article_metadata=json.dumps(article.meta_data),
            article_published_at=article.publish_date,
            article_title=article.title,
            article_text=article.text,
            article_html=article_html,
            parsed=json.dumps(dataclasses.asdict(parsed)),
            fetched_at=datetime.datetime.now(),)
        page.save()
Exemple #18
0
    def parse_item(self, response):

        tags = response.xpath(
            '//*[@class="row-fluid crumbs"]//text()').extract()

        item = EnPItem()
        art_parser = Article(response.url, language='en', fetch_images=False)
        # a.download()
        art_parser.set_html(response.text)
        art_parser.parse()

        item["home"] = response.url
        item["title"] = art_parser.title

        item["content"] = art_parser.text
        item["authors"] = art_parser.authors
        try:
            item["publish_date"] = art_parser.publish_date.strftime(
                '%Y-%m-%d %H:%M:%S')
        except:
            pass
        item["images"] = list(art_parser.images)
        item["keywords"] = art_parser.keywords
        item["meta_keywords"] = art_parser.meta_keywords
        item["tags"] = tags  #list(art_parser.tags)

        print(item)
        save_mess("%s.txt" % self.name,
                  json.dumps(dict(item), ensure_ascii=False))
Exemple #19
0
def extract(results):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            results["url"],
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

        print("=", end='', flush=True)
        return (results["url"], results["title"], text, article.publish_date)
    except Exception as e:
        print(e)
        return (results["url"], results["title"], None, None)
Exemple #20
0
	def scrap(self):
		url = self.get_url()
		print("retrieve page: {}".format(url))
		# check if we are allowed to crawl this page
		if not self.is_allowed():
			print("Retrieval is not allowed by robots.txt")
			return False

		# Get page content and headers
		try:
			response = requests.get(url, headers={'User-Agent': settings.USER_AGENT})
		except (requests.ConnectTimeout, requests.HTTPError, requests.ReadTimeout, requests.Timeout, requests.ConnectionError):
			return

		self.content_type = response.headers['content-type'] if 'content-type' in response.headers else ""  # usually "text/html"

		# don't store page content if it's not html
		self.raw_content = response.text
		if self.content_type.find("text/html") == -1:
			print("we don't process none html pages yet.")
			return False

		# store article title and content
		article = Article(url)
		article.set_html(self.raw_content)
		article.parse()
		self.article_title = article.title
		self.article_content = article.text
		self.article_top_image = article.top_image
		article.nlp()
		self.article_excerpt = article.summary
		self.article_keywords = article.keywords

		# parse html page
		soup = BeautifulSoup(self.raw_content, "html5lib")

		# Images
		for img in soup.findAll("img"):
			img_url = img.get('src', '')
			img_alt = img.get('alt', '')
			img_title = img.get('title', '')

			image_site_url, image_path = get_site_path(img_url)
			if self.site.site_url == image_site_url:
				img_site = self.site
			else:
				img_site = Site.objects.get_or_create(site_url=image_site_url)[0]
			# get image object
			image = Image.objects.get_or_create(path=image_path, site=img_site)[0]
			img_detail = ImageDetail.objects.get_or_create(image=image, page=self)[0]
			img_detail.title = img_title
			img_detail.alt = img_alt
			img_detail.save()

		# HTML Title
		self.page_title = soup.title.string
		self.save()

		return soup  # for crawling
Exemple #21
0
 def extract_with_newspaper(self, html):
     '''Parses HTML using Newspaper.'''
     article = Article(self.url)
     article.set_html(html)
     filterwarnings('ignore', category=DeprecationWarning)
     with catch_warnings():
         article.parse()
     return article.__dict__
def parse(response):
    print type(response)
    # print response.body
    article = Article(url=response.url, language="es")
    article.set_html(response.body)
    # article = self.articleProcessor.extractAll(response.body)
    print article.title
    print article.publish_date
    print response.url
Exemple #23
0
 def parse_article_page(response):
     article = Article(url=response.request.url)
     article.set_html(response.text)
     article.parse()
     if article.title and article.text:
         item = NewsArticle()
         item['title'] = article.title
         item['text'] = article.text
         yield item
 def parse_article_page(response):
     article = Article(url=response.request.url)
     article.set_html(response.text)
     article.parse()
     if article.title and article.text:
         item = NewsArticle()
         item['title'] = article.title
         item['text'] = article.text
         yield item
def load_and_parse_full_article_text_and_image(url: str) -> Article:
    config = Config()
    config.MAX_SUMMARY_SENT = 8

    article = Article(url, config=config)
    article.set_html(load_page_safe(url))  # safer than article.download()
    article.parse()

    return article
def handle(url, data, log):
    try:
        log.out(0, 'Downloading article...')
        resp = requests.get(url, headers={'User-Agent': data['user_agent']})
        if resp.status_code != 200:
            return False  #!cover

        config = Config()
        config.memoize_articles = False
        config.verbose = False
        article = Article(url='', config=config)
        log.out(0, 'Parsing article...')

        article.download()
        article.set_html(resp.text)
        article.parse()
        if article.top_image:
            src = article.top_image
            if 'http' not in src:  #!cover
                if 'https' in url:
                    src = 'https://' + src.lstrip('/ ').strip()
                else:
                    src = 'http://' + src.lstrip('/ ').strip()
            log.out(0, 'Newspaper located image: %s' % src)

            r = requests.get(src,
                             headers={'User-Agent': data['user_agent']},
                             stream=True)
            if r.status_code == 200:
                content_type = r.headers['content-type']
                ext = mimetypes.guess_extension(content_type)
                if not ext or ext == '':  #!cover
                    log.out(
                        1, 'NewsPaper Error locating file MIME Type: %s' % url)
                    return False
                if '.jp' in ext:
                    ext = '.jpg'  #!cover
                path = data['single_file'] % ext
                if not os.path.isfile(path):
                    if not os.path.isdir(data['parent_dir']):  #!cover
                        log.out(1, ("+Building dir: %s" % data['parent_dir']))
                        os.makedirs(
                            data['parent_dir']
                        )  # Parent dir for the full filepath is supplied already.
                    with open(path, 'wb') as f:
                        r.raw.decode_content = True
                        shutil.copyfileobj(r.raw, f)
                return path
            else:  #!cover
                log.out(
                    0, ('\t\tError Reading Image: %s responded with code %i!' %
                        (url, r.status_code)))
                return False
    except Exception as e:
        log.out(0, ('"Newspaper" Generic handler failed. ' + (str(e).strip())))
    return False  #!cover
Exemple #27
0
class ParserNewsPaper(Parser):
    _extractor = None

    def parse_news_text(self, page_html: str, url: str) -> dict:
        if self._extractor is None:
            self._extractor = Article("", language="en")
        self._extractor.set_html(page_html)
        self._extractor.parse()
        news_text = re.sub(r'\s+', r' ', self._extractor.text)
        return {'url': url, 'text': news_text}
def get_body_from_html(url, html, cache=False):
    if cache:
        dc = DownloadCache(url)
        if not dc.is_cached():
            dc.cache(html)

    narticle = NArticle(url, fetch_images=False)
    narticle.set_html(html)
    narticle.parse()
    return narticle.text
def top_image_from_html(url, html):
    try:
        article = Article(url=url)
        article.set_html(html)
        article.parse()
        return article.top_image
    except Exception as e:
        logger.error("error reading article " + url)

    return {}
Exemple #30
0
def clean(html_content):
    config = Configuration()
    config.fetch_images = False

    # TODO: allow URL passing
    article = Article("http://example.com", config=config)
    article.set_html(html_content)
    article.is_downloaded = True
    article.parse()

    return article.text
Exemple #31
0
def get_article(driver, url):
    driver.get(url)

    article = Article("")
    article.set_html(driver.page_source)
    article.parse()

    text = article.text
    text = re.sub(r"[\n ]+", " ", text, flags=re.M)

    return text
Exemple #32
0
def parse_article(url) -> Tuple[str, List[str]]:
    """Parse article using newspaper3k to get summary and keywords."""
    if not url:
        return "", []
    article = Article(url)
    html_content = load_page_safe(url)
    if not html_content:
        return "", []
    article.set_html(html_content)
    article.parse()
    article.nlp()
    return article.summary, list(set(article.keywords))
Exemple #33
0
def main(argv):
    try:
        r = redis.StrictRedis('localhost', 6379, 0)

        article = Article(url='', fetch_images=False, language='fr')
        article.set_html(r.get('scraped'))
        article.parse()

        print(json.dumps(article.text))

    except Exception:
        print(json.dumps(None))
Exemple #34
0
def read_nyt_article(htmltext):
    """

    uses the string of the new york times article which is passed to it to extract the important information

    :param htmltext: a string which contains the html of the new york times article

    :return:   returns a dict which stores the extracted result
    """
    soup = BeautifulSoup(htmltext, 'lxml')
    title = soup.html.head.title.text  # extracts the title
    ps = soup.body.find_all('p')
    i = 0

    article = Article('')  # so that you can use local files with newspaper3k
    article.set_html(htmltext)
    article.parse()
    authors = article.authors

    date = article.publish_date  # TODO: date not extracted here properly
    if date is not None:
        date = article.publish_date.strftime('%d/%m/%Y')

    # used to find where the article text start - it always starts with a '-'
    while '—' not in ps[i].text:
        i += 1
    ps = ps[i:]

    # gets rid of useless sections
    ps = [i for i in ps if i.text != '']
    ps = [i for i in ps if i.text != 'Advertisement']
    ps = [i for i in ps if 'Now in print:' not in i.text]
    ps = [i for i in ps if 'And here\'s our email' not in i.text]
    ps = [i for i in ps if 'The Times is committed' not in i.text]
    ps = [i for i in ps if 'We\'d like to hear' not in i.text]
    ps = [i for i in ps if 'Follow The New York Times' not in i.text]
    ps = [i for i in ps if 'on Twitter: @' not in i.text]
    ps = [i for i in ps if 'on Twitter at' not in i.text]
    ps = [i for i in ps if 'contributed reporting' not in i.text]
    ps = [i for i in ps if 'contributed research' not in i.text]
    text = "\n ".join([" ".join(i.text.split()) for i in ps])

    result_dict = {
        'title': title,
        'authors': authors,
        'text': text,
        'date': date,
        'publisher': 'nytimes'
    }
    return result_dict
Exemple #35
0
    async def enrich(self, result):
        # none of the following lines will work if we couldn't make soup
        if not self.soup:
            return result

        sanitized = sanitize_html(self.response.body)
        if not sanitized:
            return result

        article = Article(self.url, config=FixedArticleConfig())
        article.config.fetch_images = False
        article.set_html(sanitized)
        article.parse()

        result.set('title', article.title, 2, 'textlength')
        if len(article.meta_description) > 0:
            result.set('subtitle', article.meta_description, 2, 'textlength')

        if len(article.article_html) > 0:
            sanitized = sanitize_html(article.article_html)
            result.set('content', sanitized, 0, 'textlength')
        elif article.top_node is not None:
            sanitized = sanitize_html(tostring(article.top_node))
            result.set('content', sanitized, 2)

        if article.authors:
            result.set('authors', article.authors, 2)
        if article.publish_date and len(str(article.publish_date)) > 0:
            result.set('published_at', article.publish_date, 2)
        result.add('keywords', list(article.keywords))
        result.add('keywords', list(article.tags))
        result.add('_candidate_images', list(article.imgs))
        # Primary image guess is actually pretty crappy
        if article.top_image:
            result.add('_candidate_images', [article.top_img])

        text = ""
        for paragraph in article.text.split("\n"):
            paragraph = paragraph.strip()
            # this is done to get rid of cases where a stray heading
            # like "Photographs" ends up as a paragraph
            if Summarizer.has_sentence(paragraph):
                text += " " + paragraph

        if len(text) > 0:
            result.set('_text', text, 2)

        return result
 def _parse_article(self, key, url):
     a = Article('')
     html = Google().cache(url)
     a.set_html(html)
     a.parse()
     a.nlp()
     article = {"summary":a.summary,
               "publish_date":a.publish_date,
               "images":a.images,
               "top_image":a.top_image,
               "title":a.title,
               "authors":a.authors,
               "keywords":a.keywords,
               "text":a.text}
     # update
     #conn = r.connect(db="clearspark")
     conn = r.connect(**rethink_conn.conn())
Exemple #37
0
def clean_source(url, source):
    """ Parse a pre-downloaded article using newspaper.

    Args:
        url (str): The url where the article was sourced (necessary for the
                newspaper API).

        source (str): Html source of the article page.

    Returns:
        Dictionary providing cleaned article and extracted content
        (see `construct_result`), or `None` if newspaper could not extract
        the article.
    """
    article = Article(url)
    article.set_html(source)
    article.parse()

    if article.top_node is None:
        return None

    return construct_result(article)
Exemple #38
0
import sys, json
from newspaper import Article

htmlStr = ""

for line in sys.stdin:
    htmlStr = htmlStr + line

#obj = json.loads(jsonStr)
article = Article('')
article.set_html(htmlStr);
article.parse()
article.nlp()
ret = json.dumps(article.keywords)
print ret
Exemple #39
0
def extract_data(fname, loadp, savep):
    ######################
    # initialize process #
    ######################
    stream = GzipFile(loadp + fname)
    protocol = TBinaryProtocol.TBinaryProtocol(TTransport.TBufferedTransport(stream))

    data = {'data': []}
    count = 0

    ####################
    # begin extraction #
    ####################
    while True:
        page = WikiLinkItem()
        try:
            page.read(protocol)
            count += 1
        except:
            stream.close()
            break

        print '- processing FILE {0} ENTRY # {1}'.format(fname, count)
        print '\t $ URL: {0}'.format(page.url)

        #####################
        # initial filtering #
        #####################
        if page.url[:3] == 'ftp':
            print '\t\t ###### Ftp prefix detected (ignore) ###### \n'
            continue
        if page.url[len(page.url) - 4:] != 'html':
            print '\t\t ###### Non-html suffix detected (ignore) ###### \n'
            continue
        if page.content.dom == None:
            print '\t\t ###### Empty dom detected (ignore) ###### \n'
            continue

        #######################
        # secondary filtering #
        #######################
        entities = extract_entities(page.mentions)
        if len(entities) < 2:
            print '\t\t ###### Single entity found (discard) ###### \n'
            continue

        print '\t $ # Entities:', len(entities)

        #########################
        # alignment and parsing #
        #########################
        html = mark_dom(page.content.dom, entities)

        news = Article(page.url, language = 'en')
        try:
            news.set_html(html)
            news.parse()
        except:
            print '\t\t ###### Parsing failed (discard) ###### \n'
            continue

        ################
        # tokenization #
        ################
        text = None
        try:
            text = ftfy.fix_text(news.text)
            text = text.encode('ascii', 'ignore')
            text = seperate_delimiter(word_tokenize(text))
        except:
            print '\t\t ###### Tokenization failed (discard) ###### \n'
            continue

        #######################
        # save processed data #
        #######################
        print '\t $ Entry # {0} Saved \n'.format(count)
        data['data'].append({'text': text, 'dict': entities})

    #####################
    # save as json file #
    #####################
    print '****** {0}.json saved ******\n'.format(fname[:3])
    f = open(savep + '{0}.json'.format(fname[:3]), 'w')
    json.dump(data, f, indent = 4)
    f.close()
Exemple #40
0
 def prepare(self, response):
     article = Article(url=response.url)
     article.set_html(response.text)
     article.parse()
     return article
Exemple #41
0
def parser_nlp(fname, html):
  Ts = timeit.default_timer()
  raw_html = html
  # basic info
  fid = int(fname.split('_')[0].split('/')[1])
  pm = parse_machine()
  html = pm.fix_html(html)
  link_stats = pm.parse_links(html)
  link_factors = [t for t in list(set(" ".join(link_stats.keys()).lower().split())) if (len(t) > 3)]
  doc = db.articles(
    fid           = fid,
    html          = html,
    html_cnt      = len(html),
    link_stats    = link_stats,
    link_factors  = link_factors,
    rand          = random.random(),
    # extra
    lines         = raw_html.count('\n'),
    spaces        = raw_html.count(' '),
    tabs          = raw_html.count('\t'),
    braces        = raw_html.count('{'),
    brackets      = raw_html.count('['),
    quesmarks     = raw_html.count('?'),
    exclamarks    = raw_html.count('!'),
    words         = len(re.split('\s+', raw_html)),
  )
  # check empty
  if ((doc.html == None) | (len(doc.html.replace(r'\s', '')) < 10)):
    doc.empty = True
    return doc
  try:
  # if True:
    pd = Article('', fetch_images=False)
    pd.set_html(doc.html)
    pd.parse()
    pd.nlp()
  except Exception as e:
    print("-"*60)
    print("[parser_nlp %s]: %s" % (doc.fid, e)) 
    print(doc.html[:500])
    print("-"*60) 
    return doc #"%s: %s" % (e, doc.id)
  # select cleaned_text
  cleaned_text = " ".join(pd.text.lower().split())
  if (len(cleaned_text) < 140):
    soup = bs(doc.html)
    if soup.body: 
      cleaned_text = soup.body.text
    if (len(cleaned_text) < 140): 
      cleaned_text = soup.text
  cleaned_text = sanitize_txt(cleaned_text, lower=True)
  bow = nlp.nlp().txt2words(cleaned_text or '', False)
  # save results 
  try:
    opengraph = pd.meta_data.get('og', {}) if pd.meta_data else {}
    top_image = opengraph.get('image') or (pd.top_image if pd.top_image else None)
    if isinstance(top_image, dict): top_image = top_image.get('identifier')
    if isinstance(opengraph.get('locale'), dict): opengraph['locale'] = opengraph.get('locale').get('identifier')
    publish_date = pm.process_date(opengraph.get('updated_time') or pd.publish_date)
    # canonical_link & domain
    domain = canonical_link = str(opengraph.get('url') or pd.canonical_link)
    if '//' in domain: domain = domain.split('//')[1]
    if '?' in domain: domain = domain.split('?')[0]
    domain = '/'.join(domain.split('/')[0:1])
    # update
    # doc.update(
    doc = db.articles(
      fid               = doc.fid,
      html              = doc.html,
      link_stats        = doc.link_stats,
      link_factors      = doc.link_factors,
      rand              = doc.rand,
      html_cnt          = doc.html_cnt,
      #
      lines             = doc.lines,
      spaces            = doc.spaces,
      tabs              = doc.tabs,
      braces            = doc.braces,
      brackets          = doc.brackets,
      quesmarks         = doc.quesmarks,
      exclamarks        = doc.exclamarks,
      words             = doc.words,
      #
      title             = str(opengraph.get('title') or pd.title)[:500],
      # cleaned_text      = str(cleaned_text),
      bow               = bow,
      tags              = [t.lower() for t in pd.tags],
      # opengraph         = {sanitize_txt(k): sanitize_txt(v) for k,v in opengraph.items()},
      # summary           = str(pd.summary),
      keywords          = pd.keywords,
      top_image         = str(top_image),
      movies            = pd.movies,
      publish_date      = publish_date,
      meta_site_name    = str(opengraph.get('site_name')),
      meta_lang         = str(opengraph.get('locale') or pd.meta_lang),
      meta_description  = str(opengraph.get('description') or pd.meta_description),
      meta_keywords     = pd.meta_keywords,
      canonical_link    = canonical_link,
      domain            = domain,
      authors           = [n.lower().replace(' ', '_') for n in pd.authors],
    )
  except Exception as e:
    print("-"*60)
    print("[Error] while [%s] in parser_nlp: %s" % (doc.id, e))
    data = {
      "title"     : str(opengraph.get('title') or pd.title)[:500],
      "text"      : cleaned_text[:140],
      "tags"      : [t.lower() for t in pd.tags],
      "opengraph" : opengraph,
      "summary"   : str(pd.summary),
      "keywords"  : pd.keywords,
      "top_image" : str(top_image),
      "movies"    : pd.movies,
      "date"      : publish_date, #opengraph.get('updated_time') or pd.publish_date,
      "site_name" : str(opengraph.get('site_name')),
      "locale"    : str(opengraph.get('locale') or pd.meta_lang),
      "desc"      : str(opengraph.get('description') or pd.meta_description),
      "keywords"  : pd.meta_keywords,
      "url"       : canonical_link,
      "authors"   : pd.authors,
    }
    for k,v in data.items():
      print(k, v, v.__class__)
    print("-"*60)
  return doc
# -*- coding: utf-8 -*-
from newspaper import Article
from goose import Goose
import requests
import json
import sys

article = Article(sys.argv[1])

article.download()
if not article.html:
  r = requests.get(sys.argv[1], verify=False, headers={ 'User-Agent': 'Mozilla/5.0' })
  article.set_html(r.text)

article.parse()
article.nlp()

published = ''
if article.publish_date:
  published = article.publish_date.strftime("%Y-%m-%d %H:%M:%S")

# Get body with goose
g = Goose()
goose_article = g.extract(raw_html=article.html)
body = goose_article.cleaned_text
summary = goose_article.meta_description

# Maybe use https://github.com/xiaoxu193/PyTeaser
if not summary:
  summary = article.summary