def getArticle(self, url, rawHTML, language=None): config = Configuration() if language: config.targetLanguage = language config.useMetaLanguge = False config.enableImageFetching = False g = Goose(config=config) article = g.extractContent(url=url, rawHTML=rawHTML) return article
def getArticle(self, url, rawHTML): config = Configuration() config.enableImageFetching = False g = Goose(config=config) article = g.extractContent(url=url, rawHTML=rawHTML) return article
import re import time import pymongo from goose.Goose import Goose from bs4 import BeautifulSoup from pymongo import MongoClient client = MongoClient('localhost',27017) db = client.news collection = db.it_news crawl_list = ['business-issues','consumer-electronics','telecommunication','internet','networking','peripherals','security','services','software','storage','virtualization','hardware-systems-0','government','business-issues','cousumer-electronics','legal','it-management'] page_max = 400 item_per_page = 10 go = Goose() url_base = 'http://www.itnews.com/' for tag1 in crawl_list: print "TAG:::::::" + tag1 for page in range(0,page_max): new_articles = [] if page == 0: url_page = '' else: url_page = '?page=' + str(page) url_full = url_base + tag1 + url_page print "URL::::::" + url_full time.sleep(1) res = requests.get(url_full)
def ext_goosepy(html): g = Goose() g.config.enableImageFetching = False article = g.extractContent(url="http://www.example.com/test.html", rawHTML=html) return article.cleanedArticleText