def clean(html_content): config = Configuration() config.enable_image_fetching = False extractor = Goose(config=config) article = extractor.extract(raw_html=html_content) return article.cleaned_text
def getArticle(self, url, raw_html, language=None): config = Configuration() if language: config.target_language = language config.use_meta_language = False config.enable_image_fetching = False g = Goose(config=config) article = g.extract(raw_html=raw_html) return article
def getConfig(self): config = Configuration() config.enable_image_fetching = False return config
def getArticle(self, url, raw_html, language=None): config = Configuration() config.enable_image_fetching = False g = Goose(config=config) article = g.extract(url=url, raw_html=raw_html) return article
def getConfig(self): config = Configuration() config.enable_image_fetching = True return config
import os,sys import time import subprocess import signal from httplib import IncompleteRead from gevent.pool import Pool import gevent.socket as socket from gevent.event import Event from goose import Goose from goose.configuration import Configuration from goose.text import StopWordsChinese import chardet import random goose_config = Configuration() goose_config.enable_image_fetching = False goose_config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7" #goose_config.parser_class = 'soup' goose_config.stopwords_class = StopWordsChinese g = Goose(config=goose_config) url_file = '/data/algorithm/urlcontent' address = ('192.168.32.5', 10888) class Worker(object): ''' 子进程运行的代码,通过起一个协程来和主进程通信 包括接受任务分配请求,退出信号(零字节包),及反馈任务执行进度 然后主协程等待停止信号并中止进程(stop_event用于协程间同步)。