Example #1
0
def clean(html_content):
    config = Configuration()
    config.enable_image_fetching = False
    extractor = Goose(config=config)

    article = extractor.extract(raw_html=html_content)

    return article.cleaned_text
Example #2
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     if language:
         config.target_language = language
         config.use_meta_language = False
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(raw_html=raw_html)
     return article
Example #3
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = False
     return config
Example #4
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(url=url, raw_html=raw_html)
     return article
Example #5
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = True
     return config
Example #6
0
import os,sys
import time
import subprocess
import signal
from httplib import IncompleteRead
from gevent.pool import Pool
import gevent.socket as socket
from gevent.event import Event
from goose import Goose
from goose.configuration import Configuration
from goose.text import StopWordsChinese
import chardet
import random

goose_config = Configuration()
goose_config.enable_image_fetching = False
goose_config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7"
#goose_config.parser_class = 'soup'
goose_config.stopwords_class = StopWordsChinese

g = Goose(config=goose_config)

url_file = '/data/algorithm/urlcontent'

address = ('192.168.32.5', 10888)

class Worker(object):
    '''
    子进程运行的代码,通过起一个协程来和主进程通信
    包括接受任务分配请求,退出信号(零字节包),及反馈任务执行进度
    然后主协程等待停止信号并中止进程(stop_event用于协程间同步)。