Esempio n. 1
0
File: main.py Progetto: baixl/baigit
    content_extractor = pickle.load(f)

# Kafka
consumer = KafkaConsumer("test6", group_id="group",
                         bootstrap_servers=['172.16.129.43:9092'])
producer = KafkaProducer(bootstrap_servers=['172.16.129.43:9092'])
print "start sk parser!"
for message in consumer:
    if message is not None:
        try:
            jsonValue = json.loads(message.value)
            # 解析出正文
            content = content_extractor.analyze(jsonValue["html"])
            for useParser in ["lxml", "html5lib", "html.parser"]:
                # 解析标题  发布时间 正文段等
                parseHtml = extractHtml(jsonValue["html"], content, useParser)
                parseTitle = parseHtml.title()
                parsePublishDate = parseHtml.publishDate()
                parseContent = parseHtml.mainContent()
            if len(parseContent[0]) == 0:
                parseContent[0] = content
            if len(parsePublishDate) == 0 or parsePublishDate == " " or parsePublishDate == None:
                # 解析不到发布时间 则将发布时间设置为爬虫时间
                parsePublishDate = jsonValue["crawletime"]
            if len(parseContent[0]) > 0:
                dictData = {
                    "type": "none",
                    "url": jsonValue["url"],
                    "keywords": " ",
                    "description": " ",
                    "title": parseTitle,
Esempio n. 2
0
 
 
# 测试
print "start newspaper parser!"


for message in consumer:
    if message is not None:
        print "xxx"
        try:
            jsonValue = json.loads(message.value)
            html = jsonValue["html"]
            contentWithOutTag = fulltext(html, language="zh")
            for useParser in ["lxml"]:
                # 将无标签正文带回html解析
                parseHtml = extractHtml(html, contentWithOutTag, useParser)
                parseTitle = parseHtml.title()
                parsePublishDate = parseHtml.publishDate()
                parseContent = parseHtml.mainContent()
                if len(parseContent[0]) == 0:
                    parseContent[0] = contentWithOutTag
                if len(parsePublishDate) == 0 or parsePublishDate == " " or parsePublishDate == None:
                    # 解析不到发布时间 则将发布时间设置为爬虫时间
                    parsePublishDate = time.asctime(time.localtime(time.time()))
               
                print "\n-----------------------------------------------------------------------------\n"
                print "url:\t", jsonValue["url"]
                print "标题:\t", parseTitle
                print "正文:\t", parseContent[0]
                print "发布时间:\t", parsePublishDate
                print "\n-----------------------------------------------------------------------------\n"