Exemple #1
0
__date__ = '2018/5/29'

__QQ__ = '376205871'

"""
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from corpus_health.items import CorpusHealthItem
from scrapy_redis.spiders import RedisCrawlSpider
import urllib.parse
from math import floor
import re

from corpus_health.Util.LogHandler import LogHandler
logger = LogHandler(__name__, stream=True)


class Ask120Spider(RedisCrawlSpider):
    handle_httpstatus_list = [404, 403, 500]
    name = 'ask120'
    allowed_domains = ['120ask.com']
    # start_urls = [
    #     'http://www.120ask.com/list/gaoxueya/',
    #     'http://www.120ask.com/list/gaoxueya/all/2/'
    # 'http://www.120ask.com/list/tangniaobing/'
    # 'http://www.120ask.com/list/guanxinbing/'
    # 'http://www.120ask.com/list/ganmao/'
    # 'http://www.120ask.com/list/jingzhuibing/'
    # 'https://www.120ask.com/list/zhifanggan/'
    # 'http://www.120ask.com/list/tongfeng/'
Exemple #2
0
import pymysql
import pymongo
import os
import urllib
from datetime import datetime
import re
from urllib import parse
from scrapy.exceptions import DropItem
import pandas as pd
from corpus_health import settings
from corpus_health.items import CorpusHealthItem
from corpus_health.items import MedicineItem
from corpus_health.items import NewsItem

from corpus_health.Util.LogHandler import LogHandler
logger = LogHandler(__name__, stream=False)


class CorpusHealthPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(host=settings.MYSQL_HOST,
                                       db=settings.MYSQL_DBNAME,
                                       user=settings.MYSQL_USER,
                                       passwd=settings.MYSQL_PASSWD,
                                       charset='utf8',
                                       use_unicode=True)
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        if item.__class__ == CorpusHealthItem: