コード例 #1
0
class ChangzhiserverNewsPipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        pass

    def close_spider(self, spider):
        self.session.close()
コード例 #2
0
ファイル: pipelines.py プロジェクト: icexia/mgtv
class MysqlPipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        # now=datetime.utcnow()
        # print item['page_url']
        # print 333333333333333333
        # if spider.crawler.stats.get_value('item_scraped_count')>10:
        # return
        media = MediaInfo(
            title=self.list_format(item["title"], spider),
            page_url=item["page_url"],
            eName=self.list_format(item["eName"], spider),
            otherName=self.list_format(item["otherName"], spider),
            adaptor=self.list_format(item["adaptor"], spider),
            director=self.list_format(item["director"], spider),
            leader=self.list_format(item["leader"], spider),
            kind=self.list_format(item["kind"], spider),
            language=self.list_format(item["language"], spider),
            duration=self.list_format(item["duration"], spider),
            story=self.list_format(item["story"], spider),
            keyWord=self.list_format(item["keyWord"], spider),
            productPerson=self.list_format(item["productPerson"], spider),
            dubbing=self.list_format(item["dubbing"], spider),
            executiver=self.list_format(item["executiver"], spider),
            original=self.list_format(item["original"], spider),
            productColtd=self.list_format(item["productColtd"], spider),
            productionTime=self.list_format(item["productionTime"], spider),
            licence=self.list_format(item["licence"], spider),
            registration=self.list_format(item["registration"], spider),
            distributColtd=self.list_format(item["distributColtd"], spider),
            totalNumber=self.list_format(item["totalNumber"], spider),
            updateInfo=self.list_format(item["updateInfo"], spider),
            area=self.list_format(item["area"], spider),
            playTime=self.list_format(item["playTime"], spider),
            television=self.list_format(item["television"], spider),
            producer=self.list_format(item["producer"], spider),
            source=item["source"],
            # createTime=self.getNowTime()
        )
        if media.title:
            self.session.add(media)
            # self.session.commit()
            try:
                self.session.flush()  # 写入数据库
                media_id = media.id
                self.session.commit()

                # 写入ES
                json_data = json.dumps(media.to_dict(), ensure_ascii=False)  # 转换成json数组写入es
                import_to_es("mediacloud", "mc_mediainfo", media_id, json_data)
            except Exception, e:
                # print e
                self.write_file_content(settings["LOG_FILE"], "----------保存失败,info" + str(e), "a+")
                self.session.rollback()
コード例 #3
0
class ChangzhiserverPipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):

        a = ChangzhiServerSection(title=item['title'],
                                  url=item['url'],
                                  runningDate=datetime.now())
        self.session.add(a)
        self.session.commit()

    def close_spider(self, spider):
        self.session.close()
コード例 #4
0
class DataBasePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        a = Article(title=item["title"].encode("utf-8"),
                    url=item["url"],
                    body=item["body"].encode("utf-8"),
                    publish_time=item["publish_time"].encode("utf-8"),
                    source_site=item["source_site"].encode("utf-8"))
        self.session.add(a)
        self.session.commit()

    def close_spider(self, spider):
        self.session.close()
コード例 #5
0
ファイル: pipelines.py プロジェクト: 605258778/newsScrapy
class DataBasePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        a = Article(title=item["title"].encode("utf-8"),
                    url=item["url"],
                    content=item["content"].encode("utf-8"),
                    publish_time=item["publish_time"].encode("utf-8"),
                    publish_user=item["publish_user"].encode("utf-8"),
                    folder_id=2)
        self.session.add(a)
        self.session.commit()

    def close_spider(self, spider):
        self.session.close()
コード例 #6
0
class DataBasePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        a = Article(title=item["title"].encode("utf-8"),
                    url=item["url"],
                    body=item["body"].encode("utf-8"),
                    publish_time=item["publish_time"].encode("utf-8"),
                    source_site=item["source_site"].encode("utf-8"))
        self.session.add(a)
        self.session.commit()
        print 'ccccccccccc'

    def close_spider(self,spider):
        self.session.close()
コード例 #7
0
class SeprojectPipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()
        sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'"
        engine.execute(sql2)

    def process_item(self, item, spider):
        if (isinstance(item, Article)):
            content1 = item["content"].encode("utf-8")
            f = Filter()
            text = f.filterResult(item["title"], item["content"])

            if text:
                # print "text:"+text[1]+text[2]+text[3]
                money1 = text[2]
                org1 = text[1]
                date1 = text[3]
                date2 = date(date1[0], date1[1], date1[2])
                a = RawData(title=text[0],
                            content=item["content"].encode("utf-8"),
                            spider=item["spider"].encode("utf-8"),
                            org=org1,
                            money=money1,
                            date=date2)
                self.session.add(a)
                self.session.commit()
            id1 = item["id"]
            name1 = item["spider"].encode("utf-8")
            title1 = item["title"].encode("utf-8")
            currentURL = item["currentURL"].encode("utf-8")
            sql1 = "INSERT INTO `status` (`crawler_id`, `current_title` ,`current_url`, `name`) VALUES (" + str(
                id1
            ) + ",'" + title1 + "','" + currentURL + "','" + name1 + "') ON DUPLICATE KEY UPDATE `current_title`='" + title1 + "',`current_url`='" + currentURL + "',`name`='" + name1 + "',`timestamp`=NOW()"
            engine.execute(sql1)
        else:
            currentPage = item["currentPage"].encode("utf-8")
            id = item["id"]
            name = item["name"].encode("utf-8")
            sql = "INSERT INTO `status` (`crawler_id`, `current_page`) VALUES (" + str(
                id
            ) + ",'" + currentPage + "') ON DUPLICATE KEY UPDATE `current_page`='" + currentPage + "'"
            engine.execute(sql)

    def close_spider(self, spider):
        self.session.close()
コード例 #8
0
class SeprojectPipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()
        sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'"
        engine.execute(sql2)


    def process_item(self, item, spider):
        if(isinstance(item, Article)):
            content1 = item["content"].encode("utf-8")
            f = Filter()
            text = f.filterResult(item["title"], item["content"])

            if text:
                # print "text:"+text[1]+text[2]+text[3]
                money1 = text[2]
                org1 = text[1]
                date1 = text[3]
                date2 = date(date1[0], date1[1], date1[2])
                a = RawData(title=text[0],
                content=item["content"].encode("utf-8"),
                spider=item["spider"].encode("utf-8"),
                org=org1,
                money=money1,
                date=date2)
                self.session.add(a)
                self.session.commit()
            id1 = item["id"]
            name1 = item["spider"].encode("utf-8")
            title1 = item["title"].encode("utf-8")
            currentURL = item["currentURL"].encode("utf-8")
            sql1 = "INSERT INTO `status` (`crawler_id`, `current_title` ,`current_url`, `name`) VALUES (" + str(id1) + ",'" + title1 + "','" + currentURL +"','" + name1 +"') ON DUPLICATE KEY UPDATE `current_title`='" + title1 + "',`current_url`='" + currentURL +"',`name`='" + name1 + "',`timestamp`=NOW()"
            engine.execute(sql1)
        else:
            currentPage = item["currentPage"].encode("utf-8")
            id = item["id"]
            name = item["name"].encode("utf-8")
            sql = "INSERT INTO `status` (`crawler_id`, `current_page`) VALUES (" + str(id) + ",'" + currentPage + "') ON DUPLICATE KEY UPDATE `current_page`='" + currentPage + "'"
            engine.execute(sql)

    def close_spider(self,spider):
        self.session.close()
コード例 #9
0
ファイル: pipelines.py プロジェクト: jiler/getdata
class DataBasePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        a = Article(title=item["title"].encode("utf-8"),
                    url=item["url"],
                    body=item["body"].encode("utf-8"),
                    publish_time=item["publish_time"],
                    source_site=item["source_site"].encode("utf-8"),
                    spider_from=1,
                    crawler_time=time.strftime("%Y-%m-%d %H:%M:%S"),
                    project_id=item["project_id"],
                    site_property_id=item["site_property_id"])
        self.session.add(a)
        self.session.commit()
        #print 'process_item----',a.project_id,'--------',a.title
        return item

    def close_spider(self, spider):
        self.session.close()
コード例 #10
0
settings.set(
    "USER_AGENT",
    "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
)
settings.set(
    "ITEM_PIPELINES",
    {
        # 'pipelines.DuplicatesPipeline': 200,
        # 'pipelines.CountDropPipline': 100,
        'pipelines.SeprojectPipeline': 300
    })
# settings.set("LOG_LEVEL","INFO")

# process = CrawlerProcess(settings)

db = DBSession()
rules = db.query(Rule).filter(Rule.enable == 1)

for rule in rules:
    crawler = Crawler(settings)
    spider = BeijingSpider(rule)  # instantiate every spider using rule
    RUNNING_CRAWLERS.append(spider)

    # stop reactor when spider closes
    crawler.signals.connect(spider_closing, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()

# blocks process so always keep as the last statement
reactor.run()
コード例 #11
0
 def open_spider(self, spider):
     self.session = DBSession()
     sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'"
     engine.execute(sql2)
コード例 #12
0
ファイル: pipelines.py プロジェクト: icexia/mgtv
 def open_spider(self, spider):
     self.session = DBSession()
コード例 #13
0
ファイル: run.py プロジェクト: jiler/getdata
time_now = time.strftime("%Y-%m-%d %H:%M:%S")
print '当前时间:',time_now


#定义日志信息
#log.start(loglevel=log.DEBUG)

settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
settings.set("ITEM_PIPELINES" , {
    'wuchong.pipelines.DuplicatesPipeline': 200,
    # 'pipelines.CountDropPipline': 100,
    'wuchong.pipelines.DataBasePipeline': 300
})
dbp = DBSession()
nump = dbp.query(Project).filter(Project.status == 1).distinct().count()
if nump==0:
    print '暂时没有可运行的项目'
    dbp.commit()
    dbp.close()
    exit()

dbr = DBSession()
numr = dbr.query(Rules).filter(Rules.enable == 1).distinct().count()
if numr==0:
    print '暂时没有可运行的规则网站'
    dbr.commit()
    dbr.close()
    exit()
コード例 #14
0
ファイル: pipelines.py プロジェクト: icexia/iqiyi
class MongoDBPipeline(object):
	# def __init__(self,dbpool):
	# 	self.dbpool=dbpool

	# @classmethod
	# def from_settings(cls, settings):
	# 	dbargs = dict(
	# 		host=settings['MYSQL_HOST'],
	# 		db=settings['MYSQL_DBNAME'],
	# 		user=settings['MYSQL_USER'],
	# 		passwd=settings['MYSQL_PWD'],
	# 		charset='utf8',
	# 		cursorclass = MySQLdb.cursors.DictCursor,
	# 		use_unicode= True,
	# 	)
	# 	dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
	# 	return cls(dbpool)
	def open_spider(self,spider):
		self.session=DBSession()

	def process_item(self,item,spider):
		now=datetime.utcnow()
		media=MediaInfo(
			self.list_format(item['title']),
			self.list_format(item['eName']),
			self.list_format(item['otherName']),
			self.list_format(item['adaptor']),
			self.list_format(item['director']),
			self.list_format(item['leader']),
			self.list_format(item['kind']),
			self.list_format(item['language']),
			self.list_format(item['duration']),
			self.list_format(item['story']),
			self.list_format(item['keyWord']),
			self.list_format(item['productPerson']),
			self.list_format(item['dubbing']),
			self.list_format(item['executiver']),
			self.list_format(item['original']),
			self.list_format(item['productColtd']),
			self.list_format(item['productionTime']),
			self.list_format(item['licence']),
			self.list_format(item['registration']),
			self.list_format(item['distributColtd']),
			'爱奇艺',
			now
			)
		self.session.add(media)
		self.session.commit()

	def close_spider(self,spider):
		self.session.close()

	def _mediainfo_insert(self,conn,item,spider):
		now=datetime.utcnow()
		if item['title']!='':
			#//conn.execute("insert into MC_MediaInfo(title,leader,kind,source) values('%s,'%s','%s',%s)",item['title'],self.list_format(item['leader']),self.list_format(item['kind']),'爱奇艺')
			# print "insert into MC_MediaInfo(title,eName,otherName,adaptor,director,leader,kind,language,duration,story,keyWord,productPerson,dubbing,executiver,original,productColtd,productionTime,licence,registration,distributColtd,source,createTime)\
			# 	 values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" %(
			# 		self.list_format(item['title']),
			# 		self.list_format(item['eName']),
			# 		self.list_format(item['otherName']),
			# 		self.list_format(item['adaptor']),
			# 		self.list_format(item['director']),
			# 		self.list_format(item['leader']),
			# 		self.list_format(item['kind']),
			# 		self.list_format(item['language']),
			# 		self.list_format(item['duration']),
			# 		self.list_format(item['story']),
			# 		self.list_format(item['keyWord']),
			# 		self.list_format(item['productPerson']),
			# 		self.list_format(item['dubbing']),
			# 		self.list_format(item['executiver']),
			# 		self.list_format(item['original']),
			# 		self.list_format(item['productColtd']),
			# 		self.list_format(item['productionTime']),
			# 		self.list_format(item['licence']),
			# 		self.list_format(item['registration']),
			# 		self.list_format(item['distributColtd']),
			# 		'爱奇艺',
			# 		now)
			conn.execute(\
				"insert into MC_MediaInfo(title,eName,otherName,adaptor,director,leader,kind,language,duration,story,keyWord,productPerson,dubbing,executiver,original,productColtd,productionTime,licence,registration,distributColtd,source,createTime)\
				 values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(
					self.list_format(item['title']),
					self.list_format(item['eName']),
					self.list_format(item['otherName']),
					self.list_format(item['adaptor']),
					self.list_format(item['director']),
					self.list_format(item['leader']),
					self.list_format(item['kind']),
					self.list_format(item['language']),
					self.list_format(item['duration']),
					self.list_format(item['story']),
					self.list_format(item['keyWord']),
					self.list_format(item['productPerson']),
					self.list_format(item['dubbing']),
					self.list_format(item['executiver']),
					self.list_format(item['original']),
					self.list_format(item['productColtd']),
					self.list_format(item['productionTime']),
					self.list_format(item['licence']),
					self.list_format(item['registration']),
					self.list_format(item['distributColtd']),
					'爱奇艺',
					now))

	def handle_error(self,e):
		log.err(e)

	def list_format(self,input):
		return ','.join(input)
コード例 #15
0
 def start_requests(self):
     db = DBSession()
     sections = db.query(ChangzhiServerSection).all()
     for section in sections:
         yield Request(section.url,self.parse,meta={'section_id':section.id})
コード例 #16
0
ファイル: run.py プロジェクト: icexia/mgtv
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

settings = Settings()

# crawl settings
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
settings.set("ITEM_PIPELINES" , {
	# 'pipelines.DuplicatesPipeline': 200,
	# 'pipelines.CountDropPipline': 100,
	'pipelines.MysqlPipeline': 300,
})

process = CrawlerProcess(settings)

db = DBSession()
rules = db.query(Rule).filter(Rule.status == 1)
for rule in rules:
	if rule.allowed_domains=='v.baidu.com':
		process.crawl(BaiduSpider,rule)
	elif rule.allowed_domains == 'iqiyi.com':
		process.crawl(IqiyiSpider,rule)
	elif rule.allowed_domains == 'youku.com':
		process.crawl(YoukuSpider,rule)
	elif rule.allowed_domains == 'douban.com':
		process.crawl(DoubanSpider,rule)
	else:
		process.crawl(MgtvSpider,rule)
process.start()

コード例 #17
0
 def open_spider(self, spider):
     self.session = DBSession()
     sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'"
     engine.execute(sql2)
コード例 #18
0
 def open_spider(self, spider):
     self.session = DBSession()
コード例 #19
0
# -*- coding: utf-8 -*-
from spiders.deep_spider import DeepSpider
from model.config import DBSession
from model.rule import Rule
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

settings = Settings()

# crawl settings
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
settings.set("ITEM_PIPELINES" , {
    'pipelines.DuplicatesPipeline': 200,
    # 'pipelines.CountDropPipline': 100,
    'pipelines.DataBasePipeline': 300,
})

process = CrawlerProcess(settings)

db = DBSession()
rules = db.query(Rule).filter(Rule.enable == 1)
for rule in rules:
    process.crawl(DeepSpider,rule)
process.start()