class MysqlPipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): # now=datetime.utcnow() # print item['page_url'] # print 333333333333333333 # if spider.crawler.stats.get_value('item_scraped_count')>10: # return media = MediaInfo( title=self.list_format(item["title"], spider), page_url=item["page_url"], eName=self.list_format(item["eName"], spider), otherName=self.list_format(item["otherName"], spider), adaptor=self.list_format(item["adaptor"], spider), director=self.list_format(item["director"], spider), leader=self.list_format(item["leader"], spider), kind=self.list_format(item["kind"], spider), language=self.list_format(item["language"], spider), duration=self.list_format(item["duration"], spider), story=self.list_format(item["story"], spider), keyWord=self.list_format(item["keyWord"], spider), productPerson=self.list_format(item["productPerson"], spider), dubbing=self.list_format(item["dubbing"], spider), executiver=self.list_format(item["executiver"], spider), original=self.list_format(item["original"], spider), productColtd=self.list_format(item["productColtd"], spider), productionTime=self.list_format(item["productionTime"], spider), licence=self.list_format(item["licence"], spider), registration=self.list_format(item["registration"], spider), distributColtd=self.list_format(item["distributColtd"], spider), totalNumber=self.list_format(item["totalNumber"], spider), updateInfo=self.list_format(item["updateInfo"], spider), area=self.list_format(item["area"], spider), playTime=self.list_format(item["playTime"], spider), television=self.list_format(item["television"], spider), producer=self.list_format(item["producer"], spider), source=item["source"], # createTime=self.getNowTime() ) if media.title: self.session.add(media) # self.session.commit() try: self.session.flush() # 写入数据库 media_id = media.id self.session.commit() # 写入ES json_data = json.dumps(media.to_dict(), ensure_ascii=False) # 转换成json数组写入es import_to_es("mediacloud", "mc_mediainfo", media_id, json_data) except Exception, e: # print e self.write_file_content(settings["LOG_FILE"], "----------保存失败,info" + str(e), "a+") self.session.rollback()
class ChangzhiserverPipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): a = ChangzhiServerSection(title=item['title'], url=item['url'], runningDate=datetime.now()) self.session.add(a) self.session.commit() def close_spider(self, spider): self.session.close()
class DataBasePipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): a = Article(title=item["title"].encode("utf-8"), url=item["url"], body=item["body"].encode("utf-8"), publish_time=item["publish_time"].encode("utf-8"), source_site=item["source_site"].encode("utf-8")) self.session.add(a) self.session.commit() def close_spider(self, spider): self.session.close()
class DataBasePipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): a = Article(title=item["title"].encode("utf-8"), url=item["url"], content=item["content"].encode("utf-8"), publish_time=item["publish_time"].encode("utf-8"), publish_user=item["publish_user"].encode("utf-8"), folder_id=2) self.session.add(a) self.session.commit() def close_spider(self, spider): self.session.close()
class DataBasePipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): a = Article(title=item["title"].encode("utf-8"), url=item["url"], body=item["body"].encode("utf-8"), publish_time=item["publish_time"].encode("utf-8"), source_site=item["source_site"].encode("utf-8")) self.session.add(a) self.session.commit() print 'ccccccccccc' def close_spider(self,spider): self.session.close()
class SeprojectPipeline(object): def open_spider(self, spider): self.session = DBSession() sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'" engine.execute(sql2) def process_item(self, item, spider): if (isinstance(item, Article)): content1 = item["content"].encode("utf-8") f = Filter() text = f.filterResult(item["title"], item["content"]) if text: # print "text:"+text[1]+text[2]+text[3] money1 = text[2] org1 = text[1] date1 = text[3] date2 = date(date1[0], date1[1], date1[2]) a = RawData(title=text[0], content=item["content"].encode("utf-8"), spider=item["spider"].encode("utf-8"), org=org1, money=money1, date=date2) self.session.add(a) self.session.commit() id1 = item["id"] name1 = item["spider"].encode("utf-8") title1 = item["title"].encode("utf-8") currentURL = item["currentURL"].encode("utf-8") sql1 = "INSERT INTO `status` (`crawler_id`, `current_title` ,`current_url`, `name`) VALUES (" + str( id1 ) + ",'" + title1 + "','" + currentURL + "','" + name1 + "') ON DUPLICATE KEY UPDATE `current_title`='" + title1 + "',`current_url`='" + currentURL + "',`name`='" + name1 + "',`timestamp`=NOW()" engine.execute(sql1) else: currentPage = item["currentPage"].encode("utf-8") id = item["id"] name = item["name"].encode("utf-8") sql = "INSERT INTO `status` (`crawler_id`, `current_page`) VALUES (" + str( id ) + ",'" + currentPage + "') ON DUPLICATE KEY UPDATE `current_page`='" + currentPage + "'" engine.execute(sql) def close_spider(self, spider): self.session.close()
class SeprojectPipeline(object): def open_spider(self, spider): self.session = DBSession() sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'" engine.execute(sql2) def process_item(self, item, spider): if(isinstance(item, Article)): content1 = item["content"].encode("utf-8") f = Filter() text = f.filterResult(item["title"], item["content"]) if text: # print "text:"+text[1]+text[2]+text[3] money1 = text[2] org1 = text[1] date1 = text[3] date2 = date(date1[0], date1[1], date1[2]) a = RawData(title=text[0], content=item["content"].encode("utf-8"), spider=item["spider"].encode("utf-8"), org=org1, money=money1, date=date2) self.session.add(a) self.session.commit() id1 = item["id"] name1 = item["spider"].encode("utf-8") title1 = item["title"].encode("utf-8") currentURL = item["currentURL"].encode("utf-8") sql1 = "INSERT INTO `status` (`crawler_id`, `current_title` ,`current_url`, `name`) VALUES (" + str(id1) + ",'" + title1 + "','" + currentURL +"','" + name1 +"') ON DUPLICATE KEY UPDATE `current_title`='" + title1 + "',`current_url`='" + currentURL +"',`name`='" + name1 + "',`timestamp`=NOW()" engine.execute(sql1) else: currentPage = item["currentPage"].encode("utf-8") id = item["id"] name = item["name"].encode("utf-8") sql = "INSERT INTO `status` (`crawler_id`, `current_page`) VALUES (" + str(id) + ",'" + currentPage + "') ON DUPLICATE KEY UPDATE `current_page`='" + currentPage + "'" engine.execute(sql) def close_spider(self,spider): self.session.close()
class DataBasePipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): a = Article(title=item["title"].encode("utf-8"), url=item["url"], body=item["body"].encode("utf-8"), publish_time=item["publish_time"], source_site=item["source_site"].encode("utf-8"), spider_from=1, crawler_time=time.strftime("%Y-%m-%d %H:%M:%S"), project_id=item["project_id"], site_property_id=item["site_property_id"]) self.session.add(a) self.session.commit() #print 'process_item----',a.project_id,'--------',a.title return item def close_spider(self, spider): self.session.close()
#定义日志信息 #log.start(loglevel=log.DEBUG) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") settings.set("ITEM_PIPELINES" , { 'wuchong.pipelines.DuplicatesPipeline': 200, # 'pipelines.CountDropPipline': 100, 'wuchong.pipelines.DataBasePipeline': 300 }) dbp = DBSession() nump = dbp.query(Project).filter(Project.status == 1).distinct().count() if nump==0: print '暂时没有可运行的项目' dbp.commit() dbp.close() exit() dbr = DBSession() numr = dbr.query(Rules).filter(Rules.enable == 1).distinct().count() if numr==0: print '暂时没有可运行的规则网站' dbr.commit() dbr.close() exit() # 查询数据库,传参数,开始运行。 db1 = DBSession() projects = db1.query(Project).filter(Project.status == 1)
class MongoDBPipeline(object): # def __init__(self,dbpool): # self.dbpool=dbpool # @classmethod # def from_settings(cls, settings): # dbargs = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # passwd=settings['MYSQL_PWD'], # charset='utf8', # cursorclass = MySQLdb.cursors.DictCursor, # use_unicode= True, # ) # dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) # return cls(dbpool) def open_spider(self,spider): self.session=DBSession() def process_item(self,item,spider): now=datetime.utcnow() media=MediaInfo( self.list_format(item['title']), self.list_format(item['eName']), self.list_format(item['otherName']), self.list_format(item['adaptor']), self.list_format(item['director']), self.list_format(item['leader']), self.list_format(item['kind']), self.list_format(item['language']), self.list_format(item['duration']), self.list_format(item['story']), self.list_format(item['keyWord']), self.list_format(item['productPerson']), self.list_format(item['dubbing']), self.list_format(item['executiver']), self.list_format(item['original']), self.list_format(item['productColtd']), self.list_format(item['productionTime']), self.list_format(item['licence']), self.list_format(item['registration']), self.list_format(item['distributColtd']), '爱奇艺', now ) self.session.add(media) self.session.commit() def close_spider(self,spider): self.session.close() def _mediainfo_insert(self,conn,item,spider): now=datetime.utcnow() if item['title']!='': #//conn.execute("insert into MC_MediaInfo(title,leader,kind,source) values('%s,'%s','%s',%s)",item['title'],self.list_format(item['leader']),self.list_format(item['kind']),'爱奇艺') # print "insert into MC_MediaInfo(title,eName,otherName,adaptor,director,leader,kind,language,duration,story,keyWord,productPerson,dubbing,executiver,original,productColtd,productionTime,licence,registration,distributColtd,source,createTime)\ # values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" %( # self.list_format(item['title']), # self.list_format(item['eName']), # self.list_format(item['otherName']), # self.list_format(item['adaptor']), # self.list_format(item['director']), # self.list_format(item['leader']), # self.list_format(item['kind']), # self.list_format(item['language']), # self.list_format(item['duration']), # self.list_format(item['story']), # self.list_format(item['keyWord']), # self.list_format(item['productPerson']), # self.list_format(item['dubbing']), # self.list_format(item['executiver']), # self.list_format(item['original']), # self.list_format(item['productColtd']), # self.list_format(item['productionTime']), # self.list_format(item['licence']), # self.list_format(item['registration']), # self.list_format(item['distributColtd']), # '爱奇艺', # now) conn.execute(\ "insert into MC_MediaInfo(title,eName,otherName,adaptor,director,leader,kind,language,duration,story,keyWord,productPerson,dubbing,executiver,original,productColtd,productionTime,licence,registration,distributColtd,source,createTime)\ values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",( self.list_format(item['title']), self.list_format(item['eName']), self.list_format(item['otherName']), self.list_format(item['adaptor']), self.list_format(item['director']), self.list_format(item['leader']), self.list_format(item['kind']), self.list_format(item['language']), self.list_format(item['duration']), self.list_format(item['story']), self.list_format(item['keyWord']), self.list_format(item['productPerson']), self.list_format(item['dubbing']), self.list_format(item['executiver']), self.list_format(item['original']), self.list_format(item['productColtd']), self.list_format(item['productionTime']), self.list_format(item['licence']), self.list_format(item['registration']), self.list_format(item['distributColtd']), '爱奇艺', now)) def handle_error(self,e): log.err(e) def list_format(self,input): return ','.join(input)