def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings()
def __init__(self): settings = Settings() settings.CreateCommonSettings() self.file = FileIOMiddleware() self.rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT) self.bf_urls = BloomFilter(self.rconn, settings.BLOOMFILTER_URLS) self.bf_content = BloomFilter(self.rconn, settings.BLOOMFILTER_CONTENT) self.bf_authors = BloomFilter(self.rconn, settings.BLOOMFILTER_AUTHORS) self.disable_restart_interval = settings.DISABLE_RESTART_INTERVAL self.bf_weixin_url = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ARTICLE) self.bf_weixin_content = BloomFilter( self.rconn, settings.FINISHED_WEIXIN_CONTENT_ARTICLE) self.bf_weixin_id = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ID) self.bf_finished_image_id = BloomFilter(self.rconn, settings.FINISHED_IMAGE_ID) self.bf_finished_temp_weixin = BloomFilter( self.rconn, settings.FINISHED_TEMP_WEIXIN) self.md5 = hashlib.md5() self.max_concurrency = settings.MAX_CONCURRENCY self.concurrency_file = settings.CONCURRENCY_FILE self.concurrency_refresh_file = settings.CONCURRENCY_REFRESH_FILE self.refresh_concurrency_interval = settings.REFRESH_CONCURRENCY_INTERVAL self.max_concurrency_spider = settings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = settings.CONCURRENCY_FILE_SPIDER self.concurrency_refresh_file_spider = settings.CONCURRENCY_REFRESH_FILE_SPIDER self.refresh_concurrency_interval_spider = settings.REFRESH_CONCURRENCY_INTERVAL_SPIDER self.bf_huxiu_nlp = BloomFilter(self.rconn, settings.FINISHED_HUXIU_NLP) self.sites_info = settings.SITES_INFO self.sites_debug = settings.SITES_DEBUG
def __init__(self, settingName, callback=callable): self.settingName = settingName self.callBack = callback self.globalSettings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon()
def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.root = '/home/dev/Data/rsyncData/prd4/sites' self.dest = '/home/dev/Data/rsyncData/prd4/local' self.resume = '/home/dev/Repository/news/Tegenaria/tSpider/tSpider/dataRecovery/resume.txt'
def __init__(self): self.doraemon = Doraemon() self.file = FileIOMiddleware() self.settings = Settings() self.cache_file = self.settings.TIMEOUT_CACHE_FILE self.timeout = self.settings.PROCESS_TIMEOUT self.timeout_content = self.settings.PROCESS_TIMEOUT_CONTENT
def __init__(self, fileDirectory=None, fileName=None, bucketName=None, bucketFolderName=None): self.settins = Settings() self.fileDirectory = fileDirectory self.fileName = fileName self.bucketName = bucketName self.bucketFolderName = bucketFolderName # 阿里云主账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维,请登录 https://ram.console.aliyun.com 创建RAM账号。 auth = oss2.Auth(self.settins.ALI_OSS_INFO.ip, self.settins.ALI_OSS_INFO.password) # Endpoint以杭州为例,其它Region请按实际情况填写。 self.bucket = oss2.Bucket(auth, 'http://oss-cn-beijing.aliyuncs.com', '{0}'.format(self.bucketName))
class NoNameBone(): def __init__(self, settingName, callback=callable): self.settingName = settingName self.callBack = callback self.globalSettings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.settingName) self.log_path = self.globalSettings.LOG_PATH_PRD2 self.author_path = self.settings.AUTHORS_PATH self.name = self.settings.NAME def store(self): result = self.callBack() if result == None: return print 'Start to store authors for page: {0}'.format(result.page_url) if len(result.authors) == 0: message1 = 'No author for page: {0}'.format(result.page_url) self.file.logger(self.log_path, message1) print message1 for item in result.authors: is_title_empty = self.doraemon.isEmpty(item) if (is_title_empty is False) and (self.doraemon.isDuplicated( self.doraemon.bf_authors, item) is False): message2 = 'Start to store author: {0} for page: {1}.'.format( item, result.page_url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeTxtAdd(self.author_path, item, self.settingName) message3 = 'Success to store author: {0} for page: {1}.'.format( item, result.page_url) self.file.logger(self.log_path, message3) print message3 else: if is_title_empty is True: message4 = 'Empty author for {0}'.format(result.page_url) self.file.logger(self.log_path, message4) print message4 else: message5 = 'Duplicated author for {0}'.format( result.page_url) self.file.logger(self.log_path, message5) print message5 print 'End to store author for page: {0}.'.format(result.page_url) del result gc.collect()
def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2)
def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon()
def __init__(self, siteinfo=None): self.siteinfo = siteinfo self.globalSettings = Settings() self.doraemon = Doraemon() self.getSettings() self.file = FileIOMiddleware()
class UpdateMonitorFiles(): def __init__(self, siteinfo=None): self.siteinfo = siteinfo self.globalSettings = Settings() self.doraemon = Doraemon() self.getSettings() self.file = FileIOMiddleware() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.work_path_prd4 = self.settings.WORK_PATH_PRD1 self.work_path_prd3 = self.settings.WORK_PATH_PRD2 self.content_backup_path = self.settings.FINISHED_BACKUP_PATH self.content_backup_post_path = self.settings.FINISHED_BACKUP_POST_PATH self.url_backup_path = self.settings.URL_BACKUP_PATH self.url_backup_post_path = self.settings.URL_BACKUP_POST_PATH self.monitor_site_template_path = self.globalSettings.MONITOR_SITE_TEMPLATE_PATH self.monitor_spiders_template_path = self.globalSettings.MONITOR_SPIDERS_TEMPLATE_PATH self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.monitor_site_webserver0 = self.globalSettings.MONITOR_SITE_HTML_WEBSERVER0 self.monitor_site_url = self.globalSettings.MONITOR_SITE_URL self.monitor_upload_webserver0 = self.globalSettings.MONITOR_UPLOAD_PATH_WEBSERVER0 def updateSpiders(self, siteName, ycount1, tcount1, turl1, diff1, ycount2, tcount2, turl2, diff2): return '<tr>' + \ '<th align="center" valign="middle">{0}</th>'.format(siteName) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount1) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl1, tcount1) + \ '<td align="center" valign="middle">{0}</td>'.format(diff1) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount2) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl2, tcount2) + \ '<td align="center" valign="middle">{0}</td>'.format(diff2) + \ '</tr>' def updateSite(self, number, title, url): return '<tr>' + \ '<td align="center" valign="middle">{0}</td>'.format(number) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(url, title) + \ '</tr>' def uploadFile(self, fromFile, toFile): while os.path.exists(fromFile): try: if self.doraemon.sshUpload( self.globalSettings.IP_WEBSERVER0, self.globalSettings.PORT_WEBSERVER0, self.globalSettings.USER_ROOT_WEBSERVER0, self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): print 'Success to retry to upload monitor file: {0}'.format( fromFile) return True except Exception as e: print 'Exception {0} to upload monitor site file: {1}'.format( e.message, fromFile) return False def updateSingleSite(self, preBackupPath, postBackupPath, siteName): singleSiteData = singleSiteDto(self.siteinfo.name, 0, 0, None, 0) isPreBackupFileExists = os.path.exists(preBackupPath) isPostBackupFileExists = os.path.exists(postBackupPath) preCsvContent = None if isPreBackupFileExists: print "Start to read url back up file: {0}".format( self.settings.NAME) preCsvContent = self.file.readColsFromCSV(preBackupPath, ['title', 'url']) singleSiteData.tcount = len(preCsvContent.values) else: print "Url back up file not exits: {0}".format(self.settings.NAME) singleSiteData.tcount = 0 if isPostBackupFileExists: print "Start to read post url back up file: {0}".format( self.settings.NAME) postCsvContent = self.file.readColsFromCSV(postBackupPath, ['title', 'url']) singleSiteData.ycount = len(postCsvContent.values) else: print "Post url back up file not exits: {0}".format( self.settings.NAME) singleSiteData.ycount = 0 singleSiteData.diff = singleSiteData.tcount - singleSiteData.ycount if preCsvContent is not None: if preCsvContent.empty: print "No new back up url: {0}".format(self.settings.NAME) else: template = self.file.readFromTxt( self.monitor_site_template_path) finalContent = '' number = 1 for item in preCsvContent.values: finalContent = "{0}{1}".format( finalContent, self.updateSite(number, item[1], item[0])) number += 1 template = template.replace( 'UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('ServerName', siteName) template = template.replace('SiteName', self.siteinfo.name) template = template.replace('MainContent', finalContent) turl = '{0}{1}_{2}.html'.format(self.monitor_site_url, self.settings.NAME, siteName) singleSiteData.turl = turl uploadLocalHtmlPath = '{0}/{1}_{2}.html'.format( self.monitor_upload_local, self.settings.NAME, siteName) self.file.writeToHtmlCover(uploadLocalHtmlPath, template) return singleSiteData def processAllSites(self, allSitesData=None): template = self.file.readFromTxt(self.monitor_spiders_template_path) mainContent = '' t = totalDto(0, 0, 0, 0, 0, 0) for data in allSitesData: mainContent = '{0}{1}'.format( mainContent, self.updateSpiders(data.prd3.sitename, data.prd3.ycount, data.prd3.tcount, data.prd3.turl, data.prd3.diff, data.prd4.ycount, data.prd4.tcount, data.prd4.turl, data.prd4.diff)) t.prd3ytotal += data.prd3.ycount t.prd3ttotal += data.prd3.tcount t.prd4ytotal += data.prd4.ycount t.prd4ttotal += data.prd4.tcount t.prd3difftotal = t.prd3ttotal - t.prd3ytotal t.prd4difftotal = t.prd4ttotal - t.prd4ytotal mainContent = '{0}{1}'.format( mainContent, self.updateSpiders('Summary', t.prd3ytotal, t.prd3ttotal, '', t.prd3difftotal, t.prd4ytotal, t.prd4ttotal, '', t.prd4difftotal)) template = template.replace('UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('MainContent', mainContent) localHtmlPath = '{0}/index.html'.format(self.monitor_upload_local) self.file.writeToHtmlCover(localHtmlPath, template) self.doraemon.tar(self.monitor_upload_local) fromFile = '{0}.tar.gz'.format(self.monitor_upload_local) self.uploadFile( fromFile, '{0}/monitor.tar.gz'.format(self.monitor_upload_webserver0)) os.remove(fromFile) def processSingleSite(self): spidersContent = allSitesDto(None, None) spidersContent.prd3 = self.updateSingleSite(self.url_backup_path, self.url_backup_post_path, 'prd3') spidersContent.prd4 = self.updateSingleSite( self.content_backup_path, self.content_backup_post_path, 'prd4') return spidersContent
def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.log_path = self.settings.LOG_PATH self.doraemon.createFilePath(self.log_path)
class CamelBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH_PRD2 self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd2 = self.settings.WORK_PATH_PRD2 self.mongo = self.settings.MONGO_URLS self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_URL self.urls = self.settings.URLS self.max_concurrency = self.globalSettings.MAX_CONCURRENCY self.concurrency_file = self.globalSettings.CONCURRENCY_FILE self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH self.url_timeout = self.settings.URL_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.url_backup_folder_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) results = self.callBack(current_url, html) if len(results) == 0: message1 = 'No url for page: {0}'.format(current_url) self.file.logger(self.log_path, message1) print message1 for item in results: is_title_empty = self.doraemon.isEmpty(item.title) if (is_title_empty is False) and (self.doraemon.isDuplicated( self.doraemon.bf_urls, item.title) is False): message2 = 'Start to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb( self.mongo, self.doraemon.createCamelMongoJson(item)) message3 = 'End to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message3) print message3 self.file.logger(self.log_path, 'Done for {0}'.format(item.url)) else: if is_title_empty is True: message4 = 'Empty title for {0}'.format(item.url) self.file.logger(self.log_path, message4) print message4 else: print 'Finished title for {0}'.format(item.url) print 'End to parse {0}'.format(current_url) del current_url, results, html gc.collect() def start(self, isdebug=False): if self.doraemon.isCamelReadyToRun( self.settings) is False and isdebug is False: message5 = 'It is not ready to run for {0}'.format(self.name) print message5 return message6 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message6) print message6 new_urls = [] content = self.file.readFromTxt(self.urls) url_list = content.split('\n') for url in url_list: if self.doraemon.isEmpty(url) is False: new_urls.append([url, '']) if len(new_urls) == 0: print 'No url.' return request = BrowserRequest() content = request.start_chrome(new_urls, self.url_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file, self.max_concurrency) message7 = 'End for {0} requests of {1}.'.format( str(len(content)), self.name) self.file.logger(self.log_path, message7) print message7 del new_urls, content, url_list, request gc.collect()
class SpiderBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd1 = self.settings.WORK_PATH_PRD1 self.finished_txt_path = self.settings.FINISHED_TXT_PATH self.finished_html_path = self.settings.FINISHED_HTML_PATH self.finished_image_path = self.settings.FINISHED_IMG_PATH self.template_path = self.globalSettings.TEMPLATE_PATH self.article_url = self.globalSettings.ARTICLE_URL self.ali_domain = self.globalSettings.ALI_DOMAIN self.ali_domain_deepinews = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS self.ali_domain_deepinews_img = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS_IMG self.ip_webserver0 = self.globalSettings.IP_WEBSERVER0 self.port_webserver0 = self.globalSettings.PORT_WEBSERVER0 self.user_root_webserver0 = self.globalSettings.USER_ROOT_WEBSERVER0 self.user_root_password_webserver0 = self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0 self.html_webserver0 = self.globalSettings.HTML_WEBSERVER0 self.mongo = self.settings.MONGO self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_CONTENT self.url_path = self.settings.URL_PATH self.is_open_cache = self.settings.IS_OPEN_CACHE self.finished_backup_folder_path = self.settings.FINISHED_BACKUP_FOLDER_PATH self.max_concurrency_spider = self.globalSettings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = self.globalSettings.CONCURRENCY_FILE_SPIDER self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.local_html_path = self.globalSettings.LOCAL_HTML_PATH self.content_timeout = self.settings.CONTENT_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.finished_backup_folder_path) self.doraemon.createFilePath(self.monitor_upload_local) self.doraemon.createFilePath(self.local_html_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') request_title = response['request_title'] print 'Start to parse: {0}'.format(current_url) page_source = response['response'].page_source html = etree.HTML(page_source) results = None try: results = self.callBack(current_url, html, page_source) if results == None: message1 = 'No content for: {0}'.format(current_url) print message1 self.file.logger(self.log_path, message1) return dataToMongo = self.doraemon.createSpiderMongoJson(results) except Exception as e: message1 = 'Exception when parse: {0} for {1}'.format(current_url, e.message) print message1 self.file.logger(self.log_path, message1) print 'End to parse: {0}'.format(current_url) if results == None: self.doraemon.storeFinished(self.doraemon.bf_content, request_title) print 'No data for {0}'.format(request_title) else: message2 = 'Start to store mongo {0}'.format(results.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb(self.mongo, dataToMongo) message3 = 'End to store mongo {0}'.format(results.url) self.file.logger(self.log_path, message3) print message3 self.doraemon.storeTxt(results.id, results.content, self.finished_txt_path, self.name) self.doraemon.storeFinished(self.doraemon.bf_content, request_title) def start(self): if self.doraemon.isSpiderReadyToRun() is False: message4 = 'It is not ready to run spider: {0}'.format(self.name) print message4 return message5 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message5) print message5 message6 = 'Start requests: {0} '.format(self.name) self.file.logger(self.log_path, message6) print message6 new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path) if len(new_url_titles) == 0: self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider) message7 = 'No new url for {0}'.format(self.name) self.file.logger(self.log_path, message7) print message7 return request = BrowserRequest() content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider) message8 = 'End requests for {0}'.format(str(len(content))) self.file.logger(self.log_path, message8) print message8 del content, new_url_titles, request gc.collect()