class UploadMongoData(): def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.log_path = self.settings.LOG_PATH self.doraemon.createFilePath(self.log_path) def startUpload(self): fromFile = self.settings.LOCAL_MONGO_DATA_PATH toFile = self.settings.REMOTE_MONGO_DATA_PATH if not os.path.exists(fromFile): print 'no mongo data file to upload' return while os.path.exists(fromFile): try: if self.doraemon.sshUpload( self.settings.IP_WEBSERVER0, self.settings.PORT_WEBSERVER0, self.settings.USER_ROOT_WEBSERVER0, self.settings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): self.doraemon.deleteFile(fromFile) message1 = 'Success to upload mongo data file: {0}'.format( fromFile) print message1 self.file.logger(self.log_path, message1) except Exception as e: message2 = 'Exception {0} to upload mongo data file: {1}'.format( e.message, fromFile) print message2 self.file.logger(self.log_path, message2)
class NoNameBone(): def __init__(self, settingName, callback=callable): self.settingName = settingName self.callBack = callback self.globalSettings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.settingName) self.log_path = self.globalSettings.LOG_PATH_PRD2 self.author_path = self.settings.AUTHORS_PATH self.name = self.settings.NAME def store(self): result = self.callBack() if result == None: return print 'Start to store authors for page: {0}'.format(result.page_url) if len(result.authors) == 0: message1 = 'No author for page: {0}'.format(result.page_url) self.file.logger(self.log_path, message1) print message1 for item in result.authors: is_title_empty = self.doraemon.isEmpty(item) if (is_title_empty is False) and (self.doraemon.isDuplicated( self.doraemon.bf_authors, item) is False): message2 = 'Start to store author: {0} for page: {1}.'.format( item, result.page_url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeTxtAdd(self.author_path, item, self.settingName) message3 = 'Success to store author: {0} for page: {1}.'.format( item, result.page_url) self.file.logger(self.log_path, message3) print message3 else: if is_title_empty is True: message4 = 'Empty author for {0}'.format(result.page_url) self.file.logger(self.log_path, message4) print message4 else: message5 = 'Duplicated author for {0}'.format( result.page_url) self.file.logger(self.log_path, message5) print message5 print 'End to store author for page: {0}.'.format(result.page_url) del result gc.collect()
class StoreFiles(): def __init__(self, htmlpath=None, imagepath=None, templatepath=None, articleurl=None, alidomain=None, alidomaindeepinews=None, alidomaindeepinewsimg=None, ipwebserver0=None, portwebserver0=None, userrootwebserver0=None, userrootpasswordwebserver0=None, htmlwebserver0=None, needselfimage=None, needselfhtml=None, localhtmlpath=None, logpath=None): self.doraemon = Doraemon() self.file = FileIOMiddleware() self.image_count = 0 self.htmlpath = htmlpath self.imagepath = imagepath self.templatepath = templatepath self.articleurl = articleurl self.alidomain = alidomain self.alidomaindeepinews = alidomaindeepinews self.alidomaindeepinewsimg = alidomaindeepinewsimg self.ipwebserver0 = ipwebserver0 self.portwebserver0 = portwebserver0 self.userrootwebserver0 = userrootwebserver0 self.userrootpasswordwebserver0 = userrootpasswordwebserver0 self.htmlwebserver0 = htmlwebserver0 self.needselfimage = needselfimage self.needselfhtml = needselfhtml self.localhtmlpath = localhtmlpath self.logpath = logpath def parseContentRegxRule(self, content_regx_rule): result = matchRules(None, None, None) rules = [ matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[\=]', r'[\'](.*?)[\']'), matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[,]', r'[\'](.*?)[\']'), ] for rule in rules: tag = re.findall(rule.tag, content_regx_rule) key = re.findall(rule.key, content_regx_rule) value = re.findall(rule.value, content_regx_rule) if self.doraemon.isEmpty(tag) is False and \ self.doraemon.isEmpty(key) is False and \ self.doraemon.isEmpty(value) is False: result.tag = tag[0] result.key = key[0] result.value = value[0] break return result def addHighlightTextInner(self, content): return '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addHighlightTextOuter(self, node, content): return '{0}<p class="article_paragraph">'.format(node) + \ '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addImgNode(self, node, dataSrc, dataRef, width, dataRatio): if width == None: width = 1000 return '{0}<p class="article_paragraph_imag">'.format(node) + \ '<img data-ratio="{0}"'.format(dataRatio) + \ 'data-src="{0}"'.format(dataSrc) + \ 'data-ref="{0}"'.format(dataRef) + \ 'data-type="jpeg"' + \ 'data-w={0} '.format(width) + \ 'class="article_paragraph_img"/>' + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addTextNodeOuter(self, node, content): if self.doraemon.isEmpty(content): return '' return '{0}<p class="article_paragraph">{1}'.format(node, content) + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addParagraphGapNode(self, node): return '{0}<p class="article_paragraph">'.format(node) + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addH1Node(self, node, content): return '{0}<p label="h1" class="article_paragraph_h1">'.format(node) + \ '<span class="article_paragraph_h1_1">' + \ '<span class="article_paragraph_h1_1_1">' + \ '<span class="article_paragraph_h1_1_1_1">{0}'.format(content) + \ '</span>' + \ '</span>' + \ '</span>' + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def extractImgSize(self, style, mode): size = re.findall(r'{0}:(.*?)px;'.format(mode), style) if len(size) == 1: return size[0].strip() return None def extractImg(self, url, node): result = imgInfo(None, None, None) if isinstance(node, NavigableString): return None if node.name != 'img' and len(node.contents) == 0: return result if node.name == 'img': if node.attrs.has_key('src') and self.doraemon.isEmpty(result.src): if 'data:image/' not in node.attrs['src']: result.src = node.attrs['src'] if node.attrs.has_key('_src') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['_src']: result.src = node.attrs['_src'] if node.attrs.has_key('data-original') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['data-original']: result.src = node.attrs['data-original'] if node.attrs.has_key('data-src') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['data-src']: result.src = node.attrs['data-src'] if node.attrs.has_key('data-lazy-src') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['data-lazy-src']: result.src = node.attrs['data-lazy-src'] if node.attrs.has_key('width') and result.width == None: result.width = node.attrs['width'] if node.attrs.has_key('height') and result.height == None: result.height = node.attrs['height'] if node.attrs.has_key('data-w') and result.width == None: result.width = node.attrs['data-w'] if node.attrs.has_key('data-h') and result.height == None: result.height = node.attrs['data-h'] if node.attrs.has_key('data-backh') and result.height == None: result.height = node.attrs['data-backh'] if node.attrs.has_key('data-backw') and result.width == None: result.width = node.attrs['data-backw'] if node.attrs.has_key('data-wscnh') and result.height == None: result.height = node.attrs['data-wscnh'] if node.attrs.has_key('data-wscnw') and result.width == None: result.width = node.attrs['data-wscnw'] if node.attrs.has_key('style') and (result.width == None or result.height == None): result.width = self.extractImgSize(node.attrs['style'], 'width') result.height = self.extractImgSize(node.attrs['style'], 'height') if isinstance(result.width, int) and isinstance( result.height, int): result.dataRatio = float( float(result.height) / float(result.width)) if result.src != None: result.src = urlparse.urljoin(url, result.src).strip() return result if len(node.contents) > 0: for n in node.contents: result = self.extractImg(url, n) if result != None: return result return result def nodeTraversal(self, url, node, newNode, articleId): if node.name == 'strong' and \ node.parent.name == 'div' and \ self.doraemon.isEmpty(node.string) is False: newNode = '{0}{1}'.format( newNode, self.addHighlightTextOuter(newNode, node.string)) if (node.name == 'h1' or \ node.name == 'h2' or \ node.name == 'h3' or \ node.name == 'h4') and \ self.doraemon.isEmpty(node.string) is False: newNode = '{0}{1}'.format(newNode, self.addH1Node(newNode, node.string)) if isinstance(node, NavigableString) or \ node.name == 'a' or \ node.name == 'p' or \ node.name == 'span' or \ node.name == 'section': if isinstance(node, NavigableString): newNode = self.addTextNodeOuter(newNode, str(node)) else: if self.doraemon.isEmpty(node.text) == False: newNode = self.addTextNodeOuter(newNode, node.text) img = self.extractImg(url, node) updatedNode = updateNode(False, newNode, None, None) if img != None and img.src != None: updatedNode.isImageNode = True updatedNode.imageOriginUrl = img.src updatedNode.imageNewUrl = img.src try: imageType = self.doraemon.getImageTypeFromUrl( updatedNode.imageOriginUrl) imageId = '{0}_{1}'.format(articleId, self.image_count) newImageName = '{0}.{1}'.format(imageId, imageType) if self.doraemon.downloadImage(updatedNode.imageOriginUrl, self.imagepath, newImageName): imageInfo = Image.open('{0}/{1}'.format( self.imagepath, newImageName)) if self.doraemon.isEmpty(imageInfo.width) is False: img.width = imageInfo.width if self.doraemon.isEmpty(imageInfo.height) is False: img.height = imageInfo.height if isinstance(img.width, int) and isinstance( img.height, int): img.dataRatio = float( float(img.height) / float(img.width)) if self.needselfimage: updatedNode.imageNewUrl = 'https://{0}.{1}/{2}/{3}'.format( self.alidomaindeepinews, self.alidomain, self.alidomaindeepinewsimg, newImageName) imageUpload = AliUpload( '{0}'.format(self.imagepath), newImageName, '{0}'.format(self.alidomaindeepinews), '{0}'.format(self.alidomaindeepinewsimg)) if imageUpload.start(): updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, updatedNode.imageNewUrl, updatedNode.imageNewUrl, img.width, img.dataRatio)) self.image_count += 1 else: updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, img.src, img.src, img.width, img.dataRatio)) else: updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, img.src, img.src, img.width, img.dataRatio)) except Exception as e: updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, img.src, img.src, img.width, img.dataRatio)) print 'Exception {0} to download image: {1}'.format( e.message, updatedNode.imageOriginUrl) return updatedNode def updateTemplate(self, template, articleHeadDescription, articleHeadAuthor, articleHeadTitle, articleHeadOriginUrl, articleBodyTitle, articleBodyAuthor, articleBodyPublishTime, articleBodyParagraph, articleBodyOriginUrl): template = template.replace('ArticleHeadDescription', articleHeadDescription) template = template.replace('ArticleHeadAuthor', articleHeadAuthor) template = template.replace('ArticleHeadTitle', articleHeadTitle) template = template.replace('ArticleHeadOriginUrl', articleHeadOriginUrl) template = template.replace('ArticleBodyTitle', articleBodyTitle) template = template.replace('ArticleBodyAuthor', articleBodyAuthor) template = template.replace('ArticleBodyPublishTime', articleBodyPublishTime) template = template.replace('ArticleBodyParagraph', articleBodyParagraph) template = template.replace('ArticleBodyOriginUrl', articleBodyOriginUrl) return template def hasText(self, nodes): for node in nodes: if isinstance(node, NavigableString): continue if node.name == 'img' or \ node.name == 'a' or \ node.name == 'p' or \ node.name == 'span' or \ node.name == 'section': return True return False def goDeepToArticleBody(self, contents): if isinstance(contents, NavigableString): return contents if len(contents) == 0: return contents if self.hasText(contents): return contents if len(contents) > 0: for n in contents: if isinstance(n, NavigableString): continue return self.goDeepToArticleBody(n.contents) def storeFiles(self, data, page_source, content_regx_rule): if self.needselfhtml == False: return data try: self.image_count = 0 newData = copy.copy(data) newArticleId = self.doraemon.getMD5('{0}_{1}'.format( data.author_name, data.id)) newData.url = '{0}{1}.html'.format(self.articleurl, newArticleId) template = self.file.readFromTxt(self.templatepath) match = self.parseContentRegxRule(content_regx_rule) if match.tag is None or \ match.key is None or \ match.value is None: print 'No match rule available for html' return data soup = BeautifulSoup(page_source, 'lxml') matchTags = soup.select('{0}[{1}="{2}"]'.format( match.tag, match.key, match.value)) if len(matchTags) == 0: print 'No tag matched for html' return data nodes = self.goDeepToArticleBody(matchTags[0].contents) articleContent = '' for node in nodes: if isinstance(node, NavigableString): continue if self.doraemon.isEmpty(node): continue newNode = '' updateNodeInfo = self.nodeTraversal(data.url, node, newNode, newArticleId) articleContent = '{0}{1}'.format(articleContent, updateNodeInfo.node) if updateNodeInfo.isImageNode: if updateNodeInfo.imageOriginUrl in newData.images: for i in newData.images: if updateNodeInfo.imageOriginUrl in i or \ updateNodeInfo.imageOriginUrl == i: newData.images[newData.images.index( i)] = updateNodeInfo.imageNewUrl else: newData.images.append(updateNodeInfo.imageNewUrl) template = self.updateTemplate(template, newData.title, '深度资讯DeepINews', newData.title, newData.url, newData.title, newData.source, newData.public_time, articleContent, data.url) if self.doraemon.storeHtml(newArticleId, template, self.htmlpath): htmlName = '{0}.html'.format(newArticleId) fromFile = '{0}/{1}'.format(self.htmlpath, htmlName) toFile = '{0}/{1}'.format(self.localhtmlpath, htmlName) if self.doraemon.copyFile(fromFile, toFile): print 'Copy file {0} done.'.format(fromFile) return newData else: message1 = 'Copy file {0} fail.'.format(fromFile) print message1 self.file.logger(self.logpath, message1) return data except Exception as e: message2 = 'Exception {0} when update : {1}'.format( e.message, data.url) print message2 self.file.logger(self.logpath, message2) return data
class CamelBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH_PRD2 self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd2 = self.settings.WORK_PATH_PRD2 self.mongo = self.settings.MONGO_URLS self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_URL self.urls = self.settings.URLS self.max_concurrency = self.globalSettings.MAX_CONCURRENCY self.concurrency_file = self.globalSettings.CONCURRENCY_FILE self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH self.url_timeout = self.settings.URL_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.url_backup_folder_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) results = self.callBack(current_url, html) if len(results) == 0: message1 = 'No url for page: {0}'.format(current_url) self.file.logger(self.log_path, message1) print message1 for item in results: is_title_empty = self.doraemon.isEmpty(item.title) if (is_title_empty is False) and (self.doraemon.isDuplicated( self.doraemon.bf_urls, item.title) is False): message2 = 'Start to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb( self.mongo, self.doraemon.createCamelMongoJson(item)) message3 = 'End to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message3) print message3 self.file.logger(self.log_path, 'Done for {0}'.format(item.url)) else: if is_title_empty is True: message4 = 'Empty title for {0}'.format(item.url) self.file.logger(self.log_path, message4) print message4 else: print 'Finished title for {0}'.format(item.url) print 'End to parse {0}'.format(current_url) del current_url, results, html gc.collect() def start(self, isdebug=False): if self.doraemon.isCamelReadyToRun( self.settings) is False and isdebug is False: message5 = 'It is not ready to run for {0}'.format(self.name) print message5 return message6 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message6) print message6 new_urls = [] content = self.file.readFromTxt(self.urls) url_list = content.split('\n') for url in url_list: if self.doraemon.isEmpty(url) is False: new_urls.append([url, '']) if len(new_urls) == 0: print 'No url.' return request = BrowserRequest() content = request.start_chrome(new_urls, self.url_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file, self.max_concurrency) message7 = 'End for {0} requests of {1}.'.format( str(len(content)), self.name) self.file.logger(self.log_path, message7) print message7 del new_urls, content, url_list, request gc.collect()
class SpiderBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd1 = self.settings.WORK_PATH_PRD1 self.finished_txt_path = self.settings.FINISHED_TXT_PATH self.finished_html_path = self.settings.FINISHED_HTML_PATH self.finished_image_path = self.settings.FINISHED_IMG_PATH self.template_path = self.globalSettings.TEMPLATE_PATH self.article_url = self.globalSettings.ARTICLE_URL self.ali_domain = self.globalSettings.ALI_DOMAIN self.ali_domain_deepinews = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS self.ali_domain_deepinews_img = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS_IMG self.ip_webserver0 = self.globalSettings.IP_WEBSERVER0 self.port_webserver0 = self.globalSettings.PORT_WEBSERVER0 self.user_root_webserver0 = self.globalSettings.USER_ROOT_WEBSERVER0 self.user_root_password_webserver0 = self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0 self.html_webserver0 = self.globalSettings.HTML_WEBSERVER0 self.mongo = self.settings.MONGO self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_CONTENT self.url_path = self.settings.URL_PATH self.is_open_cache = self.settings.IS_OPEN_CACHE self.finished_backup_folder_path = self.settings.FINISHED_BACKUP_FOLDER_PATH self.max_concurrency_spider = self.globalSettings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = self.globalSettings.CONCURRENCY_FILE_SPIDER self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.local_html_path = self.globalSettings.LOCAL_HTML_PATH self.content_timeout = self.settings.CONTENT_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.finished_backup_folder_path) self.doraemon.createFilePath(self.monitor_upload_local) self.doraemon.createFilePath(self.local_html_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') request_title = response['request_title'] print 'Start to parse: {0}'.format(current_url) page_source = response['response'].page_source html = etree.HTML(page_source) results = None try: results = self.callBack(current_url, html, page_source) if results == None: message1 = 'No content for: {0}'.format(current_url) print message1 self.file.logger(self.log_path, message1) return dataToMongo = self.doraemon.createSpiderMongoJson(results) except Exception as e: message1 = 'Exception when parse: {0} for {1}'.format(current_url, e.message) print message1 self.file.logger(self.log_path, message1) print 'End to parse: {0}'.format(current_url) if results == None: self.doraemon.storeFinished(self.doraemon.bf_content, request_title) print 'No data for {0}'.format(request_title) else: message2 = 'Start to store mongo {0}'.format(results.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb(self.mongo, dataToMongo) message3 = 'End to store mongo {0}'.format(results.url) self.file.logger(self.log_path, message3) print message3 self.doraemon.storeTxt(results.id, results.content, self.finished_txt_path, self.name) self.doraemon.storeFinished(self.doraemon.bf_content, request_title) def start(self): if self.doraemon.isSpiderReadyToRun() is False: message4 = 'It is not ready to run spider: {0}'.format(self.name) print message4 return message5 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message5) print message5 message6 = 'Start requests: {0} '.format(self.name) self.file.logger(self.log_path, message6) print message6 new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path) if len(new_url_titles) == 0: self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider) message7 = 'No new url for {0}'.format(self.name) self.file.logger(self.log_path, message7) print message7 return request = BrowserRequest() content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider) message8 = 'End requests for {0}'.format(str(len(content))) self.file.logger(self.log_path, message8) print message8 del content, new_url_titles, request gc.collect()