Python FileIOMiddleware.logger Exemples, Tegenaria.tSpider.tSpider.middlewares.fileIOMiddleware.FileIOMiddleware.logger Python Exemples

Exemple #1

0

Afficher le fichier

class UploadMongoData():
    def __init__(self):
        self.settings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.log_path = self.settings.LOG_PATH
        self.doraemon.createFilePath(self.log_path)

    def startUpload(self):
        fromFile = self.settings.LOCAL_MONGO_DATA_PATH
        toFile = self.settings.REMOTE_MONGO_DATA_PATH
        if not os.path.exists(fromFile):
            print 'no mongo data file to upload'
            return
        while os.path.exists(fromFile):
            try:
                if self.doraemon.sshUpload(
                        self.settings.IP_WEBSERVER0,
                        self.settings.PORT_WEBSERVER0,
                        self.settings.USER_ROOT_WEBSERVER0,
                        self.settings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile,
                        toFile):
                    self.doraemon.deleteFile(fromFile)
                    message1 = 'Success to upload mongo data file: {0}'.format(
                        fromFile)
                    print message1
                    self.file.logger(self.log_path, message1)
            except Exception as e:
                message2 = 'Exception {0} to upload mongo data file: {1}'.format(
                    e.message, fromFile)
                print message2
                self.file.logger(self.log_path, message2)

Exemple #2

0

Afficher le fichier

Fichier : noNameBone.py Projet : hulu7/news

class NoNameBone():
    def __init__(self, settingName, callback=callable):
        self.settingName = settingName
        self.callBack = callback
        self.globalSettings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.settingName)
        self.log_path = self.globalSettings.LOG_PATH_PRD2
        self.author_path = self.settings.AUTHORS_PATH
        self.name = self.settings.NAME

    def store(self):
        result = self.callBack()
        if result == None:
            return
        print 'Start to store authors for page: {0}'.format(result.page_url)
        if len(result.authors) == 0:
            message1 = 'No author for page: {0}'.format(result.page_url)
            self.file.logger(self.log_path, message1)
            print message1
        for item in result.authors:
            is_title_empty = self.doraemon.isEmpty(item)
            if (is_title_empty is False) and (self.doraemon.isDuplicated(
                    self.doraemon.bf_authors, item) is False):
                message2 = 'Start to store author: {0} for page: {1}.'.format(
                    item, result.page_url)
                self.file.logger(self.log_path, message2)
                print message2
                self.doraemon.storeTxtAdd(self.author_path, item,
                                          self.settingName)
                message3 = 'Success to store author: {0} for page: {1}.'.format(
                    item, result.page_url)
                self.file.logger(self.log_path, message3)
                print message3
            else:
                if is_title_empty is True:
                    message4 = 'Empty author for {0}'.format(result.page_url)
                    self.file.logger(self.log_path, message4)
                    print message4
                else:
                    message5 = 'Duplicated author for {0}'.format(
                        result.page_url)
                    self.file.logger(self.log_path, message5)
                    print message5
        print 'End to store author for page: {0}.'.format(result.page_url)
        del result
        gc.collect()

Exemple #3

0

Afficher le fichier

class StoreFiles():
    def __init__(self,
                 htmlpath=None,
                 imagepath=None,
                 templatepath=None,
                 articleurl=None,
                 alidomain=None,
                 alidomaindeepinews=None,
                 alidomaindeepinewsimg=None,
                 ipwebserver0=None,
                 portwebserver0=None,
                 userrootwebserver0=None,
                 userrootpasswordwebserver0=None,
                 htmlwebserver0=None,
                 needselfimage=None,
                 needselfhtml=None,
                 localhtmlpath=None,
                 logpath=None):
        self.doraemon = Doraemon()
        self.file = FileIOMiddleware()
        self.image_count = 0
        self.htmlpath = htmlpath
        self.imagepath = imagepath
        self.templatepath = templatepath
        self.articleurl = articleurl
        self.alidomain = alidomain
        self.alidomaindeepinews = alidomaindeepinews
        self.alidomaindeepinewsimg = alidomaindeepinewsimg
        self.ipwebserver0 = ipwebserver0
        self.portwebserver0 = portwebserver0
        self.userrootwebserver0 = userrootwebserver0
        self.userrootpasswordwebserver0 = userrootpasswordwebserver0
        self.htmlwebserver0 = htmlwebserver0
        self.needselfimage = needselfimage
        self.needselfhtml = needselfhtml
        self.localhtmlpath = localhtmlpath
        self.logpath = logpath

    def parseContentRegxRule(self, content_regx_rule):
        result = matchRules(None, None, None)
        rules = [
            matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[\=]',
                       r'[\'](.*?)[\']'),
            matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[,]',
                       r'[\'](.*?)[\']'),
        ]
        for rule in rules:
            tag = re.findall(rule.tag, content_regx_rule)
            key = re.findall(rule.key, content_regx_rule)
            value = re.findall(rule.value, content_regx_rule)
            if self.doraemon.isEmpty(tag) is False and \
               self.doraemon.isEmpty(key) is False and \
               self.doraemon.isEmpty(value) is False:
                result.tag = tag[0]
                result.key = key[0]
                result.value = value[0]
                break
        return result

    def addHighlightTextInner(self, content):
        return '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \
               '<p class="article_paragraph">' + \
                    '<br class="article_paragraph_border"/>' + \
               '</p>'

    def addHighlightTextOuter(self, node, content):
        return '{0}<p class="article_paragraph">'.format(node) + \
                      '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                      '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addImgNode(self, node, dataSrc, dataRef, width, dataRatio):
        if width == None:
            width = 1000
        return '{0}<p class="article_paragraph_imag">'.format(node) + \
                      '<img data-ratio="{0}"'.format(dataRatio) + \
                           'data-src="{0}"'.format(dataSrc) + \
                           'data-ref="{0}"'.format(dataRef) + \
                           'data-type="jpeg"' + \
                           'data-w={0} '.format(width) + \
                           'class="article_paragraph_img"/>' + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                     '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addTextNodeOuter(self, node, content):
        if self.doraemon.isEmpty(content):
            return ''
        return '{0}<p class="article_paragraph">{1}'.format(node, content) + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                      '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addParagraphGapNode(self, node):
        return '{0}<p class="article_paragraph">'.format(node) + \
                     '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addH1Node(self, node, content):
        return '{0}<p label="h1" class="article_paragraph_h1">'.format(node) + \
                    '<span class="article_paragraph_h1_1">' + \
                        '<span class="article_paragraph_h1_1_1">' + \
                            '<span class="article_paragraph_h1_1_1_1">{0}'.format(content) + \
                            '</span>' + \
                        '</span>' + \
                    '</span>' + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                     '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def extractImgSize(self, style, mode):
        size = re.findall(r'{0}:(.*?)px;'.format(mode), style)
        if len(size) == 1:
            return size[0].strip()
        return None

    def extractImg(self, url, node):
        result = imgInfo(None, None, None)
        if isinstance(node, NavigableString):
            return None
        if node.name != 'img' and len(node.contents) == 0:
            return result
        if node.name == 'img':
            if node.attrs.has_key('src') and self.doraemon.isEmpty(result.src):
                if 'data:image/' not in node.attrs['src']:
                    result.src = node.attrs['src']
            if node.attrs.has_key('_src') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['_src']:
                    result.src = node.attrs['_src']
            if node.attrs.has_key('data-original') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['data-original']:
                    result.src = node.attrs['data-original']
            if node.attrs.has_key('data-src') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['data-src']:
                    result.src = node.attrs['data-src']
            if node.attrs.has_key('data-lazy-src') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['data-lazy-src']:
                    result.src = node.attrs['data-lazy-src']
            if node.attrs.has_key('width') and result.width == None:
                result.width = node.attrs['width']
            if node.attrs.has_key('height') and result.height == None:
                result.height = node.attrs['height']
            if node.attrs.has_key('data-w') and result.width == None:
                result.width = node.attrs['data-w']
            if node.attrs.has_key('data-h') and result.height == None:
                result.height = node.attrs['data-h']
            if node.attrs.has_key('data-backh') and result.height == None:
                result.height = node.attrs['data-backh']
            if node.attrs.has_key('data-backw') and result.width == None:
                result.width = node.attrs['data-backw']
            if node.attrs.has_key('data-wscnh') and result.height == None:
                result.height = node.attrs['data-wscnh']
            if node.attrs.has_key('data-wscnw') and result.width == None:
                result.width = node.attrs['data-wscnw']
            if node.attrs.has_key('style') and (result.width == None
                                                or result.height == None):
                result.width = self.extractImgSize(node.attrs['style'],
                                                   'width')
                result.height = self.extractImgSize(node.attrs['style'],
                                                    'height')
            if isinstance(result.width, int) and isinstance(
                    result.height, int):
                result.dataRatio = float(
                    float(result.height) / float(result.width))
            if result.src != None:
                result.src = urlparse.urljoin(url, result.src).strip()
            return result
        if len(node.contents) > 0:
            for n in node.contents:
                result = self.extractImg(url, n)
                if result != None:
                    return result
        return result

    def nodeTraversal(self, url, node, newNode, articleId):
        if node.name == 'strong' and \
           node.parent.name == 'div' and \
           self.doraemon.isEmpty(node.string) is False:
            newNode = '{0}{1}'.format(
                newNode, self.addHighlightTextOuter(newNode, node.string))
        if (node.name == 'h1' or \
           node.name == 'h2' or \
           node.name == 'h3' or \
           node.name == 'h4') and \
           self.doraemon.isEmpty(node.string) is False:
            newNode = '{0}{1}'.format(newNode,
                                      self.addH1Node(newNode, node.string))
        if isinstance(node, NavigableString) or \
           node.name == 'a' or \
           node.name == 'p' or \
           node.name == 'span' or \
           node.name == 'section':
            if isinstance(node, NavigableString):
                newNode = self.addTextNodeOuter(newNode, str(node))
            else:
                if self.doraemon.isEmpty(node.text) == False:
                    newNode = self.addTextNodeOuter(newNode, node.text)
        img = self.extractImg(url, node)
        updatedNode = updateNode(False, newNode, None, None)
        if img != None and img.src != None:
            updatedNode.isImageNode = True
            updatedNode.imageOriginUrl = img.src
            updatedNode.imageNewUrl = img.src
            try:
                imageType = self.doraemon.getImageTypeFromUrl(
                    updatedNode.imageOriginUrl)
                imageId = '{0}_{1}'.format(articleId, self.image_count)
                newImageName = '{0}.{1}'.format(imageId, imageType)
                if self.doraemon.downloadImage(updatedNode.imageOriginUrl,
                                               self.imagepath, newImageName):
                    imageInfo = Image.open('{0}/{1}'.format(
                        self.imagepath, newImageName))
                    if self.doraemon.isEmpty(imageInfo.width) is False:
                        img.width = imageInfo.width
                    if self.doraemon.isEmpty(imageInfo.height) is False:
                        img.height = imageInfo.height
                    if isinstance(img.width, int) and isinstance(
                            img.height, int):
                        img.dataRatio = float(
                            float(img.height) / float(img.width))
                    if self.needselfimage:
                        updatedNode.imageNewUrl = 'https://{0}.{1}/{2}/{3}'.format(
                            self.alidomaindeepinews, self.alidomain,
                            self.alidomaindeepinewsimg, newImageName)
                        imageUpload = AliUpload(
                            '{0}'.format(self.imagepath), newImageName,
                            '{0}'.format(self.alidomaindeepinews),
                            '{0}'.format(self.alidomaindeepinewsimg))
                        if imageUpload.start():
                            updatedNode.node = '{0}{1}'.format(
                                newNode,
                                self.addImgNode(newNode,
                                                updatedNode.imageNewUrl,
                                                updatedNode.imageNewUrl,
                                                img.width, img.dataRatio))
                            self.image_count += 1
                    else:
                        updatedNode.node = '{0}{1}'.format(
                            newNode,
                            self.addImgNode(newNode, img.src, img.src,
                                            img.width, img.dataRatio))
                else:
                    updatedNode.node = '{0}{1}'.format(
                        newNode,
                        self.addImgNode(newNode, img.src, img.src, img.width,
                                        img.dataRatio))
            except Exception as e:
                updatedNode.node = '{0}{1}'.format(
                    newNode,
                    self.addImgNode(newNode, img.src, img.src, img.width,
                                    img.dataRatio))
                print 'Exception {0} to download image: {1}'.format(
                    e.message, updatedNode.imageOriginUrl)

        return updatedNode

    def updateTemplate(self, template, articleHeadDescription,
                       articleHeadAuthor, articleHeadTitle,
                       articleHeadOriginUrl, articleBodyTitle,
                       articleBodyAuthor, articleBodyPublishTime,
                       articleBodyParagraph, articleBodyOriginUrl):
        template = template.replace('ArticleHeadDescription',
                                    articleHeadDescription)
        template = template.replace('ArticleHeadAuthor', articleHeadAuthor)
        template = template.replace('ArticleHeadTitle', articleHeadTitle)
        template = template.replace('ArticleHeadOriginUrl',
                                    articleHeadOriginUrl)
        template = template.replace('ArticleBodyTitle', articleBodyTitle)
        template = template.replace('ArticleBodyAuthor', articleBodyAuthor)
        template = template.replace('ArticleBodyPublishTime',
                                    articleBodyPublishTime)
        template = template.replace('ArticleBodyParagraph',
                                    articleBodyParagraph)
        template = template.replace('ArticleBodyOriginUrl',
                                    articleBodyOriginUrl)
        return template

    def hasText(self, nodes):
        for node in nodes:
            if isinstance(node, NavigableString):
                continue
            if node.name == 'img' or \
               node.name == 'a' or \
               node.name == 'p' or \
               node.name == 'span' or \
               node.name == 'section':
                return True
        return False

    def goDeepToArticleBody(self, contents):
        if isinstance(contents, NavigableString):
            return contents
        if len(contents) == 0:
            return contents
        if self.hasText(contents):
            return contents
        if len(contents) > 0:
            for n in contents:
                if isinstance(n, NavigableString):
                    continue
                return self.goDeepToArticleBody(n.contents)

    def storeFiles(self, data, page_source, content_regx_rule):
        if self.needselfhtml == False:
            return data
        try:
            self.image_count = 0
            newData = copy.copy(data)
            newArticleId = self.doraemon.getMD5('{0}_{1}'.format(
                data.author_name, data.id))
            newData.url = '{0}{1}.html'.format(self.articleurl, newArticleId)
            template = self.file.readFromTxt(self.templatepath)
            match = self.parseContentRegxRule(content_regx_rule)
            if match.tag is None or \
               match.key is None or \
               match.value is None:
                print 'No match rule available for html'
                return data
            soup = BeautifulSoup(page_source, 'lxml')
            matchTags = soup.select('{0}[{1}="{2}"]'.format(
                match.tag, match.key, match.value))
            if len(matchTags) == 0:
                print 'No tag matched for html'
                return data
            nodes = self.goDeepToArticleBody(matchTags[0].contents)
            articleContent = ''
            for node in nodes:
                if isinstance(node, NavigableString):
                    continue
                if self.doraemon.isEmpty(node):
                    continue
                newNode = ''
                updateNodeInfo = self.nodeTraversal(data.url, node, newNode,
                                                    newArticleId)
                articleContent = '{0}{1}'.format(articleContent,
                                                 updateNodeInfo.node)
                if updateNodeInfo.isImageNode:
                    if updateNodeInfo.imageOriginUrl in newData.images:
                        for i in newData.images:
                            if updateNodeInfo.imageOriginUrl in i or \
                               updateNodeInfo.imageOriginUrl == i:
                                newData.images[newData.images.index(
                                    i)] = updateNodeInfo.imageNewUrl
                    else:
                        newData.images.append(updateNodeInfo.imageNewUrl)
            template = self.updateTemplate(template, newData.title,
                                           '深度资讯DeepINews', newData.title,
                                           newData.url, newData.title,
                                           newData.source, newData.public_time,
                                           articleContent, data.url)
            if self.doraemon.storeHtml(newArticleId, template, self.htmlpath):
                htmlName = '{0}.html'.format(newArticleId)
                fromFile = '{0}/{1}'.format(self.htmlpath, htmlName)
                toFile = '{0}/{1}'.format(self.localhtmlpath, htmlName)
                if self.doraemon.copyFile(fromFile, toFile):
                    print 'Copy file {0} done.'.format(fromFile)
                    return newData
                else:
                    message1 = 'Copy file {0} fail.'.format(fromFile)
                    print message1
                    self.file.logger(self.logpath, message1)
            return data
        except Exception as e:
            message2 = 'Exception {0} when update : {1}'.format(
                e.message, data.url)
            print message2
            self.file.logger(self.logpath, message2)
            return data

Exemple #4

0

Afficher le fichier

Fichier : camelBone.py Projet : hulu7/news

class CamelBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH_PRD2
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd2 = self.settings.WORK_PATH_PRD2
        self.mongo = self.settings.MONGO_URLS
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_URL
        self.urls = self.settings.URLS
        self.max_concurrency = self.globalSettings.MAX_CONCURRENCY
        self.concurrency_file = self.globalSettings.CONCURRENCY_FILE
        self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH
        self.url_timeout = self.settings.URL_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.url_backup_folder_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        results = self.callBack(current_url, html)
        if len(results) == 0:
            message1 = 'No url for page: {0}'.format(current_url)
            self.file.logger(self.log_path, message1)
            print message1
        for item in results:
            is_title_empty = self.doraemon.isEmpty(item.title)
            if (is_title_empty is False) and (self.doraemon.isDuplicated(
                    self.doraemon.bf_urls, item.title) is False):
                message2 = 'Start to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message2)
                print message2
                self.doraemon.storeMongodb(
                    self.mongo, self.doraemon.createCamelMongoJson(item))
                message3 = 'End to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message3)
                print message3
                self.file.logger(self.log_path,
                                 'Done for {0}'.format(item.url))
            else:
                if is_title_empty is True:
                    message4 = 'Empty title for {0}'.format(item.url)
                    self.file.logger(self.log_path, message4)
                    print message4
                else:
                    print 'Finished title for {0}'.format(item.url)
        print 'End to parse {0}'.format(current_url)

        del current_url, results, html
        gc.collect()

    def start(self, isdebug=False):
        if self.doraemon.isCamelReadyToRun(
                self.settings) is False and isdebug is False:
            message5 = 'It is not ready to run for {0}'.format(self.name)
            print message5
            return
        message6 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return
        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.url_timeout,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file,
                                          self.max_concurrency)
        message7 = 'End for {0} requests of {1}.'.format(
            str(len(content)), self.name)
        self.file.logger(self.log_path, message7)
        print message7

        del new_urls, content, url_list, request
        gc.collect()

Exemple #5

0

Afficher le fichier

class SpiderBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd1 = self.settings.WORK_PATH_PRD1
        self.finished_txt_path = self.settings.FINISHED_TXT_PATH
        self.finished_html_path = self.settings.FINISHED_HTML_PATH
        self.finished_image_path = self.settings.FINISHED_IMG_PATH
        self.template_path = self.globalSettings.TEMPLATE_PATH
        self.article_url = self.globalSettings.ARTICLE_URL
        self.ali_domain = self.globalSettings.ALI_DOMAIN
        self.ali_domain_deepinews = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS
        self.ali_domain_deepinews_img = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS_IMG
        self.ip_webserver0 = self.globalSettings.IP_WEBSERVER0
        self.port_webserver0 = self.globalSettings.PORT_WEBSERVER0
        self.user_root_webserver0 = self.globalSettings.USER_ROOT_WEBSERVER0
        self.user_root_password_webserver0 = self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0
        self.html_webserver0 = self.globalSettings.HTML_WEBSERVER0
        self.mongo = self.settings.MONGO
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_CONTENT
        self.url_path = self.settings.URL_PATH
        self.is_open_cache = self.settings.IS_OPEN_CACHE
        self.finished_backup_folder_path = self.settings.FINISHED_BACKUP_FOLDER_PATH
        self.max_concurrency_spider = self.globalSettings.MAX_CONCURRENCY_SPIDER
        self.concurrency_file_spider = self.globalSettings.CONCURRENCY_FILE_SPIDER
        self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL
        self.local_html_path = self.globalSettings.LOCAL_HTML_PATH
        self.content_timeout = self.settings.CONTENT_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.finished_backup_folder_path)
        self.doraemon.createFilePath(self.monitor_upload_local)
        self.doraemon.createFilePath(self.local_html_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        request_title = response['request_title']
        print 'Start to parse: {0}'.format(current_url)
        page_source = response['response'].page_source
        html = etree.HTML(page_source)
        results = None
        try:
            results = self.callBack(current_url, html, page_source)
            if results == None:
                message1 = 'No content for: {0}'.format(current_url)
                print message1
                self.file.logger(self.log_path, message1)
                return
            dataToMongo = self.doraemon.createSpiderMongoJson(results)
        except Exception as e:
            message1 = 'Exception when parse: {0} for {1}'.format(current_url, e.message)
            print message1
            self.file.logger(self.log_path, message1)
        print 'End to parse: {0}'.format(current_url)
        if results == None:
            self.doraemon.storeFinished(self.doraemon.bf_content, request_title)
            print 'No data for {0}'.format(request_title)
        else:
            message2 = 'Start to store mongo {0}'.format(results.url)
            self.file.logger(self.log_path, message2)
            print message2
            self.doraemon.storeMongodb(self.mongo, dataToMongo)
            message3 = 'End to store mongo {0}'.format(results.url)
            self.file.logger(self.log_path, message3)
            print message3
            self.doraemon.storeTxt(results.id, results.content, self.finished_txt_path, self.name)
            self.doraemon.storeFinished(self.doraemon.bf_content, request_title)

    def start(self):
        if self.doraemon.isSpiderReadyToRun() is False:
            message4 = 'It is not ready to run spider: {0}'.format(self.name)
            print message4
            return
        message5 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message5)
        print message5
        message6 = 'Start requests: {0} '.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path)
        if len(new_url_titles) == 0:
            self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
            message7 = 'No new url for {0}'.format(self.name)
            self.file.logger(self.log_path, message7)
            print message7
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
        message8 = 'End requests for {0}'.format(str(len(content)))
        self.file.logger(self.log_path, message8)
        print message8
        del content, new_url_titles, request
        gc.collect()