Ejemplos de Doraemon.storeFinished en Python

Lenguaje de programación: Python

Namespace/Package Name: middlewares.doraemonMiddleware

Clase / Tipo: Doraemon

Método / Función: storeFinished

Ejemplos en hotexamples.com: 3

Python Doraemon.storeFinished - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de middlewares.doraemonMiddleware.Doraemon.storeFinished extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

createFilePath(8)

Doraemon(5)

isEmpty(4)

isExceedRestartInterval(4)

storeFinished(3)

readNewImageIds(2)

storeMongodb(2)

getAllHasSet(2)

getDateFromString(2)

hashSet(2)

storeTxt(1)

storeHtml(1)

readNewUrls(1)

isDuplicated(1)

isFileExists(1)

compressImage(1)

getFileSize(1)

downloadImage(1)

delKey(1)

delHashSet(1)

tar(1)

Ejemplo n.º 1

Mostrar archivo

class WeixinSalticidae():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.finished_img_path)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_origin_html_path = settings_name[
            'FINISHED_ORIGIN_HTML_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.mongo = settings_name['MONGO']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.url_deepinews_10002_article = self.settings.URL_DEEPINEWS_10002_ARTICLE
        self.url_deepinews_10002_image = self.settings.URL_DEEPINEWS_10002_IMAGE
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.regx_img = re.compile('<img(.*?)/>')
        self.regx_date = re.compile(
            '<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>'
        )
        self.regx_img_type = re.compile('data-type="(.*?)"')
        self.regx_img_data_src = re.compile('data-src="(.*?)"')
        self.regx_img_src = re.compile('src="(.*?)"')
        self.regx_img_class = re.compile('class="(.*?)"')

    def getPostFixOfImage(self, image_type):
        if image_type == 'jpeg':
            return 'jpg'
        if image_type == 'png':
            return 'png'
        if image_type == 'gif':
            return 'gif'
        else:
            print 'Other type: {0}'.format(image_type)

    def start_requests(self):
        self.file.logger(self.log_path,
                         'Start dowload images for: {0} '.format(self.name))
        print 'Start dowload images for: {0} '.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_image_id, self.finished_content_path)
        if len(new_ids) == 0:
            self.file.logger(self.log_path,
                             'No new image id for {0}'.format(self.name))
            print 'No new image id for {0}'.format(self.name)
            return
        self.doraemon.createFilePath(self.finished_processed_html_path)
        self.doraemon.createFilePath(self.finished_img_path)
        for id in new_ids:
            print 'Start to remove pictures in: {0}'.format(id)
            html_file = self.file.readFromHtml("{0}/{1}.html".format(
                self.finished_origin_html_path, id))
            img_list = re.findall(self.regx_img, html_file)
            date_list = re.findall(self.regx_date, html_file)
            new_html = ''
            number = 0
            for old_time in date_list:
                new_date = self.doraemon.getDateFromString(old_time)
                old_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    old_time)
                new_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    new_date)
                new_html = html_file.replace(old_time_content,
                                             new_time_content)
                html_file = new_html
            for img in img_list:
                old_img = img
                image_id = "{0}_{1}".format(id, number)
                image_data_src = ''.join(
                    re.findall(self.regx_img_data_src, img)).strip()
                image_src = re.findall(self.regx_img_src, img)
                image_type = ''.join(re.findall(self.regx_img_type,
                                                img)).strip()
                image_post_fix = self.getPostFixOfImage(image_type)
                if (self.doraemon.isEmpty(image_data_src) is True) or \
                   (self.doraemon.isEmpty(image_src) is True) or \
                   (self.doraemon.isEmpty(image_type) is True):
                    continue
                origin_image_path = "{0}/{1}.{2}".format(
                    self.finished_img_path, image_id, image_post_fix)
                print 'Start to download image: {0}'.format(image_data_src)
                self.doraemon.downloadImage(image_data_src, origin_image_path)
                image_size = self.doraemon.getFileSize(origin_image_path)
                if image_size > 60:
                    print 'Start to compress image: {0}'.format(image_data_src)
                    self.doraemon.compressImage(origin_image_path,
                                                origin_image_path, 2)
                    print 'Finished to compress image: {0}'.format(
                        image_data_src)
                print 'Finished to download image: {0}'.format(image_data_src)
                print 'Start to replace image url: {0}'.format(image_id)
                new_imgurl = "{0}{1}.{2}".format(
                    self.url_deepinews_10002_image, image_id, image_post_fix)
                # new_imgurl = '/home/dev/Data/rsyncData/prd4/weixin/img/{0}.{1}'.format(image_id, image_post_fix)
                src_list = re.findall(self.regx_img_src, img)
                img_class_list = re.findall(self.regx_img_class, img)
                for img_class in img_class_list:
                    new_img = img.replace(img_class, 'rich_pages')
                    img = new_img
                for src in src_list:
                    new_img = img.replace(src, new_imgurl)
                    img = new_img
                new_html = html_file.replace(old_img, img)
                html_file = new_html
                print 'Finished to replace image url: {0}'.format(image_id)
                number += 1
            self.doraemon.storeHtml(id, new_html,
                                    self.finished_processed_html_path)
            self.doraemon.storeFinished(self.doraemon.bf_finished_image_id, id)

Ejemplo n.º 2

Mostrar archivo

class Huxiu():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('huxiu')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/'
        self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv'
        self.mongo = 'huxiu_nlp'
        self.name = settings_name['NAME']
        self.max_pool_size = 4
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.is_open_cache = settings_name['IS_OPEN_CACHE']

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        data = {}
        comment_number = ""
        title = ""
        url = ""
        id = ""
        share_number = ""
        image_url = ""
        content = ""
        time = ""
        author_url = ""
        author_name = ""
        valid = False

        url = current_url
        id = str(filter(str.isdigit, current_url.encode('gbk')))
        title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()")
        comment_number1 = html.xpath(
            ".//*[contains(@class, 'article-pl pull-left')]/text()")
        share_number1 = html.xpath(
            ".//*[contains(@class, 'article-share pull-left')]/text()")
        image_url1 = html.xpath(
            ".//*[contains(@class, 'article-img-box')]/img/@src")
        content1 = html.xpath(
            ".//div[contains(@class, 'article-content-wrap')]//text()")
        time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()")
        author_url1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/@href")
        author_name1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/text()")

        if self.doraemon.isEmpty(title1) is False:
            title = title1[0].strip()
        if self.doraemon.isEmpty(comment_number1) is False:
            comment_number = str(
                filter(str.isdigit, comment_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(share_number1) is False:
            share_number = str(
                filter(str.isdigit, share_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(image_url1) is False:
            image_url = image_url1[0].strip()
        if self.doraemon.isEmpty(content1) is False:
            content = ''.join(content1).strip()
            valid = True
        if self.doraemon.isEmpty(time1) is False:
            time = ''.join(time1).strip()
            time = self.doraemon.getDateFromString(time)
        if self.doraemon.isEmpty(author_url1) is False:
            author_url = urlparse.urljoin(current_url, author_url1[0].strip())
        if self.doraemon.isEmpty(author_name1) is False:
            author_name = ''.join(author_name1[0]).strip()

        data = {
            'title': title,
            'comment_number': comment_number,
            'share_number': share_number,
            'image_url': image_url,
            'url': url,
            'public_time': time,
            'author_url': author_url,
            'author_name': author_name,
            'id': id,
            'download_time': self.today,
            'is_open_cache': self.is_open_cache,
            'source': self.source
        }
        print 'End to parse: {0}'.format(current_url)
        if valid == True and self.doraemon.isEmpty(title) is False:
            self.file.logger(self.log_path,
                             'Start to store mongo {0}'.format(data['url']))
            print 'Start to store mongo {0}'.format(data['url'])
            self.doraemon.storeMongodb(self.mongo, data)
            self.file.logger(self.log_path,
                             'End to store mongo {0}'.format(data['url']))
            print 'End to store mongo {0}'.format(data['url'])
            self.doraemon.storeTxt(id, content, self.finished_txt_path,
                                   self.name)
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        else:
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data
        gc.collect()

    def start_requests(self):
        self.file.logger(self.log_path, 'Start request: {0}'.format(self.name))
        print 'Start ' + self.name + ' requests'
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp,
                                                   self.url_path)
        # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']]
        if len(new_url_titles) == 0:
            self.file.logger(self.log_path,
                             'No new url for: {0}'.format(self.name))
            print 'No new url for: {0}'.format(self.name)
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.file.logger(self.log_path,
                         'End requests: {0}'.format(str(len(content))))
        print 'End requests: {0}'.format(str(len(content)))
        del new_url_titles, request, content
        gc.collect()

Ejemplo n.º 3

Mostrar archivo

Archivo: transfer_files_2_production.py Proyecto: hulu7/news

class TransferToProduction():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.request = RequestsMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.temp_folder_html)
        self.doraemon.createFilePath(self.temp_folder_img)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.temp_folder_html = self.settings.TEMP_FOLDER_HTML
        self.temp_folder_img = self.settings.TEMP_FOLDER_IMG
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY

    def start_transfer(self):
        print 'Start {0} transfer'.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_temp_weixin, self.finished_content_path)
        for id in new_ids:
            self.file.logger(self.log_path,
                             'Start transfer image: {0}'.format(id))
            regx_img_file = re.compile(id)
            for f in os.listdir(self.finished_img_path):
                isValidImage = regx_img_file.match(f)
                if isValidImage is None:
                    print 'Invalid image for not match: {0}'.format(f)
                    continue
                from_img_path = "{0}/{1}".format(self.finished_img_path, f)
                to_img_path = "{0}/{1}".format(self.temp_folder_img, f)
                is_from_path_exists = os.path.exists(from_img_path)
                if is_from_path_exists is False:
                    self.file.logger(self.log_path,
                                     'img of {0} not exits.'.format(f))
                    continue
                copyfile(from_img_path, to_img_path)
                print 'Finished to transfer image {0}'.format(f)
            self.file.logger(self.log_path,
                             'Start transfer html: {0}'.format(id))
            from_path = "{0}/{1}.html".format(
                self.finished_processed_html_path, id)
            to_path = "{0}/{1}.html".format(self.temp_folder_html, id)
            is_from_path_exists = os.path.exists(from_path)
            if is_from_path_exists is False:
                self.file.logger(self.log_path,
                                 'html of {0} not exits.'.format(id))
                continue
            copyfile(from_path, to_path)
            print 'Finished to transfer html {0}'.format(id)
            self.doraemon.storeFinished(self.doraemon.bf_finished_temp_weixin,
                                        id)
            print 'Finished to transfer {0}'.format(id)