Ejemplo n.º 1
0
class WeixinSalticidae():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.finished_img_path)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_origin_html_path = settings_name[
            'FINISHED_ORIGIN_HTML_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.mongo = settings_name['MONGO']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.url_deepinews_10002_article = self.settings.URL_DEEPINEWS_10002_ARTICLE
        self.url_deepinews_10002_image = self.settings.URL_DEEPINEWS_10002_IMAGE
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.regx_img = re.compile('<img(.*?)/>')
        self.regx_date = re.compile(
            '<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>'
        )
        self.regx_img_type = re.compile('data-type="(.*?)"')
        self.regx_img_data_src = re.compile('data-src="(.*?)"')
        self.regx_img_src = re.compile('src="(.*?)"')
        self.regx_img_class = re.compile('class="(.*?)"')

    def getPostFixOfImage(self, image_type):
        if image_type == 'jpeg':
            return 'jpg'
        if image_type == 'png':
            return 'png'
        if image_type == 'gif':
            return 'gif'
        else:
            print 'Other type: {0}'.format(image_type)

    def start_requests(self):
        self.file.logger(self.log_path,
                         'Start dowload images for: {0} '.format(self.name))
        print 'Start dowload images for: {0} '.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_image_id, self.finished_content_path)
        if len(new_ids) == 0:
            self.file.logger(self.log_path,
                             'No new image id for {0}'.format(self.name))
            print 'No new image id for {0}'.format(self.name)
            return
        self.doraemon.createFilePath(self.finished_processed_html_path)
        self.doraemon.createFilePath(self.finished_img_path)
        for id in new_ids:
            print 'Start to remove pictures in: {0}'.format(id)
            html_file = self.file.readFromHtml("{0}/{1}.html".format(
                self.finished_origin_html_path, id))
            img_list = re.findall(self.regx_img, html_file)
            date_list = re.findall(self.regx_date, html_file)
            new_html = ''
            number = 0
            for old_time in date_list:
                new_date = self.doraemon.getDateFromString(old_time)
                old_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    old_time)
                new_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    new_date)
                new_html = html_file.replace(old_time_content,
                                             new_time_content)
                html_file = new_html
            for img in img_list:
                old_img = img
                image_id = "{0}_{1}".format(id, number)
                image_data_src = ''.join(
                    re.findall(self.regx_img_data_src, img)).strip()
                image_src = re.findall(self.regx_img_src, img)
                image_type = ''.join(re.findall(self.regx_img_type,
                                                img)).strip()
                image_post_fix = self.getPostFixOfImage(image_type)
                if (self.doraemon.isEmpty(image_data_src) is True) or \
                   (self.doraemon.isEmpty(image_src) is True) or \
                   (self.doraemon.isEmpty(image_type) is True):
                    continue
                origin_image_path = "{0}/{1}.{2}".format(
                    self.finished_img_path, image_id, image_post_fix)
                print 'Start to download image: {0}'.format(image_data_src)
                self.doraemon.downloadImage(image_data_src, origin_image_path)
                image_size = self.doraemon.getFileSize(origin_image_path)
                if image_size > 60:
                    print 'Start to compress image: {0}'.format(image_data_src)
                    self.doraemon.compressImage(origin_image_path,
                                                origin_image_path, 2)
                    print 'Finished to compress image: {0}'.format(
                        image_data_src)
                print 'Finished to download image: {0}'.format(image_data_src)
                print 'Start to replace image url: {0}'.format(image_id)
                new_imgurl = "{0}{1}.{2}".format(
                    self.url_deepinews_10002_image, image_id, image_post_fix)
                # new_imgurl = '/home/dev/Data/rsyncData/prd4/weixin/img/{0}.{1}'.format(image_id, image_post_fix)
                src_list = re.findall(self.regx_img_src, img)
                img_class_list = re.findall(self.regx_img_class, img)
                for img_class in img_class_list:
                    new_img = img.replace(img_class, 'rich_pages')
                    img = new_img
                for src in src_list:
                    new_img = img.replace(src, new_imgurl)
                    img = new_img
                new_html = html_file.replace(old_img, img)
                html_file = new_html
                print 'Finished to replace image url: {0}'.format(image_id)
                number += 1
            self.doraemon.storeHtml(id, new_html,
                                    self.finished_processed_html_path)
            self.doraemon.storeFinished(self.doraemon.bf_finished_image_id, id)
Ejemplo n.º 2
0
class Huxiu():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('huxiu')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/'
        self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv'
        self.mongo = 'huxiu_nlp'
        self.name = settings_name['NAME']
        self.max_pool_size = 4
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.is_open_cache = settings_name['IS_OPEN_CACHE']

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        data = {}
        comment_number = ""
        title = ""
        url = ""
        id = ""
        share_number = ""
        image_url = ""
        content = ""
        time = ""
        author_url = ""
        author_name = ""
        valid = False

        url = current_url
        id = str(filter(str.isdigit, current_url.encode('gbk')))
        title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()")
        comment_number1 = html.xpath(
            ".//*[contains(@class, 'article-pl pull-left')]/text()")
        share_number1 = html.xpath(
            ".//*[contains(@class, 'article-share pull-left')]/text()")
        image_url1 = html.xpath(
            ".//*[contains(@class, 'article-img-box')]/img/@src")
        content1 = html.xpath(
            ".//div[contains(@class, 'article-content-wrap')]//text()")
        time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()")
        author_url1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/@href")
        author_name1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/text()")

        if self.doraemon.isEmpty(title1) is False:
            title = title1[0].strip()
        if self.doraemon.isEmpty(comment_number1) is False:
            comment_number = str(
                filter(str.isdigit, comment_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(share_number1) is False:
            share_number = str(
                filter(str.isdigit, share_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(image_url1) is False:
            image_url = image_url1[0].strip()
        if self.doraemon.isEmpty(content1) is False:
            content = ''.join(content1).strip()
            valid = True
        if self.doraemon.isEmpty(time1) is False:
            time = ''.join(time1).strip()
            time = self.doraemon.getDateFromString(time)
        if self.doraemon.isEmpty(author_url1) is False:
            author_url = urlparse.urljoin(current_url, author_url1[0].strip())
        if self.doraemon.isEmpty(author_name1) is False:
            author_name = ''.join(author_name1[0]).strip()

        data = {
            'title': title,
            'comment_number': comment_number,
            'share_number': share_number,
            'image_url': image_url,
            'url': url,
            'public_time': time,
            'author_url': author_url,
            'author_name': author_name,
            'id': id,
            'download_time': self.today,
            'is_open_cache': self.is_open_cache,
            'source': self.source
        }
        print 'End to parse: {0}'.format(current_url)
        if valid == True and self.doraemon.isEmpty(title) is False:
            self.file.logger(self.log_path,
                             'Start to store mongo {0}'.format(data['url']))
            print 'Start to store mongo {0}'.format(data['url'])
            self.doraemon.storeMongodb(self.mongo, data)
            self.file.logger(self.log_path,
                             'End to store mongo {0}'.format(data['url']))
            print 'End to store mongo {0}'.format(data['url'])
            self.doraemon.storeTxt(id, content, self.finished_txt_path,
                                   self.name)
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        else:
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data
        gc.collect()

    def start_requests(self):
        self.file.logger(self.log_path, 'Start request: {0}'.format(self.name))
        print 'Start ' + self.name + ' requests'
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp,
                                                   self.url_path)
        # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']]
        if len(new_url_titles) == 0:
            self.file.logger(self.log_path,
                             'No new url for: {0}'.format(self.name))
            print 'No new url for: {0}'.format(self.name)
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.file.logger(self.log_path,
                         'End requests: {0}'.format(str(len(content))))
        print 'End requests: {0}'.format(str(len(content)))
        del new_url_titles, request, content
        gc.collect()
Ejemplo n.º 3
0
class TransferToProduction():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.request = RequestsMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.temp_folder_html)
        self.doraemon.createFilePath(self.temp_folder_img)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.temp_folder_html = self.settings.TEMP_FOLDER_HTML
        self.temp_folder_img = self.settings.TEMP_FOLDER_IMG
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY

    def start_transfer(self):
        print 'Start {0} transfer'.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_temp_weixin, self.finished_content_path)
        for id in new_ids:
            self.file.logger(self.log_path,
                             'Start transfer image: {0}'.format(id))
            regx_img_file = re.compile(id)
            for f in os.listdir(self.finished_img_path):
                isValidImage = regx_img_file.match(f)
                if isValidImage is None:
                    print 'Invalid image for not match: {0}'.format(f)
                    continue
                from_img_path = "{0}/{1}".format(self.finished_img_path, f)
                to_img_path = "{0}/{1}".format(self.temp_folder_img, f)
                is_from_path_exists = os.path.exists(from_img_path)
                if is_from_path_exists is False:
                    self.file.logger(self.log_path,
                                     'img of {0} not exits.'.format(f))
                    continue
                copyfile(from_img_path, to_img_path)
                print 'Finished to transfer image {0}'.format(f)
            self.file.logger(self.log_path,
                             'Start transfer html: {0}'.format(id))
            from_path = "{0}/{1}.html".format(
                self.finished_processed_html_path, id)
            to_path = "{0}/{1}.html".format(self.temp_folder_html, id)
            is_from_path_exists = os.path.exists(from_path)
            if is_from_path_exists is False:
                self.file.logger(self.log_path,
                                 'html of {0} not exits.'.format(id))
                continue
            copyfile(from_path, to_path)
            print 'Finished to transfer html {0}'.format(id)
            self.doraemon.storeFinished(self.doraemon.bf_finished_temp_weixin,
                                        id)
            print 'Finished to transfer {0}'.format(id)