class WeixinSalticidae(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.settings.LOG_PATH) self.doraemon.createFilePath(self.finished_img_path) def getSettings(self): settings_name = self.settings.CreateSettings('weixin') self.source = settings_name['SOURCE_NAME'] self.work_path_prd1 = settings_name['WORK_PATH_PRD1'] self.finished_img_path = settings_name['FINISHED_IMG_PATH'] self.finished_origin_html_path = settings_name[ 'FINISHED_ORIGIN_HTML_PATH'] self.finished_processed_html_path = settings_name[ 'FINISHED_PROCESSED_HTML_PATH'] self.finished_content_path = settings_name['FINISHED_CONTENT_PATH'] self.mongo = settings_name['MONGO'] self.name = settings_name['NAME'] self.max_pool_size = settings_name['MAX_POOL_SIZE'] self.url_deepinews_10002_article = self.settings.URL_DEEPINEWS_10002_ARTICLE self.url_deepinews_10002_image = self.settings.URL_DEEPINEWS_10002_IMAGE self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY self.restart_path = settings_name['RESTART_PATH'] self.restart_interval = settings_name['RESTART_INTERVAL'] self.regx_img = re.compile('<img(.*?)/>') self.regx_date = re.compile( '<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>' ) self.regx_img_type = re.compile('data-type="(.*?)"') self.regx_img_data_src = re.compile('data-src="(.*?)"') self.regx_img_src = re.compile('src="(.*?)"') self.regx_img_class = re.compile('class="(.*?)"') def getPostFixOfImage(self, image_type): if image_type == 'jpeg': return 'jpg' if image_type == 'png': return 'png' if image_type == 'gif': return 'gif' else: print 'Other type: {0}'.format(image_type) def start_requests(self): self.file.logger(self.log_path, 'Start dowload images for: {0} '.format(self.name)) print 'Start dowload images for: {0} '.format(self.name) new_ids = self.doraemon.readNewImageIds( self.doraemon.bf_finished_image_id, self.finished_content_path) if len(new_ids) == 0: self.file.logger(self.log_path, 'No new image id for {0}'.format(self.name)) print 'No new image id for {0}'.format(self.name) return self.doraemon.createFilePath(self.finished_processed_html_path) self.doraemon.createFilePath(self.finished_img_path) for id in new_ids: print 'Start to remove pictures in: {0}'.format(id) html_file = self.file.readFromHtml("{0}/{1}.html".format( self.finished_origin_html_path, id)) img_list = re.findall(self.regx_img, html_file) date_list = re.findall(self.regx_date, html_file) new_html = '' number = 0 for old_time in date_list: new_date = self.doraemon.getDateFromString(old_time) old_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format( old_time) new_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format( new_date) new_html = html_file.replace(old_time_content, new_time_content) html_file = new_html for img in img_list: old_img = img image_id = "{0}_{1}".format(id, number) image_data_src = ''.join( re.findall(self.regx_img_data_src, img)).strip() image_src = re.findall(self.regx_img_src, img) image_type = ''.join(re.findall(self.regx_img_type, img)).strip() image_post_fix = self.getPostFixOfImage(image_type) if (self.doraemon.isEmpty(image_data_src) is True) or \ (self.doraemon.isEmpty(image_src) is True) or \ (self.doraemon.isEmpty(image_type) is True): continue origin_image_path = "{0}/{1}.{2}".format( self.finished_img_path, image_id, image_post_fix) print 'Start to download image: {0}'.format(image_data_src) self.doraemon.downloadImage(image_data_src, origin_image_path) image_size = self.doraemon.getFileSize(origin_image_path) if image_size > 60: print 'Start to compress image: {0}'.format(image_data_src) self.doraemon.compressImage(origin_image_path, origin_image_path, 2) print 'Finished to compress image: {0}'.format( image_data_src) print 'Finished to download image: {0}'.format(image_data_src) print 'Start to replace image url: {0}'.format(image_id) new_imgurl = "{0}{1}.{2}".format( self.url_deepinews_10002_image, image_id, image_post_fix) # new_imgurl = '/home/dev/Data/rsyncData/prd4/weixin/img/{0}.{1}'.format(image_id, image_post_fix) src_list = re.findall(self.regx_img_src, img) img_class_list = re.findall(self.regx_img_class, img) for img_class in img_class_list: new_img = img.replace(img_class, 'rich_pages') img = new_img for src in src_list: new_img = img.replace(src, new_imgurl) img = new_img new_html = html_file.replace(old_img, img) html_file = new_html print 'Finished to replace image url: {0}'.format(image_id) number += 1 self.doraemon.storeHtml(id, new_html, self.finished_processed_html_path) self.doraemon.storeFinished(self.doraemon.bf_finished_image_id, id)
class Huxiu(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.settings.LOG_PATH) def getSettings(self): settings_name = self.settings.CreateSettings('huxiu') self.source = settings_name['SOURCE_NAME'] self.work_path_prd1 = settings_name['WORK_PATH_PRD1'] self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/' self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv' self.mongo = 'huxiu_nlp' self.name = settings_name['NAME'] self.max_pool_size = 4 self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY self.is_open_cache = settings_name['IS_OPEN_CACHE'] def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) data = {} comment_number = "" title = "" url = "" id = "" share_number = "" image_url = "" content = "" time = "" author_url = "" author_name = "" valid = False url = current_url id = str(filter(str.isdigit, current_url.encode('gbk'))) title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()") comment_number1 = html.xpath( ".//*[contains(@class, 'article-pl pull-left')]/text()") share_number1 = html.xpath( ".//*[contains(@class, 'article-share pull-left')]/text()") image_url1 = html.xpath( ".//*[contains(@class, 'article-img-box')]/img/@src") content1 = html.xpath( ".//div[contains(@class, 'article-content-wrap')]//text()") time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()") author_url1 = html.xpath( ".//*[contains(@class, 'author-name')]/a/@href") author_name1 = html.xpath( ".//*[contains(@class, 'author-name')]/a/text()") if self.doraemon.isEmpty(title1) is False: title = title1[0].strip() if self.doraemon.isEmpty(comment_number1) is False: comment_number = str( filter(str.isdigit, comment_number1[0].encode('gbk'))).strip() if self.doraemon.isEmpty(share_number1) is False: share_number = str( filter(str.isdigit, share_number1[0].encode('gbk'))).strip() if self.doraemon.isEmpty(image_url1) is False: image_url = image_url1[0].strip() if self.doraemon.isEmpty(content1) is False: content = ''.join(content1).strip() valid = True if self.doraemon.isEmpty(time1) is False: time = ''.join(time1).strip() time = self.doraemon.getDateFromString(time) if self.doraemon.isEmpty(author_url1) is False: author_url = urlparse.urljoin(current_url, author_url1[0].strip()) if self.doraemon.isEmpty(author_name1) is False: author_name = ''.join(author_name1[0]).strip() data = { 'title': title, 'comment_number': comment_number, 'share_number': share_number, 'image_url': image_url, 'url': url, 'public_time': time, 'author_url': author_url, 'author_name': author_name, 'id': id, 'download_time': self.today, 'is_open_cache': self.is_open_cache, 'source': self.source } print 'End to parse: {0}'.format(current_url) if valid == True and self.doraemon.isEmpty(title) is False: self.file.logger(self.log_path, 'Start to store mongo {0}'.format(data['url'])) print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) self.file.logger(self.log_path, 'End to store mongo {0}'.format(data['url'])) print 'End to store mongo {0}'.format(data['url']) self.doraemon.storeTxt(id, content, self.finished_txt_path, self.name) self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp, response['request_title']) else: self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp, response['request_title']) del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data gc.collect() def start_requests(self): self.file.logger(self.log_path, 'Start request: {0}'.format(self.name)) print 'Start ' + self.name + ' requests' new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp, self.url_path) # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']] if len(new_url_titles) == 0: self.file.logger(self.log_path, 'No new url for: {0}'.format(self.name)) print 'No new url for: {0}'.format(self.name) return request = BrowserRequest() content = request.start_chrome(new_url_titles, self.max_pool_size, self.log_path, None, callback=self.parse) self.file.logger(self.log_path, 'End requests: {0}'.format(str(len(content)))) print 'End requests: {0}'.format(str(len(content))) del new_url_titles, request, content gc.collect()
class TransferToProduction(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.request = RequestsMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.settings.LOG_PATH) self.doraemon.createFilePath(self.temp_folder_html) self.doraemon.createFilePath(self.temp_folder_img) def getSettings(self): settings_name = self.settings.CreateSettings('weixin') self.source = settings_name['SOURCE_NAME'] self.work_path_prd2 = settings_name['WORK_PATH_PRD2'] self.mongo = settings_name['MONGO_URLS'] self.name = settings_name['NAME'] self.finished_content_path = settings_name['FINISHED_CONTENT_PATH'] self.finished_img_path = settings_name['FINISHED_IMG_PATH'] self.finished_processed_html_path = settings_name[ 'FINISHED_PROCESSED_HTML_PATH'] self.temp_folder_html = self.settings.TEMP_FOLDER_HTML self.temp_folder_img = self.settings.TEMP_FOLDER_IMG self.log_path = self.settings.LOG_PATH self.today = self.settings.TODAY def start_transfer(self): print 'Start {0} transfer'.format(self.name) new_ids = self.doraemon.readNewImageIds( self.doraemon.bf_finished_temp_weixin, self.finished_content_path) for id in new_ids: self.file.logger(self.log_path, 'Start transfer image: {0}'.format(id)) regx_img_file = re.compile(id) for f in os.listdir(self.finished_img_path): isValidImage = regx_img_file.match(f) if isValidImage is None: print 'Invalid image for not match: {0}'.format(f) continue from_img_path = "{0}/{1}".format(self.finished_img_path, f) to_img_path = "{0}/{1}".format(self.temp_folder_img, f) is_from_path_exists = os.path.exists(from_img_path) if is_from_path_exists is False: self.file.logger(self.log_path, 'img of {0} not exits.'.format(f)) continue copyfile(from_img_path, to_img_path) print 'Finished to transfer image {0}'.format(f) self.file.logger(self.log_path, 'Start transfer html: {0}'.format(id)) from_path = "{0}/{1}.html".format( self.finished_processed_html_path, id) to_path = "{0}/{1}.html".format(self.temp_folder_html, id) is_from_path_exists = os.path.exists(from_path) if is_from_path_exists is False: self.file.logger(self.log_path, 'html of {0} not exits.'.format(id)) continue copyfile(from_path, to_path) print 'Finished to transfer html {0}'.format(id) self.doraemon.storeFinished(self.doraemon.bf_finished_temp_weixin, id) print 'Finished to transfer {0}'.format(id)