class WPImporter: def __init__(self, *args, **kwargs): self.start_idx = kwargs['start_idx'] if 'start_idx' in kwargs else 0 self.end_idx = kwargs['end_idx'] if 'end_idx' in kwargs else 0 self.env = kwargs['env'] if 'env' in kwargs else 'dev' self.conf = config[self.env] self.d = Debug(level=4, color=True) self.current_dir = os.path.dirname(os.path.abspath(__file__)) self.project_dir = os.path.abspath( os.path.join(self.current_dir, os.pardir)) self.admin_url = self.conf['url']['admin'] self._default_image_id = 2 def get_csrf_token_from_page(self, url): r = self.s.get(url, verify=False) soup = BeautifulSoup(r.text, "lxml") return soup.find('input', {'name': '_csrf'}).get('value') def login(self): self.s = requests.Session() #r = self.s.get('{}/auth/login'.format(self.admin_url), verify=False) #csrf_token = r.cookies['_csrf'] csrf_token = self.get_csrf_token_from_page('{}/auth/login'.format( self.admin_url)) p = { 'email': self.conf['account']['email'], 'password': self.conf['account']['password'], '_csrf': csrf_token } self.s.post('{}/auth/login'.format(self.admin_url), data=p) def import_categories(self): with open(self.project_dir + '/output/categories/categories.json', 'r') as f: ## import categories self.d.info('start importing categories ...') csrf_token = self.get_csrf_token_from_page( '{}/categories/new'.format(self.admin_url)) categories = json.load(f) categories_count = len(categories) counter = 1 cur_id = 0 while len(categories) > 0: category = categories.pop(0) if category['id'] > cur_id: cur_id = category['id'] if category['parent'] > cur_id: categories.append(category) continue category['import_id'] = category['id'] del category['id'] category['parent_id'] = category['parent'] del category['parent'] category['_csrf'] = csrf_token z = self.s.post('{}/api/v1/categories/new'.format( self.admin_url), json=category, headers=self._create_headers(csrf_token)) result = json.loads(z.text) progress_info = '({}/{})'.format(counter, categories_count) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', progress_info, 'cat_id={}'.format(category['import_id']), k, result['payload']['errors'][k]) else: self.d.log('Done.', progress_info, result['payload']['message']) counter += 1 def import_tags(self): with open(self.project_dir + '/output/tags/tags.json', 'r') as f: ## import tags csrf_token = self.get_csrf_token_from_page('{}/tags/new'.format( self.admin_url)) tags = json.load(f) for tag in tags: tag['import_id'] = tag['id'] del tag['id'] tag['_csrf'] = csrf_token z = s.post('{}/api/v1/tags/new'.format(self.admin_url), json=tag, headers=self._create_headers(csrf_token)) result = json.loads(z.text) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', progress_info, 'tag_id={}'.format(category['import_id']), k, result['payload']['errors'][k]) else: self.d.log('Done.', result['payload']['message']) def import_operators(self): with open(self.project_dir + '/output/wp_users/wp_users.json', 'r') as f: ## import operators self.d.info('start importing operators ...') csrf_token = self.get_csrf_token_from_page( '{}/operators/new'.format(self.admin_url)) operators = json.load(f) operators_count = len(operators) counter = 1 for operator in operators: operator['import_id'] = operator['id'] del operator['id'] operator['_csrf'] = csrf_token z = self.s.post('{}/api/v1/operators/new'.format( self.admin_url), json=operator, headers=self._create_headers(csrf_token)) result = json.loads(z.text) progress_info = '({}/{})'.format(counter, operators_count) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error( 'ERROR', progress_info, 'operator_id={}'.format(operator['import_id']), k, result['payload']['errors'][k]) else: self.d.log('Done.', progress_info, result['payload']['message']) counter += 1 def import_articles_with_images(self): self.import_articles(with_image=True) def update_article_image(self): """ COPY (SELECT ROW_TO_JSON(t) FROM (SELECT articles.id, image_files.filename FROM articles join images on images.id = articles.image_id join image_files on images.id = image_files.image_id) t) TO '/tmp/article_image_relations.json'; mv /tmp/article_image_relations.json /var/lib/postgresql/data/ mv $DOCKER_ALIAS/data/article_image_relations.json ~/wordpress-split-data/deps/ """ contents = [] self.air = {} with open(self.project_dir + '/deps/article_image_relations.json', 'r') as f: contents = f.readlines() for line in contents: j = json.loads(line.strip()) self.air[j['id']] = j self.import_articles(with_image=True, update_only=True, save_to_local=True) def import_articles(self, with_image=False, update_only=False, save_to_local=False): h = HTMLParser() self.uii = {} with open(self.project_dir + '/deps/upload_images_id.json.c') as f: self.uii = json.load(f) features_alias = {} with open(self.project_dir + '/output/features.json', 'r') as f: series = json.load(f) categories = [] with open(self.project_dir + '/output/categories/categories.json', 'r') as f: categories = json.load(f) for s in series: if s['term_group'] == 1: for category in categories: if category['slug'] == s['slug']: features_alias[category['id']] = s break for idx in xrange(self.start_idx, self.end_idx + 1): input_file = self.project_dir + '/output/wp_posts/wp_posts_{}.json'.format( idx) with open(input_file, 'r') as f: ## import articles self.d.info('start importing articles ...') self.d.info('read file ... {}'.format(input_file)) csrf_token = self.get_csrf_token_from_page( '{}/articles/new'.format(self.admin_url)) articles = json.load(f) articles_count = len(articles) counter = 1 operator_ids = self._get_operators_ids() for article in articles: # replace cdn url into /media/image/:id xds = re.findall( '(https://stg.localhost/640/480/uploads/(.*))"', article['html_content']) article['html_content'] = self._replace_image_url_to_media( xds, article['html_content']) # replace s3 url into /media/image/:id xds = re.findall( '(https://cdn-prd.s3-ap-northeast-1.amazonaws.com/wp-content/uploads/(.*))"', article['html_content']) article['html_content'] = self._replace_image_url_to_media( xds, article['html_content']) if article['author_id'] not in operator_ids: self.d.debug( 'operator id not exists. Set to author_id = 1.') article['author_id'] = 1 article['import_id'] = article['id'] # set one space if subtitle is empty if article[u'sub_title'] == '': article[u'sub_title'] = ' ' else: article[u'sub_title'] = article[u'sub_title'][:60] if u'published_at' in article: article['status'] = 2 if article['series_id'] != 0: article['feature_ids'] = [article['series_id']] else: article['feature_ids'] = [] # limit title to 60 words article[u'title'] = h.unescape(article[u'title'][:60]) # limit meta_description to 120 words article[u'meta_description'] = article[ u'meta_description'][:100] if article[u'meta_description'] == '': article[u'meta_description'] = article[u'title'] # make content & html_content the same. article[u'content'] = article['html_content'] # do not import tag_ids article[u'tag_ids'] = [] article['_csrf'] = csrf_token article['html_content'] = h.unescape( article['html_content']) # make category <-> features alias if article['category_id'] != 0 and article[ 'category_id'] in features_alias: article['feature_ids'].append( features_alias[article['category_id']]['term_id']) article['image_id'] = self._default_image_id if save_to_local: if article['id'] in self.air: article['image_id'] = self.air[ article['id']]['image_id'] else: article['image_id'] = self._default_image_id z = None if update_only: article = self._upload_media_image(article, upload_flag=False) #del article['image_id'] z = self.s.put( '{}/api/v1/articles/{}'.format( self.admin_url, article['import_id']), json=article, headers=self._create_headers(csrf_token)) else: z = self.s.post( '{}/api/v1/articles/new'.format(self.admin_url), json=article, headers=self._create_headers(csrf_token)) result = json.loads(z.text) progress_info = '({}/{})'.format(counter, articles_count) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', progress_info, 'id={}'.format(article['import_id']), k, result['payload']['errors'][k], article['category_id']) else: if result['status'] == 'Not Found': self.d.error('Not Found.', progress_info, 'id={}'.format(article['import_id'])) else: self.d.log('Done.', progress_info, 'id={}'.format(article['import_id']), result['payload']['message']) if with_image: if save_to_local: article = self._upload_media_image( article, upload_flag=False, save_to_local=True) else: article = self._upload_media_image(article) # update image_id self._update_article_image_id(article) counter += 1 def import_widgets(self): link_short_code_dict = {} with open(self.project_dir + '/output/link_short_code.json', 'r') as f: link_short_code_dict = json.load(f) with open(self.project_dir + '/output/wp_posts/wp_links.json', 'r') as f: ## import wplink as widgets csrf_token = self.get_csrf_token_from_page('{}/widgets/new'.format( self.admin_url)) wplinks = json.load(f) for key in wplinks: # insert widget wplink = wplinks[key] content = u'<div class="recommend-links"><h2 class="title">【編集部のオススメ記事】</h2><ul class="links">{}</ul></div>'.format( ''.join([ u'<li><a href="{}"><i class="icons-right _color-darkgray"></i>{}</a></li>' .format(wplink['data'][k][0], wplink['data'][k][1]) for k in wplink['data'] ])) widget_title = ",".join([k for k in wplink['data']])[:60] content = content.replace(u'https://prd.localhost/archives', u'/archives') content_soup = BeautifulSoup(content, "lxml") if content_soup.html is not None: content_soup.html.unwrap() if content_soup.body is not None: content_soup.body.unwrap() if wplink['id'] == 124: widget_title += ' ' widget_data = { 'import_id': wplink['id'], 'name': widget_title, 'content': ' ', #content_soup.prettify(), 'type_id': 10, # おすすめ記事リンク 'link_ids': [link_short_code_dict[k]['id'] for k in wplink['data']] } widget_data['_csrf'] = csrf_token z = self.s.post('{}/api/v1/widgets/new'.format(self.admin_url), json=widget_data, headers=self._create_headers(csrf_token)) result = json.loads(z.text) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', k, result['payload']['errors'][k]) else: self.d.log('Done.', result['payload']['message']) # insert panel panel_data = { 'import_id': wplink['id'], 'name': widget_title, 'type_id': 7, 'widget_ids': [wplink['id']], } z = self.s.post('{}/api/v1/panels/new'.format(self.admin_url), json=panel_data, headers=self._create_headers(csrf_token)) result = json.loads(z.text) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', k, result['payload']['errors'][k]) else: self.d.log('Done.', wplink['id'], result['payload']['message']) def import_feature(self): with open(self.project_dir + '/output/features.json', 'r') as f: csrf_token = self.get_csrf_token_from_page( '{}/features/new'.format(self.admin_url)) series = json.load(f) for s in series: # insert widget feature_data = { 'import_id': s['term_id'], 'author_id': 1001, 'title': s['name'], 'slug': s['slug'], 'image_id': 50002, # self._default_image_id, 'description': ' ', '_csrf': csrf_token, } z = self.s.post('{}/api/v1/features/new'.format( self.admin_url), json=feature_data, headers=self._create_headers(csrf_token)) result = json.loads(z.text) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', k, result['payload']['errors'][k], s['term_id'], s['name']) else: self.d.log('Done.', result['payload']['message']) def import_link_short_code(self): with open(self.project_dir + '/output/link_short_code.json', 'r') as f: # TODO: change to correct url csrf_token = self.get_csrf_token_from_page( '{}/features/new'.format(self.admin_url)) link_short_codes = json.load(f) for key in link_short_codes: lsc_data = { 'import_id': link_short_codes[key]['id'], 'name': key, 'display_name': link_short_codes[key]['title'], 'href': link_short_codes[key]['link'], '_csrf': csrf_token, } z = self.s.post('{}/api/v1/link_short_codes/new'.format( self.admin_url), json=lsc_data, headers=self._create_headers(csrf_token)) result = json.loads(z.text) if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', k, result['payload']['errors'][k]) else: self.d.log('Done.', result['payload']['message']) def download_images(self): input_file = self.project_dir + '/output/s3_images/s3_images.json' with open(input_file, 'r') as f: ## import articles self.d.info('read file ... {}'.format(input_file)) s3_images = json.load(f) print len(s3_images.keys()) for d in s3_images: for origin_url in s3_images[d]: # create dirs image_path = origin_url.split('https://prd.localhost/')[1] image_dir = '/'.join(image_path.split('/')[:-1]) helpers.create_dirs(self.project_dir + '/output', [image_dir]) s3_url = s3_images[d][origin_url] urllib.urlretrieve( origin_url, 'output/' + image_dir + '/' + origin_url.split('/')[-1]) def _get_operators_ids(self): with open(self.project_dir + '/output/wp_users/wp_users.json', 'r') as f: operators = json.load(f) return [operator['id'] for operator in operators] def _upload_media_image(self, article, upload_flag=True, save_to_local=False): thumb_url = article['thumb_url'] if thumb_url == "": if len(article['image_urls']) > 0: thumb_url = article['image_urls'][0] else: article['image_id'] = self._default_image_id return article fn = thumb_url.split('/')[-1] open_fn = fn file_exist_flag = os.path.isfile(self.project_dir + '/output/wp-content-dl/uploads/' + fn) if not file_exist_flag: for ch in '12345': tmp = os.path.splitext(fn) open_fn = '{}{}{}'.format(tmp[0], ch, tmp[1]) if os.path.isfile(self.project_dir + '/output/wp-content-dl/uploads/' + open_fn): file_exist_flag = True break if not file_exist_flag: # fetch image image_url = 'https://prd.localhost/wp-content/uploads/{}'.format( fn) self.d.log('no image file found. start fetch files ...', image_url) image_dir = 'wp-content-dl' helpers.create_dirs(self.project_dir + '/output', [image_dir]) urllib.urlretrieve( image_url, 'output/' + image_dir + '/uploads/' + image_url.split('/')[-1]) open_fn = fn file_exist_flag = True image_path = self._get_image_full_path(open_fn) is_image = False try: with Image.open(image_path) as f: is_image = True except: pass if file_exist_flag and is_image: ## use default image if fn == 'logo_thumbnail1.jpg': article['image_id'] = self._default_image_id return article ## resize image before upload self._resize_eyecatch_image(image_path, fn) if upload_flag: self.d.debug('start upload image... {}'.format(fn)) result = self._create_image_info() image_file_id, image_id, upload_url, content_type = self._generate_image_file_id( fn, result['payload']['Id']) #self.d.debug('upload url = {}'.format(upload_url)) with open('/tmp/{}'.format(fn), 'r') as f: if upload_flag: image_data = f.read() headers = { 'content-type': content_type, 'content-length': str(len(image_data)), } r = self.s.put(upload_url, data=image_data, headers=headers) if (upload_flag and r.status_code == 200) or not upload_flag: if upload_flag: # doing image validation. ( /api/batch/media/image/validate/:id ) self.s.get( '{}/api/batch/media/image/validate/{}'.format( self.admin_url, image_file_id)) article['image_id'] = image_id self.d.debug( 'image uploaded. post_id={}, image_id={}'. format(article['id'], image_file_id)) # remove img tag in html_content #article = self._remove_eyecatch_image_in_content(article) else: article['image_id'] = self._default_image_id self.d.error('error uploading image. ID={}'.format( article['id'])) # remove tmp image after upload finished os.remove(self.conf['path']['tmp_image'] + '/' + fn) if save_to_local: dst_dir = '/tmp/u/' + self.air[ article['id']]['filename'].split('/')[0] if not os.path.exists(dst_dir): os.makedirs(dst_dir) dst_file = '/tmp/u/' + self.air[article['id']]['filename'] src_file = self.conf['path']['tmp_image'] + '/' + fn if os.path.exists(dst_file): os.remove(dst_file) try: shutil.move(src_file, dst_file) except: pass else: article['image_id'] = self._default_image_id self.d.error('thumb image not found. [post_id = {}][{}]'.format( article['id'], fn)) return article def _remove_eyecatch_image_in_content(self, article): content_soup = BeautifulSoup(article['html_content'], 'lxml') for img_tag in content_soup.findAll('img'): # remove "?asd=1" parameter img_tag['src'] = img_tag['src'].split('?')[0] # remove "(1)" something like medium_210421689(1).jpg img_tag['src'] = re.sub(r'\(\d\)', '', img_tag['src']) img_fn = img_tag['src'].split('/')[-1] split_by_dot = img_fn.split('.') split_by_hypen = split_by_dot[0].split('-') # handle some rare case which the url doesn't have file ext if len(split_by_dot) == 1: split_by_dot.append('') img_fn2 = '-'.join(split_by_hypen[0:len(split_by_hypen) - 1]) + '.' + split_by_dot[1] match_rate = 0.95 o = os.path.splitext(img_fn) m = re.match(r'{}\d?\.{}'.format(o[0], o[1][1:]), img_fn) if Levenshtein.ratio( unicode(img_fn), unicode(fn)) > match_rate or Levenshtein.ratio( unicode(img_fn2), unicode(fn)) > match_rate or m is not None: img_tag.extract() content_soup.html.unwrap() content_soup.body.unwrap() article['html_content'] = content_soup.prettify(indent_width=2) # only for first one break else: self.d.error("img_tag's not matched. {} / {}".format( img_fn, fn)) def _create_image_info(self): # https://admin.localhost/api/v1/media/image/new # alt= , caption= , use_type=4 csrf_token = self.get_csrf_token_from_page('{}/media'.format( self.admin_url)) create_image_params = { 'alt': '', 'caption': '', 'use_type': 4, } result = self.s.post('{}/api/v1/media/image/new'.format( self.admin_url), json=create_image_params, headers=self._create_headers(csrf_token)) return json.loads(result.text) def _generate_image_file_id(self, fn, fid): # /api/v1/media/image/3/upload/new # POST # request_filename: "randString.png" csrf_token = self.get_csrf_token_from_page('{}/media'.format( self.admin_url)) create_upload_url_params = { 'request_filename': fn, } r = self.s.post('{}/api/v1/media/image/{}/upload/new'.format( self.admin_url, fid), json=create_upload_url_params, headers=self._create_headers(csrf_token)) result = json.loads(r.text) return (result['payload']['ImageFileId'], result['payload']['ImageFileId2'], result['payload']['UploadURL'], result['payload']['ContentType']) def _get_image_full_path(self, fn): return self.project_dir + '/output/wp-content-dl/uploads/' + fn def _get_admin_url(self): return self.conf['url']['admin'] def _create_headers(self, csrf_token): return { 'x-csrf-token': csrf_token, 'content-type': 'application/json;charset=UTF-8', } def _resize_eyecatch_image(self, image_path, fn): basewidth = 600 baseheight = 400 img = Image.open(image_path) if img.size[0] < basewidth: wpercent = (basewidth / float(img.size[0])) hsize = int((float(img.size[1]) * float(wpercent))) img = img.resize((basewidth, hsize), Image.ANTIALIAS) if img.size[1] < baseheight: wpercent = (baseheight / float(img.size[1])) wsize = int((float(img.size[0]) * float(wpercent))) img = img.resize((wsize, baseheight), Image.ANTIALIAS) ratio = float(img.size[0]) / float(img.size[1]) y1 = (img.size[1] - baseheight) / 2 y2 = y1 + baseheight x1 = (img.size[0] - basewidth) / 2 x2 = x1 + basewidth if ratio > 1.5: w = int(float(img.size[1]) * 1.5) x1 = (img.size[0] - w) / 2 x2 = x1 + w img = img.crop((x1, 0, x2, img.size[1])) else: h = int(float(img.size[0]) / 1.5) y1 = (img.size[1] - h) / 2 y2 = y1 + h img = img.crop((0, y1, img.size[0], y2)) if fn.find('.') == -1: fn = 'rand.' + fn img.save(self.conf['path']['tmp_image'] + '/' + fn) def _update_article_image_id(self, article): csrf_token = self.get_csrf_token_from_page( '{}/articles/edit/{}'.format(self.admin_url, article['id'])) update_params = article r = self.s.put('{}/api/v1/articles/{}'.format(self.admin_url, article['id']), json=update_params, headers=self._create_headers(csrf_token)) result = json.loads(r.text) if r.status_code == 200: self.d.log('image_id update Done.', article['id'], result['payload']['message']) else: if 'errors' in result['payload']: for k in result['payload']['errors']: self.d.error('ERROR', k, result['payload']['errors'][k]) def _replace_image_url_to_media(self, xds, article_html_content): for xd in xds: xt = xd[0].split(u'"')[0] xr = xd[1].split(u'"')[0] try: article_html_content = article_html_content.replace( xt, 'https://admin.localhost/media/image/{}'.format( self.uii[xr])) except: self.d.error('failed replace image url to media ... key =', xr) return article_html_content
formatter)) BeautifulSoup.prettify = prettify if __name__ == '__main__': ## init logger d = Debug(level=4, color=True) db_conn = None try: d.info('connect to database ...') db_conn = DBConn(config['dev']['db']) d.info('[OK]\n') except Exception as e: d.error(e) exit() cursor = db_conn.get_cursor() ## init directories current_dir = os.path.dirname(os.path.abspath(__file__)) output_dir = os.path.abspath(os.path.join(current_dir, os.pardir, 'output')) output_dirs = ['wp_users', 'wp_posts', 'categories', 'tags', 's3_images'] helpers.create_dirs(output_dir, output_dirs) ## get shorten url mappings shorten_url_dict = get_shorten_urls(cursor) shorten_url_keys = shorten_url_dict.keys()