def resolve_uid(self, x): """Parse HTML and update with URLs pointing to Plone objects. ex. url: "http://worpress.com/wp-content/uploads/2010/04/image.jpg" becomes: "resolveuid/c82a53270c904cfbbfd1a0d4cef90676" :param x: [required] Parsed Regex :type x: type Regex Match object :returns: the tag with an internal url :rtype: str """ start = x.group(1) # Start of tag ex.: '<img src="' url = x.group(2) # URL end = x.group(3) # End of tag ex.: '" />' url = fix_id(url) o = urlparse(url) internal_url = o.netloc == self.domain is_site_root = o.path == '' or o.path == '/' # links to external URL or to site root are ignored if not internal_url or is_site_root: return x.group(0) # return unchanged path = str(o.path).strip(' ').lstrip('/') obj = traverse(self.context, path, None) if obj is None: # object not found logger.warn('Could not resolve UUID: {0}'.format(url)) return x.group(0) # return unchanged # Create internal URL uuid = obj.UID() return '{0}resolveuid/{1}{2}'.format(start, uuid, end)
def add_related_content(self, obj, item): """Look into WordPress list of related content and create Plone related content list. :param obj: [required] object to add related content :type obj: type constructor parameter :param item: [required] transmogrify item :type item: dict """ # Get the string with URLs from related content pinged = item.get('_pinged', '') if pinged == '': return # No related content # The URL is formated with multiple URLs together without # separator. To break it into a list, I need to split on # http and reconstruct the url # TODO: handle HTTPS scheme related_urls = set('http{0}'.format(url.rstrip('/')) for url in pinged.split('http')[1:]) # Create a list of related items to update object's field related_items = [] for url in related_urls: # Parse URL and check domain url = fix_id(url) o = urlparse(url) if o.netloc != self.domain: continue path = str(o.path).strip(' ').lstrip('/') related_obj = traverse(self.context, path, None) if related_obj is None: # object not found logger.warn('Broken link: {0}'.format(url)) continue # Get related item ID intids = getUtility(IIntIds) to_id = intids.getId(related_obj) related_items.append(RelationValue(to_id)) # No related content if len(related_items) == 0: return obj.relatedItems = related_items
def __iter__(self): for item in self.previous: yield item filename = os.path.join(self.source, 'wp_posts.csv') assert os.path.isfile(filename), 'Missing file: ' + filename with open(filename) as csvfile: csv.field_size_limit(self.field_size_limit) reader = csv.DictReader(csvfile, **csv_options) for row in reader: if _skip(row, self.skip): # should we process this row? continue item = dict() post_type = row['post_type'] if post_type == 'post': # posts are imported as portal_type item['portal_type'] = self.portal_type elif post_type == 'page': # pages are imported as Page item['portal_type'] = 'Page' elif post_type == 'attachment': # attachments are imported as Image or File is_image = row['post_mime_type'].startswith('image') item['portal_type'] = 'Image' if is_image else 'File' item['_mimetype'] = row['post_mime_type'] item['_guid'] = row['guid'] # store for later if post_type != 'attachment': # for posts and pages the id is the post name item_id = row['post_name'] # Zope ids need to be ASCII item_id = fix_id(item_id) item['title'] = strip_tags(row['post_title']) else: # for attachments we need to parse the guid # and use the file name as title url = urlparse(row['guid']) item_id = item['title'] = url.path.split('/')[-1] item_id = fix_id(item_id) # on Zope ids can't start with "_" if bad_id(item_id) is not None: logger.warn('Invalid object id on row ID: ' + row['ID']) continue # WordPress stores only publication and modification times # we use publication date as creation date item['creation_date'] = item['effective_date'] = row['post_date'] item['modification_date'] = row['post_modified'] try: item['_path'] = self.get_path(row['ID'], item_id, post_type, item) except KeyError: # files defining taxonomies are probably outdated logger.warn('No taxonomies found for row ID: ' + row['ID']) continue item['description'] = row['post_excerpt'] # quotes are escaped; we need to fix that item['text'] = row['post_content'].replace('\\"', '"') # TODO: validate HTML to avoid post-processing surprises # use display_name instead of author_id, if match found author_id = row['post_author'] item['creators'] = self.display_names.get(author_id, author_id) if row['post_status'] == 'publish': item['_transitions'] = 'publish' item['_pinged'] = row['pinged'] # store for later yield item