Esempio n. 1
0
    def resolve_uid(self, x):
        """Parse HTML and update with URLs pointing to Plone objects.
        ex. url: "http://worpress.com/wp-content/uploads/2010/04/image.jpg"
        becomes: "resolveuid/c82a53270c904cfbbfd1a0d4cef90676"

        :param x: [required] Parsed Regex
        :type x: type Regex Match object
        :returns: the tag with an internal url
        :rtype: str
        """
        start = x.group(1)  # Start of tag ex.: '<img src="'
        url = x.group(2)  # URL
        end = x.group(3)  # End of tag ex.: '" />'

        url = fix_id(url)
        o = urlparse(url)

        internal_url = o.netloc == self.domain
        is_site_root = o.path == '' or o.path == '/'

        # links to external URL or to site root are ignored
        if not internal_url or is_site_root:
            return x.group(0)  # return unchanged

        path = str(o.path).strip(' ').lstrip('/')
        obj = traverse(self.context, path, None)

        if obj is None:  # object not found
            logger.warn('Could not resolve UUID: {0}'.format(url))
            return x.group(0)  # return unchanged

        # Create internal URL
        uuid = obj.UID()
        return '{0}resolveuid/{1}{2}'.format(start, uuid, end)
def _skip(row, skip):
    """Test if we will need to skip row processing by dealing with the
    following cases:

    - parsing errors
    - items with revision type
    - items with draft status
    - explicit request

    :param row: [required] row to be analized
    :type row: dictionary
    :param skip: [required] list of item ID to be explicitly skiped
    :type skip: list
    :returns: True if we will skip the row
    :rtype: bool
    """
    if row['ID'] in skip:
        logger.info('Skipping row ID: ' + row['ID'])
        return True
    elif len(row) != 23 and 'publish' in row.values():
        logger.warn('Parsing error on row ID: ' + row['ID'])
        return True
    elif row['post_type'] not in KNOWN_POST_TYPES:
        logger.warn('Parsing error on row ID: ' + row['ID'])
        return True
    elif row['post_type'] == 'revision':
        logger.debug('Revision type on row ID: ' + row['ID'])
        return True
    elif row['post_status'] == 'draft':
        logger.debug('Draft status on row ID: ' + row['ID'])
        return True

    return False
    def __iter__(self):
        fetch_errors = []  # record all errors

        for item in self.previous:
            if '_guid' not in item:
                yield item
                continue

            url = item['_guid']
            path = item['_path']  # TODO: read path key from options

            if not path:  # not enough information
                yield item
                continue

            obj = self.context.unrestrictedTraverse(
                path.encode().lstrip('/'), None)


            # if object exists we will try to avoid downloading it again
            if obj is not None:

                if obj.portal_type not in ('File', 'Image'):  # not an attachment
                    yield item
                    continue

                # request only the header to check it
                try:
                    r = requests.head(url)
                except ConnectionError:  # skip on connection error
                    fetch_errors.append(url)
                    yield item
                    continue

                # content-length header could be missing if remote web
                # server is misconfigured for some mime types
                size = int(r.headers.get('content-length', 0))

                if size == obj.size():  # already downloaded it
                    yield item
                    continue

            try:
                r = requests.get(url)
            except RequestException:  # skip on timeouts and other errors
                fetch_errors.append(url)
                yield item
                continue

            if r.status_code != 200:  # log error and skip item
                fetch_errors.append(url)
                msg = u'Error {0} when fetching {1}'.format(r.status_code, url)
                logger.warn(msg)
                yield item
                continue

            item['_data'] = r.content

            yield item
    def add_related_content(self, obj, item):
        """Look into WordPress list of related content and create Plone
        related content list.

        :param obj: [required] object to add related content
        :type obj: type constructor parameter
        :param item: [required] transmogrify item
        :type item: dict
        """
        # Get the string with URLs from related content
        pinged = item.get('_pinged', '')
        if pinged == '':
            return  # No related content

        # The URL is formated with multiple URLs together without
        # separator.  To break it into a list, I need to split on
        # http and reconstruct the url
        # TODO: handle HTTPS scheme
        related_urls = set('http{0}'.format(url.rstrip('/'))
                           for url in pinged.split('http')[1:])

        # Create a list of related items to update object's field
        related_items = []
        for url in related_urls:
            # Parse URL and check domain
            url = fix_id(url)
            o = urlparse(url)
            if o.netloc != self.domain:
                continue

            path = str(o.path).strip(' ').lstrip('/')
            related_obj = traverse(self.context, path, None)

            if related_obj is None:  # object not found
                logger.warn('Broken link: {0}'.format(url))
                continue

            # Get related item ID
            intids = getUtility(IIntIds)
            to_id = intids.getId(related_obj)
            related_items.append(RelationValue(to_id))

        # No related content
        if len(related_items) == 0:
            return

        obj.relatedItems = related_items
    def __iter__(self):
        for item in self.previous:
            yield item

        filename = os.path.join(self.source, 'wp_posts.csv')
        assert os.path.isfile(filename), 'Missing file: ' + filename

        with open(filename) as csvfile:
            csv.field_size_limit(self.field_size_limit)
            reader = csv.DictReader(csvfile, **csv_options)
            for row in reader:

                if _skip(row, self.skip):  # should we process this row?
                    continue

                item = dict()
                post_type = row['post_type']

                if post_type == 'post':
                    # posts are imported as portal_type
                    item['portal_type'] = self.portal_type
                elif post_type == 'page':
                    # pages are imported as Page
                    item['portal_type'] = 'Page'
                elif post_type == 'attachment':
                    # attachments are imported as Image or File
                    is_image = row['post_mime_type'].startswith('image')
                    item['portal_type'] = 'Image' if is_image else 'File'
                    item['_mimetype'] = row['post_mime_type']
                    item['_guid'] = row['guid']  # store for later

                if post_type != 'attachment':
                    # for posts and pages the id is the post name
                    item_id = row['post_name']
                    # Zope ids need to be ASCII
                    item_id = fix_id(item_id)
                    item['title'] = strip_tags(row['post_title'])
                else:
                    # for attachments we need to parse the guid
                    # and use the file name as title
                    url = urlparse(row['guid'])
                    item_id = item['title'] = url.path.split('/')[-1]
                    item_id = fix_id(item_id)

                # on Zope ids can't start with "_"
                if bad_id(item_id) is not None:
                    logger.warn('Invalid object id on row ID: ' + row['ID'])
                    continue

                # WordPress stores only publication and modification times
                # we use publication date as creation date
                item['creation_date'] = item['effective_date'] = row['post_date']
                item['modification_date'] = row['post_modified']

                try:
                    item['_path'] = self.get_path(row['ID'], item_id, post_type, item)
                except KeyError:
                    # files defining taxonomies are probably outdated
                    logger.warn('No taxonomies found for row ID: ' + row['ID'])
                    continue

                item['description'] = row['post_excerpt']

                # quotes are escaped; we need to fix that
                item['text'] = row['post_content'].replace('\\"', '"')
                # TODO: validate HTML to avoid post-processing surprises

                # use display_name instead of author_id, if match found
                author_id = row['post_author']
                item['creators'] = self.display_names.get(author_id, author_id)

                if row['post_status'] == 'publish':
                    item['_transitions'] = 'publish'

                item['_pinged'] = row['pinged']  # store for later

                yield item