Beispiel #1
0
def clean_stars(div: HtmlElement) -> None:
    for e in elements(div, "p[strong[em]]"):
        e.drop_tree()
    for e in elements(div, ".//hr"):
        e.drop_tree()

    div.insert(0, H1("Unspeakable Desolation Pouring Down From the Stars"))
    e = element(div, "./p[1]")
    h2 = H2(e.text_content().title())
    replace(e, h2)

    e = element(div, "./p[strong[a]]")
    a = element(div, "./p/strong/a")
    p = P(
        CLASS("breakabove"),
        A(e.text_content(), CLASS("internal"), href=a.attrib["href"]),
    )
    replace(e, p)
Beispiel #2
0
    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.frame_map[p] = style.frame
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')

        def p_parent(x):
            # Ensure that nested <w:p> tags are handled. These can occur if a
            # textbox is present inside a paragraph.
            while True:
                x = x.getparent()
                try:
                    if x.tag.endswith('}p'):
                        return x
                except AttributeError:
                    break

        for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
            if p_parent(x) is not p:
                continue
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = self.namespace.get(x, 'w:name')
                if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
                    # _GoBack is a special bookmark inserted by Word 2010 for
                    # the return to previous edit feature, we ignore it
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(iteritems(self.anchor_map)):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x
            elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
                old_anchor = current_anchor
                anchor = unicode_type(uuid.uuid4())
                self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
                self.toc_anchor = current_anchor
                if old_anchor is not None:
                    # The previous anchor was not applied to any element
                    for a, t in tuple(iteritems(self.anchor_map)):
                        if t == old_anchor:
                            self.anchor_map[a] = current_anchor
        if current_anchor is not None:
            # This paragraph had no <w:r> descendants
            dest.set('id', current_anchor)
            current_anchor = None

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.bidi is True:
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0 and not style.has_visible_border():
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest
Beispiel #3
0
    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, 'w:hyperlink')
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        return dest
Beispiel #4
0
    def handle_filemaker(self, root, **options):

        # Prefix to append to import IDs.
        ID_PREFIX = 'inglis:'

        # Load user to create/update documents as.
        try:
            user = User.objects.get(username=options['username'])
        except User.DoesNotExist:
            raise CommandError('unknown user: %s' % options['username'])

        # Verify that we have the fields we expect.
        f = expected_fields = [
            'CardID', 'Export', 'CardHeading', 'Transcription', 'CardType',
            'CardFormat', 'Language', 'ContributorJoin::Contributors',
            'SubjectJoin::Subject', 'OrganizationJoin::Organizations',
            'Citation', 'CatalogLink', 'ResourceLink', 'AdditionalNotes',
            'DateEntered'
        ]
        fields = root.xpath('./fmp:METADATA/fmp:FIELD/@NAME', namespaces=NS)
        if not fields == expected_fields:
            new_fields = [f for f in fields if not f in expected_fields]
            missing_fields = [f for f in expected_fields if not f in fields]
            message = 'fields have changed:\n'
            message += '\n'.join(ndiff(expected_fields, fields)) + '\n\n'
            if missing_fields:
                message += ('Missing fields:\n  ' +
                            '\n  '.join(missing_fields))
            if new_fields:
                message += ('Unexpected fields:\n  ' + '\n  '.join(new_fields))
            raise CommandError(message)

        # Utility functions for accessing XML data.
        def text(e):
            text = e.text or ''
            for child in e:
                if not child.tag == '{%s}BR' % NS['fmp']:
                    raise CommandError('Unexpected element: %s' % child)
                text += ('\n%s' % (child.tail or ''))
            return text.strip()

        def values(row, field):
            return list(
                set([v for v in [text(e) for e in row[f.index(field)]] if v]))

        def value(row, field):
            v = values(row, field)
            if len(v) == 0:
                return None
            elif len(v) == 1:
                return v[0]
            else:
                raise CommandError('multiple values for %s in record %s' %
                                   (field, row.get('RECORDID')))

        def row_to_dict(row):
            d = {}
            for field in f:
                if 'Join::' in field:
                    d[field] = values(row, field)
                else:
                    d[field] = value(row, field)
            return d

        # Statistics.
        created_count = collections_created_count = topics_created_count = changed_count = unchanged_count = skipped_count = deleted_count = 0

        for row in root.xpath('./fmp:RESULTSET/fmp:ROW', namespaces=NS):
            try:
                md = row_to_dict(row)
                for field in [
                        'CardID', 'CardHeading', 'CardType', 'Transcription'
                ]:
                    if md[field] is None:
                        raise CommandError('missing %s value in record %s' %
                                           (field, row.get('RECORDID')))
            except CommandError as e:
                self.stderr.write(self.style.ERROR('Warning: %s\n' % e))
                skipped_count += 1
                continue

            if md['Export'] == 'No':
                exists = Document.objects.filter(import_id__exact='%s%s' %
                                                 (ID_PREFIX, md['CardID']))
                if exists:
                    exists[0].delete()
                    deleted_count += 1
                    continue
                else:
                    skipped_count += 1
                    continue

            collection_id = ID_PREFIX + (':%s' % md['CardHeading'])
            collection_description = P('%s (Agnes Inglis cards)' %
                                       md['CardHeading'])
            collection, collection_created = Document.objects.get_or_create(
                import_id=collection_id,
                defaults={
                    'description': collection_description,
                    'creator': user,
                    'last_updater': user
                })
            if collection_created:
                collections_created_count += 1

            description = P('%s -- %s (Agnes Inglis card #%s)' %
                            (md['CardHeading'], md['CardType'], md['CardID']))
            document, created = Document.objects.get_or_create(
                import_id=(ID_PREFIX + md['CardID']),
                defaults={
                    'description': description,
                    'creator': user,
                    'last_updater': user
                })
            document.description = description
            document.collection = collection
            document.language = md['Language']
            document.save()

            # Set document topics.
            for topic_assignment in document.related_topics.all():
                topic_assignment.delete()

            def assign_topic(document, user, topic_name, topic_type=''):
                topic, topic_created = Topic.objects.get_or_create(
                    slug=Topic.make_slug(topic_name),
                    defaults={
                        'preferred_name': topic_name,
                        'creator': user,
                        'last_updater': user
                    })
                topic.type = topic_type
                topic.save()
                TopicAssignment.objects.create(content_object=document,
                                               topic=topic,
                                               creator=user)
                if topic_created:
                    return 1
                else:
                    return 0

            for topic_name in md['ContributorJoin::Contributors']:
                topics_created_count += assign_topic(document, user,
                                                     topic_name, 'PER')
            for topic_name in md['OrganizationJoin::Organizations']:
                topics_created_count += assign_topic(document, user,
                                                     topic_name, 'ORG')
            for topic_name in md['SubjectJoin::Subject']:
                topics_created_count += assign_topic(document, user,
                                                     topic_name)

            # Set document links.
            for link in document.links.all():
                link.delete()
            for url in [md['CatalogLink'], md['ResourceLink']]:
                if url is not None:
                    document.links.create(url=url, creator=user)

            # Set document metadata.
            changed = document.set_metadata(md, user)

            # Create or update document transcript.
            transcript_html = P(*list(
                chain.from_iterable((
                    (line, BR())
                    for line in md['Transcription'].split('\n'))))[:-1])
            if created:
                Transcript.objects.create(document=document,
                                          content=transcript_html,
                                          creator=user,
                                          last_updater=user)
                created_count += 1
            elif changed or options['force_update']:
                document.transcript.content = transcript_html
                document.transcript.last_updater = user
                document.transcript.save()
                document.last_updater = user
                document.save()
                changed_count += 1
            else:
                unchanged_count += 1

        self.stderr.write('%s records skipped.\n' % skipped_count)
        self.stderr.write('%s records deleted.\n' % deleted_count)
        self.stderr.write('%s new documents created.\n' % created_count)
        self.stderr.write('%s new collections created.\n' %
                          collections_created_count)
        self.stderr.write('%s new topics created.\n' % topics_created_count)
        self.stderr.write('%s documents updated.\n' % changed_count)
        self.stderr.write('%s documents unchanged.\n' % unchanged_count)
Beispiel #5
0
    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(self.anchor_map.iteritems()):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][
                    -1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest