def clean_stars(div: HtmlElement) -> None: for e in elements(div, "p[strong[em]]"): e.drop_tree() for e in elements(div, ".//hr"): e.drop_tree() div.insert(0, H1("Unspeakable Desolation Pouring Down From the Stars")) e = element(div, "./p[1]") h2 = H2(e.text_content().title()) replace(e, h2) e = element(div, "./p[strong[a]]") a = element(div, "./p/strong/a") p = P( CLASS("breakabove"), A(e.text_content(), CLASS("internal"), href=a.attrib["href"]), ) replace(e, p)
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.frame_map[p] = style.frame self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]') def p_parent(x): # Ensure that nested <w:p> tags are handled. These can occur if a # textbox is present inside a paragraph. while True: x = x.getparent() try: if x.tag.endswith('}p'): return x except AttributeError: break for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'): if p_parent(x) is not p: continue if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = self.namespace.get(x, 'w:name') if anchor and anchor not in self.anchor_map and anchor != '_GoBack': # _GoBack is a special bookmark inserted by Word 2010 for # the return to previous edit feature, we ignore it old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map))) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(iteritems(self.anchor_map)): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '): old_anchor = current_anchor anchor = unicode_type(uuid.uuid4()) self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map))) self.toc_anchor = current_anchor if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(iteritems(self.anchor_map)): if t == old_anchor: self.anchor_map[a] = current_anchor if current_anchor is not None: # This paragraph had no <w:r> descendants dest.set('id', current_anchor) current_anchor = None m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.bidi is True: dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0 and not style.has_visible_border(): # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, 'w:hyperlink') if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) return dest
def handle_filemaker(self, root, **options): # Prefix to append to import IDs. ID_PREFIX = 'inglis:' # Load user to create/update documents as. try: user = User.objects.get(username=options['username']) except User.DoesNotExist: raise CommandError('unknown user: %s' % options['username']) # Verify that we have the fields we expect. f = expected_fields = [ 'CardID', 'Export', 'CardHeading', 'Transcription', 'CardType', 'CardFormat', 'Language', 'ContributorJoin::Contributors', 'SubjectJoin::Subject', 'OrganizationJoin::Organizations', 'Citation', 'CatalogLink', 'ResourceLink', 'AdditionalNotes', 'DateEntered' ] fields = root.xpath('./fmp:METADATA/fmp:FIELD/@NAME', namespaces=NS) if not fields == expected_fields: new_fields = [f for f in fields if not f in expected_fields] missing_fields = [f for f in expected_fields if not f in fields] message = 'fields have changed:\n' message += '\n'.join(ndiff(expected_fields, fields)) + '\n\n' if missing_fields: message += ('Missing fields:\n ' + '\n '.join(missing_fields)) if new_fields: message += ('Unexpected fields:\n ' + '\n '.join(new_fields)) raise CommandError(message) # Utility functions for accessing XML data. def text(e): text = e.text or '' for child in e: if not child.tag == '{%s}BR' % NS['fmp']: raise CommandError('Unexpected element: %s' % child) text += ('\n%s' % (child.tail or '')) return text.strip() def values(row, field): return list( set([v for v in [text(e) for e in row[f.index(field)]] if v])) def value(row, field): v = values(row, field) if len(v) == 0: return None elif len(v) == 1: return v[0] else: raise CommandError('multiple values for %s in record %s' % (field, row.get('RECORDID'))) def row_to_dict(row): d = {} for field in f: if 'Join::' in field: d[field] = values(row, field) else: d[field] = value(row, field) return d # Statistics. created_count = collections_created_count = topics_created_count = changed_count = unchanged_count = skipped_count = deleted_count = 0 for row in root.xpath('./fmp:RESULTSET/fmp:ROW', namespaces=NS): try: md = row_to_dict(row) for field in [ 'CardID', 'CardHeading', 'CardType', 'Transcription' ]: if md[field] is None: raise CommandError('missing %s value in record %s' % (field, row.get('RECORDID'))) except CommandError as e: self.stderr.write(self.style.ERROR('Warning: %s\n' % e)) skipped_count += 1 continue if md['Export'] == 'No': exists = Document.objects.filter(import_id__exact='%s%s' % (ID_PREFIX, md['CardID'])) if exists: exists[0].delete() deleted_count += 1 continue else: skipped_count += 1 continue collection_id = ID_PREFIX + (':%s' % md['CardHeading']) collection_description = P('%s (Agnes Inglis cards)' % md['CardHeading']) collection, collection_created = Document.objects.get_or_create( import_id=collection_id, defaults={ 'description': collection_description, 'creator': user, 'last_updater': user }) if collection_created: collections_created_count += 1 description = P('%s -- %s (Agnes Inglis card #%s)' % (md['CardHeading'], md['CardType'], md['CardID'])) document, created = Document.objects.get_or_create( import_id=(ID_PREFIX + md['CardID']), defaults={ 'description': description, 'creator': user, 'last_updater': user }) document.description = description document.collection = collection document.language = md['Language'] document.save() # Set document topics. for topic_assignment in document.related_topics.all(): topic_assignment.delete() def assign_topic(document, user, topic_name, topic_type=''): topic, topic_created = Topic.objects.get_or_create( slug=Topic.make_slug(topic_name), defaults={ 'preferred_name': topic_name, 'creator': user, 'last_updater': user }) topic.type = topic_type topic.save() TopicAssignment.objects.create(content_object=document, topic=topic, creator=user) if topic_created: return 1 else: return 0 for topic_name in md['ContributorJoin::Contributors']: topics_created_count += assign_topic(document, user, topic_name, 'PER') for topic_name in md['OrganizationJoin::Organizations']: topics_created_count += assign_topic(document, user, topic_name, 'ORG') for topic_name in md['SubjectJoin::Subject']: topics_created_count += assign_topic(document, user, topic_name) # Set document links. for link in document.links.all(): link.delete() for url in [md['CatalogLink'], md['ResourceLink']]: if url is not None: document.links.create(url=url, creator=user) # Set document metadata. changed = document.set_metadata(md, user) # Create or update document transcript. transcript_html = P(*list( chain.from_iterable(( (line, BR()) for line in md['Transcription'].split('\n'))))[:-1]) if created: Transcript.objects.create(document=document, content=transcript_html, creator=user, last_updater=user) created_count += 1 elif changed or options['force_update']: document.transcript.content = transcript_html document.transcript.last_updater = user document.transcript.save() document.last_updater = user document.save() changed_count += 1 else: unchanged_count += 1 self.stderr.write('%s records skipped.\n' % skipped_count) self.stderr.write('%s records deleted.\n' % deleted_count) self.stderr.write('%s new documents created.\n' % created_count) self.stderr.write('%s new collections created.\n' % collections_created_count) self.stderr.write('%s new topics created.\n' % topics_created_count) self.stderr.write('%s documents updated.\n' % changed_count) self.stderr.write('%s documents unchanged.\n' % unchanged_count)
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.iteritems()): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][ -1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest