def process_document(self, doc, debug=False): """The code to actually process the document @params: - doc: the path to a document to try and transform - debug: false (default): if set to true - transformed doc gets written to STDOUT for viewing """ # figure out its type if doc[1] == 'xml': try: tree = etree.parse(doc[0]) except IOError: log.error("Couldn't load %s. Skipping it.." % doc[0]) return except etree.XMLSyntaxError: log.error("Bad data file: %s. Skipping it.." % doc[0]) return else: try: tree = html.parse(doc[0]) except IOError: log.error("Couldn't load %s. Skipping it." % doc[0]) return transform = self._get_transform(tree) log.debug("Transforming %s with %s" % (doc[0], transform)) try: xsl = etree.parse(transform) xsl.xinclude() xsl = etree.XSLT(xsl) except IOError: log.error("No such transform: %s; skipping document: %s" % (transform, doc[0])) return except etree.XSLTParseError: log.error("Check the stylesheet; I can't parse it! %s" % transform) return # transform it! d = xsl(tree) # clean the date entries for solr clean_dates(d) try: # clean the fields with markup clean_markup(d) except ValueError: log.error("I think there's something wrong with the transformed result of: %s" % doc[0]) # strip empty elements - dates in particular cause # solr to barf horribly... elements().strip_empty_elements(d) # add in the metadata the indexer users d = d.xpath('/add/doc')[0] # add the site metadata into the record d = self.add_field(d, 'site_code', self.metadata['site_code']) d = self.add_field(d, 'site_name', self.metadata['site_name']) d = self.add_field(d, 'site_url', self.metadata['site_url']) d = self.add_field(d, 'data_type', 'OHRM') # add in the faux start and end date as required # but only if the record has a date from or to defined - for those # records where it'snot defined; we skip this step so we don't # get dodgy results if d.xpath('/add/doc/field[@name="date_from"]') or d.xpath('/add/doc/field[@name="date_to"]'): if d.xpath('/add/doc/field[@name="date_from"]'): d = self.add_field(d, 'exist_from', d.xpath('/add/doc/field[@name="date_from"]')[0].text) if d.xpath('/add/doc/field[@name="date_to"]'): d = self.add_field(d, 'exist_to', d.xpath('/add/doc/field[@name="date_to"]')[0].text) # add the existance to date if from date and no to date if d.xpath('/add/doc/field[@name="exist_from"]') and not d.xpath('/add/doc/field[@name="exist_to"]'): d = self.add_field(d, 'exist_to', d.xpath('/add/doc/field[@name="exist_from"]')[0].text) # add the existance from date if no from date and a to date if not d.xpath('/add/doc/field[@name="exist_from"]') and d.xpath('/add/doc/field[@name="exist_to"]'): d = self.add_field(d, 'exist_from', d.xpath('/add/doc/field[@name="exist_to"]')[0].text) # now we want to save the document to self.output_folder # # To ensure we never get a name clash, use the value of id as the filename, # suitably transformed to something sensible uniqueid = d.xpath("/add/doc/field[@name='id']") if not uniqueid: log.error("Couldn't get unique id for %s so I can't save it" % doc[0]) return add = etree.Element('add') add.append(d) # when testing against a single document, this is the line that spits # the result to stdout for viewing if debug: print etree.tostring(add, pretty_print=True) try: uniqueid = uniqueid[0].text.split('://')[1] output_file = os.path.join(self.output_folder, uniqueid.replace('/', '-')) log.debug("Writing output to: %s" % output_file) with open(output_file, 'w') as f: f.write(etree.tostring(add, pretty_print=True)) except: log.error("Couldn't save the output from: %s" % doc[0])
def process_item(self, item, series_id, metadata): item_id = item.attrib['id'] doc = self.xsl(item) eid = etree.Element('field', name='id') eid.text = "%s/%s#%s" % (self.source, series_id, item_id) # add the site metadata into the record site_code = etree.Element('field', name='site_code') site_code.text = metadata['site_code'] site_name = etree.Element('field', name='site_name') site_name.text = metadata['site_name'] site_url = etree.Element('field', name='site_url') site_url.text = metadata['site_url'] data_type = etree.Element('field', name='data_type') data_type.text = 'HDMS' sid = etree.Element('field', name='series_id') sid.text = series_id iid = etree.Element('field', name='item_id') iid.text = item_id d = doc.xpath('/add/doc')[0] d.append(eid) d.append(site_code) d.append(site_url) d.append(site_name) d.append(data_type) d.append(sid) d.append(iid) # process any item images - if there any if self.images is not None: # stash the image path image_path = etree.Element('field', name='source') image_path.text = metadata['source'] d.append(image_path) # generate the list of small images try: images = [ f for f in os.listdir( os.path.join(self.images, item_id, 'small')) ] for f in sorted(images): image = etree.Element('field', name='small_images') image.text = f d.append(image) except OSError: pass # generate the list of large images try: images = [ f for f in os.listdir( os.path.join(self.images, item_id, 'large')) ] for f in sorted(images): image = etree.Element('field', name='large_images') image.text = f d.append(image) except OSError: pass # clean the date entries for solr clean_dates(d) try: # clean the fields with markup clean_markup(d) except ValueError: log.error( "I think there's something wrong with the transformed result of: %s" % item_id) # strip empty elements - dates in particular cause # solr to barf horribly... elements().strip_empty_elements(d) # add in the faux start and end date as required # but only if the record has a date from or to defined - for those # records where it'snot defined; we skip this step so we don't # get dodgy results if d.xpath('/add/doc/field[@name="date_from"]') or d.xpath( '/add/doc/field[@name="date_to"]'): df = etree.Element('field', name='exist_from') if d.xpath('/add/doc/field[@name="date_from"]'): df.text = d.xpath('/add/doc/field[@name="date_from"]')[0].text d.append(df) dt = etree.Element('field', name='exist_to') if d.xpath('/add/doc/field[@name="date_to"]'): dt.text = d.xpath('/add/doc/field[@name="date_to"]')[0].text else: dt.text = "%sT00:00:00Z" % self.date_upper_bound d.append(dt) try: uniqueid = eid.text.split('://')[1].replace('#', '-') output_file = os.path.join(self.output_folder, uniqueid.replace('/', '-')) log.debug("Writing output to: %s" % output_file) with open(output_file, 'w') as f: f.write(etree.tostring(doc, pretty_print=True)) except: log.error("Couldn't save the output from: %s" % doc[0]) print etree.tostring(doc, pretty_print=True)
def process_item(self, item, series_id, metadata): item_id = item.attrib['id'] doc = self.xsl(item) eid = etree.Element('field', name='id') eid.text = "%s/%s#%s" % (self.source, series_id, item_id) # add the site metadata into the record site_code = etree.Element('field', name='site_code') site_code.text = metadata['site_code'] site_name = etree.Element('field', name='site_name') site_name.text = metadata['site_name'] site_url = etree.Element('field', name='site_url') site_url.text = metadata['site_url'] data_type = etree.Element('field', name='data_type') data_type.text = 'HDMS' sid = etree.Element('field', name='series_id') sid.text = series_id iid = etree.Element('field', name='item_id') iid.text = item_id d = doc.xpath('/add/doc')[0] d.append(eid) d.append(site_code) d.append(site_url) d.append(site_name) d.append(data_type) d.append(sid) d.append(iid) # process any item images - if there any if self.images is not None: # stash the image path image_path = etree.Element('field', name='source') image_path.text = metadata['source'] d.append(image_path) # generate the list of small images try: images = [ f for f in os.listdir(os.path.join(self.images, item_id, 'small')) ] for f in sorted(images): image = etree.Element('field', name='small_images') image.text = f d.append(image) except OSError: pass # generate the list of large images try: images = [ f for f in os.listdir(os.path.join(self.images, item_id, 'large')) ] for f in sorted(images): image = etree.Element('field', name='large_images') image.text = f d.append(image) except OSError: pass # clean the date entries for solr clean_dates(d) try: # clean the fields with markup clean_markup(d) except ValueError: log.error("I think there's something wrong with the transformed result of: %s" % item_id) # strip empty elements - dates in particular cause # solr to barf horribly... elements().strip_empty_elements(d) # add in the faux start and end date as required # but only if the record has a date from or to defined - for those # records where it'snot defined; we skip this step so we don't # get dodgy results if d.xpath('/add/doc/field[@name="date_from"]') or d.xpath('/add/doc/field[@name="date_to"]'): df = etree.Element('field', name='exist_from') if d.xpath('/add/doc/field[@name="date_from"]'): df.text = d.xpath('/add/doc/field[@name="date_from"]')[0].text d.append(df) dt = etree.Element('field', name='exist_to') if d.xpath('/add/doc/field[@name="date_to"]'): dt.text = d.xpath('/add/doc/field[@name="date_to"]')[0].text else: dt.text = "%sT00:00:00Z" % self.date_upper_bound d.append(dt) try: uniqueid = eid.text.split('://')[1].replace('#', '-') output_file = os.path.join(self.output_folder, uniqueid.replace('/', '-')) log.debug("Writing output to: %s" % output_file) with open(output_file, 'w') as f: f.write(etree.tostring(doc, pretty_print=True)) except: log.error("Couldn't save the output from: %s" % doc[0]) print etree.tostring(doc, pretty_print=True)