Beispiel #1
0
    def process_document(self, doc, debug=False):
        """The code to actually process the document

        @params:
        - doc: the path to a document to try and transform
        - debug: false (default): if set to true - transformed
            doc gets written to STDOUT for viewing
        """
        # figure out its type
        if doc[1] == 'xml':
            try:
                tree = etree.parse(doc[0])
            except IOError:
                log.error("Couldn't load %s. Skipping it.." % doc[0])
                return
            except etree.XMLSyntaxError:
                log.error("Bad data file: %s. Skipping it.." % doc[0])
                return
        else:
            try:
                tree = html.parse(doc[0])
            except IOError:
                log.error("Couldn't load %s. Skipping it." % doc[0])
                return

        transform = self._get_transform(tree)
        log.debug("Transforming %s with %s" % (doc[0], transform))
        try:
            xsl = etree.parse(transform)
            xsl.xinclude()
            xsl = etree.XSLT(xsl)
        except IOError:
            log.error("No such transform: %s; skipping document: %s" % (transform, doc[0]))
            return
        except etree.XSLTParseError:
            log.error("Check the stylesheet; I can't parse it! %s" % transform)
            return 

        # transform it!
        d = xsl(tree)

        # clean the date entries for solr
        clean_dates(d)

        try:
            # clean the fields with markup
            clean_markup(d)
        except ValueError:
            log.error("I think there's something wrong with the transformed result of: %s" % doc[0])

        # strip empty elements - dates in particular cause
        #  solr to barf horribly...
        elements().strip_empty_elements(d)

        # add in the metadata the indexer users
        d = d.xpath('/add/doc')[0]

        # add the site metadata into the record
        d = self.add_field(d, 'site_code', self.metadata['site_code'])
        d = self.add_field(d, 'site_name', self.metadata['site_name'])
        d = self.add_field(d, 'site_url', self.metadata['site_url'])
        d = self.add_field(d, 'data_type', 'OHRM')

        # add in the faux start and end date as required
        #  but only if the record has a date from or to defined - for those
        #  records where it'snot defined; we skip this step so we don't
        #  get dodgy results
        if d.xpath('/add/doc/field[@name="date_from"]') or d.xpath('/add/doc/field[@name="date_to"]'):
            if d.xpath('/add/doc/field[@name="date_from"]'):
                d = self.add_field(d, 'exist_from', d.xpath('/add/doc/field[@name="date_from"]')[0].text)

            if d.xpath('/add/doc/field[@name="date_to"]'):
                d = self.add_field(d, 'exist_to', d.xpath('/add/doc/field[@name="date_to"]')[0].text)

            # add the existance to date if from date and no to date
            if d.xpath('/add/doc/field[@name="exist_from"]') and not d.xpath('/add/doc/field[@name="exist_to"]'):
                d = self.add_field(d, 'exist_to', d.xpath('/add/doc/field[@name="exist_from"]')[0].text)

            # add the existance from date if no from date and a to date
            if not d.xpath('/add/doc/field[@name="exist_from"]') and d.xpath('/add/doc/field[@name="exist_to"]'):
                d = self.add_field(d, 'exist_from', d.xpath('/add/doc/field[@name="exist_to"]')[0].text)

        # now we want to save the document to self.output_folder
        #
        # To ensure we never get a name clash, use the value of id as the filename,
        #  suitably transformed to something sensible 
        uniqueid = d.xpath("/add/doc/field[@name='id']")
        if not uniqueid:
            log.error("Couldn't get unique id for %s so I can't save it" % doc[0])
            return

        add = etree.Element('add')
        add.append(d)

        # when testing against a single document, this is the line that spits
        #  the result to stdout for viewing
        if debug:
            print etree.tostring(add, pretty_print=True)

        try:
            uniqueid = uniqueid[0].text.split('://')[1]
            output_file = os.path.join(self.output_folder, uniqueid.replace('/', '-'))
            log.debug("Writing output to: %s" % output_file)
            with open(output_file, 'w') as f:
                f.write(etree.tostring(add, pretty_print=True))
        except:
            log.error("Couldn't save the output from: %s" % doc[0]) 
Beispiel #2
0
    def process_item(self, item, series_id, metadata):
        item_id = item.attrib['id']
        doc = self.xsl(item)

        eid = etree.Element('field', name='id')
        eid.text = "%s/%s#%s" % (self.source, series_id, item_id)

        # add the site metadata into the record
        site_code = etree.Element('field', name='site_code')
        site_code.text = metadata['site_code']

        site_name = etree.Element('field', name='site_name')
        site_name.text = metadata['site_name']

        site_url = etree.Element('field', name='site_url')
        site_url.text = metadata['site_url']

        data_type = etree.Element('field', name='data_type')
        data_type.text = 'HDMS'

        sid = etree.Element('field', name='series_id')
        sid.text = series_id

        iid = etree.Element('field', name='item_id')
        iid.text = item_id

        d = doc.xpath('/add/doc')[0]
        d.append(eid)
        d.append(site_code)
        d.append(site_url)
        d.append(site_name)
        d.append(data_type)
        d.append(sid)
        d.append(iid)

        # process any item images - if there any
        if self.images is not None:
            # stash the image path
            image_path = etree.Element('field', name='source')
            image_path.text = metadata['source']
            d.append(image_path)

            # generate the list of small images
            try:
                images = [
                    f for f in os.listdir(
                        os.path.join(self.images, item_id, 'small'))
                ]
                for f in sorted(images):
                    image = etree.Element('field', name='small_images')
                    image.text = f
                    d.append(image)
            except OSError:
                pass

            # generate the list of large images
            try:
                images = [
                    f for f in os.listdir(
                        os.path.join(self.images, item_id, 'large'))
                ]
                for f in sorted(images):
                    image = etree.Element('field', name='large_images')
                    image.text = f
                    d.append(image)
            except OSError:
                pass

        # clean the date entries for solr
        clean_dates(d)

        try:
            # clean the fields with markup
            clean_markup(d)
        except ValueError:
            log.error(
                "I think there's something wrong with the transformed result of: %s"
                % item_id)

        # strip empty elements - dates in particular cause
        #  solr to barf horribly...
        elements().strip_empty_elements(d)

        # add in the faux start and end date as required
        #  but only if the record has a date from or to defined - for those
        #  records where it'snot defined; we skip this step so we don't
        #  get dodgy results
        if d.xpath('/add/doc/field[@name="date_from"]') or d.xpath(
                '/add/doc/field[@name="date_to"]'):
            df = etree.Element('field', name='exist_from')
            if d.xpath('/add/doc/field[@name="date_from"]'):
                df.text = d.xpath('/add/doc/field[@name="date_from"]')[0].text
                d.append(df)

            dt = etree.Element('field', name='exist_to')
            if d.xpath('/add/doc/field[@name="date_to"]'):
                dt.text = d.xpath('/add/doc/field[@name="date_to"]')[0].text
            else:
                dt.text = "%sT00:00:00Z" % self.date_upper_bound
            d.append(dt)

        try:
            uniqueid = eid.text.split('://')[1].replace('#', '-')
            output_file = os.path.join(self.output_folder,
                                       uniqueid.replace('/', '-'))
            log.debug("Writing output to: %s" % output_file)
            with open(output_file, 'w') as f:
                f.write(etree.tostring(doc, pretty_print=True))
        except:
            log.error("Couldn't save the output from: %s" % doc[0])
            print etree.tostring(doc, pretty_print=True)
Beispiel #3
0
    def process_item(self, item, series_id, metadata):
        item_id = item.attrib['id']
        doc = self.xsl(item)

        eid = etree.Element('field', name='id')
        eid.text = "%s/%s#%s" % (self.source, series_id, item_id)

        # add the site metadata into the record
        site_code = etree.Element('field', name='site_code')
        site_code.text = metadata['site_code']

        site_name = etree.Element('field', name='site_name')
        site_name.text = metadata['site_name']

        site_url = etree.Element('field', name='site_url')
        site_url.text = metadata['site_url']

        data_type = etree.Element('field', name='data_type')
        data_type.text = 'HDMS'

        sid = etree.Element('field', name='series_id')
        sid.text = series_id

        iid = etree.Element('field', name='item_id')
        iid.text = item_id

        d = doc.xpath('/add/doc')[0]
        d.append(eid)
        d.append(site_code)
        d.append(site_url)
        d.append(site_name)
        d.append(data_type)
        d.append(sid)
        d.append(iid)

        # process any item images - if there any
        if self.images is not None:
            # stash the image path
            image_path = etree.Element('field', name='source')
            image_path.text = metadata['source']
            d.append(image_path)

            # generate the list of small images
            try:
                images = [ f for f in os.listdir(os.path.join(self.images, item_id, 'small')) ]
                for f in sorted(images):
                    image = etree.Element('field', name='small_images')
                    image.text = f
                    d.append(image)
            except OSError:
                pass

            # generate the list of large images
            try:
                images = [ f for f in os.listdir(os.path.join(self.images, item_id, 'large')) ]
                for f in sorted(images):
                    image = etree.Element('field', name='large_images')
                    image.text = f
                    d.append(image)
            except OSError:
                pass

        # clean the date entries for solr
        clean_dates(d)

        try:
            # clean the fields with markup
            clean_markup(d)
        except ValueError:
            log.error("I think there's something wrong with the transformed result of: %s" % item_id)

        # strip empty elements - dates in particular cause
        #  solr to barf horribly...
        elements().strip_empty_elements(d)

        # add in the faux start and end date as required
        #  but only if the record has a date from or to defined - for those
        #  records where it'snot defined; we skip this step so we don't
        #  get dodgy results
        if d.xpath('/add/doc/field[@name="date_from"]') or d.xpath('/add/doc/field[@name="date_to"]'):
            df = etree.Element('field', name='exist_from')
            if d.xpath('/add/doc/field[@name="date_from"]'):
                df.text = d.xpath('/add/doc/field[@name="date_from"]')[0].text
                d.append(df)

            dt = etree.Element('field', name='exist_to')
            if d.xpath('/add/doc/field[@name="date_to"]'):
                dt.text = d.xpath('/add/doc/field[@name="date_to"]')[0].text
            else:
                dt.text = "%sT00:00:00Z" % self.date_upper_bound
            d.append(dt)
                    
        try:
            uniqueid = eid.text.split('://')[1].replace('#', '-')
            output_file = os.path.join(self.output_folder, uniqueid.replace('/', '-'))
            log.debug("Writing output to: %s" % output_file)
            with open(output_file, 'w') as f:
                f.write(etree.tostring(doc, pretty_print=True))
        except:
            log.error("Couldn't save the output from: %s" % doc[0])
            print etree.tostring(doc, pretty_print=True)