Example #1
0
 def save_to_file(self, filename):
     """ Print all document elements to a file with the given filename."""
     # NOTE: Cannot use a simple loop over the self.elements field
     # because it only has the original sequence of tags and
     # insertions and deletions are not maintained on it. Assumes
     # that the first element of self.elements is always the first
     # element of the XML file, even if the file has been changed.
     outfile = open(filename, "w")
     element = self.elements[0]
     while element:
         string = element.get_content()
         string = string.encode('ascii', 'replace')
         string = protectNode(string)
         outfile.write(string)
         element = element.get_next()
Example #2
0
    def create_fragments(self, tagname, wrapping_tag=None, remove_tags=False):
        """ Fragments are pairs of a file basename and a DocElement that
        contains an opening tag. The file basename points to a fragment file
        in the temporary data directory. The DocElement points to the tag in
        the document in which the fragment is contained and it can be used to
        update the content of that tag. The DocElement instance knows how to
        get the content between opening and closing tags. For each tag named
        'tagname', the elements between the opening and closing tags are
        extracted and put in a separate fragment file. 

        Arguments: 
           tagname -
              name of the tag that contains the fragments from input file
              that need to be processed
           wrapping_tag -
              name of the tag that is used to wrap the content of the
              fragment file
           remove_tags -
              a boolean that indicates whether tags should be removed from
              the content of the fragment, which only makes sense for the
              source file

        Return value: None """

        index = 0
        self.fragments = []
        for tag in self.document.tags[self.tag]:
            #text = tag.collect_content()
            #text = text.encode('ascii', 'replace')
            #if remove_tags:
            # This should not be hard-coded like this. We also
            # need a smarter way, one that does not slow down when
            # large amounts of tags need to be removed. One way
            # would be to to change the decompose method in
            # formatConversor.ChunkerMarc2tarsqi
            #    for t in ('JG', 'IN-MW', 'Para'):
            #        text = text.replace('<'+t+'>','')
            #        text = text.replace('</'+t+'>','')
            text_list = tag.collect_content_list()
            index = index + 1
            base = "fragment_%03d" % index
            self.fragments.append([base, tag])
            file_name = self.DIR_DATA + os.sep + base + '.' + self.CREATION_EXTENSION
            frag_file = open(file_name, "w")
            if wrapping_tag:
                frag_file.write("<%s>" % wrapping_tag)

            for t in text_list:
                t = t.encode('ascii', 'replace')
                t = protectNode(t)
                if remove_tags:
                    # This should not be hard-coded like this. We also
                    # need a smarter way, one that does not slow down when
                    # large amounts of tags need to be removed. One way
                    # would be to to change the decompose method in
                    # formatConversor.ChunkerMarc2tarsqi
                    for s in ('JG', 'IN-MW', 'Para'):
                        t = t.replace('<' + s + '>', '')
                        t = t.replace('</' + s + '>', '')
                frag_file.write(t)

            if wrapping_tag:
                frag_file.write("</%s>" % wrapping_tag)
            frag_file.close()
Example #3
0
    def create_fragments(self, tagname, wrapping_tag=None, remove_tags=False):

        """ Fragments are pairs of a file basename and a DocElement that
        contains an opening tag. The file basename points to a fragment file
        in the temporary data directory. The DocElement points to the tag in
        the document in which the fragment is contained and it can be used to
        update the content of that tag. The DocElement instance knows how to
        get the content between opening and closing tags. For each tag named
        'tagname', the elements between the opening and closing tags are
        extracted and put in a separate fragment file. 

        Arguments: 
           tagname -
              name of the tag that contains the fragments from input file
              that need to be processed
           wrapping_tag -
              name of the tag that is used to wrap the content of the
              fragment file
           remove_tags -
              a boolean that indicates whether tags should be removed from
              the content of the fragment, which only makes sense for the
              source file

        Return value: None """

        index = 0
        self.fragments = []
        for tag in self.document.tags[self.tag]:
            #text = tag.collect_content()
            #text = text.encode('ascii', 'replace')
            #if remove_tags:
                # This should not be hard-coded like this. We also
                # need a smarter way, one that does not slow down when
                # large amounts of tags need to be removed. One way
                # would be to to change the decompose method in
                # formatConversor.ChunkerMarc2tarsqi
            #    for t in ('JG', 'IN-MW', 'Para'):
            #        text = text.replace('<'+t+'>','')
            #        text = text.replace('</'+t+'>','')
            text_list = tag.collect_content_list()
            index = index + 1
            base = "fragment_%03d" % index
            self.fragments.append([base, tag])
            file_name = self.DIR_DATA + os.sep + base + '.' + self.CREATION_EXTENSION
            frag_file = open(file_name, "w")
            if wrapping_tag:
                frag_file.write("<%s>" % wrapping_tag)

            for t in text_list:
                t = t.encode('ascii', 'replace')
                t = protectNode(t)
                if remove_tags:
                    # This should not be hard-coded like this. We also
                    # need a smarter way, one that does not slow down when
                    # large amounts of tags need to be removed. One way
                    # would be to to change the decompose method in
                    # formatConversor.ChunkerMarc2tarsqi
                    for s in ('JG', 'IN-MW', 'Para'):
                        t = t.replace('<'+s+'>','')
                        t = t.replace('</'+s+'>','')
                frag_file.write(t)

            if wrapping_tag:
                frag_file.write("</%s>" % wrapping_tag)
            frag_file.close()