def save_to_file(self, filename): """ Print all document elements to a file with the given filename.""" # NOTE: Cannot use a simple loop over the self.elements field # because it only has the original sequence of tags and # insertions and deletions are not maintained on it. Assumes # that the first element of self.elements is always the first # element of the XML file, even if the file has been changed. outfile = open(filename, "w") element = self.elements[0] while element: string = element.get_content() string = string.encode('ascii', 'replace') string = protectNode(string) outfile.write(string) element = element.get_next()
def create_fragments(self, tagname, wrapping_tag=None, remove_tags=False): """ Fragments are pairs of a file basename and a DocElement that contains an opening tag. The file basename points to a fragment file in the temporary data directory. The DocElement points to the tag in the document in which the fragment is contained and it can be used to update the content of that tag. The DocElement instance knows how to get the content between opening and closing tags. For each tag named 'tagname', the elements between the opening and closing tags are extracted and put in a separate fragment file. Arguments: tagname - name of the tag that contains the fragments from input file that need to be processed wrapping_tag - name of the tag that is used to wrap the content of the fragment file remove_tags - a boolean that indicates whether tags should be removed from the content of the fragment, which only makes sense for the source file Return value: None """ index = 0 self.fragments = [] for tag in self.document.tags[self.tag]: #text = tag.collect_content() #text = text.encode('ascii', 'replace') #if remove_tags: # This should not be hard-coded like this. We also # need a smarter way, one that does not slow down when # large amounts of tags need to be removed. One way # would be to to change the decompose method in # formatConversor.ChunkerMarc2tarsqi # for t in ('JG', 'IN-MW', 'Para'): # text = text.replace('<'+t+'>','') # text = text.replace('</'+t+'>','') text_list = tag.collect_content_list() index = index + 1 base = "fragment_%03d" % index self.fragments.append([base, tag]) file_name = self.DIR_DATA + os.sep + base + '.' + self.CREATION_EXTENSION frag_file = open(file_name, "w") if wrapping_tag: frag_file.write("<%s>" % wrapping_tag) for t in text_list: t = t.encode('ascii', 'replace') t = protectNode(t) if remove_tags: # This should not be hard-coded like this. We also # need a smarter way, one that does not slow down when # large amounts of tags need to be removed. One way # would be to to change the decompose method in # formatConversor.ChunkerMarc2tarsqi for s in ('JG', 'IN-MW', 'Para'): t = t.replace('<' + s + '>', '') t = t.replace('</' + s + '>', '') frag_file.write(t) if wrapping_tag: frag_file.write("</%s>" % wrapping_tag) frag_file.close()
def create_fragments(self, tagname, wrapping_tag=None, remove_tags=False): """ Fragments are pairs of a file basename and a DocElement that contains an opening tag. The file basename points to a fragment file in the temporary data directory. The DocElement points to the tag in the document in which the fragment is contained and it can be used to update the content of that tag. The DocElement instance knows how to get the content between opening and closing tags. For each tag named 'tagname', the elements between the opening and closing tags are extracted and put in a separate fragment file. Arguments: tagname - name of the tag that contains the fragments from input file that need to be processed wrapping_tag - name of the tag that is used to wrap the content of the fragment file remove_tags - a boolean that indicates whether tags should be removed from the content of the fragment, which only makes sense for the source file Return value: None """ index = 0 self.fragments = [] for tag in self.document.tags[self.tag]: #text = tag.collect_content() #text = text.encode('ascii', 'replace') #if remove_tags: # This should not be hard-coded like this. We also # need a smarter way, one that does not slow down when # large amounts of tags need to be removed. One way # would be to to change the decompose method in # formatConversor.ChunkerMarc2tarsqi # for t in ('JG', 'IN-MW', 'Para'): # text = text.replace('<'+t+'>','') # text = text.replace('</'+t+'>','') text_list = tag.collect_content_list() index = index + 1 base = "fragment_%03d" % index self.fragments.append([base, tag]) file_name = self.DIR_DATA + os.sep + base + '.' + self.CREATION_EXTENSION frag_file = open(file_name, "w") if wrapping_tag: frag_file.write("<%s>" % wrapping_tag) for t in text_list: t = t.encode('ascii', 'replace') t = protectNode(t) if remove_tags: # This should not be hard-coded like this. We also # need a smarter way, one that does not slow down when # large amounts of tags need to be removed. One way # would be to to change the decompose method in # formatConversor.ChunkerMarc2tarsqi for s in ('JG', 'IN-MW', 'Para'): t = t.replace('<'+s+'>','') t = t.replace('</'+s+'>','') frag_file.write(t) if wrapping_tag: frag_file.write("</%s>" % wrapping_tag) frag_file.close()