Ejemplo n.º 1
0
    def replace_in_text(self, id, element, replace_text, ref_type):
        before_after = element.text.split(replace_text, 1)
        element.text = before_after[0]

        new_element = etree.Element('xref')
        new_element.attrib['rid'] = unicode(id)
        new_element.attrib['ref-type'] = ref_type
        new_element.text = replace_text
        new_element.tail = ''.join(before_after[1:])

        NlmManipulate.append_safe(element, new_element, self)
Ejemplo n.º 2
0
    def replace_in_text(self, id, element, replace_text, ref_type):
        before_after = element.text.split(replace_text, 1)
        element.text = before_after[0]

        new_element = etree.Element('xref')
        new_element.attrib['rid'] = unicode(id)
        new_element.attrib['ref-type'] = ref_type
        new_element.text = replace_text
        new_element.tail = ''.join(before_after[1:])

        NlmManipulate.append_safe(element, new_element, self)
Ejemplo n.º 3
0
    def run_graphics_sibling(self):
        # images are hard to handle because Word/OO puts them in different places
        # for instance, the caption can come before or after;
        # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
        # orientation="portrait" xlink:type="simple"/>

        self.debug.print_debug(
            self,
            u'Attempting to classify captions for graphics objects [sibling]')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        graphics = tree.xpath('//graphic')

        graphic_titles = []
        graphic_ids = []
        graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
        graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')

        separator = ':'

        for graphic in graphics:
            use_next = False
            use_previous = False

            # get the next sibling
            p = graphic.getparent().getnext()
            pprev = graphic.getparent().getprevious()

            if p is not None and p.tag == 'p':
                text = manipulate.get_stripped_text(p)

                if graphic_regex_colon.match(text):
                    use_next = True
                    separator = ':'
                elif graphic_regex_dot.match(text):
                    use_next = True
                    separator = '.'

            if not use_next:
                if pprev is not None and pprev.tag == 'p':
                    text = manipulate.get_stripped_text(pprev)

                    if graphic_regex_colon.match(text):
                        use_previous = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_previous = True
                        separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = graphic.getparent()

                while parent is not None and not parent.tag.endswith('sec'):
                    parent = parent.getparent()
                    if parent is not None:
                        titles = parent.xpath('title')
                    else:
                        titles = []

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if graphic_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                self.debug.print_debug(
                    self,
                    u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = graphic.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    graphic.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                new_p = etree.Element('p')
                new_p.text = caption

                NlmManipulate.append_safe(caption_element, new_p, self)
                NlmManipulate.append_safe(graphic, caption_element, self)

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                if graphic.tail:
                    graphic.tail = graphic.tail.replace(title + separator, '')
                    graphic.tail = graphic.tail.replace(
                        caption + separator, '')
                    graphic.tail = graphic.tail.replace(caption, '')

                if not 'id' in graphic.attrib:
                    graphic.attrib['id'] = u'ID{0}'.format(
                        unicode(uuid.uuid4()))

                graphic_titles.append(title)
                graphic_ids.append(graphic.attrib['id'])

        paragraphs = tree.xpath('//p')

        self.link(graphic_ids, graphic_titles, paragraphs, 'fig')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Ejemplo n.º 4
0
    def run_tables(self):
        self.debug.print_debug(
            self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0]
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(
                    self,
                    u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith(
                            'sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(
                            self,
                            u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith(
                                'sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(
                                self,
                                u'Moved table and siblings to parent section')

        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Ejemplo n.º 5
0
    def run_tables(self):
        self.debug.print_debug(
            self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            caption_element = None
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(
                    self,
                    u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(uuid.uuid4())

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith(
                            'sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(
                            self,
                            u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith(
                                'sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(
                                self,
                                u'Moved table and siblings to parent section')

            # If none of that worked, try to find caption in table rows
            if caption_element is None:
                table_rows = table.find("table").getchildren()

                # Check if first row has fewer columns than others
                # Therefore not likely to be data or a header
                columns_count = {}
                first_column = {}
                row_number = 0

                for row in table_rows:
                    row_number += 1
                    columns_count[row_number] = len(row.getchildren())
                    try:
                        first_column[row_number] = row.getchildren()[0].text
                    except:
                        first_column[row_number] = ""
                    fewest_columns = min(columns_count, key=columns_count.get)

                if len(columns_count) > 2 and columns_count[
                        1] == fewest_columns and columns_count[
                            2] != fewest_columns:
                    # If it has fewest columns, also check Levenshtein distance
                    # To ensure this row is unlike the others
                    if editdistance.eval(first_column[1],
                                         first_column[2]) > editdistance.eval(
                                             first_column[2], first_column[3]):

                        # OK, we have something, move it
                        caption_element = etree.Element('caption')
                        caption_element.text = first_column[1]
                        NlmManipulate.append_safe(table, caption_element, self)
                        table.find("table").remove(table_rows[0])

        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Ejemplo n.º 6
0
    def run_graphics_sibling(self):
        # images are hard to handle because Word/OO puts them in different places
        # for instance, the caption can come before or after;
        # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
        # orientation="portrait" xlink:type="simple"/>

        self.debug.print_debug(self, u'Attempting to classify captions for graphics objects [sibling]')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        graphics = tree.xpath('//graphic')

        graphic_titles = []
        graphic_ids = []
        graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
        graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')

        separator = ':'

        for graphic in graphics:
            use_next = False
            use_previous = False

            # get the next sibling
            p = graphic.getparent().getnext()
            pprev = graphic.getparent().getprevious()

            if p is not None and p.tag == 'p':
                text = manipulate.get_stripped_text(p)

                if graphic_regex_colon.match(text):
                    use_next = True
                    separator = ':'
                elif graphic_regex_dot.match(text):
                    use_next = True
                    separator = '.'

            if not use_next:
                if pprev is not None and pprev.tag == 'p':
                    text = manipulate.get_stripped_text(pprev)

                    if graphic_regex_colon.match(text):
                        use_previous = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_previous = True
                        separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = graphic.getparent()

                while parent is not None and not parent.tag.endswith('sec'):
                    parent = parent.getparent()
                    if parent is not None:
                      titles = parent.xpath('title')
                else:
                    titles = []

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if graphic_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = graphic.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    graphic.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                new_p = etree.Element('p')
                new_p.text = caption

                NlmManipulate.append_safe(caption_element, new_p, self)
                NlmManipulate.append_safe(graphic, caption_element, self)

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                if graphic.tail:
                    graphic.tail = graphic.tail.replace(title + separator, '')
                    graphic.tail = graphic.tail.replace(caption + separator, '')
                    graphic.tail = graphic.tail.replace(caption, '')

                if not 'id' in graphic.attrib:
                    graphic.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

                graphic_titles.append(title)
                graphic_ids.append(graphic.attrib['id'])

        paragraphs = tree.xpath('//p')

        self.link(graphic_ids, graphic_titles, paragraphs, 'fig')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Ejemplo n.º 7
0
    def run_tables(self):
        self.debug.print_debug(self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0]
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith('sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(self, u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith('sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(self, u'Moved table and siblings to parent section')

        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Ejemplo n.º 8
0
    def run_tables(self):
        self.debug.print_debug(self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            caption_element = None
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(uuid.uuid4())

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith('sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(self, u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith('sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(self, u'Moved table and siblings to parent section')

            # If none of that worked, try to find caption in table rows
            if caption_element is None:
                table_rows = table.find("table").getchildren()

                # Check if first row has fewer columns than others
                # Therefore not likely to be data or a header
                columns_count = {}
                first_column = {}
                row_number = 0

                for row in table_rows:
                    row_number += 1
                    columns_count[row_number] = len(row.getchildren())
                    try:
                        first_column[row_number] = row.getchildren()[0].text
                    except:
                        first_column[row_number] = ""
                    fewest_columns = min(columns_count, key=columns_count.get)

                if len(columns_count) > 2 and columns_count[1] == fewest_columns and columns_count[2] != fewest_columns:
                    # If it has fewest columns, also check Levenshtein distance
                    # To ensure this row is unlike the others
                    if editdistance.eval(first_column[1], first_column[2]) > editdistance.eval(first_column[2], first_column[3]):

                        # OK, we have something, move it
                        caption_element = etree.Element('caption')
                        caption_element.text = first_column[1]
                        NlmManipulate.append_safe(table, caption_element, self)
                        table.find("table").remove(table_rows[0])


        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Ejemplo n.º 9
0
    def run_graphics(self):
        # images are hard to handle because Word/OO puts them in different places
        # for instance, the caption can come before or after;
        # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
        # orientation="portrait" xlink:type="simple"/>

        self.debug.print_debug(self, u"Attempting to classify captions for graphics objects [plain]")

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        graphics = tree.xpath("//graphic")

        graphic_titles = []
        graphic_ids = []
        graphic_regex_dot = re.compile("^.+?\s*\d+\..+")
        graphic_regex_colon = re.compile("^.+?\s*\d+\:.+")

        separator = ":"

        for graphic in graphics:
            use_next = False

            # get the next sibling
            p = graphic.getparent()

            if p is not None and p.tag == "p":
                text = manipulate.get_stripped_text(p)

                if graphic_regex_colon.match(text):
                    use_next = True
                    separator = ":"
                elif graphic_regex_dot.match(text):
                    use_next = True
                    separator = "."

            if use_next:
                text = manipulate.get_stripped_text(p)

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = ("".join(split_title[1:])).strip()

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = graphic.xpath("label")[0]
                except:
                    title_element = etree.Element("label")
                    graphic.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element("caption")
                new_p = etree.Element("p")
                new_p.text = caption

                NlmManipulate.append_safe(caption_element, new_p, self)
                NlmManipulate.append_safe(graphic, caption_element, self)

                if graphic.tail:
                    graphic.tail = graphic.tail.replace(title + separator, "")
                    graphic.tail = graphic.tail.replace(caption + separator, "")
                    graphic.tail = graphic.tail.replace(caption, "")

                if not "id" in graphic.attrib:
                    graphic.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4()))

                graphic_titles.append(title)
                graphic_ids.append(graphic.attrib["id"])

        paragraphs = tree.xpath("//p")

        self.link(graphic_ids, graphic_titles, paragraphs, "fig")

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)

        self.run_graphics_sibling()