Esempio n. 1
0
def put_it_together(cur_image, caption, context, extracted_image_data, line_index, \
                    lines):
    """
    Takes the current image(s) and caption(s) and assembles them into
    something useful in the extracted_image_data list.

    @param: cur_image (string || list): the image currently being dealt with, or
        the list of images, in the case of subimages
    @param: caption (string || list): the caption or captions currently in scope
    @param: extracted_image_data ([(string, string), (string, string), ...]):
        a list of tuples of images matched to captions from this document.
    @param: line_index (int): the index where we are in the lines (for
        searchback and searchforward purposes)
    @param: lines ([string, string, ...]): the lines in the TeX

    @return: (cur_image, caption, extracted_image_data): the same arguments it
        was sent, processed appropriately
    """

    if type(cur_image) == list:
        if cur_image[MAIN_CAPTION_OR_IMAGE] == 'ERROR':
            cur_image[MAIN_CAPTION_OR_IMAGE] = ''
        for image in cur_image[SUB_CAPTION_OR_IMAGE]:
            if image == 'ERROR':
                cur_image[SUB_CAPTION_OR_IMAGE].remove(image)

    if cur_image != '' and caption != '':

        if type(cur_image) == list and type(caption) == list:

            if cur_image[MAIN_CAPTION_OR_IMAGE] != '' and\
                    caption[MAIN_CAPTION_OR_IMAGE] != '':
                extracted_image_data.append(
                    (cur_image[MAIN_CAPTION_OR_IMAGE],
                     caption[MAIN_CAPTION_OR_IMAGE],
                     context))
            if type(cur_image[MAIN_CAPTION_OR_IMAGE]) == list:
                # why is the main image a list?
                # it's a good idea to attach the main caption to other
                # things, but the main image can only be used once
                cur_image[MAIN_CAPTION_OR_IMAGE] = ''

            if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                    for index in \
                            range(len(cur_image[SUB_CAPTION_OR_IMAGE])):
                        if index < len(caption[SUB_CAPTION_OR_IMAGE]):
                            long_caption = \
                                caption[MAIN_CAPTION_OR_IMAGE] + ' : ' + \
                                caption[SUB_CAPTION_OR_IMAGE][index]
                        else:
                            long_caption = \
                                caption[MAIN_CAPTION_OR_IMAGE] + ' : ' + \
                                'Caption not extracted'
                        extracted_image_data.append(
                            (cur_image[SUB_CAPTION_OR_IMAGE][index],
                             long_caption, context))

                else:
                    long_caption = caption[MAIN_CAPTION_OR_IMAGE] + \
                        ' : ' + caption[SUB_CAPTION_OR_IMAGE]
                    for sub_image in cur_image[SUB_CAPTION_OR_IMAGE]:
                        extracted_image_data.append(
                            (sub_image, long_caption, context))

            else:
                if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                    long_caption = caption[MAIN_CAPTION_OR_IMAGE]
                    for sub_cap in caption[SUB_CAPTION_OR_IMAGE]:
                        long_caption = long_caption + ' : ' + sub_cap
                    extracted_image_data.append(
                       (cur_image[SUB_CAPTION_OR_IMAGE], long_caption, context))
                else:
                    #wtf are they lists for?
                    extracted_image_data.append(
                        (cur_image[SUB_CAPTION_OR_IMAGE],
                         caption[SUB_CAPTION_OR_IMAGE], context))

        elif type(cur_image) == list:
            if cur_image[MAIN_CAPTION_OR_IMAGE] != '':
                extracted_image_data.append(
                    (cur_image[MAIN_CAPTION_OR_IMAGE], caption, context))
            if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                for image in cur_image[SUB_CAPTION_OR_IMAGE]:
                    extracted_image_data.append((image, caption, context))
            else:
                extracted_image_data.append(
                    (cur_image[SUB_CAPTION_OR_IMAGE], caption, context))

        elif type(caption) == list:
            if caption[MAIN_CAPTION_OR_IMAGE] != '':
                extracted_image_data.append(
                    (cur_image, caption[MAIN_CAPTION_OR_IMAGE], context))
            if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                # multiple caps for one image:
                long_caption = caption[MAIN_CAPTION_OR_IMAGE]
                for subcap in caption[SUB_CAPTION_OR_IMAGE]:
                    if long_caption != '':
                        long_caption += ' : '
                    long_caption += subcap
                extracted_image_data.append((cur_image, long_caption, context))
            else:
                extracted_image_data.append(
                    (cur_image, caption[SUB_CAPTION_OR_IMAGE]. context))

        else:
            extracted_image_data.append((cur_image, caption, context))

    elif cur_image != '' and caption == '':
        # we may have missed the caption somewhere.
        REASONABLE_SEARCHBACK = 25
        REASONABLE_SEARCHFORWARD = 5
        curly_no_tag_preceding = '(?<!\\w){'

        for searchback in range(REASONABLE_SEARCHBACK):
            if line_index - searchback < 0:
                continue

            back_line = lines[line_index - searchback]
            m = re.search(curly_no_tag_preceding, back_line)
            if m != None:
                open_curly = m.start()
                open_curly, open_curly_line, close_curly, \
                close_curly_line = find_open_and_close_braces(\
                line_index - searchback, open_curly, '{', lines)

                cap_begin = open_curly + 1

                caption = assemble_caption(open_curly_line, cap_begin, \
                    close_curly_line, close_curly, lines)

                if type(cur_image) == list:
                    extracted_image_data.append(
                            (cur_image[MAIN_CAPTION_OR_IMAGE], caption, context))
                    for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
                        extracted_image_data.append((sub_img, caption, context))
                else:
                    extracted_image_data.append((cur_image, caption, context))
                    break

        if caption == '':
            for searchforward in range(REASONABLE_SEARCHFORWARD):
                if line_index + searchforward >= len(lines):
                    break

                fwd_line = lines[line_index + searchforward]
                m = re.search(curly_no_tag_preceding, fwd_line)

                if m != None:
                    open_curly = m.start()
                    open_curly, open_curly_line, close_curly, \
                    close_curly_line = find_open_and_close_braces(\
                    line_index + searchforward, open_curly, '{', lines)

                    cap_begin = open_curly + 1

                    caption = assemble_caption(open_curly_line, \
                              cap_begin, close_curly_line, close_curly, lines)

                    if type(cur_image) == list:
                        extracted_image_data.append(
                            (cur_image[MAIN_CAPTION_OR_IMAGE], caption, context))
                        for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
                            extracted_image_data.append((sub_img, caption, context))
                    else:
                        extracted_image_data.append((cur_image, caption, context))
                    break

        if caption == '':
            if type(cur_image) == list:
                extracted_image_data.append(
                    (cur_image[MAIN_CAPTION_OR_IMAGE], 'No caption found', context))
                for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
                    extracted_image_data.append((sub_img, 'No caption', context))
            else:
                extracted_image_data.append(
	               (cur_image, 'No caption found', context))


    elif caption != '' and cur_image == '':
        if type(caption) == list:
            long_caption = caption[MAIN_CAPTION_OR_IMAGE]
            for subcap in caption[SUB_CAPTION_OR_IMAGE]:
                long_caption = long_caption + ': ' + subcap
        else:
            long_caption = caption
        extracted_image_data.append(('', 'noimg' + long_caption, context))


    # if we're leaving the figure, no sense keeping the data
    cur_image = ''
    caption = ''

    return (cur_image, caption, extracted_image_data)
Esempio n. 2
0
def extract_captions(tex_file, sdir, image_list, primary=True):
    """
    Take the TeX file and the list of images in the tarball (which all,
    presumably, are used in the TeX file) and figure out which captions
    in the text are associated with which images
    @param: lines (list): list of lines of the TeX file

    @param: tex_file (string): the name of the TeX file which mentions
        the images
    @param: sdir (string): path to current sub-directory
    @param: image_list (list): list of images in tarball
    @param: primary (bool): is this the primary call to extract_caption?

    @return: images_and_captions_and_labels ([(string, string, list),
        (string, string, list), ...]):
        a list of tuples representing the names of images and their
        corresponding figure labels from the TeX file
    """
    if os.path.isdir(tex_file) or not os.path.exists(tex_file):
        return []
    fd = open(tex_file)
    lines = fd.readlines()
    fd.close()

    # possible figure lead-ins
    figure_head = '\\begin{figure'  # also matches figure*
    figure_tail = '\\end{figure'  # also matches figure*
    picture_head = '\\begin{picture}'
    displaymath_head = '\\begin{displaymath}'
    subfloat_head = '\\subfloat'
    subfig_head = '\\subfigure'
    includegraphics_head = '\\includegraphics'

    epsfig_head = '\\epsfig'
    input_head = '\\input'
    # possible caption lead-ins
    caption_head = '\\caption'
    figcaption_head = '\\figcaption'

    label_head = '\\label'

    rotate = 'rotate='
    angle = 'angle='

    eps_tail = '.eps'
    ps_tail = '.ps'

    doc_head = '\\begin{document}'
    doc_tail = '\\end{document}'

    extracted_image_data = []
    cur_image = ''
    caption = ''
    labels = []
    active_label = ""

    # cut out shit before the doc head
    if primary:
        for line_index in range(len(lines)):
            if lines[line_index].find(doc_head) < 0:
                lines[line_index] = ''
            else:
                break

    # are we using commas in filenames here?
    commas_okay = False
    for dummy1, dummy2, filenames in \
            os.walk(os.path.split(os.path.split(tex_file)[0])[0]):
        for filename in filenames:
            if filename.find(',') > -1:
                commas_okay = True
                break

    # a comment is a % not preceded by a \
    comment = re.compile("(?<!\\\\)%")

    for line_index in range(len(lines)):
        # get rid of pesky comments by splitting where the comment is
        # and keeping only the part before the %
        line = comment.split(lines[line_index])[0]
        line = line.strip()
        lines[line_index] = line

    in_figure_tag = 0

    for line_index in range(len(lines)):
        line = lines[line_index]

        if line == '':
            continue
        if line.find(doc_tail) > -1:
            return extracted_image_data

        """
        FIGURE -
        structure of a figure:
        \begin{figure}
        \formatting...
        \includegraphics[someoptions]{FILENAME}
        \caption{CAPTION}  %caption and includegraphics may be switched!
        \end{figure}
        """

        index = line.find(figure_head)
        if index > -1:
            in_figure_tag = 1
            # some punks don't like to put things in the figure tag.  so we
            # just want to see if there is anything that is sitting outside
            # of it when we find it
            cur_image, caption, extracted_image_data = \
                    put_it_together(cur_image, caption, active_label, extracted_image_data, \
                                    line_index, lines)

        # here, you jerks, just make it so that it's fecking impossible to
        # figure out your damn inclusion types

        index = max([line.find(eps_tail), line.find(ps_tail), \
                     line.find(epsfig_head)])
        if index > -1:
            if line.find(eps_tail) > -1 or line.find(ps_tail) > -1:
                ext = True
            else:
                ext = False
            filenames = intelligently_find_filenames(line, ext=ext,
                                                     commas_okay=commas_okay)

            # try to look ahead!  sometimes there are better matches after
            if line_index < len(lines) - 1:
                filenames.extend(\
                          intelligently_find_filenames(lines[line_index + 1],
                                                      commas_okay=commas_okay))
            if line_index < len(lines) - 2:
                filenames.extend(\
                          intelligently_find_filenames(lines[line_index + 2],
                                                      commas_okay=commas_okay))

            for filename in filenames:
                filename = str(filename)
                if cur_image == '':
                    cur_image = filename
                elif type(cur_image) == list:
                    if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                        cur_image[SUB_CAPTION_OR_IMAGE].append(filename)
                    else:
                        cur_image[SUB_CAPTION_OR_IMAGE] = [filename]
                else:
                    cur_image = ['', [cur_image, filename]]

        """
        Rotate and angle
        """
        index = max(line.find(rotate), line.find(angle))
        if index > -1:
            # which is the image associated to it?
            filenames = intelligently_find_filenames(line,
                                                     commas_okay=commas_okay)
            # try the line after and the line before
            if line_index + 1 < len(lines):
                filenames.extend(intelligently_find_filenames(lines[line_index + 1],
                                                      commas_okay=commas_okay))
            if line_index > 1:
                filenames.extend(intelligently_find_filenames(lines[line_index - 1],
                                                      commas_okay=commas_okay))

            already_tried = []
            for filename in filenames:
                if filename != 'ERROR' and not filename in already_tried:
                    if rotate_image(filename, line, sdir, image_list):
                        break
                    already_tried.append(filename)

        """
        INCLUDEGRAPHICS -
        structure of includegraphics:
        \includegraphics[someoptions]{FILENAME}
        """
        index = line.find(includegraphics_head)
        if index > -1:
            open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(line_index, index, '{', lines)

            filename = lines[open_curly_line][open_curly + 1:close_curly]

            if cur_image == '':
                cur_image = filename
            elif type(cur_image) == list:
                if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                    cur_image[SUB_CAPTION_OR_IMAGE].append(filename)
                else:
                    cur_image[SUB_CAPTION_OR_IMAGE] = [filename]
            else:
                cur_image = ['', [cur_image, filename]]

        """
        {\input{FILENAME}}
        \caption{CAPTION}

        This input is ambiguous, since input is also used for things like
        inclusion of data from other LaTeX files directly.
        """
        index = line.find(input_head)
        if index > -1:
            new_tex_names = intelligently_find_filenames(line, TeX=True, \
                                                         commas_okay=commas_okay)

            for new_tex_name in new_tex_names:
                if new_tex_name != 'ERROR':
                    new_tex_file = get_tex_location(new_tex_name, tex_file)
                    if new_tex_file != None and primary: #to kill recursion
                        extracted_image_data.extend(extract_captions(\
                                                      new_tex_file, sdir, \
                                                      image_list,
                                                      primary=False))

        """PICTURE"""

        index = line.find(picture_head)
        if index > -1:
            # structure of a picture:
            # \begin{picture}
            # ....not worrying about this now
            #write_message('found picture tag')
            #FIXME
            pass



        """DISPLAYMATH"""

        index = line.find(displaymath_head)
        if index > -1:
            # structure of a displaymath:
            # \begin{displaymath}
            # ....not worrying about this now
            #write_message('found displaymath tag')
            #FIXME
            pass

        """
        CAPTIONS -
        structure of a caption:
        \caption[someoptions]{CAPTION}
        or
        \caption{CAPTION}
        or
        \caption{{options}{CAPTION}}
        """

        index = max([line.find(caption_head), line.find(figcaption_head)])
        if index > -1:
            open_curly, open_curly_line, close_curly, close_curly_line = \
                    find_open_and_close_braces(line_index, index, '{', lines)

            cap_begin = open_curly + 1

            cur_caption = assemble_caption(open_curly_line, cap_begin, \
                        close_curly_line, close_curly, lines)

            if caption == '':
                caption = cur_caption
            elif type(caption) == list:
                if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                    caption[SUB_CAPTION_OR_IMAGE].append(cur_caption)
                else:
                    caption[SUB_CAPTION_OR_IMAGE] = [cur_caption]
            elif caption != cur_caption:
                caption = ['', [caption, cur_caption]]

        """
        SUBFLOATS -
        structure of a subfloat (inside of a figure tag):
        \subfloat[CAPTION]{options{FILENAME}}

        also associated with the overall caption of the enclosing figure
        """

        index = line.find(subfloat_head)
        if index > -1:
            # if we are dealing with subfloats, we need a different
            # sort of structure to keep track of captions and subcaptions
            if type(cur_image) != list:
                cur_image = [cur_image, []]
            if type(caption) != list:
                caption = [caption, []]

            open_square, open_square_line, close_square, close_square_line = \
                    find_open_and_close_braces(line_index, index, '[', lines)
            cap_begin = open_square + 1

            sub_caption = assemble_caption(open_square_line, \
                    cap_begin, close_square_line, close_square, lines)
            caption[SUB_CAPTION_OR_IMAGE].append(sub_caption)

            open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(close_square_line, \
                    close_square, '{', lines)
            sub_image = lines[open_curly_line][open_curly + 1:close_curly]

            cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image)

        """
        SUBFIGURES -
        structure of a subfigure (inside a figure tag):
        \subfigure[CAPTION]{
        \includegraphics[options]{FILENAME}}

        also associated with the overall caption of the enclosing figure
        """

        index = line.find(subfig_head)
        if index > -1:
            # like with subfloats, we need a different structure for keepin
            # track of this stuff
            if type(cur_image) != list:
                cur_image = [cur_image, []]
            if type(caption) != list:
                caption = [caption, []]

            open_square, open_square_line, close_square, close_square_line = \
                    find_open_and_close_braces(line_index, index, '[', lines)
            cap_begin = open_square + 1

            sub_caption = assemble_caption(open_square_line, \
                    cap_begin, close_square_line, close_square, lines)
            caption[SUB_CAPTION_OR_IMAGE].append(sub_caption)

            index_cpy = index

            # find the graphics tag to get the filename
            # it is okay if we eat lines here
            index = line.find(includegraphics_head)
            while index == -1 and (line_index + 1) < len(lines):
                line_index = line_index + 1
                line = lines[line_index]
                index = line.find(includegraphics_head)
            if line_index == len(lines):
                # didn't find the image name on line
                line_index = index_cpy

            open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(line_index, \
                    index, '{', lines)
            sub_image = lines[open_curly_line][open_curly + 1:close_curly]

            cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image)

        """
        LABELS -
        structure of a label:
        \label{somelabelnamewhichprobablyincludesacolon}

        Labels are used to tag images and will later be used in ref tags
        to reference them.  This is interesting because in effect the refs
        to a plot are additional caption for it.

        Notes: labels can be used for many more things than just plots.
        We'll have to experiment with how to best associate a label with an
        image.. if it's in the caption, it's easy.  If it's in a figure, it's
        still okay... but the images that aren't in figure tags are numerous.
        """
        index = line.find(label_head)
        if index > -1 and in_figure_tag:
            open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(line_index, \
                    index, '{', lines)
            label = lines[open_curly_line][open_curly + 1:close_curly]
            if label not in labels:
                active_label = label
            labels.append(label)

        """
        FIGURE

        important: we put the check for the end of the figure at the end
        of the loop in case some pathological person puts everything in one
        line
        """

        index = max([line.find(figure_tail), line.find(doc_tail)])
        if index > -1:
            in_figure_tag = 0

            cur_image, caption, extracted_image_data = \
                    put_it_together(cur_image, caption, active_label, extracted_image_data, \
                                    line_index, lines)

        """
        END DOCUMENT

        we shouldn't look at anything after the end document tag is found
        """

        index = line.find(doc_tail)
        if index > -1:
            break

    return extracted_image_data