Exemple #1
0
def process_pdf_internal(fname, page_num='all'):
    """
    Change from orignal name of process_pdf to process_pdf_internal
    get the raw character

    :param fname:
    :param page_num:
    :return:
    """
    tmp_path = get_tmp_path(fname)
    cache_path = "%s.chars.%s.pkl"%(tmp_path, str(page_num))

    if os.path.isfile(cache_path):
        try:
            return pickle.load(open(cache_path))
        except Exception as e:
            print "load failed, get again"

    # global char_list
    char_list = []
    if debug:
        print fname
    # Open a PDF file.
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for i, page in enumerate(PDFPage.create_pages(document)):
        process_mark = (page_num == 'all' or page_num == i)
        if process_mark:
            interpreter.process_page(page)
            layout = device.get_result()
            print_layout(layout, char_list)

        if page_num == i:
            break

    crop_bbox = get_pdf_page_bbox_abandon(fname, page_num)
    for char in char_list:
        if isinstance(char, LTChar):
            adjust_element_bbox(char, crop_bbox)

    with open(cache_path, 'w') as f:
        pickle.dump(char_list, f)
    return char_list
Exemple #2
0
def export_glyph_ratio(pdf_path):
    """
    TODO
    :param pdf_path:
    :return:
    """
    raise Exception("should not call it")
    tmp_pdf_path = get_tmp_path(pdf_path)
    print "TODO, move the glyph JAR into common place"
    export_glyph_jar_path = "E:/pdfbox-2.0.8-src/pdfbox-2.0.8/debugger/target/pdfGlyphAdjust-jar-with-dependencies.jar"
    cmd = "java -jar {} {}".format(export_glyph_jar_path, tmp_pdf_path)
    os.system(cmd)
Exemple #3
0
def get_glyph_ratio(pdf_path, pid):
    """
    TODO, what the return should be like?

    :param pdf_path:
    :param pid:
    :return: page to fontname 2 glyphname 2 pair/tuple
    """
    raise Exception("Should not call it")

    import shutil
    from pdfxml.path_util import get_tmp_path
    from pdfxml.pdf_util.pdf_extract import get_page_num
    tmp_pdf_path = get_tmp_path(pdf_path)
    if tmp_pdf_path == pdf_path or os.path.isfile(tmp_pdf_path):
        pass
    else:
        shutil.copy(pdf_path, tmp_pdf_path)
    pn = get_page_num(pdf_path)
    all_create = True  # check whether all created
    for i in range(pn):
        gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, i)
        if not os.path.isfile(gr_path):
            all_create = False
            break
    if not all_create:
        export_glyph_ratio(pdf_path)

    # read from the files and return here
    #page2fontname2glyphname2adjust = {}
    #for pid in range(pn):
    fontname2glyphname2adjust = {}
    gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, pid)
    lines = open(gr_path).readlines()
    for line in lines:
        line = line.strip()
        if line == "":
            continue
        ws = line.strip().split("\t")
        fontname, glyphname, up_ratio, lower_ratio = \
            ws[0], ws[1], float(ws[2]), float(ws[3])
        if fontname not in fontname2glyphname2adjust:
            fontname2glyphname2adjust[fontname] = {}
        fontname2glyphname2adjust[fontname][glyphname] = (up_ratio,
                                                          lower_ratio)

        #page2fontname2glyphname2adjust[pid] = fontname2glyphname2adjust
    #return page2fontname2glyphname2adjust

    return fontname2glyphname2adjust
Exemple #4
0
def get_page_num(fpath):
    """ Get the page number for the current pdf file
    https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python
    """
    tmp_path = get_tmp_path(fpath)
    cache_path = "{}.page_num.json".format(tmp_path)
    if os.path.isfile(cache_path):
        tmp_dict = load_general(cache_path)
        return tmp_dict['page_num']

    # Open a PDF file.
    fp = open(fpath, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)

    c = resolve1(document.catalog['Pages'])['Count']

    tmp_dict = {'page_num': c}
    dump_general(tmp_dict, cache_path)

    return c
Exemple #5
0
def get_font_from_pdf(pdf_path, pid):
    """
    This is the most commonly used file

    NOTE, using the pdfbox to export the glyph

    :param pdf_path:
    :return:
    """
    return None
    tmp_path = get_tmp_path(pdf_path)
    font_path = "{}.font".format(tmp_path)

    if not os.path.isfile(font_path):
        export_font(pdf_path, font_path)

    page2name2detail = read_pdfbox_font(font_path)
    return page2name2detail[pid]

    #############
    # using PDFBox to read all char information
    #############
    #def export_font(pdf_path, export_path):
    """
Exemple #6
0
def process_pdf_lines(fname, page_num='all', do_adjust=False):
    """

    :param fname: file path to the PDF file
    :param page_num: default to extract all
    :return:
    :rtype: list(list(LTChar))
    """
    # TODO, cache the informatin here?
    from pdfxml.path_util import get_tmp_path
    tmp_pdf_path = get_tmp_path(fname)

    pdf_lines_cache = "{}.pdf_line.{}.pkl".format(tmp_pdf_path, page_num)
    if os.path.isfile(pdf_lines_cache):
        return load_serialization(pdf_lines_cache)

    line_list = []
    char_list = []
    def print_layout(l):
        """ get all the chars
        """
        for e in l:
            if isinstance(e, LTTextLineHorizontal):
                #print "try recursively text line"
                print_layout(e)
                line_list.append(copy.copy(char_list))
                while len(char_list) > 0:
                    char_list.pop()

            if isinstance(e, LTTextBoxHorizontal):
                #print "try recursively text box"
                print_layout(e)

            if isinstance(e, LTChar) or isinstance(e, LTAnno):
                char_list.append(e)


    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for i, page in enumerate(PDFPage.create_pages(document)):
        process_mark = (page_num == 'all' or page_num == i)
        if process_mark:
            interpreter.process_page(page)
            layout = device.get_result()
            print_layout(layout)

        if page_num == i:
            break

    if do_adjust:
        for line in line_list:
            adjust_basedon_glyph_ratio(line, fname, page_num)

    # adjust based on crop bbox
    crop_bbox = get_pdf_page_bbox_abandon(fname, page_num)
    for line in line_list:
        for char in line:
            if isinstance(char, LTChar):
                adjust_element_bbox(char, crop_bbox)

    dump_serialization(line_list, pdf_lines_cache)
    return line_list
Exemple #7
0
def pdf_extract_lines(pdf_path, pid=0, force_single=False):
    """
    each line is a list of LTChar

    :param pdf_path:
    :param pid:
    :return:
    """
    tmp_pdf_path = get_tmp_path(pdf_path)
    cache_path = "{}.pdfbox_merge_line.{}.pkl".format(tmp_pdf_path, pid)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    char_list_list = pdf_extract_lines_raw(pdf_path, pid)

    # TODO, do another round of line merging
    # still use our column line detection model to find the region.
    fontname2space = pdf_extract_fontname2space(pdf_path, pid)
    word_info_list = pdf_extract_words(pdf_path, pid)

    res_char_list_list = []
    if not force_single and is_double_column(pdf_path, pid):
        # split the current list into three parts
        # detect the center split, create two column
        # outside of the double column,
        # within the double column
        page_size = get_pdf_page_size(pdf_path, pid)
        page_width = page_size['width']

        out_char_list_list = []
        left_char_list_list = []
        right_char_list_list = []
        from pdfxml.pdf_util.layout_util import get_char_list_bbox
        for char_list in char_list_list:
            bbox = get_char_list_bbox(char_list)
            if bbox.left() < bbox.right() < page_width / 2:
                left_char_list_list.append(char_list)
            elif bbox.right() > bbox.left() > page_width / 2:
                right_char_list_list.append(char_list)
            else:
                out_char_list_list.append(char_list)

        # before mering do the word_info_filter
        word_info_list = word_info_filter(char_list_list, word_info_list)

        new_out_char_list_list = merging_lines(out_char_list_list,
                                               fontname2space, word_info_list,
                                               pdf_path, pid)
        new_left_char_list_list = merging_lines(left_char_list_list,
                                                fontname2space, word_info_list,
                                                pdf_path, pid)
        new_right_char_list_list = merging_lines(right_char_list_list,
                                                 fontname2space,
                                                 word_info_list, pdf_path, pid)

        # not in the vertical range of the double dolumn
        # center on the left part,
        # center on the right part,
        char_list_list = []
        char_list_list.extend(new_out_char_list_list)
        char_list_list.extend(new_left_char_list_list)
        char_list_list.extend(new_right_char_list_list)

        res_char_list_list = char_list_list
    else:
        # before mering do the word_info_filter
        word_info_list = word_info_filter(char_list_list, word_info_list)

        # single column, then just go on merging the lines
        new_char_list_list = merging_lines(char_list_list, fontname2space,
                                           word_info_list, pdf_path, pid)
        res_char_list_list = new_char_list_list
    dump_serialization(res_char_list_list, cache_path)
    return res_char_list_list
Exemple #8
0
def extract_me(pdf_path):

    pdf_name = get_file_name_prefix(pdf_path)

    # TODO, place it outside

    duration_recorder.begin_timer("Begin ME Extraction")
    # load the setting here.
    tmp_pdf_path = get_tmp_path(pdf_path)

    if not os.path.isfile(tmp_pdf_path):
        shutil.copy(pdf_path, tmp_pdf_path)

    if extraction_done(pdf_path):
        print "ME extraction done for {}".format(pdf_path)
        return

    #if ext_settings.debug:
    #    convert2image(pdf_path)

    # batch extraction of lines
    pn = get_page_num(pdf_path)

    if ext_settings.debug:
        # the font is not useful in later stage
        #get_font_from_pdf(pdf_path, 0)  # just do it once, other wise, the parallel error?
        pass

    duration_recorder.begin_timer("Column-Line-Word")
    # This part should not be parallelized, only execute once

    export_exact_position(pdf_path)
    if ext_settings.ME_ANALYSIS_PARALLEL:
        print "parallized CLW threading"
        from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines

        #Parallel(n_jobs=PARALLEL_SIZE)(
        #    delayed(clw_pdf_lines)(pdf_path, pid) for pid in range(pn))
        from multiprocessing import Process
        process_list = []
        for pid in range(pn):
            p = Process(target=clw_pdf_lines, args=(pdf_path, pid))
            p.start()
            process_list.append(p)

        for p in process_list:
            p.join()
    else:
        print "serialized CLW"
        if ext_settings.CLW_VERSION == CLW_OLD:
            from pdfxml.pdf_util.ppc_line_reunion import ppc_line_reunion
            for pid in range(pn):
                ppc_line_reunion(pdf_path, pid)
        elif ext_settings.CLW_VERSION == CLW_FEB:
            from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines
            for pid in range(pn):
                clw_pdf_lines(pdf_path, pid)
        else:
            raise Exception("unknown version")

    duration_recorder.begin_timer("IME Extraction")
    extract_ime(pdf_path)
    duration_recorder.begin_timer("EME Extraction")
    extract_eme(pdf_path)
    duration_recorder.begin_timer("ME extraction finished")
Exemple #9
0
def assess_ime(pdf_path, pid=0, xml_out_path=None, ignore_exist=False):
    """
    # IME [3]

    With math symbol and without non-math words

    Return:
        xml_out_path: output the boundary file
    """
    tmp_path = get_tmp_path(pdf_path)
    ret_info_dict = {}
    if xml_out_path and os.path.isfile(xml_out_path) and (not ignore_exist):
        return {}

    from pdfxml.me_extraction.me_consts import math_words
    t = time.time()
    # common resource loader
    wl = set(words.words())
    wl.update(additional_words)
    wnl = WordNetLemmatizer()
    d = time.time() - t
    ret_info_dict['resource_time'] = d

    t = time.time()
    # layout analysis
    font = get_font_from_pdf(pdf_path, pid)
    #font = None
    prefix = pdf_path[pdf_path.rindex("/") + 1:-4]
    lines = internal_get_llines(prefix, pdf_path, pid)

    d = time.time() - t
    ret_info_dict['layout_time'] = d

    # IME assessment core
    t = time.time()
    line_labels = [0] * len(lines)
    for li, line in enumerate(lines):
        line_label = 0
        beg_idx = 0
        with_math_symbol_or_word = False
        with_non_math_word = False
        for i, char in enumerate(line):
            if isinstance(char, LTChar):
                if check_is_math_LTChar(char, font):
                    me_extraction_logger.debug("Char {} as Math".format(char))
                    with_math_symbol_or_word = True

            if is_space_char(char):
                word = ""
                for j in range(beg_idx, i):
                    if j == i - 1 and line[j].get_text() in [
                            ',', '.', 'period', 'comma'
                    ]:
                        continue

                    # for word checking, only work on the alpha beta
                    tmp_text = line[j].get_text()
                    if len(tmp_text) != 1:
                        tmp_text = " "

                    word += tmp_text
                beg_idx = i + 1
                word = word.lower().strip()

                # move to above, and use glyph name to match
                #if word.endswith(',') or word.endswith('.'):
                #    word = word[:-1]

                # print check word
                s_word, v_word = "", ""
                try:
                    s_word = wnl.lemmatize(word, 'n')
                    v_word = word
                    v_word = wnl.lemmatize(word, 'v')
                except Exception as e:
                    me_extraction_error_logger.error(
                        "Error checking the word as noun or verb")

                if word in math_words:
                    me_extraction_logger.debug("Math Word {}".format(word))
                    with_math_symbol_or_word = True
                elif len(word) > 2 and (word in wl or s_word in wl
                                        or v_word in wl):
                    me_extraction_logger.debug("Plain Word {}".format(word))
                    with_non_math_word = True
                else:
                    pass

        # debug for line, with ME or not
        tmp_line_str = char_list2str(line, ', ')
        me_extraction_logger.debug(tmp_line_str)
        me_extraction_logger.debug("with math {}, with word {}".format(
            with_math_symbol_or_word, with_non_math_word))
        if with_math_symbol_or_word and (not with_non_math_word):
            me_extraction_logger.debug("MATHLINE")
            line_label = 1
        line_labels[li] = line_label
    d = time.time() - t
    ret_info_dict['core_time'] = d

    if not xml_out_path:
        for li, line in enumerate(lines):
            if line_labels[li]:
                tmp_str = ''.join([
                    char.get_text() for char in line
                    if isinstance(char, LTChar)
                ])
                print tmp_str.encode("utf-8")

    # export for evaluation
    page_info = {}
    page_info['pid'] = pid
    page_info['ilist'] = []
    page_info['elist'] = []

    # create bbox for each ME
    for li, line in enumerate(lines):
        if line_labels[li]:
            visible_char_list = [
                char for char in line if isinstance(char, LTChar)
            ]
            char_list2str(visible_char_list)
            page_info['ilist'].append(line)

    t = time.time()
    if xml_out_path:
        export_xml(page_info, xml_out_path, pdf_path, pid)
    d = time.time() - t
    ret_info_dict['io_time'] = d
    return ret_info_dict