Ejemplo n.º 1
0
def save_report_pages(docid, report_num=1):
    report_path = paths.get_report_name(docid,
                                        local_path=True,
                                        file_extension='.pdf',
                                        file_num=report_num)
    try:
        images = convert_from_path(report_path)
    except exceptions.PDFPageCountError:
        fname = textractor.textloading.find_file(docid)
        rep_folder = (paths.get_report_name(
            docid, local_path=True, file_num=report_num)).split('cr')[0]
        if not os.path.exists(rep_folder):
            os.mkdir(rep_folder)

        if '.tif' in fname:
            report_in = re.sub('.pdf', '.tif', report_path)
            textloading.download_report(fname, report_in)
            with open(report_path, "wb") as f:
                f.write(img2pdf.convert(open(report_in, "rb")))
        else:
            textloading.download_report(fname, report_path)
        images = convert_from_path(report_path)

    for i in range(len(images)):
        pgpath = paths.get_report_page_path(docid, i + 1)
        images[i].save(pgpath)
Ejemplo n.º 2
0
def report2json(report, test=False):
    if test:
        local = 'test'
    else:
        local = True
    with open(
            paths.get_report_name(report.docid,
                                  local_path=local,
                                  file_extension='.json',
                                  file_num=report.filenum), "w") as f:
        frozen = jsonpickle.encode(report)
        json.dump(frozen, f)
Ejemplo n.º 3
0
def save_report_sections(report):
    if len(report.docinfo.keys()) == 0:
        return
    doc = docx.Document()
    for section in report.section_content:
        doc.add_heading(section['Heading'], 1)
        p = doc.add_paragraph()
        for line in section['Content']:
            p.add_run(line + '\n')
        doc.add_page_break()
    doc.save(
        paths.get_report_name(report.docid,
                              local_path=True,
                              file_extension='_sections.docx',
                              file_num=report.filenum))
Ejemplo n.º 4
0
def display_doc(
        docid):  # doc has to be pageinfo type - made for restructpageinfo
    report_path = paths.get_report_name(docid,
                                        local_path=True,
                                        file_extension=True)
    images = convert_from_path(report_path)

    docfile = open(paths.get_restructpageinfo_file(docid), "r")
    doc = json.load(docfile)
    drawn_images = []

    # Create image showing bounding box/polygon the detected lines/text
    for page in doc.items():
        i = int(page[0]) - 1
        image = images[i]
        width, height = image.size
        #draw = ImageDraw.Draw(image)
        draw = ImageDraw.Draw(image)
        for line in page[1]:
            # Uncomment to draw bounding box
            box = line['BoundingBox']
            left = width * box['Left']
            top = height * box['Top']
            draw.rectangle([
                left, top, left + (width * box['Width']), top +
                (height * box['Height'])
            ],
                           outline='green')

        #image.save(docid + '_' + page[0] + ".jpeg", "JPEG")
        drawn_images.append(image)

    save_path = paths.result_path + docid + '_boxed.pdf'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    drawn_images[0].save(save_path,
                         save_all=True,
                         append_images=drawn_images[1:])
Ejemplo n.º 5
0
def bookmark_report(report, test=False):
    if len(report.docinfo.keys()) == 0:
        return
    if test:
        report_file = paths.get_report_name(report.docid,
                                            local_path=True,
                                            file_extension='_boxed.pdf',
                                            file_num=report.filenum)
    else:
        report_file = paths.get_report_name(report.docid,
                                            local_path=True,
                                            file_extension='.pdf',
                                            file_num=report.filenum)
    output = PdfFileWriter()
    input = PdfFileReader(open(report_file, 'rb'))  #'../' +
    ptrs = report.headings_intext
    for page in input.pages:
        output.addPage(page)

    output.addBookmark('Title Page', 0, fit='/FitB')
    if report.toc_page:
        output.addBookmark('Table of Contents',
                           report.toc_page - 1,
                           fit='/FitB')
    section = None
    for i, row in ptrs.iterrows():
        #page, line = row['PageNum'], row['LineNum']
        #lnbb = report.docinfo[page][line-1]['BoundingBox']
        if row['Heading'] == 1:
            section = output.addBookmark(row['Text'],
                                         row['PageNum'] - 1,
                                         fit='/FitB')
        elif row['Heading'] == 2:
            if section:
                output.addBookmark(row['Text'],
                                   row['PageNum'] - 1,
                                   parent=section,
                                   fit='/FitB')
            else:
                output.addBookmark(
                    row['Text'], row['PageNum'] - 1,
                    fit='/FitB')  # add as a heading if section doesn't exist

    refpg = output.getPage(0).mediaBox
    width, height = float(refpg[2]), float(refpg[3])

    # add links between toc lines and their intext section
    #self.headings_intext, self.subheadings, self.headings
    if report.toc_page:
        toc_headings = pd.concat([report.headings, report.subheadings])
        for i, row in report.headings_intext.iterrows():
            if row.MatchesHeading == 0:
                continue
            toc_h = toc_headings.loc[int(row.MatchesI)]
            toc_bb = report.line_dataset.loc[
                (report.line_dataset.PageNum == report.toc_page)
                & (report.line_dataset.LineNum == toc_h.LineNum)].iloc[0]
            left = width * toc_bb['Left']
            top = height * (1 - toc_bb['Top'])
            #rectangle = [left, top, left + (width * toc_bb['Width']), top + (height * toc_bb['Height'])]
            rectangle = [
                left, top, left + (width * toc_bb['Width']),
                top - (height * toc_bb['Height'])
            ]
            output.addLink(
                report.toc_page - 1,
                row.PageNum - 1,
                rect=rectangle,
                fit='/FitB')  # creates link from toc heading to section page

    #outfile = settings.get_report_name(report.docid, local_path=True, file_extension='_bookmarked.pdf')
    outfile = paths.get_bookmarked_file(report.docid,
                                        test=test,
                                        filenum=report.filenum)
    print(outfile)
    rpath = outfile.rsplit('/', 1)[0]
    if not os.path.exists(rpath):
        os.mkdir(rpath)
    output.write(open(outfile, 'wb'))
Ejemplo n.º 6
0
def draw_report(report):
    report_path = paths.get_report_name(report.docid,
                                        local_path=True,
                                        file_extension='.pdf',
                                        file_num=report.filenum)
    images = convert_from_path(report_path)

    doc = report.docinfo
    drawn_images = []

    for page in doc.items():
        i = int(page[0]) - 1
        image = images[i]  # this has to be of type RGB
        width, height = image.size
        draw = ImageDraw.Draw(image, 'RGBA')

        if int(
                page[0]
        ) in report.marginals['PageNum'].values:  # draw bb around marginals
            lnnum = report.marginals.loc[report.marginals['PageNum'] == int(
                page[0])]['LineNum']
            for ln in lnnum.values:
                linenum = ln - 1
                line = page[1][linenum]
                box = line['BoundingBox']
                left = width * box['Left']
                top = height * box['Top']
                draw.rectangle([
                    left, top, left + (width * box['Width']), top +
                    (height * box['Height'])
                ],
                               outline='orange')

            # draw bb around page number (by comparing marginal content to result of page number extraction)
            if isinstance(report.page_nums, pd.DataFrame):
                if int(page[0]) in report.page_nums[
                        'PageNum'].values:  # draw bb around marginals
                    pg_marginal = report.page_nums.loc[
                        report.page_nums['PageNum'] == int(page[0])]
                    #pglnnum = pg_marginal['LineNum']
                    #pglinenum = pglnnum.values[0] - 1
                    text = pg_marginal.Text.values[0]
                    split_text = text.split('\t')
                    reg = r'(^|\s)' + str(
                        pg_marginal['Page'].values[0]
                    ) + r'($|\s)'  # implement returning pagenum position instead? would make this MUCH easier
                    pgnum_i = None
                    for t, i in zip(split_text, range(len(split_text))):
                        if re.search(reg, t):
                            pgnum_i = i
                            break
                    if pgnum_i:
                        box = line['OriginalBBs'][pgnum_i]
                        left = width * box['Left']
                        top = height * box['Top']
                        draw.rectangle([
                            left, top, left + (width * box['Width']), top +
                            (height * box['Height'])
                        ],
                                       outline='red')

                #original_marginal_bb = docinfo[pagestr][lineindex]['OriginalBBs'][index in marginal]

        if page[0] == str(report.toc_page):  # change colour of toc page
            # for i, row in report.toc_dataset.iterrows():  # did this mean to put rectangles around toc headings?
            #     left = width * row['Left']
            #     top = height * row['Top']
            #     #draw = ImageDraw.Draw(image)
            #     draw.rectangle([left, top, left + (width * row['Width']), top + (height * row['Height'])],
            #                    outline='pink')

            img_copy = image.copy()
            background = ImageDraw.Draw(img_copy, 'RGBA')
            background.rectangle([0, 0, image.size[0], image.size[1]],
                                 fill='green')
            image = Image.blend(img_copy, image, alpha=0.3)

        if report.fig_pages:
            if float(page[0]) in report.fig_pages[
                    'PageNum'].values:  # change colour of fig pages
                img_copy = image.copy()
                background = ImageDraw.Draw(img_copy, 'RGBA')
                background.rectangle([0, 0, image.size[0], image.size[1]],
                                     fill='purple')
                image = Image.blend(image, img_copy, alpha=0.3)

        #else:
        # draw bb around section headers
        if int(page[0]) in report.section_ptrs['PageNum'].values:
            lnnums = report.section_ptrs.loc[report.section_ptrs['PageNum'] ==
                                             int(page[0])]['LineNum']
            for line in lnnums.values:
                linenum = line - 1
                line = page[1][linenum]
                box = line['BoundingBox']
                left = width * box['Left']
                top = height * box['Top']
                draw.rectangle([
                    left, top, left + (width * box['Width']), top +
                    (height * box['Height'])
                ],
                               outline='blue')

        if int(page[0]) in report.subsection_ptrs['PageNum'].values:
            lnnums = report.subsection_ptrs.loc[
                report.subsection_ptrs['PageNum'] == int(page[0])]['LineNum']
            for line in lnnums.values:
                linenum = line - 1
                line = page[1][linenum]
                box = line['BoundingBox']
                left = width * box['Left']
                top = height * box['Top']
                draw.rectangle([
                    left, top, left + (width * box['Width']), top +
                    (height * box['Height'])
                ],
                               outline='green')

        drawn_images.append(image)
    outfile = paths.get_report_name(report.docid,
                                    local_path=True,
                                    file_extension='_boxed.pdf',
                                    file_num=report.filenum)
    drawn_images[0].save(outfile,
                         save_all=True,
                         append_images=drawn_images[1:])