Ejemplo n.º 1
0
def sanitise_files():
    rtitle = 'QGMJ'
    rtype = 'WELCOM'
    ref = pd.read_excel(
        'C:/Users/andraszeka/OneDrive - ITP (Queensland Government)/gsq-boreholes/investigations/QDEX_metada_export.xlsx',
        dtype={'REPNO': int})
    bad = ref.loc[ref.RTITLE.str.contains(rtitle)
                  | ref.RTYPE.str.contains(rtype)]
    bad_docids = bad.REPNO.values
    removed = []
    ids = paths.get_files_from_path('restructpageinfo')
    lines_docs = paths.get_files_from_path('restructpageinfo',
                                           get_file_paths=True)
    for id, lines_doc in zip(ids, lines_docs):
        docid, filenum = id[0], id[1]
        if docid in bad_docids:
            if not os.path.exists('nottraining/restructpageinfo/'):
                os.makedirs('nottraining/restructpageinfo/')
            os.rename(
                lines_doc,
                paths.get_restructpageinfo_file(docid,
                                                local_path=True,
                                                training=False,
                                                file_num=filenum))
            removed.append([docid, filenum])
    print("Removed: ", len(removed), ", ", removed)
def create_individual_dataset(docid):
    pageinfo = paths.get_restructpageinfo_file(docid)
    pi = json.load(open(pageinfo))
    df = pd.DataFrame(columns=columns)
    write_to_dataset(df, pi, docid)
    unnormed = np.array(df['Centrality'])
    normalized = (unnormed - min(unnormed)) / (max(unnormed) - min(unnormed))
    df['Centrality'] = normalized
    return df
Ejemplo n.º 3
0
def clean_and_restruct(docid, save=True, training=True, report_num=1):
    json_file = paths.get_full_json_file(docid,
                                         training=training,
                                         file_num=report_num)
    with open(json_file, 'r') as file:
        json_doc = json.load(file)
    json_res = json2res(json_doc)
    pagelineinfo = get_pagelineinfo_map(json_res)  # takes json response
    clean_page = get_clean_page(pagelineinfo, docid)
    restructpageinfo = get_restructpagelines(clean_page)

    if save:
        fp = paths.get_restructpageinfo_file(docid,
                                             training=training,
                                             file_num=report_num)
        p = fp.rsplit('/', 1)[0]
        if not os.path.exists(p):
            os.makedirs(p)
        o = open(fp, "w")
        json.dump(restructpageinfo, o)
    else:
        return restructpageinfo
Ejemplo n.º 4
0
def display_doc(
        docid):  # doc has to be pageinfo type - made for restructpageinfo
    report_path = paths.get_report_name(docid,
                                        local_path=True,
                                        file_extension=True)
    images = convert_from_path(report_path)

    docfile = open(paths.get_restructpageinfo_file(docid), "r")
    doc = json.load(docfile)
    drawn_images = []

    # Create image showing bounding box/polygon the detected lines/text
    for page in doc.items():
        i = int(page[0]) - 1
        image = images[i]
        width, height = image.size
        #draw = ImageDraw.Draw(image)
        draw = ImageDraw.Draw(image)
        for line in page[1]:
            # Uncomment to draw bounding box
            box = line['BoundingBox']
            left = width * box['Left']
            top = height * box['Top']
            draw.rectangle([
                left, top, left + (width * box['Width']), top +
                (height * box['Height'])
            ],
                           outline='green')

        #image.save(docid + '_' + page[0] + ".jpeg", "JPEG")
        drawn_images.append(image)

    save_path = paths.result_path + docid + '_boxed.pdf'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    drawn_images[0].save(save_path,
                         save_all=True,
                         append_images=drawn_images[1:])
Ejemplo n.º 5
0
def display_page(docid, page, line=None, mode=paths.dataset_version):
    pg_path = paths.get_report_page_path(int(docid), int(page))  # docid, page
    image = Image.open(pg_path)
    width, height = image.size

    if line:
        draw = ImageDraw.Draw(image, 'RGBA')
        draw.line([(1, 1), (1, height - 3)], fill="blue",
                  width=3)  # draw parallel lines down the page
        draw.line([(width - 3, 1), (width - 3, height - 3)],
                  fill="blue",
                  width=3)

        docinfofile = paths.get_restructpageinfo_file(docid)
        docinfo = json.load(open(docinfofile, "r"))
        pageinfo = docinfo[str(page)]
        lineinfo = pageinfo[int(line) - 1]  #-1 because linenum starts from 1

        box = lineinfo['BoundingBox']
        ln_left = width * box['Left']
        ln_top = height * box['Top']

        crop_height = height / 3
        left = 0
        right = width
        top = ln_top - box['Height'] - (
            crop_height / 2)  # bottom > top  bc of coordinate system
        bottom = ln_top + (crop_height / 2)

        if top < 0:  # if top is outside of bounds, add to it to make it 0, and sub from bottom
            change = top
            top = 0
            bottom -= change
            draw.line([(1, 1), (width - 3, 1)], fill="blue", width=3)

        elif bottom > height:
            change = bottom - height
            bottom = height
            top -= change
            draw.line([(1, height - 3), (width - 3, height - 3)],
                      fill="blue",
                      width=3)

        draw.rectangle([
            ln_left, ln_top, ln_left + (width * box['Width']), ln_top +
            (height * box['Height'])
        ],
                       outline='green',
                       width=2)

        crop_image = image.crop((left, top, right, bottom))
        #crop_ln_top = crop_height * box['Top']

        #draw.rectangle([ln_left, crop_ln_top, ln_left + (width * box['Width']), crop_ln_top + (crop_height * box['Height'])], outline='green')
        image = crop_image

    display.display(image)
    # line option: draw a box around the line
    # get docinfo, query the line number and bounding box
    # crop page to about 1/3 of it to make it more focused on the line

    print(pg_path)
    if line: print("line: ", line)
Ejemplo n.º 6
0
 def get_doc_info(self):
     pageinfo = paths.get_restructpageinfo_file(self.docid,
                                                file_num=self.filenum)
     pi = json.load(open(pageinfo, "r"))
     return pi
Ejemplo n.º 7
0
                        except DecompressionBombError as e:
                            print(e)
                            continue
                        textract_end = time.time()
                        textract_time = textract_end - textract_start
                        print("Time to textract: " + str(docid) + "_" +
                              str(num) + " " +
                              "{0:.2f}".format(textract_time) + " seconds")
                    else:
                        print("Report ", docid, "_", str(num),
                              " already textracted")
                        textract_time = 0

                    # check if clean and restruct needs to be run or if restructpageinfo alredy exists
                    if (not os.path.exists(
                            paths.get_restructpageinfo_file(
                                docid, training=training, file_num=num))
                            and (not args.force)):
                        texttransforming.clean_and_restruct(docid,
                                                            save=True,
                                                            training=training,
                                                            report_num=num)
                    else:
                        print("Report ", docid, "_", str(num),
                              " already cleaned and reconstructed")

                    if special_mode == 'welcom':
                        # copy json, tables, kvpairs, to extrafolder
                        jsonsrc = paths.get_full_json_file(docid,
                                                           training=training,
                                                           file_num=num)
                        jsondest = paths.get_full_json_file(
Ejemplo n.º 8
0
            print('Nums: ', nums)
            for num in nums:
                if not (os.path.exists(paths.get_full_json_file(docid, file_num=num))) and (not args.force):
                    textract_start = time.time()
                    try:
                        textract(docid, features=['TABLES'], report_num=num)
                    except FileNotFoundError as e:
                        #print("Report file", docid, "_", str(num), "doesn't exist in S3")
                        print(e)
                        continue
                    except TextBasedFileException as e:
                         print(e)
                         continue
                    textract_end = time.time()
                    textract_time = textract_end - textract_start
                    print("Time to textract: " + str(docid) + "_" + str(num) + " " + "{0:.2f}".format(textract_time) + " seconds")
                else:
                    print("Report ", docid, "_", str(num),  " already textracted")

                # check if clean and restruct needs to be run or if restructpageinfo alredy exists
                if (not os.path.exists(paths.get_restructpageinfo_file(docid, file_num=num)) and (not args.force)):
                    texttransforming.clean_and_restruct(docid, save=True, report_num=num)
                else: print("Report ", docid, "_", str(num), " already cleaned and reconstructed")

        cont = input("Run again?")
        if 'n' in cont:
            not_exit = False
        else:
            new_args = input("Enter new args: ")
            args = parser.parse_args(new_args.split())