Ejemplo n.º 1
0
def get_features(pdf_html_dir):
    for pdf_html_page in os.listdir(pdf_html_dir):
        path = os.path.join(pdf_html_dir, pdf_html_page)
        pages = []
        with open(path, 'r') as rf:
            soup = BeautifulSoup(rf, 'html.parser')
            tstring = soup.title.string
            # Example title: S1470160X05000063.pdf-0000
            page_number_re = re.match(r'.*-(\d{4})')
            page_number = page_number_re.group()
            merged_areas = helpers.merge_areas(
                soup.find_all('div', 'ocr_carea'))
            pages.append({
                'page_no':
                page_number,
                'soup':
                soup,
                'page':
                tstring,
                'areas': [helpers.area_summary(area) for area in merged_areas],
                'lines': [line for line in soup.find_all('span', 'ocr_line')]
            })
        page_areas = [page['areas'] for page in pages]
        doc_stats = helpers.summarize_document(
            [area for areas in page_areas for area in areas])
Ejemplo n.º 2
0
doc_id = sys.argv[1]

page_paths = glob.glob('./docs/training/' + doc_id + '/tesseract/*.html')

pages = []
for page_no, page in enumerate(page_paths):
    # Read in each tesseract page with BeautifulSoup so we can look at the document holistically
    with codecs.open(page, "r", "utf-8") as hocr:
        text = hocr.read()
        soup = BeautifulSoup(text, 'html.parser')
        merged_areas = helpers.merge_areas(soup.find_all('div', 'ocr_carea'))
        pages.append({
            'page_no': page.split('/')[-1].replace('.html', '').replace('page_', ''),
            'soup': soup,
            'page': helpers.extractbbox(soup.find_all('div', 'ocr_page')[0].get('title')),
            'areas': [ helpers.area_summary(area) for area in merged_areas ],
            'lines': [ line for line in soup.find_all('span', 'ocr_line') ]
        })

# map/reduce
page_areas = [ page['areas'] for page in pages ]

# Calculate summary stats for the document from all areas identified by Tesseract
doc_stats = helpers.summarize_document([ area for areas in page_areas for area in areas ])

# Classify areas
for idx, page in enumerate(pages):
    for area in page['areas']:
        classification = heuristics.classify(area, doc_stats, page['areas'])
        classification['page_no'] = page['page_no']
def test_area_summary():
    area_summary(AREA)
Ejemplo n.º 4
0
def extract_tables(document_path):
    # Connect to Postgres
    connection = psycopg2.connect(dbname=Credentials.PG_DATABASE,
                                  user=Credentials.PG_USERNAME,
                                  password=Credentials.PG_PASSWORD,
                                  host=Credentials.PG_HOST,
                                  port=Credentials.PG_PORT)
    cursor = connection.cursor()

    page_paths = glob.glob(document_path + '/tesseract/*.html')

    pages = []
    text_layer = ''

    # Read in each tesseract page with BeautifulSoup so we can look at the document holistically
    for page_no, page in enumerate(page_paths):
        with codecs.open(page, "r", "utf-8") as hocr:
            text = hocr.read()
            soup = BeautifulSoup(text, 'html.parser')
            merged_areas = helpers.merge_areas(
                soup.find_all('div', 'ocr_carea'))
            pages.append({
                'page_no':
                page.split('/')[-1].replace('.html', '').replace('page_', ''),
                'soup':
                soup,
                'page':
                helpers.extractbbox(
                    soup.find_all('div', 'ocr_page')[0].get('title')),
                'areas': [helpers.area_summary(area) for area in merged_areas],
                'lines': [line for line in soup.find_all('span', 'ocr_line')]
            })
            # Record the OCR-identified text
            text_layer += soup.getText()

    # map/reduce
    page_areas = [page['areas'] for page in pages]

    # Calculate summary stats for the document from all areas identified by Tesseract
    doc_stats = helpers.summarize_document(
        [area for areas in page_areas for area in areas])

    # Classify areas
    for idx, page in enumerate(pages):
        for area in page['areas']:
            area['classification'] = heuristics.classify(
                area, doc_stats, page['areas'])
            area['classification']['page_no'] = page['page_no']
            # Use the model to assign an area type and probabilty of that area type
            probabilities = clf.predict_proba(
                [heuristics.classify_list(area, doc_stats, page['areas'])])
            # Apply a label to each probability
            classifications = zip(clf.classes_, probabilities[0])
            classifications = sorted(classifications,
                                     key=lambda x: x[1],
                                     reverse=True)

            area['classification_p'] = classifications[0][1]

            area['type'] = clf.predict(
                [heuristics.classify_list(area, doc_stats, page['areas'])])[0]

    # Attempt to identify all charts/tables/etc in the paper by looking at the text layer
    # i.e. It is useful for us to know if the text mentions "see table 4", because if the caption
    # for table 4 is distorted in the text layer ("teble 4", for example), we can still guess that
    # it is table 4 because of it's position in the document and our prior knowledge that a table 4
    # exists
    text_layer = text_layer.strip().replace('\n', ' ').replace('  ',
                                                               ' ').lower()
    figures = []
    for result in re.findall(
            '(table|figure|fig|map|appendix|app|appx|tbl)(\.)? (\d+)(\.)?',
            text_layer,
            flags=re.IGNORECASE):
        figures.append(' '.join(' '.join(result).replace('.', '').replace(
            'figure', 'fig').split()).lower())

    # Clean up the list of figures/tables/etc
    figures = sorted(set(figures))
    figure_idx = {}
    for fig in figures:
        parts = fig.split(' ')
        # Need to try/except because often times the "number" is actually a string that cannot be parsed into an integer
        if parts[0] in figure_idx:
            try:
                figure_idx[parts[0]].append(int(parts[1]))
            except:
                continue
        else:
            try:
                figure_idx[parts[0]] = [int(parts[1])]
            except:
                continue

    # Clean up for reformat
    for key in figure_idx:
        figure_idx[key] = helpers.clean_range(sorted(set(figure_idx[key])))

    # map/reduce
    area_stats = [area for areas in page_areas for area in areas]

    # Most documents only contain one page height, but others mix landscape and portrait pages
    # Figure out which is the most common
    doc_stats['page_height'] = np.bincount(
        [page['page']['y2'] - page['page']['y1'] for page in pages]).argmax()
    doc_stats['page_width'] = np.bincount(
        [page['page']['x2'] - page['page']['x1'] for page in pages]).argmax()

    # Find out if a header or footer is present in the document - make sure we don't include them in extracts
    doc_stats['header'], doc_stats['footer'] = helpers.get_header_footer(
        pages, doc_stats['page_height'], doc_stats['page_width'])

    doc_stats['found_tables'] = figure_idx
    print('these tables were found --')

    for ttype in figure_idx:
        print('    ', ttype, figure_idx[ttype])

    colormap = {
        'other': '#26547C',
        'header / footer': '#EF476F',
        'graphic caption': '#FFD166',
        'graphic': '#06D6A0',
        'reference': '#3E92CC',
        'body': '#F4FAFF'
    }
    for page in pages:
        fig = plt.figure()
        print(document_path + "/png/page_%s.png" % page['page_no'])
        img = plt.imread(document_path + "/png/page_%s.png" % page['page_no'])
        ax = fig.add_subplot(111, aspect='equal')
        for area in page['areas']:
            box = {
                '_left': int(area['x1']),
                '_top': int(area['y1']),
                '_right': int(area['x2']),
                '_bottom': int(area['y2']),
                'width': int(area['x2']) - int(area['x1']),
                'height': int(area['y2']) - int(area['y1'])
            }
            ax.add_patch(
                patches.Rectangle((box['_left'], box['_top']),
                                  box['_right'] - box['_left'],
                                  box['_bottom'] - box['_top'],
                                  fill=True,
                                  linewidth=0.5,
                                  facecolor=colormap[area['type']],
                                  label=area['type'],
                                  alpha=0.2))
        plt.ylim(0, pages[0]['page']['y2'])
        plt.xlim(0, pages[0]['page']['x2'])
        plt.axis("off")
        plt.imshow(img, zorder=0)
        ax = plt.gca()
        ax.invert_yaxis()
        patchlist = [
            patches.Patch(color=color, label=label, alpha=0.2)
            for label, color in colormap.items()
        ]
        fig.legend(patchlist,
                   colormap.keys(),
                   loc='lower center',
                   fontsize='x-small',
                   ncol=int(len(colormap) / 2),
                   bbox_transform=fig.transFigure)
        plt.axis('off')
        fig.savefig(document_path +
                    "/annotated/page_%s_with_areatypes.png" % page['page_no'],
                    dpi=400,
                    bbox_inches='tight',
                    pad_inches=0)
        fig.clf()
        plt.close()

    for page in pages:
        page_extracts = process_page(doc_stats, page)

        found = []
        for e in page_extracts:
            if e['name'] in found:
                e['name'] = e['name'] + '*'

            found.append(e['name'])

        for table in page_extracts:
            helpers.extract_table(document_path, page['page_no'], table)
Ejemplo n.º 5
0
def extract_tables(document_path):
    # Connect to Postgres
    connection = psycopg2.connect(dbname=Credentials.PG_DATABASE,
                                  user=Credentials.PG_USERNAME,
                                  password=Credentials.PG_PASSWORD,
                                  host=Credentials.PG_HOST,
                                  port=Credentials.PG_PORT)
    cursor = connection.cursor()

    page_paths = glob.glob(document_path + '/tesseract/*.html')

    pages = []
    text_layer = ''

    # Read in each tesseract page with BeautifulSoup so we can look at the document holistically
    for page_no, page in enumerate(page_paths):
        with open(page) as hocr:
            text = hocr.read()
            soup = BeautifulSoup(text, 'html.parser')
            merged_areas = helpers.merge_areas(
                soup.find_all('div', 'ocr_carea'))
            pages.append({
                'page_no':
                page.split('/')[-1].replace('.html', '').replace('page_', ''),
                'soup':
                soup,
                'page':
                helpers.extractbbox(
                    soup.find_all('div', 'ocr_page')[0].get('title')),
                'areas': [helpers.area_summary(area) for area in merged_areas],
                'lines': [line for line in soup.find_all('span', 'ocr_line')]
            })
            # Record the OCR-identified text
            text_layer += soup.getText()

    # map/reduce
    page_areas = [page['areas'] for page in pages]

    # Calculate summary stats for the document from all areas identified by Tesseract
    doc_stats = helpers.summarize_document(
        [area for areas in page_areas for area in areas])

    # Classify areas
    for idx, page in enumerate(pages):
        for area in page['areas']:
            area['classification'] = heuristics.classify(
                area, doc_stats, page['areas'])
            area['classification']['page_no'] = page['page_no']
            # Use the model to assign an area type and probabilty of that area type
            probabilities = clf.predict_proba(
                [heuristics.classify_list(area, doc_stats, page['areas'])])
            # Apply a label to each probability
            classifications = zip(clf.classes_, probabilities)
            # Sort by highest probability
            classifications.sort(key=lambda x: x[1], reverse=True)

            area['classification_p'] = classifications[0][0]

            area['type'] = clf.predict(
                [heuristics.classify_list(area, doc_stats, page['areas'])])

    # Attempt to identify all charts/tables/etc in the paper by looking at the text layer
    # i.e. It is useful for us to know if the text mentions "see table 4", because if the caption
    # for table 4 is distorted in the text layer ("teble 4", for example), we can still guess that
    # it is table 4 because of it's position in the document and our prior knowledge that a table 4
    # exists
    text_layer = text_layer.strip().replace('\n', ' ').replace('  ',
                                                               ' ').lower()
    figures = []
    for result in re.findall(
            '(table|figure|fig|map|appendix|app|appx|tbl)(\.)? (\d+)(\.)?',
            text_layer,
            flags=re.IGNORECASE):
        figures.append(' '.join(' '.join(result).replace('.', '').replace(
            'figure', 'fig').split()).lower())

    # Clean up the list of figures/tables/etc
    figures = sorted(set(figures))
    figure_idx = {}
    for fig in figures:
        parts = fig.split(' ')
        # Need to try/except because often times the "number" is actually a string that cannot be parsed into an integer
        if parts[0] in figure_idx:
            try:
                figure_idx[parts[0]].append(int(parts[1]))
            except:
                continue
        else:
            try:
                figure_idx[parts[0]] = [int(parts[1])]
            except:
                continue

    # Clean up for reformat
    for key in figure_idx:
        figure_idx[key] = helpers.clean_range(sorted(set(figure_idx[key])))

    # map/reduce
    area_stats = [area for areas in page_areas for area in areas]

    # Most documents only contain one page height, but others mix landscape and portrait pages
    # Figure out which is the most common
    doc_stats['page_height'] = np.bincount(
        [page['page']['y2'] - page['page']['y1'] for page in pages]).argmax()
    doc_stats['page_width'] = np.bincount(
        [page['page']['x2'] - page['page']['x1'] for page in pages]).argmax()

    # Find out if a header or footer is present in the document - make sure we don't include them in extracts
    doc_stats['header'], doc_stats['footer'] = helpers.get_header_footer(
        pages, doc_stats['page_height'], doc_stats['page_width'])

    doc_stats['found_tables'] = figure_idx
    print('these tables were found --')

    for ttype in figure_idx:
        print('    ', ttype, figure_idx[ttype])

    for page in pages:
        page_extracts = process_page(doc_stats, page)

        found = []
        for e in page_extracts:
            if e['name'] in found:
                e['name'] = e['name'] + '*'

            found.append(e['name'])

        for table in page_extracts:
            helpers.extract_table(document_path, page['page_no'], table)