def get_features(pdf_html_dir): for pdf_html_page in os.listdir(pdf_html_dir): path = os.path.join(pdf_html_dir, pdf_html_page) pages = [] with open(path, 'r') as rf: soup = BeautifulSoup(rf, 'html.parser') tstring = soup.title.string # Example title: S1470160X05000063.pdf-0000 page_number_re = re.match(r'.*-(\d{4})') page_number = page_number_re.group() merged_areas = helpers.merge_areas( soup.find_all('div', 'ocr_carea')) pages.append({ 'page_no': page_number, 'soup': soup, 'page': tstring, 'areas': [helpers.area_summary(area) for area in merged_areas], 'lines': [line for line in soup.find_all('span', 'ocr_line')] }) page_areas = [page['areas'] for page in pages] doc_stats = helpers.summarize_document( [area for areas in page_areas for area in areas])
doc_id = sys.argv[1] page_paths = glob.glob('./docs/training/' + doc_id + '/tesseract/*.html') pages = [] for page_no, page in enumerate(page_paths): # Read in each tesseract page with BeautifulSoup so we can look at the document holistically with codecs.open(page, "r", "utf-8") as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') merged_areas = helpers.merge_areas(soup.find_all('div', 'ocr_carea')) pages.append({ 'page_no': page.split('/')[-1].replace('.html', '').replace('page_', ''), 'soup': soup, 'page': helpers.extractbbox(soup.find_all('div', 'ocr_page')[0].get('title')), 'areas': [ helpers.area_summary(area) for area in merged_areas ], 'lines': [ line for line in soup.find_all('span', 'ocr_line') ] }) # map/reduce page_areas = [ page['areas'] for page in pages ] # Calculate summary stats for the document from all areas identified by Tesseract doc_stats = helpers.summarize_document([ area for areas in page_areas for area in areas ]) # Classify areas for idx, page in enumerate(pages): for area in page['areas']: classification = heuristics.classify(area, doc_stats, page['areas']) classification['page_no'] = page['page_no']
def test_area_summary(): area_summary(AREA)
def extract_tables(document_path): # Connect to Postgres connection = psycopg2.connect(dbname=Credentials.PG_DATABASE, user=Credentials.PG_USERNAME, password=Credentials.PG_PASSWORD, host=Credentials.PG_HOST, port=Credentials.PG_PORT) cursor = connection.cursor() page_paths = glob.glob(document_path + '/tesseract/*.html') pages = [] text_layer = '' # Read in each tesseract page with BeautifulSoup so we can look at the document holistically for page_no, page in enumerate(page_paths): with codecs.open(page, "r", "utf-8") as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') merged_areas = helpers.merge_areas( soup.find_all('div', 'ocr_carea')) pages.append({ 'page_no': page.split('/')[-1].replace('.html', '').replace('page_', ''), 'soup': soup, 'page': helpers.extractbbox( soup.find_all('div', 'ocr_page')[0].get('title')), 'areas': [helpers.area_summary(area) for area in merged_areas], 'lines': [line for line in soup.find_all('span', 'ocr_line')] }) # Record the OCR-identified text text_layer += soup.getText() # map/reduce page_areas = [page['areas'] for page in pages] # Calculate summary stats for the document from all areas identified by Tesseract doc_stats = helpers.summarize_document( [area for areas in page_areas for area in areas]) # Classify areas for idx, page in enumerate(pages): for area in page['areas']: area['classification'] = heuristics.classify( area, doc_stats, page['areas']) area['classification']['page_no'] = page['page_no'] # Use the model to assign an area type and probabilty of that area type probabilities = clf.predict_proba( [heuristics.classify_list(area, doc_stats, page['areas'])]) # Apply a label to each probability classifications = zip(clf.classes_, probabilities[0]) classifications = sorted(classifications, key=lambda x: x[1], reverse=True) area['classification_p'] = classifications[0][1] area['type'] = clf.predict( [heuristics.classify_list(area, doc_stats, page['areas'])])[0] # Attempt to identify all charts/tables/etc in the paper by looking at the text layer # i.e. It is useful for us to know if the text mentions "see table 4", because if the caption # for table 4 is distorted in the text layer ("teble 4", for example), we can still guess that # it is table 4 because of it's position in the document and our prior knowledge that a table 4 # exists text_layer = text_layer.strip().replace('\n', ' ').replace(' ', ' ').lower() figures = [] for result in re.findall( '(table|figure|fig|map|appendix|app|appx|tbl)(\.)? (\d+)(\.)?', text_layer, flags=re.IGNORECASE): figures.append(' '.join(' '.join(result).replace('.', '').replace( 'figure', 'fig').split()).lower()) # Clean up the list of figures/tables/etc figures = sorted(set(figures)) figure_idx = {} for fig in figures: parts = fig.split(' ') # Need to try/except because often times the "number" is actually a string that cannot be parsed into an integer if parts[0] in figure_idx: try: figure_idx[parts[0]].append(int(parts[1])) except: continue else: try: figure_idx[parts[0]] = [int(parts[1])] except: continue # Clean up for reformat for key in figure_idx: figure_idx[key] = helpers.clean_range(sorted(set(figure_idx[key]))) # map/reduce area_stats = [area for areas in page_areas for area in areas] # Most documents only contain one page height, but others mix landscape and portrait pages # Figure out which is the most common doc_stats['page_height'] = np.bincount( [page['page']['y2'] - page['page']['y1'] for page in pages]).argmax() doc_stats['page_width'] = np.bincount( [page['page']['x2'] - page['page']['x1'] for page in pages]).argmax() # Find out if a header or footer is present in the document - make sure we don't include them in extracts doc_stats['header'], doc_stats['footer'] = helpers.get_header_footer( pages, doc_stats['page_height'], doc_stats['page_width']) doc_stats['found_tables'] = figure_idx print('these tables were found --') for ttype in figure_idx: print(' ', ttype, figure_idx[ttype]) colormap = { 'other': '#26547C', 'header / footer': '#EF476F', 'graphic caption': '#FFD166', 'graphic': '#06D6A0', 'reference': '#3E92CC', 'body': '#F4FAFF' } for page in pages: fig = plt.figure() print(document_path + "/png/page_%s.png" % page['page_no']) img = plt.imread(document_path + "/png/page_%s.png" % page['page_no']) ax = fig.add_subplot(111, aspect='equal') for area in page['areas']: box = { '_left': int(area['x1']), '_top': int(area['y1']), '_right': int(area['x2']), '_bottom': int(area['y2']), 'width': int(area['x2']) - int(area['x1']), 'height': int(area['y2']) - int(area['y1']) } ax.add_patch( patches.Rectangle((box['_left'], box['_top']), box['_right'] - box['_left'], box['_bottom'] - box['_top'], fill=True, linewidth=0.5, facecolor=colormap[area['type']], label=area['type'], alpha=0.2)) plt.ylim(0, pages[0]['page']['y2']) plt.xlim(0, pages[0]['page']['x2']) plt.axis("off") plt.imshow(img, zorder=0) ax = plt.gca() ax.invert_yaxis() patchlist = [ patches.Patch(color=color, label=label, alpha=0.2) for label, color in colormap.items() ] fig.legend(patchlist, colormap.keys(), loc='lower center', fontsize='x-small', ncol=int(len(colormap) / 2), bbox_transform=fig.transFigure) plt.axis('off') fig.savefig(document_path + "/annotated/page_%s_with_areatypes.png" % page['page_no'], dpi=400, bbox_inches='tight', pad_inches=0) fig.clf() plt.close() for page in pages: page_extracts = process_page(doc_stats, page) found = [] for e in page_extracts: if e['name'] in found: e['name'] = e['name'] + '*' found.append(e['name']) for table in page_extracts: helpers.extract_table(document_path, page['page_no'], table)
def extract_tables(document_path): # Connect to Postgres connection = psycopg2.connect(dbname=Credentials.PG_DATABASE, user=Credentials.PG_USERNAME, password=Credentials.PG_PASSWORD, host=Credentials.PG_HOST, port=Credentials.PG_PORT) cursor = connection.cursor() page_paths = glob.glob(document_path + '/tesseract/*.html') pages = [] text_layer = '' # Read in each tesseract page with BeautifulSoup so we can look at the document holistically for page_no, page in enumerate(page_paths): with open(page) as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') merged_areas = helpers.merge_areas( soup.find_all('div', 'ocr_carea')) pages.append({ 'page_no': page.split('/')[-1].replace('.html', '').replace('page_', ''), 'soup': soup, 'page': helpers.extractbbox( soup.find_all('div', 'ocr_page')[0].get('title')), 'areas': [helpers.area_summary(area) for area in merged_areas], 'lines': [line for line in soup.find_all('span', 'ocr_line')] }) # Record the OCR-identified text text_layer += soup.getText() # map/reduce page_areas = [page['areas'] for page in pages] # Calculate summary stats for the document from all areas identified by Tesseract doc_stats = helpers.summarize_document( [area for areas in page_areas for area in areas]) # Classify areas for idx, page in enumerate(pages): for area in page['areas']: area['classification'] = heuristics.classify( area, doc_stats, page['areas']) area['classification']['page_no'] = page['page_no'] # Use the model to assign an area type and probabilty of that area type probabilities = clf.predict_proba( [heuristics.classify_list(area, doc_stats, page['areas'])]) # Apply a label to each probability classifications = zip(clf.classes_, probabilities) # Sort by highest probability classifications.sort(key=lambda x: x[1], reverse=True) area['classification_p'] = classifications[0][0] area['type'] = clf.predict( [heuristics.classify_list(area, doc_stats, page['areas'])]) # Attempt to identify all charts/tables/etc in the paper by looking at the text layer # i.e. It is useful for us to know if the text mentions "see table 4", because if the caption # for table 4 is distorted in the text layer ("teble 4", for example), we can still guess that # it is table 4 because of it's position in the document and our prior knowledge that a table 4 # exists text_layer = text_layer.strip().replace('\n', ' ').replace(' ', ' ').lower() figures = [] for result in re.findall( '(table|figure|fig|map|appendix|app|appx|tbl)(\.)? (\d+)(\.)?', text_layer, flags=re.IGNORECASE): figures.append(' '.join(' '.join(result).replace('.', '').replace( 'figure', 'fig').split()).lower()) # Clean up the list of figures/tables/etc figures = sorted(set(figures)) figure_idx = {} for fig in figures: parts = fig.split(' ') # Need to try/except because often times the "number" is actually a string that cannot be parsed into an integer if parts[0] in figure_idx: try: figure_idx[parts[0]].append(int(parts[1])) except: continue else: try: figure_idx[parts[0]] = [int(parts[1])] except: continue # Clean up for reformat for key in figure_idx: figure_idx[key] = helpers.clean_range(sorted(set(figure_idx[key]))) # map/reduce area_stats = [area for areas in page_areas for area in areas] # Most documents only contain one page height, but others mix landscape and portrait pages # Figure out which is the most common doc_stats['page_height'] = np.bincount( [page['page']['y2'] - page['page']['y1'] for page in pages]).argmax() doc_stats['page_width'] = np.bincount( [page['page']['x2'] - page['page']['x1'] for page in pages]).argmax() # Find out if a header or footer is present in the document - make sure we don't include them in extracts doc_stats['header'], doc_stats['footer'] = helpers.get_header_footer( pages, doc_stats['page_height'], doc_stats['page_width']) doc_stats['found_tables'] = figure_idx print('these tables were found --') for ttype in figure_idx: print(' ', ttype, figure_idx[ttype]) for page in pages: page_extracts = process_page(doc_stats, page) found = [] for e in page_extracts: if e['name'] in found: e['name'] = e['name'] + '*' found.append(e['name']) for table in page_extracts: helpers.extract_table(document_path, page['page_no'], table)