def enter_singlepage_document(file_path, document_slug, collection_slug, page_number,only_enter_new_pages=False):
    this_collection, created = Document_Collection.objects.get_or_create(collection_slug=collection_slug)
    parser = document_parser(file_path, encoding='latin-1')
    this_doc, created = Document.objects.get_or_create(document_slug=document_slug, document_collection=this_collection)
    # todo: populate ein, form, year, month from id
    page_count=0
    for this_page in parser:
        page_count+=1
        assert page_count < 2
        
        page = get_words_with_lines_from_page(this_page.getvalue())
        enter_page(this_doc, page, page_number, only_enter_new_pages)
def enter_document(file_path, document_id):
    parser = document_parser(file_path, encoding='latin-1')
    this_doc, created = Document.objects.get_or_create(document_id=document_id)
    # todo: populate ein, form, year, month from id
    page_count=0
    for this_page in parser:
        page_count += 1
        
        # READ THE PAGE AS A HIERARCHY OF LINES AND WORDS. ONLY WORDS ARE GIVEN BOUNDING BOXES HERE THOUGH
        # THE HOCR SPEC GIVES LINES BOUNDING BOXES, SO THEY COULD BE ADDED, IT'S JUST NOT CLEAR IF THAT WOULD HELP
        # I ASSUME THE LINE IS JUST THE CONVEX HULL OF THE WORDS, BUT DON'T KNOW THIS FOR SURE. 
        
        page = get_words_with_lines_from_page(this_page.getvalue())
        enter_page(this_doc, page, page_count)
 def handle(self, *args, **options):        
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 fc = get_feature_collection(page)
                 print fc
 def handle(self, *args, **options):        
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 fc = get_feature_collection(page)
                 print fc
Exemple #5
0
def enter_document(file_path, document_id):
    parser = document_parser(file_path, encoding='latin-1')
    this_doc, created = Document.objects.get_or_create(document_id=document_id)
    # todo: populate ein, form, year, month from id
    page_count = 0
    for this_page in parser:
        page_count += 1

        # READ THE PAGE AS A HIERARCHY OF LINES AND WORDS. ONLY WORDS ARE GIVEN BOUNDING BOXES HERE THOUGH
        # THE HOCR SPEC GIVES LINES BOUNDING BOXES, SO THEY COULD BE ADDED, IT'S JUST NOT CLEAR IF THAT WOULD HELP
        # I ASSUME THE LINE IS JUST THE CONVEX HULL OF THE WORDS, BUT DON'T KNOW THIS FOR SURE.

        page = get_words_with_lines_from_page(this_page.getvalue())
        enter_page(this_doc, page, page_count)
 def handle(self, *args, **options):
     """ test cmd to just get a page with geosgeometries attached """
     
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 page['words'] = get_word_shapes(page['words'])
                 print page
""" A django-independent test that reads a document and returns geojson files for each page. """

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_with_lines_from_page
from geo_utils.geojson_utils import get_feature_collection

# A test file
hocr_file = "./hocr_parser/test_hocr/58-1723645_990_201204.html"

# create a parser for this doc
hocr_parser = document_parser(hocr_file, encoding='latin-1')

for this_page in hocr_parser:
    
    # retrieve a representation of the pages that include line numbers and word numbers
    page = get_words_with_lines_from_page(this_page.getvalue())
    
    print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib'])
    # Get geojson that assigns id by word order and preserves line numbers as an attribute
    print get_feature_collection(page)
    print "\n\n\n"
from lxml import etree
from lxml.etree import tostring
from StringIO import StringIO

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_from_page, get_words_with_lines_from_page, get_annotated_bbox


flexible_parser = etree.XMLParser(encoding='utf-8', recover=True)


file_name = "58-1723645_990_201204"

file_path = "parser/test_hocr/" + file_name + ".html"
parser = document_parser(file_path, encoding='latin-1')

page_num = 0
while True:
    this_page = parser.read_page()
    if not this_page:
        break
    page_num += 1
    print "Processing page %s" % page_num
    outfile = "../display/hocr_pages/" + file_name + "p" + str(page_num) + ".html"
    #outh = open(outfile, 'w')
    page_xml = this_page.getvalue()
    #page_xml = page_xml.decode('latin-1', 'ignore').encode('utf-8')
    tree = etree.parse(StringIO(page_xml), flexible_parser)
    tree.write(outfile)
import unicodedata

from lxml import etree
from lxml.etree import tostring
from StringIO import StringIO

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_from_page, get_words_with_lines_from_page, get_annotated_bbox

flexible_parser = etree.XMLParser(encoding='utf-8', recover=True)

file_name = "58-1723645_990_201204"

file_path = "hocr_parser/test_hocr/" + file_name + ".html"
parser = document_parser(file_path, encoding='latin-1')

page_num = 0
while True:
    this_page = parser.read_page()
    if not this_page:
        break
    page_num += 1
    print "Processing page %s" % page_num
    outfile = "../display/hocr_pages/" + file_name + "p" + str(
        page_num) + ".html"
    #outh = open(outfile, 'w')
    page_xml = this_page.getvalue()
    #page_xml = page_xml.decode('latin-1', 'ignore').encode('utf-8')
    tree = etree.parse(StringIO(page_xml), flexible_parser)
    tree.write(outfile)
Exemple #10
0
""" A django-independent test that reads a document and returns geojson files for each page. """

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_with_lines_from_page
from geo_utils.geojson_utils import get_feature_collection

# A test file
hocr_file = "hocr_parser/test_hocr/58-1723645_990_201204.html"

# create a parser for this doc
hocr_parser = document_parser(hocr_file, encoding='latin-1')

for this_page in hocr_parser:
    
    # retrieve a representation of the pages that include line numbers and word numbers
    page = get_words_with_lines_from_page(this_page.getvalue())
    
    print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib'])
    # Get geojson that assigns id by word order and preserves line numbers as an attribute
    print get_feature_collection(page)
    print "\n\n\n"