def enter_singlepage_document(file_path, document_slug, collection_slug, page_number,only_enter_new_pages=False):
    this_collection, created = Document_Collection.objects.get_or_create(collection_slug=collection_slug)
    parser = document_parser(file_path, encoding='latin-1')
    this_doc, created = Document.objects.get_or_create(document_slug=document_slug, document_collection=this_collection)
    # todo: populate ein, form, year, month from id
    page_count=0
    for this_page in parser:
        page_count+=1
        assert page_count < 2
        
        page = get_words_with_lines_from_page(this_page.getvalue())
        enter_page(this_doc, page, page_number, only_enter_new_pages)
Esempio n. 2
0
def enter_document(file_path, document_id):
    parser = document_parser(file_path, encoding='latin-1')
    this_doc, created = Document.objects.get_or_create(document_id=document_id)
    # todo: populate ein, form, year, month from id
    page_count=0
    for this_page in parser:
        page_count += 1
        
        # READ THE PAGE AS A HIERARCHY OF LINES AND WORDS. ONLY WORDS ARE GIVEN BOUNDING BOXES HERE THOUGH
        # THE HOCR SPEC GIVES LINES BOUNDING BOXES, SO THEY COULD BE ADDED, IT'S JUST NOT CLEAR IF THAT WOULD HELP
        # I ASSUME THE LINE IS JUST THE CONVEX HULL OF THE WORDS, BUT DON'T KNOW THIS FOR SURE. 
        
        page = get_words_with_lines_from_page(this_page.getvalue())
        enter_page(this_doc, page, page_count)
Esempio n. 3
0
 def handle(self, *args, **options):        
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 fc = get_feature_collection(page)
                 print fc
Esempio n. 4
0
 def handle(self, *args, **options):        
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 fc = get_feature_collection(page)
                 print fc
Esempio n. 5
0
def enter_document(file_path, document_id):
    parser = document_parser(file_path, encoding='latin-1')
    this_doc, created = Document.objects.get_or_create(document_id=document_id)
    # todo: populate ein, form, year, month from id
    page_count = 0
    for this_page in parser:
        page_count += 1

        # READ THE PAGE AS A HIERARCHY OF LINES AND WORDS. ONLY WORDS ARE GIVEN BOUNDING BOXES HERE THOUGH
        # THE HOCR SPEC GIVES LINES BOUNDING BOXES, SO THEY COULD BE ADDED, IT'S JUST NOT CLEAR IF THAT WOULD HELP
        # I ASSUME THE LINE IS JUST THE CONVEX HULL OF THE WORDS, BUT DON'T KNOW THIS FOR SURE.

        page = get_words_with_lines_from_page(this_page.getvalue())
        enter_page(this_doc, page, page_count)
 def handle(self, *args, **options):
     """ test cmd to just get a page with geosgeometries attached """
     
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 page['words'] = get_word_shapes(page['words'])
                 print page
""" A django-independent test that reads a document and returns geojson files for each page. """

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_with_lines_from_page
from geo_utils.geojson_utils import get_feature_collection

# A test file
hocr_file = "./hocr_parser/test_hocr/58-1723645_990_201204.html"

# create a parser for this doc
hocr_parser = document_parser(hocr_file, encoding='latin-1')

for this_page in hocr_parser:
    
    # retrieve a representation of the pages that include line numbers and word numbers
    page = get_words_with_lines_from_page(this_page.getvalue())
    
    print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib'])
    # Get geojson that assigns id by word order and preserves line numbers as an attribute
    print get_feature_collection(page)
    print "\n\n\n"
Esempio n. 8
0
""" A django-independent test that reads a document and returns geojson files for each page. """

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_with_lines_from_page
from geo_utils.geojson_utils import get_feature_collection

# A test file
hocr_file = "hocr_parser/test_hocr/58-1723645_990_201204.html"

# create a parser for this doc
hocr_parser = document_parser(hocr_file, encoding='latin-1')

for this_page in hocr_parser:
    
    # retrieve a representation of the pages that include line numbers and word numbers
    page = get_words_with_lines_from_page(this_page.getvalue())
    
    print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib'])
    # Get geojson that assigns id by word order and preserves line numbers as an attribute
    print get_feature_collection(page)
    print "\n\n\n"