Exemple #1
0
def document_page_geojson(request, doc_id, page_number):
    
    this_page = get_object_or_404(Page, doc__document_id=doc_id, page_number=page_number)
    this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values('text','bbox', 'line_num')
    page = {}
    page['words'] = this_page_words        
    output = get_feature_collection(page)
    # todo: add the page bounding box. 
    return HttpResponse(output, content_type="application/json")
Exemple #2
0
def document_page_geojson(request, doc_id, page_number):

    this_page = get_object_or_404(Page,
                                  doc__document_id=doc_id,
                                  page_number=page_number)
    this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values(
        'text', 'bbox', 'line_num')
    page = {}
    page['words'] = this_page_words
    output = get_feature_collection(page)
    # todo: add the page bounding box.
    return HttpResponse(output, content_type="application/json")
 def handle(self, *args, **options):        
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 fc = get_feature_collection(page)
                 print fc
 def handle(self, *args, **options):        
     for d, _, files in os.walk(SAMPLE_FILE_DIR):
         for i, this_file in enumerate(files):
             file_path = SAMPLE_FILE_DIR + this_file
             # ignore files that aren't .html in case any got mixed in there
             # may want to filter on other criteria here too
             if file_path.find(".html") > 0:
                 print "Handling %s" % (file_path)
     
                 parser = document_parser(file_path, encoding='latin-1')
                 first_page = parser.next_document()
                 page = get_words_with_lines_from_page(first_page.getvalue())
                 fc = get_feature_collection(page)
                 print fc
Exemple #5
0
def document_page_geojson(request, slug, doc_slug, page_number):
    this_document = get_object_or_404(Document, document_slug=doc_slug, document_collection__collection_slug=slug)
    this_page = get_object_or_404(Page, doc__document_slug=doc_slug, page_number=page_number)
    this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values('text','bbox', 'line_num')
    page = {}
    page['words'] = this_page_words        
    featurecollection = get_feature_collection(page['words'])
    
    ## Add additional attributes needed. This may or may not break the geojsonspec. 
    featurecollection['bbox'] = 'blah'
    featurecollection['background_image'] = 'blahblah'
    
    output = geojson.dumps(featurecollection)
    # todo: add the page bounding box. 
    return HttpResponse(output, content_type="application/json")
Exemple #6
0
def document_page_geojson(request, slug, doc_slug, page_number):
    this_document = get_object_or_404(
        Document,
        document_slug=doc_slug,
        document_collection__collection_slug=slug)
    this_page = get_object_or_404(Page,
                                  doc__document_slug=doc_slug,
                                  page_number=page_number)
    this_page_words = PageWord.objects.filter(page_pk=this_page.pk).values(
        'text', 'bbox', 'line_num')
    page = {}
    page['words'] = this_page_words
    featurecollection = get_feature_collection(page['words'])

    ## Add additional attributes needed. This may or may not break the geojsonspec.
    featurecollection['bbox'] = 'blah'
    featurecollection['background_image'] = 'blahblah'

    output = geojson.dumps(featurecollection)
    # todo: add the page bounding box.
    return HttpResponse(output, content_type="application/json")
""" A django-independent test that reads a document and returns geojson files for each page. """

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_with_lines_from_page
from geo_utils.geojson_utils import get_feature_collection

# A test file
hocr_file = "./hocr_parser/test_hocr/58-1723645_990_201204.html"

# create a parser for this doc
hocr_parser = document_parser(hocr_file, encoding='latin-1')

for this_page in hocr_parser:
    
    # retrieve a representation of the pages that include line numbers and word numbers
    page = get_words_with_lines_from_page(this_page.getvalue())
    
    print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib'])
    # Get geojson that assigns id by word order and preserves line numbers as an attribute
    print get_feature_collection(page)
    print "\n\n\n"
Exemple #8
0
""" A django-independent test that reads a document and returns geojson files for each page. """

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_with_lines_from_page
from geo_utils.geojson_utils import get_feature_collection

# A test file
hocr_file = "hocr_parser/test_hocr/58-1723645_990_201204.html"

# create a parser for this doc
hocr_parser = document_parser(hocr_file, encoding='latin-1')

for this_page in hocr_parser:
    
    # retrieve a representation of the pages that include line numbers and word numbers
    page = get_words_with_lines_from_page(this_page.getvalue())
    
    print "Processing page %s -- now dumping geojson to stdout\n\n" % (page['attrib'])
    # Get geojson that assigns id by word order and preserves line numbers as an attribute
    print get_feature_collection(page)
    print "\n\n\n"