def enter_page(doc, page, page_number): #print "processing page %s" % page_number page_attributes = page['attrib'] title = page_attributes['title'] #semicolon_position = title.find(';') #if semicolon_position > 0: # title = title[:semicolon_position] #print "title is '%s'" % title #r = bbox_re.search(title) #bbox_raw = r.group(1) bbox_raw = title.split(';')[1] bbox_raw = bbox_raw.replace("bbox ", "") poly_string = get_poly_string_from_bbox(bbox_raw) #print "bbox is %s, poly_string is: %s" % (bbox_raw, poly_string) poly = GEOSGeometry(poly_string) wkb = poly.hex this_page, created = Page.objects.get_or_create( doc=doc, page_number=page_number, defaults={'page_dimensions':poly} ) page_pk = this_page.pk enter_words(page_pk, page['words'])
def enter_page(doc, page, page_number, only_enter_new_pages=False): #print "processing page %s" % page_number page_attributes = page['attrib'] title = page_attributes['title'] #semicolon_position = title.find(';') #if semicolon_position > 0: # title = title[:semicolon_position] #print "title is '%s'" % title #r = bbox_re.search(title) #bbox_raw = r.group(1) bbox_raw = title.split(';')[1] bbox_raw = bbox_raw.replace("bbox ", "") poly_string = get_poly_string_from_bbox(bbox_raw) #print "bbox is %s, poly_string is: %s" % (bbox_raw, poly_string) poly = GEOSGeometry(poly_string) wkb = poly.hex this_page, created = Page.objects.get_or_create( doc=doc, page_number=page_number, defaults={'page_dimensions':poly} ) #print "This page is: %s created=%s" % (this_page, created) # Only enter pagewords if the page is new (or if we're telling it to. ) if not created or not only_enter_new_pages: page_pk = this_page.pk enter_words(page_pk, page['words'])
def enter_page_words_only(doc, page, page_number): print "processing page %s" % page_number page_attributes = page['attrib'] title = page_attributes['title'] r = bbox_re.search(title) bbox_raw = r.group(1) poly_string = get_poly_string_from_bbox(bbox_raw) poly = GEOSGeometry(poly_string) wkb = poly.hex this_page, created = Page.objects.get_or_create( doc=doc, page_number=page_number, defaults={'page_dimensions': poly}) page_pk = this_page.pk # READ THE PAGE AS A BUNCH OF WORDS ONLY enter_words_only(page_pk, page['words'])
def enter_page_words_only(doc, page, page_number): print "processing page %s" % page_number page_attributes = page['attrib'] title = page_attributes['title'] r = bbox_re.search(title) bbox_raw = r.group(1) poly_string = get_poly_string_from_bbox(bbox_raw) poly = GEOSGeometry(poly_string) wkb = poly.hex this_page, created = Page.objects.get_or_create( doc=doc, page_number=page_number, defaults={'page_dimensions':poly} ) page_pk = this_page.pk # READ THE PAGE AS A BUNCH OF WORDS ONLY enter_words_only(page_pk, page['words'])