def handle(self, *args, **options): start = datetime.now() files_found = 0 for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): files_found += 1 file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0 and i < FILE_ENTRY_CAP: doc_id = this_file.replace(".html", "") print "Entering file number %s data from: %s" % (i, file_path) try: enter_document(file_path, doc_id) except PageCountError: print "Error entering file number %s from %s - entire document skipped. " % ( i, file_path) print traceback.print_exc() if files_found == 0: print warning_message else: end = datetime.now() elapsed = end - start print "Time elapsed: %s" % (elapsed)
def handle(self, *args, **options): start = datetime.now() files_found = 0 for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): files_found += 1 file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0 and i < FILE_ENTRY_CAP: doc_id = this_file.replace(".html", "") print "Entering file number %s data from: %s" % (i, file_path) try: enter_document(file_path, doc_id) except PageCountError: print "Error entering file number %s from %s - entire document skipped. " % (i, file_path) print traceback.print_exc() if files_found ==0: print warning_message else: end = datetime.now() elapsed = end-start print "Time elapsed: %s" % (elapsed)
def handle(self, *args, **options): # just try one that's weird. if False: file_path = "parser/hocr_sample/02-0575282_990EZ_201212.html" doc_id = "02-0575282_990EZ_201212" enter_document(file_path, doc_id) assert False start = datetime.now() for d, _, files in os.walk(SAMPLE_FILE_DIR): for i, this_file in enumerate(files): file_path = SAMPLE_FILE_DIR + this_file # ignore files that aren't .html in case any got mixed in there # may want to filter on other criteria here too if file_path.find(".html") > 0 and i < FILE_ENTRY_CAP: doc_id = this_file.replace(".html", "") print "Entering file number %s data from: %s" % (i, file_path) try: enter_document(file_path, doc_id) except PageCountError: print "Error entering file number %s from %s - entire document skipped. " % (i, file_path) print traceback.print_exc() end = datetime.now() elapsed = end-start print "Time elapsed: %s" % (elapsed)
def handle(self, *args, **options): # sample file included with repo, hopefully this_file = os.path.join(BASE_DIR, 'hocr_parser/test_hocr/58-1723645_990_201204.html') doc_id = "58-1723645_990_201204" start = datetime.now() enter_document(this_file, doc_id) end = datetime.now() elapsed = end-start print "Time elapsed: %s" % (elapsed)
def handle(self, *args, **options): this_file = "parser/test_hocr/58-1723645_990_201204.html" doc_id = "58-1723645_990_201204" start = datetime.now() enter_document(this_file, doc_id) end = datetime.now() elapsed = end-start print "Time elapsed: %s" % (elapsed)
def handle(self, *args, **options): # sample file included with repo, hopefully #doc_id = "1088501-adventuretime-alta-p1" doc_id = "14400741771771-p2" this_file = SAMPLE_FILE_DIR + doc_id + ".html" # 1088501-adventuretime-alta-p1.png start = datetime.now() enter_document(this_file, doc_id, DOCUMENT_COLLECTION_SLUG, only_enter_new_pages=False) end = datetime.now() elapsed = end-start print "Time elapsed: %s" % (elapsed)