Esempio n. 1
0
    def handle(self, *args, **options):

        start = datetime.now()
        files_found = 0
        for d, _, files in os.walk(SAMPLE_FILE_DIR):

            for i, this_file in enumerate(files):
                files_found += 1
                file_path = SAMPLE_FILE_DIR + this_file
                # ignore files that aren't .html in case any got mixed in there
                # may want to filter on other criteria here too
                if file_path.find(".html") > 0 and i < FILE_ENTRY_CAP:
                    doc_id = this_file.replace(".html", "")
                    print "Entering file number %s data from: %s" % (i,
                                                                     file_path)

                    try:
                        enter_document(file_path, doc_id)

                    except PageCountError:

                        print "Error entering file number %s from %s - entire document skipped. " % (
                            i, file_path)
                        print traceback.print_exc()

        if files_found == 0:
            print warning_message

        else:
            end = datetime.now()
            elapsed = end - start
            print "Time elapsed: %s" % (elapsed)
    def handle(self, *args, **options):
        

        start = datetime.now()
        files_found = 0
        for d, _, files in os.walk(SAMPLE_FILE_DIR):
            
            for i, this_file in enumerate(files):
                files_found += 1
                file_path = SAMPLE_FILE_DIR + this_file
                # ignore files that aren't .html in case any got mixed in there
                # may want to filter on other criteria here too
                if file_path.find(".html") > 0 and i < FILE_ENTRY_CAP:
                    doc_id = this_file.replace(".html", "")
                    print "Entering file number %s data from: %s" % (i, file_path)
                    
                    try:
                        enter_document(file_path, doc_id)
                        
                    except PageCountError:
                        
                        print "Error entering file number %s from %s - entire document skipped. " % (i, file_path)
                        print traceback.print_exc()
        
        if files_found ==0:
            print warning_message
            
        else:
            end = datetime.now()
            elapsed = end-start
            print "Time elapsed: %s" % (elapsed)
    def handle(self, *args, **options):
        
        # just try one that's weird.
        if False:
            file_path = "parser/hocr_sample/02-0575282_990EZ_201212.html"
            doc_id = "02-0575282_990EZ_201212"
            enter_document(file_path, doc_id)
            assert False

        start = datetime.now()
        
        for d, _, files in os.walk(SAMPLE_FILE_DIR):
            for i, this_file in enumerate(files):
                file_path = SAMPLE_FILE_DIR + this_file
                # ignore files that aren't .html in case any got mixed in there
                # may want to filter on other criteria here too
                if file_path.find(".html") > 0 and i < FILE_ENTRY_CAP:
                    doc_id = this_file.replace(".html", "")
                    print "Entering file number %s data from: %s" % (i, file_path)
                    
                    try:
                        enter_document(file_path, doc_id)
                        
                    except PageCountError:
                        
                        print "Error entering file number %s from %s - entire document skipped. " % (i, file_path)
                        print traceback.print_exc()
        
        end = datetime.now()
        elapsed = end-start
        print "Time elapsed: %s" % (elapsed)
Esempio n. 4
0
 def handle(self, *args, **options):
     # sample file included with repo, hopefully
     this_file = os.path.join(BASE_DIR, 'hocr_parser/test_hocr/58-1723645_990_201204.html')
     doc_id = "58-1723645_990_201204"
     start = datetime.now()
     enter_document(this_file, doc_id)
     end = datetime.now()
     elapsed = end-start
     print "Time elapsed: %s" % (elapsed)
Esempio n. 5
0
    def handle(self, *args, **options):
        

        this_file = "parser/test_hocr/58-1723645_990_201204.html"
        doc_id = "58-1723645_990_201204"
        start = datetime.now()
        enter_document(this_file, doc_id)
        end = datetime.now()
        elapsed = end-start
        print "Time elapsed: %s" % (elapsed)
Esempio n. 6
0
 def handle(self, *args, **options):
     # sample file included with repo, hopefully
     #doc_id = "1088501-adventuretime-alta-p1"
     doc_id = "14400741771771-p2"
     this_file = SAMPLE_FILE_DIR  + doc_id + ".html"
     # 1088501-adventuretime-alta-p1.png
     start = datetime.now()
     enter_document(this_file, doc_id, DOCUMENT_COLLECTION_SLUG, only_enter_new_pages=False)
     end = datetime.now()
     elapsed = end-start
     print "Time elapsed: %s" % (elapsed)