def page_classification(training_folder, testing_folder):
    path_extracted = Address(1).split("\n")
    base_dir = path_extracted[0]
    pages_pdf_dir = p_join(
        base_dir, 'tmp_pdf')  # create a temporary directory to split pages
    pages_txt_dir = p_join(
        base_dir,
        'tmp_txt')  # create a temporary directory to convert into texts
    if isdir(pages_pdf_dir):
        rmtree(pages_pdf_dir)
    if isdir(pages_txt_dir):
        rmtree(pages_txt_dir)
    # labels and training directory
    perfect_label = 'perfect'
    bad_label = 'bad'
    perfect_path = [p_join(training_folder, perfect_label), perfect_label]
    bad_path = [p_join(training_folder, bad_label), bad_label]
    SOURCES = [perfect_path, bad_path]
    # crop test pdf into test pages
    pdf_cropper_for_extraction.pdf_cropper_multiple(testing_folder,
                                                    pages_pdf_dir)
    # convert test pages to text
    page_classification_result = []
    for pdf_folder in listdir(pages_pdf_dir):
        page_pdf = p_join(pages_pdf_dir, pdf_folder)
        page_text = p_join(pages_txt_dir, pdf_folder)
        pdf_to_text(page_pdf, page_text)
        classifier_result = supervised_classifier(SOURCES, page_text)
        page_classification_result = page_classification_result + classifier_result
    rmtree(pages_pdf_dir)
    rmtree(pages_txt_dir)
    return page_classification_result
def page_classification(subject):
    Path_extracted=Address(1).split("\n")
    Path_extracted1=Path_extracted[0]
    #label pages as perfect, good and bad, training set in three folders
    perfect_label = 'perfect'
    good_label = 'good'
    bad_label = 'bad'
    #pages_path = os.path.join(os.path.join(Path_extracted1, 'All_pages_text'), subject)
    pages_path = os.path.join(os.path.join(Path_extracted1, 'small_training_set_text'), subject)
    perfect_path = [os.path.join(pages_path,'perfect'), perfect_label]
    good_path = [os.path.join(pages_path,'good'), good_label]
    bad_path = [os.path.join(pages_path,'bad'), bad_label]
    SOURCES = [perfect_path, good_path, bad_path]
    #clean up files in test folders
    pageDir = os.path.join(Path_extracted1, 'Test_pages')
    textDir = os.path.join(Path_extracted1, 'Test_pages_text')
    for folder in os.listdir(pageDir):
        shutil.rmtree(os.path.join(pageDir, folder))
    for folder in os.listdir(textDir):
        shutil.rmtree(os.path.join(textDir, folder))
    #crop test pdf into test pages
    pdfDir = os.path.join(Path_extracted1, 'Test_pdf')
    pageDir = os.path.join(Path_extracted1, 'Test_pages')
    pdf_cropper_for_extraction.pdf_cropper_multiple(pdfDir, pageDir)
    pageDir = os.path.join(Path_extracted1, 'Test_pages')
    #convert test pages to text
    for pdf_folder in os.listdir(pageDir):
        page_pdf = os.path.join('Test_pages', pdf_folder)
        page_text = os.path.join('Test_pages_text', pdf_folder)
        pdf_to_text(page_pdf,page_text)
        page_classifier_result = supervised_classifier_ngram(SOURCES, page_text)
        print(page_classifier_result)
Ejemplo n.º 3
0
 # prepare training and testing set folders
 for directory in [page_train, para_train, subpara_train, sentence_train, page_test, para_test, subpara_test, sentence_test]:
     if isdir(directory):
         rmtree(directory)
 for directory in [page_test, para_test, subpara_test, sentence_test]:
     mkdir(directory)
 # pick page folder for page testing
 testing_datasheet = [datasheet_list[k*n+i] for i in range(min(k, len(datasheet_list) - k*n))]
 # testing_folder = [paragraph_folder_list[k*n+i] for i in range(min(k, len(paragraph_folder_list) - k*n))]
 # copy the datasheets for testing
 for datasheet in testing_datasheet:
     copy_src = p_join(page_total, datasheet)
     copy_dir = p_join(page_test, datasheet)
     copyfile(copy_src, copy_dir)
 mkdir(tmp)
 pdf_cropper_for_extraction.pdf_cropper_multiple(page_test, tmp)
 testing_page = []
 for folder in listdir(tmp):
     folder = p_join(tmp, folder)
     testing_page = testing_page + [page for page in listdir(folder)]
 rmtree(tmp)
 # copy the paragraphs for testing
 testing_paragraph = []
 for page in testing_page:
     page = page.split('.pdf')[0]
     testing_paragraph_folder = p_join(para_total, page)
     if isdir(testing_paragraph_folder):
         testing_paragraph = testing_paragraph + [paragraph for paragraph in listdir(testing_paragraph_folder)]
 testing_subparagraph = []
 for paragraph in testing_paragraph:
     paragraph = paragraph.split('.txt')[0]
     page_number = pfr.getNumPages()
     #for i in range(page_number):
     for i in range(5):
         page_name = pdf_name + '_' + str(i)
         page_txt = page_name + ".txt"
         label = page_labels[page_name]
         copy_pagetxt = os.path.join(os.path.join(pagetxtDir, label),
                                     page_txt)
         shutil.copy(copy_pagetxt, os.path.join(trainingDir, label))
 #build partial testing set
 for pdf_name in testing_set:
     pdf = pdf_name + ".pdf"
     copy_pdf = os.path.join(all_pdf_folder, pdf)
     shutil.copy(copy_pdf, testingDir)
 #crop test pdf into test pages
 pdf_cropper_for_extraction.pdf_cropper_multiple(testingDir, testpageDir)
 #the following for loop is for small page set
 for folder in os.listdir(testpageDir):
     keep_page = [
         folder + "_0", folder + "_1", folder + "_2", folder + "_3",
         folder + "_4"
     ]
     for page in os.listdir(os.path.join(testpageDir, folder)):
         if page.split(".pdf")[0] not in keep_page:
             os.remove(os.path.join(testpageDir, os.path.join(folder,
                                                              page)))
 #convert test pages to text and check number of truly classified pages
 for pdf_folder in os.listdir(testpageDir):
     page_pdf = os.path.join(testing_set_page_folder, pdf_folder)
     page_text = os.path.join(testing_set_text_folder, pdf_folder)
     pdf_to_text(page_pdf, page_text)