コード例 #1
0
ファイル: extraction2_0.py プロジェクト: mikpim01/PDF2EXCEL
        try:
            c.value = dict_to_print[key].strip()
        except:
            #Using this area to keep track of encoding errors
            print "This text caused an error"
            print dict_to_print[key]
            c.value = "Raw text contained bad chars, see intepreter."
            #c.value=dict_to_print[key]
        i = i + 1

    wb.save(book_name)
    return book_name


## Gets the user to select the document containing the OCR'ed text
ocr_text_path = useful.getPath(get_default_directory())

## Uses that path to extract the data.
document_list = get_document_list(ocr_text_path)

## Retrieves the utility library from the utility dictionary for the specified utility
## which is currently determined with a function (That just returns "Consolidated Edison"
utility_library = get_utility_library(get_current_utility(),
                                      get_utility_library_directory())

## Sample of using def get raw chars
#library_entry_sample=utility_library['extraction_parameters']['G&T Demand1']
#raw_chars=get_raw_chars(document_list[0],library_entry_sample)

## Collecting the raw characters, but I would also like to collect
## the flags found by the regular expressions.
コード例 #2
0
    filename=useful.getFilenameFromPath(target_pdf)
    filehandle=file(outputFile,'a')
    filehandle.write("TEXT FROM "+str(filename)+" PAGE "+str(i))
    filehandle.write("\n")
    filehandle.write(text_string)
    filehandle.write('\n')
    return outputFile

##----THE ACTUAL PROGRAM----##

## Ask user for the target pdf, in the future I plan to have it
## be able to get all the pdfs in a directory! Or maybe I'll have
## the user make the PDF themselves? because this program still doesn't really
## know how to handle blank pages.

target_pdf = useful.getPath(default_directory)[0]
#print target_pdf
print "---------------------------------------------------"
print "THE TARGET PDF FILENAME IS: "+target_pdf[target_pdf.rindex('/')+1:]
#print "---------------------------------------------------"

##  Initializations for Tess
#print "Importing Tess"
api = tesseract.TessBaseAPI()   
api.SetOutputName("outputName");
api.Init(".","eng",tesseract.OEM_DEFAULT)
api.SetPageSegMode(tesseract.PSM_AUTO)

#print "made tess decs"

## Other Initializations 
コード例 #3
0
        c=ws.cell(row=last_occ_row, column=i)
        try:
            c.value=dict_to_print[key].strip()
        except:
            #Using this area to keep track of encoding errors
            print "This text caused an error"
            print dict_to_print[key]
            c.value="Raw text contained bad chars, see intepreter."
            #c.value=dict_to_print[key]
        i=i+1
            
    wb.save(book_name)
    return book_name

## Gets the user to select the document containing the OCR'ed text
ocr_text_path=useful.getPath(get_default_directory())

## Uses that path to extract the data.
document_list=get_document_list(ocr_text_path)

## Retrieves the utility library from the utility dictionary for the specified utility
## which is currently determined with a function (That just returns "Consolidated Edison"
utility_library=get_utility_library(get_current_utility(),get_utility_library_directory())

## Sample of using def get raw chars
#library_entry_sample=utility_library['extraction_parameters']['G&T Demand1']
#raw_chars=get_raw_chars(document_list[0],library_entry_sample)

## Collecting the raw characters, but I would also like to collect
## the flags found by the regular expressions.
match_dict={}
コード例 #4
0
ファイル: extraction.py プロジェクト: mikpim01/PDF2EXCEL
        refined_results.append(result)
    return refined_results

###--------------------------------------------------------------------------------------------
###--------START OF PROGRAM---------------------------------------------------------------
import os
import useful

## Testing directory
default_directory="C:\Users\James McGlynn\My Programs\Python Programs\pdf2txt\WorkRelated"

## Choose library here
library=refined_ConEd_Lib

## Have user navigate to text file
text_file=useful.getPath(default_directory)

## Open the text file and put every line in a list
with open(text_file,'r') as fh:
    lines=fh.readlines()
    
## Pull every other line from the list (That's just how I made the document)
document_list=get_document_list(lines)

## Use the library and the text to try and get the data you want
refined_results=extract_information(library, document_list)


def print_by_account_number(library,refined_results,text_file):

    ## Openpyxl library imports