Python convert_into Beispiele, tabula.convert_into Python Beispiele

Beispiel #1

0

Datei anzeigen

def convert_SRLDC():
    yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
    print(yesterday.strftime("%d%m%y"))
    ystrday = str(yesterday.strftime("%d-%m-%Y"))
    print(ystrday)
    df = tabula.read_pdf("SRLDC" + ystrday + ".pdf",
                         encoding="cp932",
                         pages="all",
                         multiple_tables=True,
                         lattice=True)
    tabula.convert_into("SRLDC" + ystrday + ".pdf",
                        "SRLDC" + ystrday + ".csv",
                        output_format="csv",
                        multiple_tables=True,
                        lattice=True,
                        pages="all")
    REG_AV = pd.DataFrame(df[0])
    State_Demand = pd.DataFrame(df[1])
    S_D_Energy_Forecast = pd.DataFrame(df[2])
    GEN = pd.DataFrame(df[3])
    GEN_2 = pd.DataFrame(df[4])
    GEN_3 = pd.DataFrame(df[5])
    GEN_4 = pd.DataFrame(df[6])
    IRE_HVDC_Physical_Flows = pd.DataFrame(df[7])
    IRE_SCH_WHELNG_UI_TLCHR = pd.DataFrame(df[8])
    FREQ_Profile = pd.DataFrame(df[9])
    VOL_Critical_Sub_Station = pd.DataFrame(df[10])
    Maj_Res_Particulars = pd.DataFrame(df[11])
    Overdrawls_below_49HZ_Constituents = pd.DataFrame(df[12])
    Overdrawls_below_49HZ_Generators = pd.DataFrame(df[13])

Beispiel #2

0

Datei anzeigen

def convert_WRLDC():
    today = datetime.datetime.now()
    today = str(today.strftime("%d%m%Y"))
    print(today)
    df = tabula.read_pdf("WRLDC" + today + ".pdf",
                         encoding="cp932",
                         pages="all",
                         multiple_tables=True,
                         lattice=True)
    tabula.convert_into("WRLDC" + today + ".pdf",
                        "WRLDC" + today + ".csv",
                        output_format="csv",
                        multiple_tables=True,
                        lattice=True,
                        pages="all")
    REQ_WR = pd.Dataframe(df[0])
    FREQ = pd.DataFrame(df[1])
    G_D_S_in_CA = pd.DataFrame(df[2])
    S_D_M = pd.DataFrame(df[3])
    REG = pd.DataFrame(df[4])
    REG_IPP = pd.DataFrame(df[5])
    REG_IPP_2 = pd.DataFrame(df[6])
    IRE = pd.DataFrame(df[7])
    VOL_Profile_765kV = pd.DataFrame(df[8])
    VOL_Profile_400kV = pd.DataFrame(df[9])
    STOA_BI_PX = pd.DataFrame(df[10])
    State_Genrtrs = pd.DataFrame(df[11])
    State_Genrtrs_2 = pd.DataFrame(df[12])
    Z_C_UI_S = pd.DataFrame(df[13])

Beispiel #3

0

Datei anzeigen

def convert_pdf_to_csv(path):
    """Converts the pdf to csv and saves the csv

    Parameters
    ----------
    path : str
        The file location of the pdf

    """

    filename_wo_ext = os.path.splitext(os.path.basename(path))[0]
    tabula.convert_into(path,
                        OUTPUT_NAME1,
                        format='csv',
                        stream=True,
                        pages=1,
                        area=(TOP1, LEFT1, TOP_HEIGHT1, LEFT_WIDTH1))
    tabula.convert_into(path,
                        OUTPUT_NAME2,
                        format='csv',
                        stream=True,
                        pages=1,
                        area=(TOP2, LEFT2, TOP_HEIGHT2, LEFT_WIDTH2))
    df1 = pd.read_csv(OUTPUT_NAME1)
    df2 = pd.read_csv(OUTPUT_NAME2, header=0, names=NAMES)
    pd.concat([df1, df2], axis=1).to_csv(OUTPUT_NAME, index=False)
    os.remove(OUTPUT_NAME1)
    os.remove(OUTPUT_NAME2)

Beispiel #4

0

Datei anzeigen

def convert_NRLDC():
    df = tabula.read_pdf("NRLDC" + ystrday + ".pdf",
                         encoding="cp932",
                         pages="all",
                         multiple_tables=True,
                         lattice=True)
    tabula.convert_into("NRLDC" + ystrday + ".pdf",
                        "NRLDC" + ystrday + ".csv",
                        output_format="csv",
                        multiple_tables=True,
                        lattice=True,
                        pages="all")
    R_A_D = pd.DataFrame(df[0])
    S_L_D = pd.DataFrame(df[1])
    S_D_M = pd.DataFrame(df[2])
    REG_ENT = pd.DataFrame(df[3])
    State_ENT = pd.DataFrame(df[4])
    State_ENT_2 = pd.DataFrame(df[5])
    T_Hydro_GEN = pd.DataFrame(df[6])
    T_Ren_GEN = pd.DataFrame(df[7])
    IRE = pd.DataFrame(df[8])
    IR_S_ACT_EX = pd.DataFrame(df[9])
    IR_Analysis = pd.DataFrame(df[10])
    IRE_Nepal = pd.DataFrame(df[11])
    FREQ_Profile = pd.DataFrame(df[12])
    FREQ_Profile_3 = pd.DataFrame(df[13])
    VOL_Profile_400kV = pd.DataFrame(df[14])
    VOL_Profile_765kV = pd.DataFrame(df[15])
    Res_parameters = pd.DataFrame(df[16])
    STOA = pd.DataFrame(df[17])
    STOA_2 = pd.DataFrame(df[18])
    Sys_Rel_Indics = pd.DataFrame(df[19])
    Sys_Rel_Indics_2 = pd.DataFrame(df[20])
    Z_C_violations = pd.DataFrame(df[21])

Beispiel #5

0

Datei anzeigen

Datei: utils.py Projekt: sbipl/unscrapulous

def convert_into_csv(filenames, output_dir, ext='pdf', table=[]):
    '''
    Converts `pdf/xls/xlsx` files to `csv`.
    Also writes a `csv` file from a list
    '''
    if len(table) != 0:
        for filename in filenames:
            filename = os.path.join(output_dir, filename)
            with open(filename, 'w') as f:
                writer = csv.writer(f)
                writer.writerows(table)
        return

    if ext == 'pdf':
        for filename in filenames:
            filename = os.path.join(output_dir, filename)
            tabula.convert_into(
                filename,
                filename.replace(ext, 'csv'),
                lattice=True,
                pages='all'
            )

    elif ext in ['xls', 'xlsx']:
        for filename in filenames:
            filename = os.path.join(output_dir, filename)
            excel_file = pd.read_excel(filename)
            excel_file.to_csv(filename.replace(ext, 'csv'),
                              index=None,
                              header=True)

Beispiel #6

0

Datei anzeigen

def read_table(file, user):
    tabula.convert_into(file, 'file.csv', output_format='csv', pages='all')
    head = [
        'Txn Date', 'Value Date', 'Description', 'Ref No./Cheque No.', 'Debit',
        'Credit', 'Balance'
    ]
    with open('file.csv') as f:
        data = csv.reader(f)
        data = list(data)
        row0 = data[0]
        m = len(row0)
        i = 0
        while i < len(data):
            f = data[i]
            i += 1
            while i < len(data):
                rowi = data[i]
                if rowi[-1] != '':
                    break
                for j in range(m):
                    if rowi[j] != '':
                        f[j] = f[j] + ' ' + rowi[j]
                i += 1
            if f == head:
                print f
            else:
                entry = StatEntry(txn_date=f[0],
                                  val_date=f[1],
                                  description=f[2],
                                  ref_no=f[3],
                                  debit=f[4],
                                  credit=f[5],
                                  balance=f[6],
                                  user=user)
                entry.save()

Beispiel #7

0

Datei anzeigen

def df2csv(filename, page):

    tabula.convert_into(filename,
                        r"E:\scrapy\json\output.csv",
                        output_format="csv",
                        pages=page)
    print("Convert Complete!")

Beispiel #8

0

Datei anzeigen

Datei: workordersbauto.py Projekt: shanekshimizu/kfsautomation

def translatepdf():
    """
    Use tabula to convert pdf file into .csv
    """
    list_of_files = glob.glob('/Users/shaneshimizu/Downloads/*')
    latest_file = max(list_of_files, key=os.path.getctime)
    validateFile = input("name this file: ")
    schoolCode = input("School Code: ")
    os.system(f"open '{latest_file}'")

    if validateFile.lower() != None:
        dateRange = validateFile
        pathToFile = f'/Users/{username}/Desktop/service_billings/tabula_csv/'
        fileName = f'Work_Order_{dateRange}.csv'
        joinPath = os.path.join(pathToFile, fileName)
        #read recent file in download, read only set area, output into a csv file
        try:
            data = read_pdf(latest_file, pages='all')
            tabula.convert_into(latest_file,
                                joinPath,
                                guess=False,
                                stream=True,
                                area=(18.05, 17.9, 568.49, 756.57),
                                output_format="csv",
                                pages='all')
        #most recent file is not a accepted file for conversion
        except:
            print("not a fleet report, check recent download")
            return
        workBookName = joinPath
        time.sleep(1)
        executeAutomation(workBookName, schoolCode, dateRange)
    else:
        exit()

Beispiel #9

0

Datei anzeigen

def translatepdf():
    """
    use tabula to convert pdf file into .csv
    """
    list_of_files = glob.glob(
        f'/Users/{username}/Downloads/*'
    )  # * means all if need specific format then *.csv
    latest_file = max(list_of_files, key=os.path.getctime)
    validateFile = input("name this file: ")
    schoolCode = input("School Code: ")
    os.system(f"open '{latest_file}'")

    if validateFile.lower() != None:
        dateRange = validateFile
        #read recent file in download, read only set area, output into a csv file
        try:
            pathToFile = f'/Users/{username}/Desktop/service_billings/tabula_csv/'
            fileName = f'Fuel_{dateRange}.csv'
            joinPath = os.path.join(pathToFile, fileName)
            data = read_pdf(latest_file, pages='all')
            tabula.convert_into(latest_file,
                                joinPath,
                                output_format="csv",
                                pages='all')
        #most recent file is not a accepted file for conversion
        except:
            print("not a fleet report, check recent download")
            return
        workBookName = joinPath
        time.sleep(1)
        executeAutomation(workBookName, schoolCode, dateRange)
    else:
        exit()

Beispiel #10

0

Datei anzeigen

def getTablepdf():
    # df = read_pdf('C:/Users/wenji/Desktop/subway.pdf', multiple_tables=True)
    tabula.convert_into('C:/Users/wenji/Desktop/subway.pdf',
                        "C:/Users/wenji/Desktop/result/output.csv",
                        output_format="csv",
                        pages="all",
                        multiple_tables=True)

Beispiel #11

0

Datei anzeigen

def extract_info(tiliote):
    tapahtumat_list = list()
    nested_list = list()
    if path.exists("sptili.csv") == False:
        tabula.convert_into("stili.pdf",
                            "sptili.csv",
                            output_format="csv",
                            pages='all')

    with open("sptili.csv", "r") as f:
        read_file = f.read()

        tapahtumat_object = re.compile(
            r'\d*,(\d{4})(.*),,\d+,,"(\d*\.?\d*\.?\d{1,3},\d{2}[+-])"'
        )  #read only line with amount
        matches = tapahtumat_object.finditer(read_file)
        for match in matches:
            print("function", match)
            # tapahtumat_list.append(match)
            for group_index in range(1, len(match.groups()) + 1):
                nested_list.append(match.group(group_index))
            tapahtumat_list.append(nested_list)
            nested_list = list()
        # remove("accounts/bank_statements/{date}_{user}_tili.csv")
    for i in tapahtumat_list:
        print(i[0][:2])

Beispiel #12

0

Datei anzeigen

def get_tables():
    # tables = camelot.read_pdf(file, pages = "1-end")
    # tables.export("tables/output_tables.csv", f = "csv")
    tabula.convert_into(file,
                        "tables/output.csv",
                        output_format='csv',
                        pages='all')

Beispiel #13

0

Datei anzeigen

Datei: convert_pdf_to_csv.py Projekt: PengWei98/Hao-Project

def with_table(path, out, png_path, page='all'):
    try:
        tabula.convert_into(path, out, output_format="csv", pages=page)
    except:
        return 'This pdf is not parasble!'
    # return tabula.read_pdf(pdf_path, header=-1, pages=page)
    df = pd.read_csv(out, encoding='utf-8', header=-1)
    df = df.drop(cleanRows(df), axis=0)
    df = df.reset_index()
    del df['index']
    df = df.drop(longColumns(df), axis=1)
    df = df.drop(longRows(df), axis=0)
    df = move_nans(df)
    df.to_csv(out, encoding='utf-8', header=0, index=False)

    row, column = firstCell(df)
    print(row, column)

    if not (3 > row > 0 and 3 > column > 0):
        return 'No table is detected in the pdf.'

    render_mpl_table(df, header=(row, column), col_width=1.8)
    plt.savefig(png_path)
    # plt.show()
    return df

Beispiel #14

0

Datei anzeigen

def pdf_scanner(file_path, scanner="pdfplumber", verbose=False):
    """  
    hvis tabula ikke virker til at skanne pdf prøves med pdfpumber
    """
    if scanner == "tabula":
        try:
            tabula.convert_into(file_path,
                                file_path + ".csv",
                                all=True,
                                pages='all')
        except Exception as exc:
            if verbose:
                print(
                    'Exception - Fejl i konvertering af pdf til csv-fil (pdfplumber): %s'
                    % (exc))
            return "Fejl i tabula skriving af csv fil"

    else:
        try:
            manuel_skanning(file_path, verbose)
        except Exception as exc:
            if verbose:
                print(
                    'Exception - Fejl i konvertering af pdf til csv-fil (pdfplumber): %s'
                    % (exc))
            return "Fejl i manuel skriving af csv fil"

Beispiel #15

0

Datei anzeigen

def it_new_data():
    from tabula import convert_into
    import pandas as pd
    ## Somehow tabula did not manage to extract,
    ## but it can convert into csv directly
    URL = it_link_update()
    convert_into(URL,
                 "./granular_cases_europe/it_tmp.csv",
                 output_format="csv")
    italy_updated = pd.read_csv('./granular_cases_europe/it_tmp.csv',
                                skiprows=2,
                                thousands='.')
    totals = [
        x for x in range(len(italy_updated.iloc[:, 0]))
        if italy_updated.iloc[:, 0][x] == "TOTALE"
    ]
    italy_updated = italy_updated.drop(
        range(totals[0],
              len(italy_updated.iloc[:, 0])))  #.reset_index(drop = True)
    italy = pd.DataFrame({
        "country": "Italy",
        "region": italy_updated.iloc[:, 0],
        "confirmed_infected": italy_updated.iloc[:, 7],
        "dead": italy_updated.iloc[:, 6],
        "recovered": italy_updated.iloc[:, 5]
    })
    return (italy)

Beispiel #16

0

Datei anzeigen

Datei: PDFcheck.py Projekt: yazarneon/mosint

def PDFcheck(mail, _verbose=None):
    domain = mail.split("@")[1]
    term = "site:" + domain + " filetype:PDF intext:" + '"' + "email" + '"'
    try:
        data = search(term, num_results=5)
        for i in data:
            r = requests.get(i, stream=True)
            with open('data.pdf', 'wb') as f:
                f.write(r.content)
            pdfFileObj = open('data.pdf', 'rb')
            for pageNumber in range(1, 3):
                tabula.convert_into("data.pdf",
                                    "out.txt",
                                    pages=pageNumber,
                                    silent=True)
                file = open("out.txt", "r", encoding="utf-8")
                read = file.read()
                findPDFs = re.findall('[\w\.-]+@[a-z0-9\.-]+', read)
                try:
                    if (findPDFs[0] is not None):
                        for pdfs in findPDFs:
                            print(pdfs)
                except:
                    pass
            pdfFileObj.close()
            file.close()
            if os.path.exists("data.pdf"):
                os.remove("data.pdf")
            if os.path.exists("out.txt"):
                os.remove("out.txt")
    except:
        print("PDF Search error!")

Beispiel #17

0

Datei anzeigen

Datei: views.py Projekt: Rodrigo-Ornellas/zedlog

def downPDF(request):

    #  building a unique name for the schedule file being downloaded
    fname = '{}{}{}'.format("sched/schedule_",
                            str(datetime.now().strftime("%Y%d%m")),
                            randomString().upper())

    # file name definition
    fnameCSV = os.path.join(str(settings.MEDIA_ROOT),
                            '{}{}'.format(fname, '.csv'))
    print('fnameCSV > {}'.format(fnameCSV))
    fnamePDF = os.path.join(str(settings.MEDIA_ROOT),
                            '{}{}'.format(fname, '.pdf'))
    print('fnamePDF > {}'.format(fnamePDF))

    #  URL from where the FILE will be downloaded
    url = "http://localhost:9003/"
    # url = "http://webservices.globalterminalscanada.com/sites/default/files/DPVesselSchedule.pdf"

    # save the PDF file
    urlretrieve(url, fnamePDF)

    # 2) convert PDF into CSV file
    tabula.convert_into(fnamePDF, fnameCSV, output_format="csv", pages='all')

    # 3) Saving file names to Database
    dwnld = SchedFILE(uploaded_by=request.user,
                      fnamePDF=fnamePDF,
                      fnameCSV=fnameCSV)
    dwnld.save()

    return dwnld.fnameCSV

Beispiel #18

0

Datei anzeigen

def pdf_to_csv(download_path):
    """
    パス文字列で指定されたフォルダの最新のPDFファイルをDataframeで返す
    
    Parameters
    ----------
    download_path : str
        パス文字列

    Returns
    -------
    df : pandas.DataFrame
        PDFファイルから生成したDataframe
    """
    # 指定フォルダから最新ファイルを取得
    list_of_files = glob.glob(download_path)
    latest_pdf_file = max(list_of_files, key=os.path.getctime)
    # tabulaでCSVに変換
    latest_csv_file = latest_pdf_file.replace('.pdf', '.csv')
    tabula.convert_into(latest_pdf_file,
                        latest_csv_file,
                        pages="all",
                        output_format="csv")
    # pandas
    df = pd.read_csv(latest_csv_file)

    return df

Beispiel #19

0

Datei anzeigen

def translatepdf():
    """
    use tabula to convert pdf file into .csv
    """
    list_of_files = glob.glob(
        '/Users/shaneshimizu/Downloads/*'
    )  # * means all if need specific format then *.csv
    latest_file = max(list_of_files, key=os.path.getctime)
    validatefile = input("name this file: ")
    print("\n")
    #validatefile = input("Is " + latest_file + " the file you want to process?" + "\n type yes or no: ")

    if validatefile.lower() != None:
        dateRange = validatefile
        #dateRange = input("Please enter the date range for this report (use underscore instead of spaces): ")
        try:
            data = read_pdf(latest_file, pages='all')
            tabula.convert_into(latest_file,
                                f'Fleet_Invoice_{dateRange}.csv',
                                guess=False,
                                stream=True,
                                area=(54.85, 15.76, 775.07, 595.18),
                                output_format="csv",
                                pages='all')
        except:
            print("not a fleet report, check recent download")
            return
        workbookname = f'Fleet_Invoice_{dateRange}.csv'
        #time.sleep(1)
        executeAutomation(workbookname)
    else:
        exit()

Beispiel #20

0

Datei anzeigen

def pdf2csv(input_file, output_file, verbose=False):
    ''' pdf2csv() - Read a PDF file and write tables to a csv file

    Parameters
    ----------
    input_file : str
        name of input PDF file

    output_file : str
        name of output csv file

    verbose : bool, default=False
        turn command-line output on or off

    Returns
    -------
    nothing

    '''
    import tabula

    tabula.convert_into(input_file, output_file, output_format='csv', pages='all')
    if verbose:
        print(f'  Wrote table(s) in {input_file} to {output_file}')
    return

Beispiel #21

0

Datei anzeigen

def oldPdfToCsv(inp_file):
    top = 60
    left = 40
    width = 744
    height = 912

    y1 = top
    x1 = left
    y2 = top + height
    x2 = left + width

    out_file = 'out2.csv'

    tabula.convert_into(inp_file,
                        out_file,
                        stream=True,
                        output_format='csv',
                        pages="11")

    df = pd.read_csv('out2.csv', header=None)
    # df.Name.apply(lambda x: pd.Series(str(x).split("_")))

    ndf = df[0].apply(lambda x: pd.Series(num_name(x)))
    ndf[[2, 3, 4, 5, 6]] = df[[1, 2, 3, 5, 6]]
    ndf.to_csv('out2.csv', index=False, header=False)

Beispiel #22

0

Datei anzeigen

Datei: main.py Projekt: firefrogliu/tabelExtract

def pdf2Csv(pdfpath, fileName, output_path):
    pdfName = pdfpath + fileName
    csvName = output_path + fileName + '.csv'
    # Read pdf into DataFrame
    df = tabula.read_pdf(pdfName, pages='all')

    # convert PDF into CSV
    tabula.convert_into(pdfName, csvName, output_format="csv", pages='all')
    data = readCsv(output_path + fileName)
    new_data = []
    for i in range(len(data)):
        row = data[i]
        new_row = []
        for j in range(len(row)):
            column = row[j]
            if j == 1:
                if isDate(column.lower()):
                    new_row[0] == column
                    column = ""
            new_row.append(column)

        new_data.append(new_row)

    with open('b.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(new_data)

Beispiel #23

0

Datei anzeigen

Datei: pdfparser.py Projekt: dhirata/cc-statement-processor

def parse_statement_pdf(creditor, path):
    result = []
    temp_file_path = "test.csv"
    unsanitized = []
    pdf = Pdf(path, creditor)
    pgs = pdf.get_page_range()
    relative_area = pdf.get_relative_area_percentages()
    convert_into(path,
                 temp_file_path,
                 output_format="csv",
                 pages=pgs,
                 guess=False,
                 area=relative_area,
                 relative_area=True)
    with open(temp_file_path, 'r') as f:
        reader = csv.reader(f)
        unsanitized = list(reader)
    for line in unsanitized:
        sanitized_line = []
        filtered_line = list(filter(None, line))  # removes empty elements
        # Check if transaction entry
        date_name_field = filtered_line[0]
        if type(date_name_field) is str:
            date_name_list = date_name_field.split(' ', 1)
            try:
                datetime.datetime.strptime(date_name_list[0], "%m/%d")
                sanitized_line.extend(date_name_list)
                price_field = filtered_line[1]
                sanitized_line.append(eval(price_field.replace(',', '')))
                result.append(sanitized_line)
                print(sanitized_line)
            except ValueError as err:
                continue
    return result

Beispiel #24

0

Datei anzeigen

Datei: read_tiliote.py Projekt: Linda2009/Oth

def extract_info(tiliote, user, date, bank):
    tapahtumat_list = list()
    nested_list = list()
    tabula.convert_into(tiliote,
                        f"accounts/bank_statements/{date}_{user}_tili.csv",
                        output_format="csv",
                        pages='all')

    with open(f"accounts/bank_statements/{date}_{user}_tili.csv", "r") as f:
        read_file = f.read()
        if bank == "nordea":
            tapahtumat_object = re.compile(
                r'(\d{2}\.\d{2}\s\d{2}\.\d{2})(.*?)(,,,,,+)"(\d*\.?\d*\.?\d{1,3},\d{2}[+-])"'
            )  #read only line with amount
        elif bank == "sp":
            tapahtumat_object = re.compile(
                r'\d*,(\d{4})(.*),,\d+,,"(\d*\.?\d*\.?\d{1,3},\d{2}[+-])"')
        matches = tapahtumat_object.finditer(read_file)
        for match in matches:
            print("function", match)
            # tapahtumat_list.append(match)
            for group_index in range(1, len(match.groups()) + 1):
                nested_list.append(match.group(group_index))
            tapahtumat_list.append(nested_list)
            nested_list = list()
        # remove("accounts/bank_statements/{date}_{user}_tili.csv")
    return tapahtumat_list

Beispiel #25

0

Datei anzeigen

Datei: pdf2Csv.py Projekt: getnot/dehaat

def convert(inputfilepath,inputfilename):
    ''' function to convert pdf file to csv 
        input => full qualified file name
    '''
    logger.debug("going to convert pdf file to csv")
    fileInputPath =inputfilepath+"/"+inputfilename
    validateFile(fileInputPath)
    createFolder(outputPath)
    
    logger.debug("pdf file validation done")
    milliseconds = int(round(time.time() * 1000))

    fileTempPath=outputPath+"temp_"+str(milliseconds)+".csv"
   
    outputFile=inputfilename.split(".")[0]+"_"+str(milliseconds)+".csv"
    fileOutputPath=outputPath+outputFile

    tabula.convert_into(fileInputPath, fileTempPath ,stream=True)
    logger.debug("pdf file to csv done")
    
    try:
        output=pandas.read_csv(fileTempPath,header=None)
        arr=output[[2][0]].str.split(" ",n=1,expand=True)
        output[[2][0]]=arr[0]
        output[[3][0]]=arr[1]
        output.to_csv(fileOutputPath,index=False, header=False)
        logger.debug("csv data is cleaned")
    except Exception as e:
        logger.debug("error while data cleaning"+str(e))
        raise incorrectPdfFile
        
    return outputFile

Beispiel #26

0

Datei anzeigen

def ImportMatricula(ruta, usuario):
    # Creating a pdf file object
    try:
        convert_into(ruta, 'media/conversion.csv', pages='all')
        safe = pd.read_csv('media/conversion.csv', encoding='latin-1')
        logging.debug(str(safe.columns[2]))
        if (str(safe.columns[2]) == 'DATOS DE MATRICULA'):
            file = pd.read_csv('media/conversion.csv', encoding='latin-1')
            file.to_excel('media/conversion.xls', index=False, header=False)
            loc = 'media/conversion.xls'
            wb = xlrd.open_workbook(loc)
            sheet = wb.sheet_by_index(0)

            for i in range(sheet.nrows):
                code = str(sheet.cell_value(i, 0))[0:8]
                logging.debug(code)
                asig = Asignatura.objects.filter(sid=code)
                if asig.exists():  ##ASIGNATURA SOPORTADA POR LA PLATAFORMA
                    asig = Asignatura.objects.get(sid=code)
                    logging.debug("Asignatura OK")
                    AlAs = AlumAsig.objects.filter(uid=usuario,
                                                   sid=asig,
                                                   passed=False)
                    logging.debug(AlAs.exists())
                    if not AlAs.exists():  ##ALUMNO YA MATRICULADO
                        logging.debug(code)
                        logging.debug('No existe esta relacion aun')
                        new = AlumAsig.objects.create(
                            uid=usuario,
                            sid=asig,
                            amount=0,
                            grade=0,
                            passed=False,
                        )
                        new.save()
                    else:
                        logging.debug(
                            "Alumno ya matriculado de esta asignatura.")

            os.remove(ruta)
            os.remove('media/conversion.csv')
            os.remove('media/conversion.xls')
            usuario.doc = None
            usuario.save()
            return 0
        else:
            logging.debug("Matrícula no válida")
            os.remove(ruta)
            os.remove('media/conversion.csv')
            usuario.doc = None
            usuario.save()
            return 1
    except Exception as e:
        logging.debug("Matrícula no válida (EXCEPCION)")
        logging.exception(e)
        os.remove(ruta)
        usuario.doc = None
        usuario.save()
        return 1

Beispiel #27

0

Datei anzeigen

Datei: ProcessingSourse.py Projekt: SaprunovGleb/PDF_Tables

def extrPdfFromZipTo2TXT(nameSourse="cmegroup",
                         workFilePDF="Section61_Energy_Futures_Products.pdf",
                         info="Futures",
                         col=-1):
    #Нахождение местоположение проекта на компьюторе
    path = os.getcwd()
    #Указание адреса исходной директории
    pathDirIn = "\Source" + "\\" + nameSourse
    #Указание адреса итоговой директории
    pathDirOut = "\Data" + "\\" + nameSourse + "\\" + info
    #Задание абсолютного пути
    pathIn = path + pathDirIn
    pathOut = path + pathDirOut
    #Список файлов в исходной директории
    fileNamesIn = os.listdir(pathIn)
    #print(fileNamesIn)
    fileNamesOut = os.listdir(pathOut)
    #Тело цикла
    #Счетчик файлов из директории
    i = 0
    if col == -1:
        maxI = len(fileNamesIn)
    else:
        maxI = col
    #Номер символа с которого идет дата
    adressDateFromName = 18
    print("Исходный файл:", workFilePDF)
    while i < maxI:
        #Задание имени итогового файла
        fileNameOut = fileNamesIn[i][adressDateFromName:(adressDateFromName +
                                                         8)] + ".xlsx"
        #print(fileNameOut)
        if fileNamesOut.count(fileNameOut) == 0:
            #Исходный архив Zip
            nameFileZip = pathIn + "\\" + fileNamesIn[i]
            #print(workFileZip)
            #Проверка на существование и то что это действительно zip архив
            if zipfile.is_zipfile(nameFileZip):
                #Открываем архив для работы
                workFileZip = zipfile.ZipFile(nameFileZip, 'r')
                #Указываем файл для обработки
                #Извлекаем файлы для обработки
                workFileZip.extract(workFilePDF)
                print("Идет создание ", fileNameOut, "исходный архив №", i)
                convert_pdf_to_txtfitz(workFilePDF, "PDFtoTXTpymupdf.txt")
                #pathFileNameOut = path + pathDirOut+"\Futures\\"+fileNameOut
                tabula.convert_into(workFilePDF,
                                    output_path="PDFtoTXTTabula.txt",
                                    output_format="csv",
                                    pages="all")
                if info == "Futures":
                    txtToXlsxFut(pathOut=(pathOut + "\\" + fileNameOut))
                elif info == "Options":
                    txtToXlsxOpt(pathOut=(pathOut + "\\" + fileNameOut))
        i += 1
    print("Переформатирование завершено")


#extrPdfFromZipTo2TXT(workFilePDF = "Section63_Energy_Options_Products.pdf",col=1,info="Options")

Beispiel #28

0

Datei anzeigen

def pdf_to_csv(filepath):
    """
    Convierte un archivo pdf a un csv para luego analizarlo
    :param filepath: ruta al archivo pdf
    :return:
    """
    tabula.convert_into(filepath, RUTA_CSV, pages="all", output_format="csv", guess=False, stream=True)
    leer_csv()

Beispiel #29

0

Datei anzeigen

Datei: pdfconvert.py Projekt: rafmacalaba/pdftableconvert

    def Convert(self):
        '''This function will handle the converting process of the pdf tables using the convert_into function of the tabula-py'''
        try:
            tabula.convert_into("%s" % usefile, output_path="C:/OutputFolder/Output.csv", output_format="csv", pages ="%s" % page)
            self.statusBar().showMessage('Converted') #show when the file is already converted

        except:  #user hints for error handling                      
            QMessageBox.information(self, "WARNING", "Hint:Close the output file or Select the page.")

Beispiel #30

0

Datei anzeigen

def pdf_to_text(pdf_path, outfile, columns, output_format='tsv', password = None):
    
    logging.debug("Extracting text from pdf - {}".format(pdf_path))
    options_dict = {}
    if password is not None:
        options_dict["password"] = password

    tabula.convert_into(pdf_path, outfile, pages="all", stream=True, guess=False, output_format=output_format, columns=columns, **options_dict)

Beispiel #31

0

Datei anzeigen

Datei: test_read_pdf_table.py Projekt: ejmurray/tabula-py

 def test_convert_into_exception(self):
     pdf_path = 'tests/resources/data.pdf'
     with self.assertRaises(AttributeError):
         tabula.convert_into(pdf_path, 'test.csv', output_format='dataframe')
     with self.assertRaises(AttributeError):
         tabula.convert_into(pdf_path, None)
     with self.assertRaises(AttributeError):
         tabula.convert_into(pdf_path, '')

Beispiel #32

0

Datei anzeigen

Datei: test_read_pdf_table.py Projekt: ejmurray/tabula-py

 def test_conver_from(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_csv = 'tests/resources/data_1.csv'
     expected_tsv = 'tests/resources/data_1.tsv'
     expected_json = 'tests/resources/data_1.json'
     temp = tempfile.NamedTemporaryFile()
     tabula.convert_into(pdf_path, temp.name, output_format='csv')
     self.assertTrue(filecmp.cmp(temp.name, expected_csv))
     tabula.convert_into(pdf_path, temp.name, output_format='tsv')
     self.assertTrue(filecmp.cmp(temp.name, expected_tsv))
     tabula.convert_into(pdf_path, temp.name, output_format='json')
     self.assertTrue(filecmp.cmp(temp.name, expected_json))

Beispiel #33

0

Datei anzeigen

Datei: test_read_pdf_table.py Projekt: kanymanyman/tabula-py

 def test_convert_remote_file(self):
     uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf"
     temp = tempfile.NamedTemporaryFile()
     tabula.convert_into(uri, temp.name, output_format='csv')
     self.assertTrue(os.path.exists(temp.name))