コード例 #1
0
def tabula_table_generator(file, start_page, pdf_name_data, pdf_name_count,
                           page_break_count):
    '''This function generates the list of tables from the pages where required heading (like 'Precision') data presented in pdf

    Parameters:

    file(str): filename or the DOWID of the document

    start_page(number): Page number of pdf from where heading (like 'Precision') data started

    pdf_name_data(list): This is the set of pdf id along with some texts presented in the extracted heading text

    pdf_name_count(number): Count that represents how many times the pdf name occurred in the extracted heading text

    page_break_count(number): Count of pages having required heading (like 'Precision') data

    returns:

    tabledata(list): List of tables presented in the pages of pdf in which required heading (like 'Precision') data presented

    '''

    tabledata = []

    if (pdf_name_data != []) and (pdf_name_count != 0):
        new_pages = str(
            str(start_page + 1) + "-" + str(start_page + pdf_name_count + 1))

    elif page_break_count != 0:
        new_pages = str(
            str(start_page + 1) + "-" + str(start_page + page_break_count + 1))

    else:
        new_pages = str(start_page + 1)

    tabledata = read_pdf(file,
                         output_format="dataframe",
                         pages=new_pages,
                         lattice=True,
                         multiple_tables=True,
                         encoding='latin-1',
                         position="absolute")

    if tabledata == []:
        tabledata = read_pdf(file,
                             output_format="dataframe",
                             pages=new_pages,
                             lattice=False,
                             multiple_tables=True,
                             encoding='latin-1',
                             position="absolute")

    return tabledata
コード例 #2
0
def multiple(file_path, columns):
    """
    Convert pdf table by pages
    """
    DATA = pd.DataFrame(columns=list(range(0, 7)))
    tables = read_pdf(
        file_path,
        # encoding='windows-1251',
        encoding='utf-8',
        lattice=True,
        pandas_options={'header': None},
        pages='all',
        multiple_tables=True)

    for t in tables:  # Because of mistakes while pasring we need to delete empty columns sometimes

        if t.shape[1] > 7:
            t.dropna(axis='columns', how='all', inplace=True)
            # This solution works only if there is at least one full row on the page
            t.rename(columns=col_name(t.keys(), list(range(0, 7))),
                     inplace=True)
            continue

        DATA = DATA.append(t, ignore_index=True)

    DATA.rename(columns=col_name(DATA.keys(), columns), inplace=True)
    DATA.drop(0, inplace=True)

    return DATA
コード例 #3
0
def main():
    getCleanPdf()
    # By now, the clean pdf is clean.pdf
    df = wrapper.read_pdf("clean.pdf")
    writer = pd.ExcelWriter('out.xlsx', engine='xlsxwriter')
    df.to_excel(writer, sheet_name='Sheet1')
    writer.save()
コード例 #4
0
def download_extract(paper, extract_figure=False, extract_table=False):
    if paper.pages >= 0 and paper.table >= 0:
        return False
    paper_info = {
        'pdf_url': paper.url,
        'title': paper.title,
    }
    api_paper = arxiv.query(id_list=[paper.arvixID])[0]
    if 'pdf_url' not in api_paper:
        return False
    pdf_url = api_paper['pdf_url']
    # pdf_url = 'https://arxiv.org/pdf/' + paper.url.split('/')[-1] +'.pdf'
    file_path = os.path.join(store_path, paper.paperId + '.pdf')
    # if not os.path.isfile(file_path):
    urllib.request.urlretrieve(pdf_url, file_path)

    if extract_table:
        df = wrapper.read_pdf(file_path, multiple_tables=True, pages='all')
        table_count = len(df)
        del df

    if extract_figure:
        figure_count, page_count = get_figure_count(file_path)
        modified = False
        if paper.pages == -1:
            modified = True
            paper.pages = page_count
        else:
            page_count = paper.pages
        if paper.table == -1:
            modified = True
            paper.table = table_count
        if os.path.exists(file_path):
            os.remove(file_path)
        if modified:
            Paper.update(table=table_count, pages=page_count).where(
                Paper.arvixID == paper.arvixID).execute()
            # paper.save()
            return modified
    # api_paper = arxiv.query(id_list=[paper.arvixID])[0]
    # if 'pdf_url' not in api_paper:
    #     return False
    # pdf_url = api_paper['pdf_url']
    texts = extract_text(file_path, pdf_url)
    if texts is None:
        print("PDF either do not exists or failed : ", paper.url)
        return False
    affiliation = []
    for text in texts.split():
        if re.match("[^@]+@[^@]+\.[^@]+", text):
            domain_name = text.split('@')[-1]
            affiliation.append(domain_name)
    if len(affiliation) > 0:
        Paper.update(affiliation=affiliation).where(
            Paper.arvixID == paper.arvixID).execute()

    return False
コード例 #5
0
def pdfProcess(inputFolder, url):
    #다른 사이트에서 pdf 데이터를 받을 때는 함수를 다시 만들어야 해요
    #html-parsing 작업_새로 업데이트된 pdf 파일의 일련번호를 가져와야 해요
    resp = requests.get(
        "http://fsc.go.kr/info/trd_list.jsp?menu=7230000&bbsid=BBS0069")
    resp.encoding = 'utf-8'
    html = resp.text
    bs = BeautifulSoup(html, 'html.parser')

    #프로그램 실행시마다 최신데이터를 데이터를 가져오도록 하는 코드
    #데이터 추출하기
    originalData = bs.select(
        "#contents > div.board > table > tbody > tr > td > a")
    convertedData = str(originalData[2])
    dailyUpdated = convertedData.split('"')[1].split('amp;')[1]

    url = url + "&" + dailyUpdated  # & 를 안 써줘서 직업이 안됐었어....
    df2 = wrapper.read_pdf(url,
                           multiple_tables=True,
                           pages="all",
                           pandas_options={"header": 0})

    #컬럼 헤더에 특수문자 제거
    colName = df2[0].columns.tolist()
    for i in range(0, len(df2)):
        colName[i] = df2[i].columns.tolist()
        for j in range(0, len(colName)):
            clean = re.sub('[-=+,#/\?:^$.%@*\"※~&ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]',
                           '', colName[i][j])
            colName[i][j] = clean
        df2[i].columns = colName[i]

    #csv 파일로 저장하기
    #파일이름에 반영할 현재 날짜 구하기
    currentYear = datetime.today().year
    currentMonth = datetime.today().month
    currentDate = datetime.today().day

    currentYear = str(currentYear)[-2:]
    if (currentMonth < 10):
        currentMonth = "0" + str(currentMonth)
    else:
        currentMonth = str(currentMonth)

    if (currentDate < 10):
        currentDate = "0" + str(currentDate)
    else:
        currentDate = str(currentDate)

    today = currentYear + currentMonth + currentDate

    #각각의 표를 파일로 저장 (to DB : postgres)
    engine = create_engine('postgresql://*****:*****@192.168.110.111:5432/kopo')
    for i in range(0, len(df2)):
        fileName = inputFolder + "_" + today + "_" + str(i)
        df2[i].to_sql(fileName, engine, if_exists='replace', index=False)
コード例 #6
0
ファイル: app.py プロジェクト: knail1/hanafidallas
def pdfParser(pdfFile):
    df = wrapper.read_pdf(
        pdfFile,
        spreadsheet=True,
        # (top,left,bottom,right).
        #area = (136.43, 58.64, 602.63, 554.93))
        area=(129.52, 59.99, 602.63, 553.13))
    #top, left, bottom, right
    df = pd.DataFrame(data=df)
    return (df)
コード例 #7
0
def dataPrep(url):
    #download file
    fileName = 'cb10.pdf'
    pdf = urllib.request.urlretrieve(url, fileName)
    tables = wrapper.read_pdf(fileName, lattice=True, multiple_tables=True)
    #select largest table
    largest = pd.DataFrame()

    for i in tables:
        if i.size >= largest.size:
            largest = i

    #get rid of the top columns names
    table = largest.copy()

    dayOfWeekRow = None
    for index, row in table.iterrows():
        dayChars = [d[:2] for d in calendar.day_name[:]]
        rowString = [
            str(i).strip('\r').title()[:2] for i in list(row) if not pd.isna(i)
        ]
        if len(set(dayChars) & set(rowString)) >= 7:
            dayOfWeekRow = index

    table.columns = table.iloc[dayOfWeekRow]
    table = table[dayOfWeekRow + 1:]

    #drop nan column
    table = table.loc[:, table.columns.notnull()]

    #drop nan with blank string
    table = table.fillna('')

    events = []
    for column in table:
        dayOfWeek = column
        values = table[column].values
        event = {}
        #if day in first row has value init weeknumber to 0 otherwise init to 1
        if table[:1].get(column).values != '':
            weekNumber = 0
        else:
            weekNumber = 1
        #go though each cell of the column and check for an integer,
        #if there is an integer start day otherwise just append the text
        for cell in values:
            date = re.search(r"^(\d*)", cell).group(0)
            if (date):
                weekNumber += 1
                event[column] = {'text': cell, 'weekNumber': weekNumber}

                if event[column].items not in events:
                    events.append(event)
                    event = {}
    return events
コード例 #8
0
def pdfocr():
    #pdf.image(imgpath,x,y,w,h)
    #pdf.output("/home/caratred/text.pdf", "F")
    a = wrapper.read_pdf("/home/caratred/Downloads/mrzlines.pdf",
                         pages="1",
                         index=False,
                         stream=True,
                         squeeze=True,
                         header=None)
    print(a)
    b = a.iloc[:, 0:3]
    print(b)
コード例 #9
0
def all_one(file_path, columns):
    """
    Convert pdf table to DataFrame wholly
    """
    df = read_pdf(
        file_path,
        # encoding='windows-1251',
        encoding='utf-8',
        lattice=True,
        pandas_options={'header': columns},
        pages='all')

    return df
コード例 #10
0
def vertical_column_names(pdf_name, column_names_area):
    ''' Function to extract vertical column names by rotating the file and using tabula '''
    rotated_pdf_name = rotate_pdf_90(pdf_name)
    list_of_column_names = wrapper.read_pdf(
        rotated_pdf_name,
        stream=True,
        multiple_tables=True,
        pages='all',
        area=(column_names_area[0], column_names_area[1], column_names_area[2],
              column_names_area[3]),
        guess=False)

    return list_of_column_names
コード例 #11
0
def pdf_to_text(pdf_name, columns_vertical_bool, tables_area, offices_area,
                column_names_area):
    '''Function to read text from pdf using tabula library into three separate lists (list of tables, list of offices and list of column headers)'''
    # get all tables (area of row only = (171.743,0.383,790.628,611.618))
    list_of_tables = wrapper.read_pdf(pdf_name,
                                      stream=True,
                                      multiple_tables=True,
                                      pages='all',
                                      area=(tables_area[0], tables_area[1],
                                            tables_area[2], tables_area[3]),
                                      guess=False)

    # get all office names
    list_of_offices = wrapper.read_pdf(pdf_name,
                                       stream=True,
                                       multiple_tables=True,
                                       pages='all',
                                       area=(offices_area[0], offices_area[1],
                                             offices_area[2], offices_area[3]),
                                       guess=False)

    # get all column names
    # if there are vertical columns, run the helper function (vertical_column_names)
    if columns_vertical_bool == "y":
        list_of_column_names = vertical_column_names(pdf_name,
                                                     column_names_area)
    # if there aren't vertical columns, extract normally
    elif columns_vertical_bool == "n":
        list_of_column_names = wrapper.read_pdf(
            pdf_name,
            stream=True,
            multiple_tables=True,
            pages='all',
            area=(column_names_area[0], column_names_area[1],
                  column_names_area[2], column_names_area[3]),
            guess=False)

    return list_of_tables, list_of_offices, list_of_column_names
コード例 #12
0
    def getPdf(self):
        tables = wrapper.read_pdf("qw.pdf",
                                  multiple_tables=True,
                                  pages='all',
                                  encoding='utf-8',
                                  spreadsheet=True)
        i = 1
        row_array = []
        row_array2 = []
        for table in tables:
            table.columns = table.iloc[0]
            table = table.reindex(table.index.drop(0)).reset_index(drop=True)
            table.columns.name = None
            #To write Excel
            table.to_excel('output' + str(i) + '.xlsx',
                           header=True,
                           index=False)
            workbook = xlrd.open_workbook('output2.xlsx')
            worksheet = workbook.sheet_by_name('Sheet1')
            num_rows = 5  #worksheet.nrows - 1
            curr_row = 4
            while curr_row < num_rows:
                row = worksheet.col(curr_row)
                #print(row)
                row2 = worksheet.col(curr_row - 1)
                row_array += row
                row_array2 += row2
                curr_row += 1
                i = i + 1

        for i in range(len(row_array)):
            #print(row[i].split("'"))
            #k=string(row[i])
            if row_array[i].value != '':
                if row_array[i].value == 'Grade':
                    self.semesterCount += 1
                    self.gradesArray.append(int(self.semesterCount))
                    self.creditsArray.append('D')
                else:
                    print(row_array[i].value)
                    gradeSingle = str(row_array[i].value)
                    creditSingle = int(row_array2[i].value)
                    self.gradesArray.append(gradeSingle)
                    self.creditsArray.append(creditSingle)

    #gradesArray+=row_array[i].value
    #creditsArray+=row_array2[i].value
        print(self.gradesArray)
        print(self.creditsArray)
コード例 #13
0
ファイル: cFunction.py プロジェクト: gojiin/python_test
def pdfProcess(inputFolder, url):
    #다른 사이트에서 pdf 데이터를 받을 때는 함수를 다시 만들어야 해요
    #html-parsing 작업_새로 업데이트된 pdf 파일의 일련번호를 가져와야 해요
    resp = requests.get(
        "http://fsc.go.kr/info/trd_list.jsp?menu=7230000&bbsid=BBS0069")
    resp.encoding = 'utf-8'
    html = resp.text
    bs = BeautifulSoup(html, 'html.parser')

    #프로그램 실행시마다 최신데이터를 데이터를 가져오도록 하는 코드
    #데이터 추출하기
    originalData = bs.select(
        "#contents > div.board > table > tbody > tr > td > a")
    convertedData = str(originalData[2])
    dailyUpdated = convertedData.split('"')[1].split('amp;')[1]

    url = url + "&" + dailyUpdated  # & 를 안 써줘서 직업이 안됐었어....
    df2 = wrapper.read_pdf(url,
                           multiple_tables=True,
                           pages="all",
                           pandas_options={"header": 0})

    #csv 파일로 저장하기
    #파일이름에 반영할 현재 날짜 구하기
    currentYear = datetime.today().year
    currentMonth = datetime.today().month
    currentDate = datetime.today().day

    currentYear = str(currentYear)[-2:]
    if (currentMonth < 10):
        currentMonth = "0" + str(currentMonth)
    else:
        currentMonth = str(currentMonth)

    if (currentDate < 10):
        currentDate = "0" + str(currentDate)
    else:
        currentDate = str(currentDate)

    today = currentYear + currentMonth + currentDate

    #각각의 표를 파일로 저장
    for i in range(0, len(df2)):
        fileName = inputFolder + "_" + today + "_" + str(i) + '.csv'
        df2[i].to_csv("../../data/outbound/" + inputFolder + "/" + fileName,
                      index=False,
                      encoding="ms949")
コード例 #14
0
def vat_return_reader(file_name):
    """vat_return_reader(file_name)
    It creates a dataframe where key is file name and values are last column(2) 
    of the VAT return table. In dataframe index name is file_name which function used"""

    df = wrapper.read_pdf(file_name,
                          pandas_options={'header': None},
                          encoding="cp775")
    print("tabula_read executed")
    column_data = df.loc[0:45, 2]
    column_name = df.loc[0:45, 1]
    #replaces '.' with ',' in order to be treated as nummeric
    column_data = column_data.astype(str).str.replace(',', '.')
    #converts str to float
    column_data = pd.to_numeric(column_data, errors='coerce')
    # renames row index as file names
    column_data = column_data.rename(index=f"{file_name}")
    return column_data
コード例 #15
0
from tabula import wrapper

import pandas as pd
import numpy as np 

from datetime import timedelta, datetime

import tkinter as tk
from tkinter import filedialog

import re, shutil 

tkroot = tk.Tk()
tkroot.withdraw()
file = filedialog.askopenfilename()

df = wrapper.read_pdf(file, pages='all', error_bad_lines=False)

print(df)

writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
コード例 #16
0
from tabula import wrapper
wrapper.read_pdf("", area=[])
コード例 #17
0
import numpy as np

from datetime import timedelta, datetime

import tkinter as tk
from tkinter import filedialog

tkroot = tk.Tk()
tkroot.withdraw()
file = filedialog.askopenfilename()

invoice = file[0:7]

#tkinter etc

df = wrapper.read_pdf(file)

#rename columns
df = df.rename(
    columns={
        'AGCO International GmbH': 'data',
        'Unnamed: 1': 'data1',
        'Unnamed: 2': 'data2',
    })

#get invoice from filename

#df['connection'] =

# new df with relevant rows
コード例 #18
0
ファイル: utils.py プロジェクト: Aitrg/md_ingestion
 def pdfConverter(self):
     from tabula.wrapper import read_pdf
     return read_pdf(self.path, pages='all', lattice=True)
コード例 #19
0
from tabula import wrapper
import os
print(os.listdir("."))
filepath = r"Birddogs-BofA-Bank-Statement-April.'17.pdf"

tables = wrapper.read_pdf(filepath,
                          pages='all',
                          silent=True,
                          pandas_options={
                              'header': None,
                              'error_bad_lines': False,
                              'warn_bad_lines': False
                          })
# tables = wrapper.read_pdf(filepath,multiple_tables=True,pages='all')

folder = "results"
if os.path.isdir(folder) == False:
    os.makedirs(folder)
i = 1

for the_file in os.listdir(folder):
    file_path = os.path.join(folder, the_file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)
for r in (tables.values.tolist()):
    print(r)
# for table in tables:
#     table.to_excel(os.path.join(folder,'Results'+str(i)+'.xlsx'),index=False)
コード例 #20
0
# Read pdf into DataFrame
# df = tabula.read_pdf("test.pdf", options)

from tabula import wrapper
#df = wrapper.read_pdf('../sample-data/B1705_2000560947.pdf', output_format='dataframe', guess=False)
#df = wrapper.read_pdf('../sample-data/B1705_2000560948.pdf', output_format='dataframe', guess=False)
#df = wrapper.read_pdf('../sample-data/B1705_2000604352 (1).pdf', output_format='dataframe', guess=False)

collection_frame = []

for file in os.listdir("../sample-data/"):
    if file.endswith(".pdf"):
        #print(file)
        filepath = "../sample-data/{}".format(file)
        print(filepath)
        df = wrapper.read_pdf(filepath, output_format='dataframe', guess=False)
        list(df)
        RBC_Value = df['Unnamed: 0'][df['Unnamed: 0'].str.contains(
            "RBC").__eq__(True)]
        Fibrinogen_Value = df['Unnamed: 0'][df['Unnamed: 0'].str.contains(
            "FIBRINOGEN").__eq__(True)]

        #getting rows from start to finish
        test = df['Unnamed: 0'].str.split(' ', expand=True)[1][11:32]
        #print(test)
        test2 = df['Unnamed: 0'].str.split(' ', expand=True)[2][18]

        test3 = df['Unnamed: 0'].str.split(' ', expand=True)[3][30]
        #print(test3)

        platelet_appearance = df['Unnamed: 0'].str.split(' ',
コード例 #21
0
from tabula import wrapper
import xlrd
import openpyxl
tables = wrapper.read_pdf("qw.pdf",
                          multiple_tables=True,
                          pages='all',
                          encoding='utf-8',
                          spreadsheet=True)

i = 1
row_array = []
row_array2 = []
semesterCount = 0
for table in tables:
    table.columns = table.iloc[0]
    table = table.reindex(table.index.drop(0)).reset_index(drop=True)
    table.columns.name = None
    #To write Excel
    table.to_excel('output' + str(i) + '.xlsx', header=True, index=False)
    workbook = xlrd.open_workbook('output' + str(i) + '.xlsx')
    worksheet = workbook.sheet_by_name('Sheet1')
    num_rows = 5  #worksheet.nrows - 1
    curr_row = 4
    while curr_row < num_rows:
        row = worksheet.col(curr_row)
        #print(row)
        row2 = worksheet.col(curr_row - 1)
        row_array += row
        row_array2 += row2
        curr_row += 1
        i = i + 1
コード例 #22
0
def check_tables(file_name):
    df = wrapper.read_pdf(str(file_name), pages='all')
    if len(df) == 0:
        return False
    else:
        return True
コード例 #23
0
ファイル: dcltools.py プロジェクト: TI-Russia/datafest2019
def parser(fn, path):

    famheader = {
        0: 'first_name',
        1: 'last_name',
        2: 'place_of_birth',
        3: 'date_of_birth',
        4: 'relationship'
    }

    propheader = {
        0: 'owner',
        1: 'type',
        2: 'purchase_date',
        3: 'purchase_type',
        4: 'price',
        5: 'location_area',
        6: 'owners'
    }

    fam = 'თქვენი ოჯახის წევრების (მეუღლე, არასრულწლოვანი შვილი, (გერი), თქვენთან მუდმივად მცხოვრები პირი) მონაცემები'
    prop = 'თქვენი, თქვენი ოჯახის წევრის საკუთრებაში არსებული უძრავი ქონება'

    rdate = re.compile(r'\s*\d\d\.\d\d.\d{4}$')
    place_and_date = 'დაბადების ადგილი, დაბადების თარიღი:\s*'

    dcl_data = {}

    FDF = make_fdf()
    PDF = make_pdf()

    dfs = read_pdf(os.path.join(path, fn),
                   pages='all',
                   lattice=True,
                   multiple_tables=True)

    inf_flag = dfs[1].loc[0, 0]

    p_n_d = re.search(place_and_date + '(.+)', inf_flag)

    if p_n_d:

        pnd = p_n_d.group(1)
        place = re.sub(rdate, '', pnd)
        dcl_data['palse_of_birth'] = place

        date = re.search(rdate, pnd)

        if date:

            date = date.group().strip()
            dcl_data['date_of_birth'] = date

    for df in dfs[2:]:
        if df.shape != (0, 0):

            if df.loc[0, 0] == fam:
                FDF = FDF.append(get_info(df, famheader))

            if df.loc[0, 0] == prop:
                PDF = PDF.append(get_info(df, propheader))

    if FDF.shape != (0, 0):

        FDF.fillna('', inplace=True)
        FDF.reset_index(drop=True, inplace=True)

        FDF = fullrows(FDF, 'first_name')

        FDF

        FDF.loc[:, 'first_name_lat'] = FDF.first_name.apply(transli)
        FDF.loc[:, 'last_name_lat'] = FDF.last_name.apply(transli)
        FDF.loc[:, 'place_of_birth_lat'] = FDF.place_of_birth.apply(transli)
        FDF.loc[:, 'relationship_lat'] = FDF.relationship.apply(transli)

        dcl_data['family'] = json.loads(
            FDF.to_json(orient="records", force_ascii=False))

    if PDF.shape != (0, 0):

        PDF.fillna('', inplace=True)
        PDF.reset_index(drop=True, inplace=True)
        PDF = fullrows(PDF, 'owner')

        for i in PDF[PDF['location_area'].str.endswith(
                ',')].index.values[::-1] + 1:
            wow(PDF, i)

        PDF.loc[:, 'owner'] = PDF.owner.str.replace('\r', ' ')
        PDF.loc[:, 'location_area'] = PDF.location_area.str.replace('\r', ' ')

        PDF.loc[:, 'owner_lat'] = PDF.owner.apply(transli)
        PDF.loc[:, 'type_lat'] = PDF.type.apply(transli)
        PDF.loc[:, 'purchase_type_lat'] = PDF.purchase_type.apply(transli)
        PDF.loc[:, 'location_area_lat'] = PDF.location_area.apply(transli)
        PDF.loc[:, 'owners_lat'] = PDF.owners.apply(transli)

        dcl_data['purchase'] = json.loads(
            PDF.to_json(orient="records", force_ascii=False))

    return dcl_data
コード例 #24
0
    for chunk in r.iter_content(102400):
        fd.write(chunk)
        print('...', end="", flush=True)

print('[DONE]')

# In[5]:


# wrapper.convert_into(file_name, (file_name + '.csv'), output_format='csv', pages='all')


# In[6]:


df = wrapper.read_pdf(file_name, pages='1700-1790')


# In[7]:


df.head()


# In[8]:


# df.columns = ['NR. DOSAR', 'DATA ÎNREGISTRĂRII', 'TERMEN', 'SOLUȚIE']
df.columns = ['NR. DOSAR', 'DATA INREGISTRARII', 'TERMEN', 'SOLUTIE']

コード例 #25
0
def out_tables(file, text_between, preprocessed_text, header, corpus, pages,
               table_start_list, table_end_list):
    '''This takes table names list as input and returns the tables related to the table names list.

    Parameters:

    file(str): filename or the DOWID of the document

    text_between(str): The text in between the start and end patterns in the text of the pdf

    preprocessed_text(list)e pdf text data which is already pre processed 

    header(str):String pattern of the header, for example 'Precision'

    corpus(object): PyPDF2.corpus object file can be used to read the pdf later

    pages(number): total number of pages in the document

    table_start_list(list): List of tables referenced in the heading (like 'Precision') text

    table_end_list(list): List of table names which are expected as the end patterns for table_start_list

    returns:

    tabledata(list): List of tables presented in the pages of pdf in which required heading (like 'Table I') data presented.

    '''

    tabledata = []
    remove_text = r"\.\.|following tab|table(|s) be|Metablen|in the table(|s)(|\.)|(T|t)able(|s) (be|of)|[a-z][a-z]table|table according"
    textdata = re.sub(remove_text, "", str(text_between))

    preprocessed_text1 = (re.sub(r'see Appendix|' + remove_text, "", temp)
                          for temp in preprocessed_text)

    if " Table " or " table " or " tables " or " Tables " or " TABLE " or " TABLES " in str(
            textdata):
        table_start_list, table_end_list = out_tables_list(text_between)

        if table_end_list != ['', ' ']:

            table_start = re.search("Table(|s)\s*\-*\s*\w*", str(text_between))

            if table_start != None:
                j = 0
                find = ''
                find_list = [
                    'THE IFORMATION HEREIN', 'THE INFORMATION HEREIN',
                    'The information herein', 'Appendix'
                ]
                text_at_end = ''
                for word in range(0, len(find_list)):

                    for page in range(0, len(preprocessed_text)):

                        if find_list[word] in preprocessed_text[page]:
                            tablepage = page
                            text_at_end = preprocessed_text[
                                tablepage:len(preprocessed_text)]
                            find = find_list[word]
                            break

                if find != '' and table_start != ' ' or '':

                    for table in range(0, len(table_start_list)):
                        table_start = table_start_list[table]
                        table_end = table_end_list[table]

                        if str(table_end) in str(text_at_end):
                            pattern = str(table_start) + "(.+?)" + str(
                                table_end)

                        else:
                            table_end = str(
                                text_at_end)[len(str(text_at_end)) -
                                             8:len(str(text_at_end))]
                            pattern = str(table_start) + "(.+?)" + table_end

                        text_between = str(
                            re.findall(pattern, str(text_at_end)))

                        for k in range(tablepage, len(preprocessed_text)):

                            if table_start in preprocessed_text[k]:
                                new_pages = str(k + 1) + "-" + str(k + 2)
                                if k + 1 == pages:
                                    new_pages = str(pages)
                                tabledata = read_pdf(file,
                                                     output_format="dataframe",
                                                     pages=new_pages,
                                                     lattice=True,
                                                     multiple_tables=True,
                                                     encoding='latin-1',
                                                     position="absolute")
                        tabledata = string_comp_match(
                            text_between=str(text_at_end), tabledata=tabledata)

                        if tabledata != []:
                            break
    return tabledata
コード例 #26
0
import pandas as pd
import numpy as np

from datetime import timedelta, datetime

import tkinter as tk
from tkinter import filedialog

import re, shutil, csv

tkroot = tk.Tk()
tkroot.withdraw()
file = filedialog.askopenfilename()

df = wrapper.read_pdf(file, pages='all')

doi = re.compile(r'(\d\d/\d\d/\d\d)')
Dateoi = doi.findall(str(df['Unnamed: 2'].str.extract(r'(\d\d/\d\d/\d\d)')))

df = df.rename(
    columns={
        'AGCO International GmbH': 'data',
        'Unnamed: 1': 'data1',
        'Unnamed: 2': 'data2',
    })

#create empty df for dissection
#write individual lines to new df

invr = re.compile(r'(A0\d{6})')
コード例 #27
0
# data = wrapper.read_pdf("http://gs.cufe.edu.cn/system/resource/storage/download.jsp?mark=REE2MkE4Q0M2QzAwRjRDRjI0Q0JFRTJFRDAyOTk0MzcvMDMxNDc5RDQvOUFEMQ== ")
# print(data)

import PyPDF2
from tabula import wrapper
from Office.save_file import save_dfto_csv
# import tabula
#
pdfpath = r"C:\Users\xfs9619\Desktop\附件1 中央财经大学2019-2020学年第一学期研究生课程表(公共课部分)\中央财经大学2019-2020学年第一学期研究生课程表(学生专用).pdf"
reader = PyPDF2.PdfFileReader(open(pdfpath, mode='rb'))

n = reader.getNumPages()
df = []
for page in range(18, 264):
    try:
        d = wrapper.read_pdf(pdfpath, pages=page)
        dfs = d.set_index(
            d['时间']).drop(columns=['时间', 'Unnamed: 0']).stack().reset_index()
        dfs.columns = ['节数', '星期', '课程信息']
        dfs = dfs.dropna(how='any')
        dfs['课程名称'] = dfs['课程信息'].apply(lambda x: x.split('\r')[0])
        dfs['时间范围'] = dfs['课程信息'].apply(lambda x: x.split('\r')[1])
        dfs['地点'] = dfs['课程信息'].apply(lambda x: x.split('\r')[2])
        save_dfto_csv('课表', dfs)
        #res = dfs.drop(columns='课程信息')
        # df.append(d)
        # d.to_excel('%d.xlsx')
        print(str(page) + 'success')
    except:
        print(str(page) + 'fail')
コード例 #28
0
#!/usr/bin/env python
# coding: utf-8

# In[7]:

import pandas as pd

get_ipython().system('pip install tabula-py')
from tabula import wrapper

df = wrapper.read_pdf('C:/Users/ramaruth/Downloads/hdfc.pdf')

# In[164]:

df

# In[165]:

data = df[[
    'Date', 'Narration', 'Withdrawal Amt.', 'Deposit Amt.', 'Closing Balance'
]]

# Here the null values of Narration doesn't provide any transacton details.Hence Deleting them

# In[166]:

data[data.Narration.isnull()]

# In[167]:

data = data.dropna(subset=['Narration'], axis=0, how='any')
コード例 #29
0
def scrap(filename):

    # pdf file object
    # you can find find the pdf file with complete code in below
    pdfFileObj = open(filename, 'rb')
    #print(filepath)
    # pdf reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    # number of pages in pdf
    print(pdfReader.numPages)
    no_of_pages = pdfReader.numPages

    #This scrapes the PDF

    text = convert_pdf_to_txt(filename, pages=range(no_of_pages))
    arr = text.split('\n')  #We split by new line
    print(arr)
    trans_details = arr
    #We will retrieve the labels by finding the index of  Beginning and Ending balance.
    st_index_labels = arr.index('Beginning Balance')
    end_index_labels = arr.index('Ending Balance')
    labels = arr[st_index_labels:end_index_labels + 1]
    print(labels)

    #find start index
    index = end_index_labels
    st = 'Ending Balance'
    while index < (len(arr) - 1):
        if arr[index] != '':
            if arr[index][0] == '$':
                st = arr[index]

                break
        index = index + 1

    st
    print(index)

    st_index = index
    print(st_index)
    print(len(labels))
    summary = (arr[st_index:st_index + len(labels)])

    summary[len(summary) - 1] = summary[len(summary) - 1][1:]
    #Wrangle Strings to remove dollar sign and commas. Convert them to float to plot them
    summary[0] = summary[0][
        1:]  # This removes dollar sign in the front, so we can use for plotting.
    summary = [s.replace(',', '')
               for s in summary]  # This removes commas from the amount

    print(summary)

    #This scrapes data from

    dates = cleanDates(trans_details)
    print(len(dates))

    months = getMonths(dates)
    months
    labels.insert(0, 'Month')
    summary.insert(0, months[0])
    summary = [float(i) for i in summary]  # converts the whole list into flots
    summary_dataframe = pd.DataFrame({'Labels': labels, 'Amount': summary})
    summary_dataframe.to_csv(
        'C:\\Users\\nidhi\\Desktop\\UCSC_Spring\\WebScraping\\Summary.csv',
        sep=',',
        mode='a',
        index=False)
    #Adjust the page number.
    #Test which is the start page where the transaction details are
    #If you give a page which does not have table then tabula will throw an exception
    print(summary_dataframe.info())
    print(summary_dataframe.head(10))

    startpage = 0
    for page in range(no_of_pages):
        try:
            text = convert_pdf_to_txt(filename, pages=[page])
            arr = text.split('\n')
            index = arr.index('*start*transaction detail')
            if (index >= 0):
                break
        except ValueError:
            startpage = startpage + 1

    print(startpage)
    print("Nit")

    #from tabula import wrapper

    #for page in range(startpage,)
    #tables = wrapper.read_pdf("C:\\Users\\nidhi\\Desktop\\UCSC_Spring\\WebScraping\\s2.pdf",multiple_tables=True, pages=0)

    #print(len(tables[0][2]))
    #for table in tables:
    # for tab in range(len(table[0][2])):
    #  print(tables[0][2][tab])

    #os.path.abspath("E:/Documents/myPy/")
    #Strange observation: 1) Though pages all means iterating all the pages and multi tables true should help to accumulate multiple table. But it has only one
    # 2) All the first page where the transaction starts like s4,s6 does fine  or has 0,1 as amounts otherwise it is on 0,2

    balance = []
    amount = []
    description = []
    #print(len(tables[0][1]))

    #print(tables[0][2][tab])
    #for tab in range(len(tables[0][1])):
    # print(tables[0][1][tab])

    for page in range(startpage, no_of_pages):
        print(page)
        try:
            tables = wrapper.read_pdf(filename,
                                      multiple_tables=True,
                                      pages=page + 1)
            print(tables[0][2])
            if ((page) == startpage):
                try:
                    for tab in range(len(tables[0][1])):
                        amount.append(tables[0][1][tab])

                except:
                    print("Amount Hi")
            else:
                try:
                    for tab in range(len(tables[0][2])):
                        amount.append(tables[0][2][tab])
                except:
                    print("Amount Hi")
            try:
                for tab in range(len(tables[0][0])):
                    description.append(tables[0][0][tab])
            except:
                print(" Description Hi")
            if ((page) == startpage):
                try:

                    for tab in range(len(tables[0][2])):
                        balance.append(tables[0][2][tab])
                except:
                    print("Balance Hi")
            else:
                try:

                    for tab in range(len(tables[0][3])):
                        balance.append(tables[0][3][tab])
                except:
                    print("Balance Hi")

            #balance
            #cleanedList = [x for x in balance if str(x) != 'nan']
            #cleanedlist = [x for x in balance if x != 'nan']
            #cleanedList

            #amount
            print(amount)
            print(balance)
        except:
            print("table error")

    print(amount)
    cleanedAmount = cleanAmountAndBalance(amount)
    cleanedAmount = [float(i) for i in cleanedAmount]
    cleanedAmount
    len(cleanedAmount)

    cleanedBalance = cleanAmountAndBalance(balance)
    cleanedBalance = [float(i) for i in cleanedBalance]
    cleanedBalance = cleanedBalance[1:]
    print((cleanedBalance))
    if len(cleanedBalance) > len(cleanedAmount):
        cleanedBalance = cleanedBalance[:len(cleanedBalance) - 1]

    len(cleanedBalance)

    cleanedDescription = cleanDescription(description)
    cleanedDescription
    len(cleanedDescription)

    bankStatementInfo = pd.DataFrame({
        'Dates': dates,
        'Months': months,
        'Description': cleanedDescription,
        'Amount': cleanedAmount,
        'Balance': cleanedBalance
    })
    print(bankStatementInfo.info())
    print(bankStatementInfo.head(10))
    from pathlib import Path

    my_file = Path("bankStatements.csv")
    if my_file.is_file():
        bankStatementInfo.to_csv('bankStatements.csv',
                                 index=False,
                                 header=False,
                                 sep=',',
                                 mode='a')
    else:

        # file exists
        bankStatementInfo.to_csv('bankStatements.csv',
                                 index=False,
                                 sep=',',
                                 mode='a')

    ism = re.match("^\d{2}\/\d{2}", '01/22')
    if ism:
        print("Hi")

    return "Hello Nits!!"
コード例 #30
0
# encoding: utf-8

from tabula import wrapper

df = wrapper.read_pdf("/Users/my/Documents/program/消费行业周报20190111.pdf")
print(df)
for indexs in df.index:
    # 遍历打印企业名称
    print(df.loc[indexs].values[1].strip())
コード例 #31
0
print('Loading file...', end="", flush=True)

with open(file_name, 'wb') as fd:
    for chunk in r.iter_content(102400):
        fd.write(chunk)
        print('...', end="", flush=True)

print('[DONE]')

# In[5]:

# wrapper.convert_into(file_name, (file_name + '.csv'), output_format='csv', pages='all')

# In[6]:

df = wrapper.read_pdf(file_name, pages='1700-1790')

# In[7]:

df.head()

# In[8]:

# df.columns = ['NR. DOSAR', 'DATA ÎNREGISTRĂRII', 'TERMEN', 'SOLUȚIE']
df.columns = ['NR. DOSAR', 'DATA INREGISTRARII', 'TERMEN', 'SOLUTIE']

# In[9]:

# print(df.shape)
# print(type(df.iloc[0,2]))