def parse_pdf(file_obj):
    pdf = pdfplumber.load(file_obj)

    checks = pd.concat(list(map(parse_page, pdf.pages)))\
        .reset_index(drop=True)

    return checks[checks["state"] != "Totals"]
Example #2
0
def extract_data(feed):
    data = ""
    with pdfplumber.load(feed) as pdf:
        pages = pdf.pages
        for p in pages:
            data = data + p.extract_text()
    return data  # build more code to return a dataframe
Example #3
0
def uploaded_pdf_to_text(uploaded_file):
    doc = []
    pdf = pdfplumber.load(uploaded_file)
    for page in pdf.pages:
        doc.append(page.extract_text())
    text_lookup_res = '\n'.join(doc)
    return text_lookup_res
Example #4
0
def extract_data(feed):
    data = []
    with pdfplumber.load(feed) as pdf:
        pages = pdf.pages
        for p in pages:
            data.append(p.extract_text())
    return data
Example #5
0
def extractDataFromPDFFile(pdfSource, x_coord, y_coord):
    text = ''
    with pdfplumber.load(pdfSource) as pdf:
        for page in pdf.pages:
            text1 = page.extract_text(x_tolerance=x_coord, y_tolerance=y_coord)
            text = text + text1
    return text
Example #6
0
def main():
    args = parse_args()
    pdf = pdfplumber.load(args.infile, pages=args.pages)
    if args.format == "csv":
        to_csv(pdf, args.types, args.encoding)
    else:
        to_json(pdf, args.types, args.encoding)
Example #7
0
def extractDataFromPDFFile(pdfSource):
    text = ''
    with pdfplumber.load(pdfSource) as pdf:
        for page in pdf.pages:
            text1 = page.extract_text()
            text = text + text1
    return text
Example #8
0
 def _load_file(self):
     self.viewer.clear()
     path = self.paths[self.pathidx]
     filename = os.path.basename(path)
     try:
         if filename.split('.')[-1].lower() in ['jpg', 'png']:
             image = Image.open(path)
             pdf = io.BytesIO(
                 pytesseract.image_to_pdf_or_hocr(image, extension='pdf'))
             self.pdf = pdfplumber.load(pdf)
         else:
             self.pdf = pdfplumber.open(path)
         self.viewer.display_pdf(self.pdf)
         self.doc_label.configure(
             text="{} of {}".format(self.pathidx + 1, len(self.paths)))
         self.logger.clear()
         self.logger.log("Showing invoice '{}'".format(path))
     except WandException:
         result = messagebox.askokcancel(
             "Error",
             "ImageMagick Policy Error! Should InvoiceNet try to fix the error?"
         )
         if result:
             result = self._fix_policy_error()
         if result:
             messagebox.showinfo(
                 "Policy Fixed!",
                 "ImageMagick Policy Error fixed! Restart InvoiceNet.")
         else:
             messagebox.showerror(
                 "ImageMagick Policy Error",
                 "Coud not fix ImageMagick policy. Rejecting the current pdf file!"
             )
     except (IndexError, IOError, TypeError):
         pass
def parse_pdf(file_obj):
    pdf = pdfplumber.load(file_obj)

    checks = pd.concat(list(map(parse_page, pdf.pages)))\
        .reset_index(drop=True)

    return checks[checks["state"] != "Totals"]
def main():
    args = parse_args()
    pdf = pdfplumber.load(args.infile, pages=args.pages)
    if args.format == "csv":
        to_csv(pdf, args.types, args.encoding)
    else:
        to_json(pdf, args.types, args.encoding)
def parse_pdf(file_obj):
    pdf = pdfplumber.load(file_obj)

    # Note: As of Nov. 2019 file, first page is documentation
    checks_gen = map(parse_page, pdf.pages[1:])
    checks = pd.concat(checks_gen).reset_index(drop=True)

    return checks[checks["state"] != "Totals"]
Example #12
0
    def test_loading_fileobj(self):
        path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
        with open(path, "rb") as f:
            with pdfplumber.open(f) as pdf:
                assert len(pdf.metadata)

        # Will be removed from library soon
        with open(path, "rb") as f:
            with pdfplumber.load(f) as pdf:
                assert len(pdf.metadata)
def main():
    parser = argparse.ArgumentParser(
        description='Extract financial lines from a PDF document')
    parser.add_argument('infile', type=argparse.FileType('rb'))
    args = parser.parse_args()

    pdf = pdfplumber.load(args.infile)
    rows = get_finances(pdf)
    for r in rows:
        print(r)
Example #14
0
def extract_text(path):
    """Returns a generator object with a list of rows for each page.

    :param path: can be a URL or system path to a pdf file.

    Usage::
        If the
        file is small, you may be able to create one object containing all
        text in the pdf file:
      >>> pdf_pages = pdf_yeah.extract_text('https://www.nostarch.com/download/Automate_the_Boring_Stuff_sample_ch17.pdf')
      >>> pdf_full_text = [page for page in pdf_pages]

        Otherwise it's better to iterate through the generator like so:
      >>> import pdf_yeah
      >>> pdf_pages = pdf_yeah.extract_text('https://abc.xyz/investor/pdf/20160331_alphabet_10Q.pdf')
      >>> pg1 = next(pdf_pages)
      >>> print('\n'.join(pg1[:9]))

      UNITED STATES
      SECURITIES AND EXCHANGE COMMISSION
      Washington, D.C. 20549
      ________________________________________________________________________________________
      FORM 10-Q
      ________________________________________________________________________________________
      (Mark One)
      QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
      For the quarterly period ended March 31, 2016

    """

    if path.startswith('http'):
        r = requests.get(path)
        fp = io.BytesIO(r.content)
        pdf = pdfplumber.load(fp)
        for page in pdf.pages:
            yield page.extract_text().split('\n')

    else:
        with open(path, 'rb') as fp:
            pdf = pdfplumber.load(fp)
            for page in pdf.pages:
                yield page.extract_text().split('\n')
Example #15
0
 def __init__(self, datasheet_path):
     self.path = Path(datasheet_path)
     self.pdf_file = PyPDF3.PdfFileReader(self.path.open('rb'))
     self.plumber = pdfplumber.load(self.path.open('rb'))
     self.raw_outline = []
     self.tables, self.figures = {}, {}  # type: Dict
     self.table_of_content = DataSheetNode('ROOT', [0])
     self.table_root = DataSheetNode('TABLES', [-1])
     self.table_of_content.append(self.table_root)
     self.fallback_table: DataSheetTableNode = None
     self.flatten_outline()
     self.sort_raw_outline()
     self.collect_tables()
def extract_table_to_dfs(page):
    pdf_page = pdfplumber.load(page)
    tables = pdf_page.pages[0].extract_tables()
    dfs = []
    for table in tables:
        df = pd.DataFrame(table[1:], columns=table[0])
        number_of_not_nans = np.sum(df.count())
        if number_of_not_nans > 0:
            number_of_nans = df.isnull().sum().sum()
            percentage_of_nans = number_of_nans / (number_of_not_nans +
                                                   number_of_nans)
            if percentage_of_nans <= NANS_THRESHOLD:
                dfs.append(df)
    return dfs
Example #17
0
    def extract_tables(self):

        data = []
        raw_table = []
        with pdfplumber.load(self.pdf) as pdf:
            pages = pdf.pages
            for p in pages:
                tbl = p.extract_tables()
                raw_table.append(tbl)
                #data.append(pd.DataFrame(tbl))
                if len(tbl):
                    column_names = tbl[0].pop(0)
                    df = pd.DataFrame(tbl[0], columns=column_names)
                    data.append(df)
        self.tables = data
        return data
def parse_pdf(file_obj):
    pdf = pdfplumber.load(file_obj)
    rects = pd.DataFrame(pdf.rects)
    chars = pd.DataFrame(pdf.chars)

    # Find the leftmost side of the rectangles that appear on each page.
    rect_counts = rects["x0"].value_counts()
    edges = rect_counts[
        rect_counts == len(pdf.pages)
    ].sort_index().index
    edges = ((pd.Series(edges) / 2).round() * 2).drop_duplicates()

    # Use these edges to create boundaries, defining fields.
    bounds = list(zip(edges, edges[1:]))

    def parse_line(chars):
        fields = [ "".join(get_between(chars, x0, x1)["text"])
            for x0, x1 in bounds ]

        parsed = list(map(parse_field, fields))
        return parsed

    def parse_page_chars(chars):
        c = chars[
            (chars["top"] >= DATA_START_TOP) &
            (chars["top"] < DATA_END_TOP)
        ]

        month = parse_month("".join(chars[
            (chars["size"] == 14.183) &
            (chars["top"] > 28)
        ]["text"]))

        data = c.groupby((c["doctop"] / 3).round()).apply(parse_line)
        df = pd.DataFrame([ [ month ] + d for d in data ], columns=COLUMNS)
        df.loc[(df["state"] == "llinois"), "state"] = "Illinois"
        try: validate_data(df)
        except: raise Exception("Invalid data for " + month)
        return df

    checks = pd.concat([ parse_page_chars(chars[chars["pageid"] == p.pageid])
        for p in pdf.pages ]).reset_index(drop=True)

    return checks
Example #19
0
 def parse(self, content):
     """
     :type 主要使用pdfplumber模块进行pdf内容解析
     :param content: 读取内存中的pdf数据
     :return: 返回字符串数据
     """
     # 加载pdf文件, 类型为二进制
     pdf = pdfplumber.load(content)
     targets = []  # 保存结果
     # 获取每一页的pdf内容
     for page in pdf.pages:
         # 获取当前页面的全部文本信息
         words = page.extract_text()
         # 对内容进行清洗
         word = words.replace(' ', '').replace('\n', '')
         targets.append(word)
     # 关闭pdf资源
     pdf.close()
     return ''.join(targets)
Example #20
0
 def _load_file(self):
     self.viewer.clear()
     path = self.paths[self.pathidx]
     filename = os.path.basename(path)
     try:
         if filename.split('.')[-1].lower() in ['jpg', 'png']:
             image = Image.open(path)
             pdf = io.BytesIO(
                 pytesseract.image_to_pdf_or_hocr(image, extension='pdf'))
             self.pdf = pdfplumber.load(pdf)
         else:
             self.pdf = pdfplumber.open(path)
         self.viewer.display_pdf(self.pdf)
         self.doc_label.configure(
             text="{} of {}".format(self.pathidx + 1, len(self.paths)))
         self.logger.clear()
         self.logger.log("Showing invoice '{}'".format(path))
     except (IndexError, IOError, TypeError):
         pass
Example #21
0
def new_get_auditor(url, page):
    '''
    get audit firm name by searching the regex pattern on a page
    '''
    rq = requests.get(url)
    if rq.status_code == 200:
        # logging.info('request success. start extracting text...')
        print('request success, loading pdf...')
    try:
        pdf = pdfplumber.load(BytesIO(rq.content))
        txt = pdf.pages[page].extract_text()
    except:
        logging.warning(f'Not pdf file. check {url}.')
        return None
    txt = re.sub("([^\x00-\x7F])+", "", txt)  # diu no chinese
    pattern = r'\n(?!.*?Institute.*?).*?(?P<auditor>.+?)(?:LLP\s*)?\s*((PRC.*?|Chinese.*?)?[Cc]ertified [Pp]ublic|[Cc]hartered) [Aa]ccountants'
    auditor = re.search(pattern, txt,
                        flags=re.MULTILINE).group('auditor').strip()
    return auditor
def parse_pdf(file_obj):
    pdf = pdfplumber.load(file_obj, pandas=True)
    rects = pdf.rects
    chars = pdf.chars

    # Find the leftmost side of the rectangles that appear on each page.
    rect_counts = rects["x0"].value_counts()
    edges = rect_counts[
        rect_counts == len(pdf.pages)
    ].sort_index().index

    # Use these edges to create boundaries, defining fields.
    bounds = list(zip(edges, edges[1:]))

    def parse_line(chars):
        fields = [ "".join(get_between(chars, x0, x1)["text"])
            for x0, x1 in bounds ]

        parsed = list(map(parse_field, fields))
        return parsed

    def parse_page_chars(chars):
        c = chars[
            (chars["top"] >= DATA_START_TOP) &
            (chars["top"] < DATA_END_TOP)
        ].sort_values([ "doctop", "x0" ])

        month = parse_month("".join(chars[
            (chars["size"] == 14.183) &
            (chars["top"] > 28)
        ]["text"]))

        data = c.groupby("doctop").apply(parse_line)
        return pd.DataFrame([ [ month ] + d for d in data ], columns=COLUMNS)


    checks = pd.concat([ parse_page_chars(chars[chars["pageid"] == p.pageid])
        for p in pdf.pages ]).reset_index(drop=True)

    assert(len(checks) > 0)
    assert((checks.fillna(0).sum(axis=1) != (checks["totals"] * 2)).sum() == 0)
    return checks
Example #23
0
    def _run_ocr(self):
        if self.pdf is None:
            return

        pdf_pages = list()
        for page in self.pdf.pages:
            image = page.to_image(resolution=100)
            pdf = pytesseract.image_to_pdf_or_hocr(image.original,
                                                   extension='pdf')
            pdf_pages.append(pdf)

        pdf_writer = PyPDF2.PdfFileWriter()
        for page in pdf_pages:
            pdf = PyPDF2.PdfFileReader(io.BytesIO(page))
            pdf_writer.addPage(pdf.getPage(0))

        pdf = io.BytesIO()
        pdf_writer.write(pdf)

        self.pdf = pdfplumber.load(pdf)
        self.viewer.display_pdf(self.pdf)
    def get_text(self, url):
        if url[-4:] == '.pdf':
            try:
                r = self.request('GET',
                                 url,
                                 stream=True,
                                 timeout=self.options['timeout'],
                                 verify=False)
                if 200 == r.status_code and 'application/pdf' == r.headers[
                        'Content-Type']:
                    with io.BytesIO() as f:
                        for chunk in r.iter_content(chunk_size=8192):
                            if chunk:
                                f.write(chunk)
                        pdf = pdfplumber.load(f)
                        text = ''
                        for i in range(0, len(pdf.pages)):
                            page = pdf.pages[i]

                            page_text = page.extract_text()
                            if isinstance(page_text, str):
                                text = text + page_text
                        pdf.close()
                        return text
            except Exception as e:
                self.alert(url + ' ' + str(e))
        else:
            try:
                r = self.request('GET',
                                 url,
                                 stream=True,
                                 timeout=self.options['timeout'],
                                 verify=False)
                if 200 == r.status_code:
                    return r.text
            except Exception as e:
                self.alert(url + ' ' + str(e))
    def _run_ocr(self):
        if self.pdf is None:
            return

        pdf_pages = list()
        for page in self.pdf.pages:
            image = page.to_image(resolution=100)

            # myocr
            # text_strings, text_recs_alls = predict([image.original], language='chn')
            # boxes = [[(recs[0], recs[1]), (recs[4], recs[5])] for recs in text_recs_alls[0]][:-1]
            # imgdraw = ImageDraw.Draw(image.original)
            # for box in boxes:
            #     imgdraw.rectangle(box,width=2,outline='red')
            # 画框
            # boxes = [box.split(' ') for box in pytesseract.image_to_boxes(image.original).split('\n')][:-1]
            # imgcopy = image.original.copy()
            # imgdraw = ImageDraw.Draw(imgcopy)
            # for box in boxes:
            #     x1,y1,x2,y2 = int(box[1]),int(box[2]),int(box[3]),int(box[4])
            #     imgdraw.rectangle([(x1,y1),(x2,y2)],width=2,outline='red')

            pdf = pytesseract.image_to_pdf_or_hocr(image.original,
                                                   extension='pdf')
            pdf_pages.append(pdf)

        pdf_writer = PyPDF2.PdfFileWriter()
        for page in pdf_pages:
            pdf = PyPDF2.PdfFileReader(io.BytesIO(page))
            pdf_writer.addPage(pdf.getPage(0))

        pdf = io.BytesIO()
        pdf_writer.write(pdf)

        self.pdf = pdfplumber.load(pdf)
        self.viewer.display_pdf(self.pdf)
Example #26
0
def extract_data_lonseddel(feed):
    data_list = []
    with pdfplumber.load(feed) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()

        for row in text.split('\n'):
            #st.write(row)
            if '1100' in row:
                global timer
                products_dict = {}
                text = row.split()[1]

                products_dict["Beskrivelse"] = text
                products_dict["Enheder"] = row.split()[-3]
                timer = row.split()[-3]
                products_dict["Sats"] = row.split()[-2]
                products_dict["Beløb"] = row.split()[-1]

                data_list.append(products_dict)
            if '1104' in row:
                products_dict = {}
                text_1 = row.split()[1]
                text_2 = row.split()[2]
                text = text_1 + " " + text_2

                products_dict["Beskrivelse"] = text
                products_dict["Enheder"] = row.split()[-3]
                products_dict["Sats"] = row.split()[-2]
                products_dict["Beløb"] = row.split()[-1]

                data_list.append(products_dict)

            if '1330' in row:
                products_dict = {}

                text_1 = row.split()[1]
                text_2 = row.split()[2]
                text_3 = row.split()[3]
                text_4 = row.split()[4]
                text = text_1 + " " + text_2 + " " + text_3 + " " + text_4

                products_dict["Beskrivelse"] = text
                products_dict["Enheder"] = row.split()[-3]
                products_dict["Sats"] = row.split()[-2]
                products_dict["Beløb"] = row.split()[-1]

                data_list.append(products_dict)

            if '3992' in row:
                products_dict = {}

                text_1 = row.split()[1]
                text_2 = row.split()[2]
                text_3 = row.split()[3]
                text_4 = row.split()[4]
                text_5 = row.split()[5]
                text = text_1 + " " + text_2 + " " + text_3 + " " + text_4 + " " + text_5

                products_dict["Beskrivelse"] = text
                products_dict["Enheder"] = row.split()[-3]
                products_dict["Sats"] = row.split()[-2]
                products_dict["Beløb"] = row.split()[-1]

                data_list.append(products_dict)


            if 'Overført til reg./konto' in row:
                global udbetaling
                if '8100' in row:
                    timer = row.split()[-2]

                products_dict = {}

                text_1 = row.split()[0]
                text_2 = row.split()[1]
                text_3 = row.split()[2]
                text = text_1 + " " + text_2 + " " + text_3

                products_dict["Beskrivelse"] = text
                products_dict["Enheder"] = timer
                products_dict["Sats"] = " "
                udbetaling = row.split()[-1]
                products_dict["Beløb"] = udbetaling

                data_list.append(products_dict)

            if 'Lønseddel for perioden' in row:
                text_1 = row.split()
                #print(text_1)

                global start_dato, slut_dato,year_dato
                start_dato = str([' '.join(text_1[3:5])])[2:-2]
                slut_dato = str([' '.join(text_1[-3:-1])])[2:-2]
                year_dato = str([''.join(text_1[-1:])])[2:-2]

    return data_list  # build more code to return a dataframe
        ((chars["fontname"] == "Arial") &
        (chars["size"] == 5.628))
    ].copy()

    data = pdfplumber.utils.extract_columns(data_chars, x_tolerance=1, y_tolerance=1)

    if len(data.columns) == 6:
        data.columns = COLUMNS
    else:
        data.columns = [ "sivigila_code" ] + COLUMNS
    data = data.drop_duplicates().reset_index(drop=True)
    data[INT_COLS] = data[INT_COLS].astype(int)
    data["department"] = data["department"].str.strip().str.upper().apply(lambda x: DEPT_FIXES.get(x, x))
    data["municipality"] = data["municipality"].str.strip().str.upper().apply(lambda x: MUNI_FIXES.get(x, x))

    sums = data[INT_COLS].sum(axis=1)
    equalities = (sums == (data["zika_total"] * 2)).unique().tolist()
    assert(equalities == [ True ])
    return data

if __name__ == "__main__":
    import sys
    if hasattr(sys.stdin, "buffer"):
        buf = sys.stdin.buffer
    else:
        buf = sys.stdin
    pdf = pdfplumber.load(buf)
    data = parse(pdf)
    data.to_csv(sys.stdout, index=False, encoding="utf-8")
    
Example #28
0
def receiveFile():
    print("Receiving File", flush=True)
    length = int(request.form['length'])
    fileNames = json.loads(request.form['fileNames'])
    returnJson = {}
    absaDocument = {}
    sentimentWordDocument = {}
    corpus = []
    corpusEntity = {}
    corpusRelation = []

    for i in range(length):
        file = request.files[f'file{i}']

        # Get filename
        fileName = fileNames[i]

        # Get file extension
        name, extension = os.path.splitext(fileName)
        print('POST SUCCESSFUL', fileName, flush=True)
        try:
            if extension == '.txt':
                byteString = file.read()
                encoding = chardet.detect(byteString)['encoding']
                text = byteString.decode(encoding)
            elif extension == '.pdf':
                text = ''
                with pdfplumber.load(file) as pdf:
                    for page in pdf.pages:
                        text += page.extract_text()
            text = re.sub('\\\\', '', text)
            tempJson = runAlice(text)
            absaChapter = tempJson['sentiment'][2]['absaChapter'].copy()
            sentimentWordChapter = tempJson['sentiment'][2][
                'sentimentWordChapter'].copy()
            absaDocument = absa_document_combined_c(absaDocument, absaChapter,
                                                    name)
            sentimentWordDocument = entity_sentimentwords_document(
                sentimentWordDocument, sentimentWordChapter)

            returnJson[name] = tempJson

            tempEntity = tempJson['ner']['ents'].copy()
            for entity in tempEntity:
                key = entity['text'] + '_' + entity['type']
                if key in corpusEntity:
                    corpusEntity[key]['value'] += 1
                    corpusEntity[key]['documents'].add(name)
                else:
                    corpusEntity[key] = {
                        'id': entity['text'],
                        'label': entity['text'],
                        'value': 1,
                        'documents': set([name]),
                        'type': entity['type'],
                        'color': nercolors[entity['type']]
                    }
            # corpusPassToRelation.extend(tempJson['ner'].pop('passToRelation'))
            corpus.append(text)
            print(f"Current Corpus Text: {corpus}", flush=True)

            newRelation = tempJson['relation'].copy()
            for relation in newRelation:
                relation['documents'] = [name]
                corpusRelation.append(relation)

        except Exception as err:
            print(err, "occured in" + fileName)
        except:
            print('Unknown error in' + fileName)
    if length > 1:
        print(f"Corpus being sent to overview {corpus}", flush=True)
        returnJson['Overview'] = getOverview(corpus, corpusEntity,
                                             corpusRelation, absaDocument,
                                             sentimentWordDocument, fileNames)
    print('RESULT', json.dumps(returnJson))
    returnJson = jsonify(returnJson)
    return returnJson
import pikepdf
import io
import pdfplumber

with io.BytesIO() as f:
    with pikepdf.open(r'C:\Users\BashamF\Documents\c06278453.pdf') as pdf:       
        pdf.save(f)

    with pdfplumber.load(f) as pdf:
        print(pdf.pages[0].extract_text())


#!/usr/bin/env python
import pandas as pd
import pdfplumber
import requests
import datetime
import re
from io import BytesIO

def parse_date(pdf):
    chars = pd.DataFrame(pdf.chars)
    updated_text = "".join(chars[
        (chars["fontname"] == "Times New Roman") &
        (chars["doctop"] < 175)
    ].sort_values(["doctop", "x0"])["text"])
    date_pat = r"UPDATED:\s+As of (.+)$"
    updated_date = re.search(date_pat, updated_text).group(1)
    d = datetime.datetime.strptime(updated_date, "%B %d, %Y")
    return d

if __name__ == "__main__":
    URL = "https://www.fbi.gov/about-us/cjis/nics/reports/active_records_in_the_nics-index.pdf"
    raw = requests.get(URL).content
    pdf = pdfplumber.load(BytesIO(raw))
    d = parse_date(pdf)
    print(d.strftime("%Y-%m"))
    data = pdfplumber.utils.extract_columns(data_chars,
                                            x_tolerance=1,
                                            y_tolerance=1)

    if len(data.columns) == 6:
        data.columns = COLUMNS
    else:
        data.columns = ["sivigila_code"] + COLUMNS
    data = data.drop_duplicates().reset_index(drop=True)
    data[INT_COLS] = data[INT_COLS].astype(int)
    data["department"] = data["department"].str.strip().str.upper().apply(
        lambda x: DEPT_FIXES.get(x, x))
    data["municipality"] = data["municipality"].str.strip().str.upper().apply(
        lambda x: MUNI_FIXES.get(x, x))

    sums = data[INT_COLS].sum(axis=1)
    equalities = (sums == (data["zika_total"] * 2)).unique().tolist()
    assert (equalities == [True])
    return data


if __name__ == "__main__":
    import sys
    if hasattr(sys.stdin, "buffer"):
        buf = sys.stdin.buffer
    else:
        buf = sys.stdin
    pdf = pdfplumber.load(buf)
    data = parse(pdf)
    data.to_csv(sys.stdout, index=False, encoding="utf-8")
Example #32
0
def save_line_items(invoice_file, forceOcr):

    folder = settings.UPLOAD_PATH
    if not os.path.exists(folder):
        os.makedirs(folder)

    # save file locally first for aws
    folder = folder
    fs = FileSystemStorage(location=folder)
    filename = fs.save(invoice_file.name, invoice_file)
    temp_pdf_path = folder + filename

    invoice_text = ''
    try:
        ocrmypdf.ocr(temp_pdf_path, temp_pdf_path, force_ocr=forceOcr)
        temp_file = open(temp_pdf_path, "r")
        with pdfplumber.load(temp_file.buffer) as pdf:
            page = pdf.pages[0]
            invoice_text = page.extract_text()
    except Exception as err:
        with pdfplumber.load(temp_pdf_path) as pdf:
            page = pdf.pages[0]
            invoice_text = page.extract_text()

    # Save to AWS
    upload_to_AWS(temp_pdf_path, invoice_file.name)

    # delete pdf and img after extraction is complete
    if os.path.isfile(temp_pdf_path):
        os.remove(temp_pdf_path)

    # Regular expressions
    delta_re = re.compile(r'(?i)DELTA')
    johnstone_re = re.compile(r'(?i)(JOHNSTONE)')
    carrier_re = re.compile(r'(?i)(Distributor)')
    capco_re = re.compile(r'(?i)(capco)')
    ferguson_re = re.compile(r'(?i)(ferguson)')

    meta_data = {}
    lines = invoice_text.split("\n")
    for i in range(len(lines)):
        line = lines[i]
        supplier = ""

        if delta_re.search(line):
            meta_data = parse_delta_invoice(invoice_text)
            supplier = "Delta"

        if johnstone_re.search(line):
            meta_data = parse_johnstone_invoice(invoice_text)
            supplier = "Johnstone"

        if carrier_re.search(line):
            meta_data = parse_carrier_invoice(invoice_text)
            supplier = "Carrier"

        if capco_re.search(line):
            meta_data = parse_capco_invoice(invoice_text)
            supplier = "Capco"

        if ferguson_re.search(line):
            meta_data = parse_ferguson_invoice(invoice_text)
            supplier = "Ferguson"

        if supplier:
            meta_data["supplier_id"] = Supplier.objects.filter(
                supplier_name__icontains=supplier)[0].id
            meta_data["invoice_date"] = meta_data["invoice_date"].strip()
            meta_data["invoice_number"] = meta_data["invoice_number"].strip()
            meta_data["new_invoice_name"] = supplier + " " + \
                meta_data["invoice_date"].replace(
                    "/", "-") + " " + meta_data["invoice_number"] + ".pdf"
            # create item and price keys
            for i in range(len(meta_data['line_items'])):
                item, price = meta_data['line_items'][i]
                item_key = "item" + str(i + 1)
                price_key = "price" + str(i + 1)
                meta_data['line_items'][i] = (item_key, item.strip(),
                                              price_key, price.strip())
            break

    return meta_data
Example #33
0
#!/usr/bin/env python
import pandas as pd
import pdfplumber
import requests
import datetime
import re
from io import BytesIO

def parse_date(pdf):
    text = pdf.pages[0].extract_text(x_tolerance=5)
    date_pat = r"UPDATED:\s+As of (.+)\n"
    updated_date = re.search(date_pat, text).group(1)
    d = datetime.datetime.strptime(updated_date, "%B %d, %Y")
    return d

if __name__ == "__main__":
    URL = "https://www.fbi.gov/about-us/cjis/nics/reports/active_records_in_the_nics-index.pdf"
    raw = requests.get(URL).content
    pdf = pdfplumber.load(BytesIO(raw))
    d = parse_date(pdf)
    print(d.strftime("%Y-%m"))
Example #34
0
def main(string):
    print("Enter your name, or an empty line to exit.")
    with pdfplumber.load(string) as pdf:
        first_page = pdf.pages[0]
        text = first_page.extract_text()
        print(text)
Example #35
0
def parse_pdf(x_tolerance, y_tolerance, path=None, savePath=None, data=None):
    '''
    function : 处理pdf
    :param:词间最大间距,行间最大间距,输入路径,输出路径
    :return  无
    '''
    # 读入文件
    if path != None:
        pdf = pdfplumber.open(path)
    elif data != None:
        pdf = pdfplumber.load(data)
    # 记录page行数
    pdfRowNumber = 0

    wb = Workbook()
    ws = wb.active

    for page in pdf.pages:
        pageContainer = []  #存储所有该page的字典
        theMaxColNum = 0  #记录最大列数
        words = page.extract_words(x_tolerance=x_tolerance,
                                   y_tolerance=y_tolerance,
                                   keep_blank_chars=True)
        pageContainer, theMaxColNum = compileByRowLocation(
            words, x_tolerance, y_tolerance)
        # 按照位置信息排序
        for line in pageContainer:
            line.sort(key=itemgetter('x0'))
        pageContainer.sort(key=lambda line: line[0]['top'])
        # 对前排进行判断是否出现有表头或多行为一行的情况
        if len(pageContainer[0]) < theMaxColNum:
            for i in range(len(pageContainer)):
                if len(pageContainer[i]) == theMaxColNum:
                    repairList, repairNum = align_front_row(
                        pageContainer[0:i + 1], theMaxColNum)
                    for i in range(repairNum):
                        del pageContainer[0]
                    pageContainer.insert(0, repairList)
                    break
        # 对最后一排进行判断
        if len(pageContainer[-1]) < theMaxColNum:
            pageContainer[-1] = align_last_row(pageContainer[-2:],
                                               theMaxColNum)
        # 写入excel
        # ftTitle = Font(name='font',size=14)
        # ftText = Font(name = '',size=10)
        for idx, line in enumerate(pageContainer):
            for idy, item in enumerate(line):
                cellIndex = ws.cell(row=idx + 1 + pdfRowNumber, column=idy + 1)
                if item['text'] == '':
                    pass
                elif item['text'] == None:
                    ws.merge_cells(start_row=idx + 1 + pdfRowNumber,
                                   start_column=1,
                                   end_row=idx + 1 + pdfRowNumber,
                                   end_column=len(line))
                    break
                else:
                    # cellIndex.font = ftText
                    if item['text'].isdigit():
                        cellIndex.value = int(item['text'])
                    elif is_float(item['text']):
                        cellIndex.value = float(item['text'])
                    else:
                        cellIndex.value = item['text']
        # 将该页的行数相加,使excel连续
        pdfRowNumber += len(pageContainer)
    # 保存excel文件至本地
    if savePath != None:
        wb.save(savePath)
    else:
        wb.save(path.replace('.pdf', '.xlsx'))
Example #36
0

#讀取成文字後存回資料庫
def returnPdfcontent(content):
    sqlstr = "insert into proposal (content) VALUES (%s)" % (content)
    return DB.execution(DB.create, sqlstr)


def returnHashtag(hashtags):
    for tag in hashtags:
        print("insert into hashtag (hashtag_name) VALUES ('%s')" % (tag[0]))


for j in pdfdb["data"]:
    rq = requests.get(j["pdfUrl"])
    pdf = pdfplumber.load(BytesIO(rq.content))
    allText = ""
    for i in pdf.pages:
        allText += str(i.extract_text())
    # print(allText)
    text = ""
    table = ""
    for i in pdf.pages:
        content = i.extract_table()
        if content != None:
            table = str(getColName(content))
            break

    # page = pdf.pages[1]
    # text = page.extract_text()
    # print(allText)