forked from 21Vipin/Invoice2textdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
137 lines (125 loc) · 4.2 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import sys
import string
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
def convert(src, des):
for root, dirs, files in os.walk(src):
for file in files:
try:
if file.endswith(".pdf"):
if not file.startswith("._"):
outfile = des
codec = 'utf-8'
caching = True
rsrcmgr = PDFResourceManager(caching=caching)
if outfile:
outfp = open(outfile, 'wt', encoding=codec, errors='ignore')
close_outfp = True
else:
outfp = sys.stdout
close_outfp = False
device = TextConverter(rsrcmgr, outfp)
fname = os.path.join(root, file)
fp = open(fname, 'rb')
process_pdf(rsrcmgr, device, fp, check_extractable=True)
fp.close()
device.close()
if close_outfp:
outfp.close()
test=open(outfile).read()
invoice=find_invoice_number(test)
date= find_date(test)
due_amount = find_amount(test)
print("{ File Name: ", file, "Invoice Number: ", invoice, "Invoice Date: ", date, "Due Amount: Rs ", due_amount,"}")
except:
print('An error occured.')
def find_invoice_number(str):
str_lower = str.lower()
index = 0
while index < len(str_lower):
index = str_lower.find('invoice', index)
if index == -1:
break
if "no" in str_lower[index + 7: index + 10] or "number" in str_lower[index + 7: index + 14]:
str = str[index + 10: index + 30]
if "INV" in str:
start = str.find("INV") + 3
invoice = 'INV'
else:
for pos,char in enumerate(str):
if char.isdigit():
start = pos
invoice = ''
break
for pos,char in enumerate(str[start:]):
if char in string.punctuation:
continue
if char.isalpha():
end = pos break
invoice = invoice + str[start: start + end]
return invoice
index += 7
def find_date(str):
str_lower = str.lower()
index = 0
while index < len(str_lower):
index = str_lower.find('date', index)
if index == -1:
break
start=0
end = 0
date=''
if ":" in str_lower[index + 4: index + 12]:
str = str[index + 4: index + 25]
months = ["Jan", "Feb", "Mar", "Apr", "May" , "Jun" , "Jul" ,"Aug" , "Sep" , "Oct" , "Nov" , "Dec"]
for month in months:
if month in str:
start = str.find(month) - 3
end= start + 10
break
else:
for pos,char in enumerate(str):
if char.isdigit():
start = pos
end = 10
break
date = date + str[start: start + end]
return date
else:
str = str[index + 4:]
months = ["January", "February", "March", "April", "May" , "June" , "July" ,"August" , "September" , "October" , "November" , "December"]
for month in months:
if month in str:
start = str.find(month) + len(month)
date = month
for pos,char in enumerate(str[start:]):
if char.isalpha():
end = pos
break
date = date + str[start: start + end]
return date
index += 7
def find_amount(str):
str_lower = str.lower()
index = 0
start=0
end=0
while index < len(str_lower):
index = str_lower.find('balance', index)
if index == -1:
break
if "due" in str_lower[index + 7:]:
str = str[index + 12: index + 30 ]
for pos,char in enumerate(str):
if char.isdigit():
start = pos + 1
invoice =""
break
for pos,char in enumerate(str[start:]):
if char.isalpha():
end = pos
break
invoice = invoice + str[start: start + end]
return invoice
index += 7