def run(self): if self.Files.isTiff == 'True': target = self.Files.tiffName_header else : target = self.Files.jpgName_header date = search_by_positions(self.supplier, 'date', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) if date and date[0]: res = self.formatDate(date[0], date[1]) if res: self.date = res[0] self.Log.info('Date found using mask position : ' + str(res[0])) if len(date) == 3: return [res[0], res[1], date[2]] else: return [res[0], res[1], ''] for line in self.text: res = self.process(re.sub(r'(\d)\s+(\d)', r'\1\2', line.content), line.position) if not res : res = self.process(line.content, line.position) if res: return [res[0], res[1], self.nbPages] else: return [res[0], res[1], self.nbPages]
def run(self): found = False if self.Files.isTiff == 'True': target = self.Files.tiffName_header else: target = self.Files.jpgName_header invoiceNumber = search_by_positions(self.supplier, 'invoice', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) if invoiceNumber and invoiceNumber[0]: return invoiceNumber for line in self.text: for _invoice in re.finditer(r"" + self.Locale.invoiceRegex + "", line.content.upper()): tmpInvoiceNumber = re.sub( r"" + self.Locale.invoiceRegex[:-2] + "", '', _invoice.group()) # Delete the invoice keyword invoiceNumber = tmpInvoiceNumber.lstrip().split(' ')[0] if len(invoiceNumber) > int(self.Locale.invoiceSizeMin): self.Log.info('Invoice number found : ' + invoiceNumber) return [invoiceNumber, line.position, self.nbPages] else: found = False if not found and self.supplier and not self.customPage: self.Log.info( 'Invoice number not found. Searching invoice number using position in database' ) position = self.Database.select({ 'select': ['invoice_number_position', 'invoice_number_page'], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['invoice_number_position']: data = { 'position': position['invoice_number_position'], 'regex': None, 'target': 'full', 'page': position['invoice_number_page'] } text, position = search_custom_positions( data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text != '': self.Log.info('Invoice number found with position : ' + text) return [text, position, data['page']] else: return False else: return False else: return False
def run(self): if self.Files.isTiff == 'True': target = self.Files.tiffName_header else: target = self.Files.jpgName_header invoice_number = search_by_positions(self.supplier, 'invoice', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) if invoice_number and invoice_number[0]: return invoice_number if self.supplier and not self.customPage: position = self.Database.select({ 'select': ['invoice_number_position', 'invoice_number_page'], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['invoice_number_position'] not in [False, 'NULL', '', None]: data = {'position': position['invoice_number_position'], 'regex': None, 'target': 'full', 'page': position['invoice_number_page']} text, position = search_custom_positions(data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text != '': self.Log.info('Invoice number found with position : ' + str(text)) return [text, position, data['page']] for line in self.text: for _invoice in re.finditer(r"" + self.Locale.invoiceRegex + "", line.content.upper()): invoice_res = _invoice.group() # If the regex return a date, remove it for _date in re.finditer(r"" + self.Locale.dateRegex + "", _invoice.group()): if _date.group(): invoice_res = _invoice.group().replace(_date.group(), '') tmp_invoice_number = re.sub(r"" + self.Locale.invoiceRegex[:-2] + "", '', invoice_res) # Delete the invoice keyword invoice_number = tmp_invoice_number.lstrip().split(' ')[0] if len(invoice_number) >= int(self.Locale.invoiceSizeMin): self.Log.info('Invoice number found : ' + invoice_number) return [invoice_number, line.position, self.nbPages] for line in self.footer_text: for _invoice in re.finditer(r"" + self.Locale.invoiceRegex + "", line.content.upper()): invoice_res = _invoice.group() # If the regex return a date, remove it for _date in re.finditer(r"" + self.Locale.dateRegex + "", _invoice.group()): if _date.group(): invoice_res = _invoice.group().replace(_date.group(), '') tmp_invoice_number = re.sub(r"" + self.Locale.invoiceRegex[:-2] + "", '', invoice_res) # Delete the invoice keyword invoice_number = tmp_invoice_number.lstrip().split(' ')[0] if len(invoice_number) >= int(self.Locale.invoiceSizeMin): self.Log.info('Invoice number found : ' + invoice_number) position = self.Files.return_position_with_ratio(line, 'footer') return [invoice_number, position, self.nbPages]
def run(self): if self.Files.isTiff == 'True': target = self.Files.tiffName else : target = self.Files.jpgName allRate = search_by_positions(self.supplier, 'total_amount', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) allRateAmount = {} if allRate and allRate[0]: allRateAmount = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", allRate[0].replace(',', '.')), 1: allRate[1] } noRate = search_by_positions(self.supplier, 'ht_amount', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) noRateAmount = {} if noRate and noRate[0]: noRateAmount = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", noRate[0].replace(',', '.')), 1: allRate[1] } percentage = search_by_positions(self.supplier, 'rate_percentage', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) ratePercentage = {} if percentage and percentage[0]: ratePercentage = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", percentage[0].replace(',', '.')), 1: allRate[1] } if not self.test_amount(noRateAmount, allRateAmount, ratePercentage): noRateAmount = self.process(self.Locale.noRatesRegex) ratePercentage = self.process(self.Locale.vatRateRegex) allRateAmount = self.process(self.Locale.allRatesRegex) # Test all amounts. If some are false, try to search them with position. If not, pass if self.test_amount(noRateAmount, allRateAmount, ratePercentage) is not False: # First args is amount, second is position noRateAmount = self.return_max(self.noRateAmount) allRateAmount = self.return_max(self.allRateAmount) ratePercentage = self.return_max(self.ratePercentage) if noRateAmount is False and allRateAmount and ratePercentage: noRateAmount = [float("%.2f" % (float(allRateAmount[0]) / (1 + float(ratePercentage[0] / 100)))), (('',''),('',''))] elif allRateAmount is False and noRateAmount and ratePercentage: allRateAmount = [float("%.2f" % (float(noRateAmount[0]) + (float(noRateAmount[0]) * float(ratePercentage[0] / 100)))), (('',''),('',''))] elif ratePercentage is False and noRateAmount and allRateAmount: vatAmount = float("%.2f" % (float(allRateAmount[0]) - float(noRateAmount[0]))) ratePercentage = [float("%.2f" % (float(vatAmount) / float(noRateAmount[0]) * 100)), (('',''),('',''))] # Test if the three var's are good by simple math operation # Round up value with 2 decimals try: total = "%.2f" % (float(noRateAmount[0]) + (float(noRateAmount[0]) * float(ratePercentage[0]) / 100)) except TypeError: return False if float(total) == float(allRateAmount[0]): self.Log.info('Footer informations found : [TOTAL : ' + str(total) + '] - [HT : ' + str(noRateAmount[0]) + '] - [VATRATE : ' + str(ratePercentage[0]) + ']') return [noRateAmount, allRateAmount, ratePercentage, 1] else: return False else: return False
def run(self): if self.Files.isTiff == 'True': target = self.Files.tiffName_header else: target = self.Files.jpgName_header date = search_by_positions(self.supplier, 'date', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) due_date = False if date and date[0]: res = self.format_date(date[0], date[1]) if res: self.date = res[0] self.Log.info('Date found using mask position : ' + str(res[0])) if len(date) == 3: return [res[0], res[1], date[2]] else: return [res[0], res[1], ''] if self.supplier: position = self.db.select({ 'select': [ 'invoice_date_position', 'invoice_date_page', 'due_date_position', 'due_date_page' ], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['due_date_position'] not in [ False, 'NULL', '', None ]: data = { 'position': position['due_date_position'], 'regex': None, 'target': 'full', 'page': position['due_date_page'] } _text, _position = search_custom_positions( data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if _text != '': res = self.format_date(_text, _position, True) if res: due_date = [res[0], res[1]] self.Log.info('Due date found using position : ' + str(res[0])) if not due_date: for line in self.text: due_date = self.process_due_date( re.sub(r'(\d)\s+(\d)', r'\1\2', line.content.upper()), line.position) if due_date: break if self.supplier: if position and position['invoice_date_position'] not in [ False, 'NULL', '', None ]: data = { 'position': position['invoice_date_position'], 'regex': None, 'target': 'full', 'page': position['invoice_date_page'] } text, position = search_custom_positions( data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text != '': res = self.format_date(text, position, True) if res: self.date = res[0] self.Log.info('Invoice date found using position : ' + str(res[0])) return [self.date, position, data['page'], due_date] for line in self.text: res = self.process(line.content.upper(), line.position) if res: self.Log.info('Invoice date found : ' + res[0]) return [res[0], res[1], self.nbPages, due_date] for line in self.text: res = self.process(re.sub(r'(\d)\s+(\d)', r'\1\2', line.content), line.position) if not res: res = self.process(line.content, line.position) if res: return [res[0], res[1], self.nbPages, due_date] else: return [res[0], res[1], self.nbPages, due_date]
def run(self, text_as_string=False): if self.Files.isTiff == 'True': target = self.Files.tiffName else: target = self.Files.jpgName all_rate = search_by_positions(self.supplier, 'ttc', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) all_rate_amount = {} if all_rate and all_rate[0]: all_rate_amount = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", all_rate[0].replace(',', '.')), 1: all_rate[1] } no_rate = search_by_positions(self.supplier, 'no_taxes', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) no_rate_amount = {} if no_rate and no_rate[0]: no_rate_amount = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", no_rate[0].replace(',', '.')), 1: all_rate[1] } percentage = search_by_positions(self.supplier, 'rate_percentage', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) rate_percentage = {} if percentage and percentage[0]: rate_percentage = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", percentage[0].replace(',', '.')), 1: all_rate[1] } vat_amount = False if not self.test_amount(no_rate_amount, all_rate_amount, rate_percentage): no_rate_amount = self.process(self.Locale.noRatesRegex, text_as_string) rate_percentage = self.process(self.Locale.vatRateRegex, text_as_string) all_rate_amount = self.process(self.Locale.allRatesRegex, text_as_string) if all_rate_amount and no_rate_amount: vat_amount = float("%.2f" % (self.return_max(all_rate_amount)[0] - self.return_max(no_rate_amount)[0])) if all_rate_amount and vat_amount and not no_rate_amount: no_rate_amount = [ float("%.2f" % self.return_max(all_rate_amount)[0] - self.return_max(vat_amount)[0]), (('', ''), ('', '')) ] if all_rate_amount and rate_percentage and not no_rate_amount: no_rate_amount = [ float( "%.2f" % (self.return_max(all_rate_amount)[0] / (1 + float(self.return_max(rate_percentage)[0] / 100)))), (('', ''), ('', '')) ] # Test all amounts. If some are false, try to search them with position. If not, pass if self.test_amount(no_rate_amount, all_rate_amount, rate_percentage) is not False: # First args is amount, second is position no_rate_amount = self.return_max(self.noRateAmount) all_rate_amount = self.return_max(self.allRateAmount) rate_percentage = self.return_max(self.ratePercentage) if no_rate_amount is False and all_rate_amount and rate_percentage: no_rate_amount = [ float("%.2f" % (float(all_rate_amount[0]) / (1 + float(rate_percentage[0] / 100)))), (('', ''), ('', '')), True ] elif all_rate_amount is False and no_rate_amount and rate_percentage: all_rate_amount = [ float("%.2f" % (float(no_rate_amount[0]) + (float(no_rate_amount[0]) * float(float(rate_percentage[0]) / 100)))), (('', ''), ('', '')), True ] elif rate_percentage is False and no_rate_amount and all_rate_amount: vat_amount = float( "%.2f" % (float(all_rate_amount[0]) - float(no_rate_amount[0]))) rate_percentage = [ float( "%.2f" % (float(vat_amount) / float(no_rate_amount[0]) * 100)), (('', ''), ('', '')), True ] # Test if the three var's are good by simple math operation # Round up value with 2 decimals try: total = "%.2f" % (float(no_rate_amount[0]) + (float(no_rate_amount[0]) * float(rate_percentage[0]) / 100)) except TypeError: return False if float(total) == float(all_rate_amount[0]): self.Log.info('Footer informations found : [TOTAL : ' + str(total) + '] - [HT : ' + str(no_rate_amount[0]) + '] - [VATRATE : ' + str(rate_percentage[0]) + ']') return [ no_rate_amount, all_rate_amount, rate_percentage, self.nbPage, [ "%.2f" % float( float(no_rate_amount[0]) * (float(rate_percentage[0]) / 100)) ] ] elif float(all_rate_amount[0]) == float(vat_amount + no_rate_amount[0]): self.Log.info('Footer informations found : [TOTAL : ' + str(total) + '] - [HT : ' + str(no_rate_amount[0]) + '] - [VATRATE : ' + str(rate_percentage[0]) + ']') return [ no_rate_amount, all_rate_amount, rate_percentage, self.nbPage, [ "%.2f" % float( float(no_rate_amount[0]) * (float(rate_percentage[0]) / 100)) ] ] else: return False else: if not self.rerun: self.rerun = True if self.Files.isTiff == 'True': improved_image = self.Files.improve_image_detection( self.Files.tiffName_footer) else: improved_image = self.Files.improve_image_detection( self.Files.jpgName_footer) self.Files.open_img(improved_image) self.text = self.Ocr.line_box_builder(self.Files.img) return self.run() if self.rerun and not self.rerun_as_text: self.rerun_as_text = True self.text = self.Ocr.text_builder(self.Files.img) return self.run(text_as_string=True) return False
def run(self, text_as_string=False): if self.Files.isTiff == 'True': target = self.Files.tiffName else: target = self.Files.jpgName all_rate = search_by_positions(self.supplier, 'ttc', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) all_rate_amount = {} if all_rate and all_rate[0]: all_rate_amount = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", all_rate[0].replace(',', '.')), 1: all_rate[1] } no_rate = search_by_positions(self.supplier, 'no_taxes', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) no_rate_amount = {} if no_rate and no_rate[0]: no_rate_amount = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", no_rate[0].replace(',', '.')), 1: no_rate[1] } percentage = search_by_positions(self.supplier, 'rate_percentage', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) rate_percentage = {} if percentage and percentage[0]: rate_percentage = { 0: re.sub(r"[^0-9\.]|\.(?!\d)", "", percentage[0].replace(',', '.')), 1: percentage[1] } vat_amount = {} if not self.test_amount(no_rate_amount, all_rate_amount, rate_percentage, vat_amount): no_rate_amount = self.process(self.Locale.noRatesRegex, text_as_string) rate_percentage = self.process(self.Locale.vatRateRegex, text_as_string) all_rate_amount = self.process(self.Locale.allRatesRegex, text_as_string) vat_amount = self.process(self.Locale.vatAmountRegex, text_as_string) # Test all amounts. If some are false, try to search them with position. If not, pass if self.test_amount(no_rate_amount, all_rate_amount, rate_percentage, vat_amount) is not False: no_rate_amount = self.return_max(self.noRateAmount) all_rate_amount = self.return_max(self.allRateAmount) rate_percentage = self.return_max(self.ratePercentage) vat_amount = self.return_max(self.vatAmount) self.Log.info('Raw footer informations found : [TOTAL : ' + str(all_rate_amount[0]) + '] - [HT : ' + str(no_rate_amount[0]) + '] - [VATRATE : ' + str(rate_percentage[0]) + '] - [VAT AMOUNT : ' + str(vat_amount[0]) + ']') return [ no_rate_amount, all_rate_amount, rate_percentage, self.nbPage, vat_amount ] else: if not self.rerun: self.rerun = True if self.Files.isTiff == 'True': improved_image = self.Files.improve_image_detection( self.Files.tiffName_footer) else: improved_image = self.Files.improve_image_detection( self.Files.jpgName_footer) self.Files.open_img(improved_image) self.text = self.Ocr.line_box_builder(self.Files.img) return self.run() if self.rerun and not self.rerun_as_text: self.rerun_as_text = True self.text = self.Ocr.text_builder(self.Files.img) return self.run(text_as_string=True) return False