def process_with_position(self, select): position = self.Database.select({ 'select': select, 'table' : ['suppliers'], 'where' : ['vat_number = ?'], 'data' : [self.supplier[0]] })[0] if position and position[select[0]]: data = {'position' : position[select[0]], 'regex': None, 'target' : 'full', 'page' : position[select[1]]} text, position = search_custom_positions(data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text: # Filter the result to get only the digits text = re.finditer(r'[-+]?\d*[.,\s]+\d+|\d+', text) result = '' for t in text: result += re.sub('\s*', '', t.group()) if result != '': result = float(result.replace(',', '.')) return [result, position, data['page']] else: return False else: return False else: return False
def run(self): dataToReturn = {} list_of_fields = {} if self.typo: list_of_fields = retrieve_custom_positions(self.typo, self.Config) elif self.supplier and self.supplier[2]['typology']: list_of_fields = retrieve_custom_positions(self.supplier[2]['typology'], self.Config) if list_of_fields: for index in list_of_fields: data, position = search_custom_positions(list_of_fields[index], self.Ocr, self.Files, self.Locale, self.file, self.Config) if not data and list_of_fields[index]['regex'] is not False: dataToReturn[index] = [self.process(list_of_fields[index]), position, list_of_fields[index]['column']] if list_of_fields[index]['type'] == 'date': if index in dataToReturn and dataToReturn[index][0]: for date in re.finditer(r"" + self.Locale.dateRegex, dataToReturn[index][0]): dataToReturn[index] = [date.group(), position, list_of_fields[index]['column']] elif list_of_fields[index]['type'] == 'number': if index in dataToReturn and dataToReturn[index][0]: dataToReturn[index] = [data, position, list_of_fields[index]['column']] else: if list_of_fields[index]['type'] == 'date': for date in re.finditer(r"" + self.Locale.dateRegex, data): data = date.group() elif list_of_fields[index]['type'] == 'number': data = re.sub('[^0-9]', '', data) dataToReturn[index] = [data, position, list_of_fields[index]['column']] return dataToReturn
def run(self): found = False if self.Files.isTiff == 'True': target = self.Files.tiffName_header else: target = self.Files.jpgName_header invoiceNumber = search_by_positions(self.supplier, 'invoice', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) if invoiceNumber and invoiceNumber[0]: return invoiceNumber for line in self.text: for _invoice in re.finditer(r"" + self.Locale.invoiceRegex + "", line.content.upper()): tmpInvoiceNumber = re.sub( r"" + self.Locale.invoiceRegex[:-2] + "", '', _invoice.group()) # Delete the invoice keyword invoiceNumber = tmpInvoiceNumber.lstrip().split(' ')[0] if len(invoiceNumber) > int(self.Locale.invoiceSizeMin): self.Log.info('Invoice number found : ' + invoiceNumber) return [invoiceNumber, line.position, self.nbPages] else: found = False if not found and self.supplier and not self.customPage: self.Log.info( 'Invoice number not found. Searching invoice number using position in database' ) position = self.Database.select({ 'select': ['invoice_number_position', 'invoice_number_page'], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['invoice_number_position']: data = { 'position': position['invoice_number_position'], 'regex': None, 'target': 'full', 'page': position['invoice_number_page'] } text, position = search_custom_positions( data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text != '': self.Log.info('Invoice number found with position : ' + text) return [text, position, data['page']] else: return False else: return False else: return False
def run(self): if self.Files.isTiff == 'True': target = self.Files.tiffName_header else: target = self.Files.jpgName_header invoice_number = search_by_positions(self.supplier, 'invoice', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) if invoice_number and invoice_number[0]: return invoice_number if self.supplier and not self.customPage: position = self.Database.select({ 'select': ['invoice_number_position', 'invoice_number_page'], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['invoice_number_position'] not in [False, 'NULL', '', None]: data = {'position': position['invoice_number_position'], 'regex': None, 'target': 'full', 'page': position['invoice_number_page']} text, position = search_custom_positions(data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text != '': self.Log.info('Invoice number found with position : ' + str(text)) return [text, position, data['page']] for line in self.text: for _invoice in re.finditer(r"" + self.Locale.invoiceRegex + "", line.content.upper()): invoice_res = _invoice.group() # If the regex return a date, remove it for _date in re.finditer(r"" + self.Locale.dateRegex + "", _invoice.group()): if _date.group(): invoice_res = _invoice.group().replace(_date.group(), '') tmp_invoice_number = re.sub(r"" + self.Locale.invoiceRegex[:-2] + "", '', invoice_res) # Delete the invoice keyword invoice_number = tmp_invoice_number.lstrip().split(' ')[0] if len(invoice_number) >= int(self.Locale.invoiceSizeMin): self.Log.info('Invoice number found : ' + invoice_number) return [invoice_number, line.position, self.nbPages] for line in self.footer_text: for _invoice in re.finditer(r"" + self.Locale.invoiceRegex + "", line.content.upper()): invoice_res = _invoice.group() # If the regex return a date, remove it for _date in re.finditer(r"" + self.Locale.dateRegex + "", _invoice.group()): if _date.group(): invoice_res = _invoice.group().replace(_date.group(), '') tmp_invoice_number = re.sub(r"" + self.Locale.invoiceRegex[:-2] + "", '', invoice_res) # Delete the invoice keyword invoice_number = tmp_invoice_number.lstrip().split(' ')[0] if len(invoice_number) >= int(self.Locale.invoiceSizeMin): self.Log.info('Invoice number found : ' + invoice_number) position = self.Files.return_position_with_ratio(line, 'footer') return [invoice_number, position, self.nbPages]
def run(self): if self.supplier and not self.customPage: position = self.Database.select({ 'select': ['delivery_number_1_position'], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['delivery_number_1_position'] not in [False, 'NULL', '', None]: data = {'position': position['delivery_number_1_position'], 'regex': None, 'target': 'full', 'page': '1'} text, position = search_custom_positions(data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text is not False: for _delivery in re.finditer(r"" + self.Locale.deliveryNumberRegex + "", str(text.upper())): delivery_res = _delivery.group() # If the regex return a date, remove it for _date in re.finditer(r"" + self.Locale.dateRegex + "", _delivery.group()): if _date.group(): delivery_res = _delivery.group().replace(_date.group(), '') tmp_delivery_number = re.sub(r"" + self.Locale.deliveryNumberRegex[:-2] + "", '', delivery_res) # Delete the delivery number keyword delivery_number = tmp_delivery_number.lstrip().split(' ')[0] if delivery_number != '': self.Log.info('Delivery number found with position : ' + str(delivery_number)) return [delivery_number, position, data['page']] if text != "": self.Log.info('Delivery number found with position : ' + str(text)) return [text, position, data['page']] for line in self.text: for _delivery in re.finditer(r"" + self.Locale.deliveryNumberRegex + "", line.content.upper()): delivery_res = _delivery.group() # If the regex return a date, remove it for _date in re.finditer(r"" + self.Locale.dateRegex + "", _delivery.group()): if _date.group(): delivery_res = _delivery.group().replace(_date.group(), '') tmp_delivery_number = re.sub(r"" + self.Locale.deliveryNumberRegex[:-2] + "", '', delivery_res) # Delete the delivery number keyword delivery_number = tmp_delivery_number.lstrip().split(' ')[0] if len(delivery_number) >= int(self.Locale.invoiceSizeMin): self.Log.info('Delivery number found : ' + delivery_number) position = line.position if self.target != 'header': position = self.Files.return_position_with_ratio(line, self.target) return [delivery_number, position, self.nbPages] return False
def run(self): if self.Files.isTiff == 'True': target = self.Files.tiffName_header else: target = self.Files.jpgName_header date = search_by_positions(self.supplier, 'date', self.Config, self.Locale, self.Ocr, self.Files, target, self.typo) due_date = False if date and date[0]: res = self.format_date(date[0], date[1]) if res: self.date = res[0] self.Log.info('Date found using mask position : ' + str(res[0])) if len(date) == 3: return [res[0], res[1], date[2]] else: return [res[0], res[1], ''] if self.supplier: position = self.db.select({ 'select': [ 'invoice_date_position', 'invoice_date_page', 'due_date_position', 'due_date_page' ], 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position['due_date_position'] not in [ False, 'NULL', '', None ]: data = { 'position': position['due_date_position'], 'regex': None, 'target': 'full', 'page': position['due_date_page'] } _text, _position = search_custom_positions( data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if _text != '': res = self.format_date(_text, _position, True) if res: due_date = [res[0], res[1]] self.Log.info('Due date found using position : ' + str(res[0])) if not due_date: for line in self.text: due_date = self.process_due_date( re.sub(r'(\d)\s+(\d)', r'\1\2', line.content.upper()), line.position) if due_date: break if self.supplier: if position and position['invoice_date_position'] not in [ False, 'NULL', '', None ]: data = { 'position': position['invoice_date_position'], 'regex': None, 'target': 'full', 'page': position['invoice_date_page'] } text, position = search_custom_positions( data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text != '': res = self.format_date(text, position, True) if res: self.date = res[0] self.Log.info('Invoice date found using position : ' + str(res[0])) return [self.date, position, data['page'], due_date] for line in self.text: res = self.process(line.content.upper(), line.position) if res: self.Log.info('Invoice date found : ' + res[0]) return [res[0], res[1], self.nbPages, due_date] for line in self.text: res = self.process(re.sub(r'(\d)\s+(\d)', r'\1\2', line.content), line.position) if not res: res = self.process(line.content, line.position) if res: return [res[0], res[1], self.nbPages, due_date] else: return [res[0], res[1], self.nbPages, due_date]
def process_footer_with_position(self, select): position = self.Database.select({ 'select': select, 'table': ['suppliers'], 'where': ['vat_number = ?'], 'data': [self.supplier[0]] })[0] if position and position[select[0]] not in [ '((,),(,))', 'NULL', None, '', False ]: page = position[select[1]] if self.target == 'full': page = self.nbPage data = { 'position': position[select[0]], 'regex': None, 'target': 'full', 'page': page } text, position = search_custom_positions(data, self.Ocr, self.Files, self.Locale, self.file, self.Config) if text: try: # Try if the return string could be convert to float float(text) result = text if select[ 0] == 'vat_1_position': # Fix if we retrieve 2000.0, or 200.0 instead of 20.0 for example tva_amounts = eval(self.Locale.vatRateList) _split = result.split('.') if _split[1] == '0': result = _split[0] for tva in tva_amounts: if str(tva) in str(result.replace(',', '.')): result = str(tva) break except (ValueError, SyntaxError, TypeError): # If results isn't a float, transform it text = re.finditer(r'[-+]?\d*[.,]+\d+([.,]+\d+)?|\d+', text) result = '' for t in text: result += t.group() if select[0] != 'vat_1_position': try: text = result.replace(' ', '.') text = text.replace('\x0c', '') text = text.replace('\n', '') text = text.replace(',', '.') splitted_number = text.split('.') if len(splitted_number) > 1: last_index = splitted_number[ len(splitted_number) - 1] if len(last_index) > 2: result = text.replace('.', '') else: splitted_number.pop(-1) result = ''.join( splitted_number) + '.' + last_index result = str(float(result)) except (ValueError, SyntaxError, TypeError): pass if result != '': result = re.sub('\s*', '', result).replace(',', '.') self.nbPage = data['page'] return [result, position, data['page']] else: return False else: return False else: return False