def botsunzip(ta_from, endstatus, password=None, pass_non_zip=False, **argv): ''' unzip file; editype & messagetype are unchanged. ''' try: z = zipfile.ZipFile(botslib.abspathdata(filename=ta_from.filename), mode='r') except zipfile.BadZipfile: botsglobal.logger.debug(_(u'File is not a zip-file.')) if pass_non_zip: #just pass the file botsglobal.logger.debug( _(u'"pass_non_zip" is True, just pass the file.')) ta_to = ta_from.copyta(status=endstatus, statust=OK) return raise botslib.InMessageError(_(u'File is not a zip-file.')) if password: z.setpassword(password) for f in z.infolist(): if f.filename[-1] == '/': #check if this is a dir; if so continue continue ta_to = ta_from.copyta(status=endstatus) tofilename = str(ta_to.idta) tofile = botslib.opendata(tofilename, 'wb') tofile.write(z.read(f.filename)) tofile.close() ta_to.update( statust=OK, filename=tofilename) #update outmessage transaction with ta_info; botsglobal.logger.debug(_(u' File written: "%s".'), tofilename)
def receive_layout(self, ltpage): # recursively get every text element and it's coordinates def render(item): if isinstance(item, LTContainer): for child in item: render(child) elif isinstance(item, LTText): (_, _, x, y) = item.bbox # group the y values (rows) within group tolerance for v in yv: if y > v - y_group and y < v + y_group: y = v yv.append(y) line = lines[int(-y)] line[x] = item.get_text().encode('utf-8') from collections import defaultdict lines = defaultdict(lambda: {}) yv = [] render(ltpage) lineid = 0 for y in sorted(lines.keys()): line = lines[y] lineid += 1 csvdata = [ltpage.pageid, lineid] # first 2 columns are page and line numbers # group the x values (fields) within group tolerance p = 0 field_txt = '' for x in sorted(line.keys()): gap = x - p if p > 0 and gap > x_group: csvdata.append(field_txt) field_txt = '' field_txt += line[x] p = x csvdata.append(field_txt) csvout.writerow(csvdata) if lineid == 0: raise botslib.InMessageError( _(u'PDF text extraction failed, it may contain just image(s)?' ))
def mailbag(ta_from, endstatus, **argv): ''' 2 main functions: - recognizes and distuinguishes several edi types: x12 edifact tradacoms xml ('mailbag' in, correct editype out) - split up interchanges (edifact, x12, tradacoms) details: - edifact, x12 and tradacoms can be can be mixed, - recognizes xml files; but messagetype 'xml' has a special handling when reading xml-files: xpath identifiers are used. this is needed for using xml in mailbag! - when more interchanges in one file: strongly recommended to mailbag/split these. - handle multiple UNA in one file, including different charsets. - handle multiple x12 seperators in one file. ''' edifile = botslib.readdata(filename=ta_from.filename) #read as binary... startpos = 0 while (1): found = HEADER.search(edifile[startpos:]) if found is None: if startpos: #ISA/UNB have been found in file; no new ISA/UNB is found. So all processing is done. break #guess if this is an xml file..... sniffxml = edifile[:25] sniffxml = sniffxml.lstrip( ' \t\n\r\f\v\xFF\xFE\xEF\xBB\xBF\x00' ) #to find first ' real' data; some char are because of BOM, UTF-16 etc if sniffxml and sniffxml[0] == '<': ta_to = ta_from.copyta( status=endstatus, statust=OK, filename=ta_from.filename, editype='xml', messagetype='mailbag' ) #make transaction for translated message; gets ta_info of ta_frommes #~ ta_tomes.update(status=STATUSTMP,statust=OK,filename=ta_set_for_processing.filename,editype='xml') #update outmessage transaction with ta_info; break else: raise botslib.InMessageError( _(u'Found no content in mailbag.')) elif found.group(1): editype = 'x12' headpos = startpos + found.start(2) count = 0 for char in edifile[ headpos:headpos + 120]: #search first 120 characters to find separators if char in '\r\n' and count != 105: continue count += 1 if count == 4: field_sep = char elif count == 106: record_sep = char break #~ foundtrailer = re.search(re.escape(record_sep)+'\s*IEA'+re.escape(field_sep)+'.+?'+re.escape(record_sep),edifile[headpos:],re.DOTALL) foundtrailer = re.search( re.escape(record_sep) + '\s*I\s*E\s*A\s*' + re.escape(field_sep) + '.+?' + re.escape(record_sep), edifile[headpos:], re.DOTALL) elif found.group(3): editype = 'edifact' if found.group(4): field_sep = edifile[startpos + found.start(4) + 4] record_sep = edifile[startpos + found.start(4) + 8] headpos = startpos + found.start(4) else: field_sep = '+' record_sep = "'" headpos = startpos + found.start(5) foundtrailer = re.search( re.escape(record_sep) + '\s*U\s*N\s*Z\s*' + re.escape(field_sep) + '.+?' + re.escape(record_sep), edifile[headpos:], re.DOTALL) elif found.group(8): editype = 'tradacoms' headpos = startpos + found.start(9) field_sep = '=' #the tradacoms 'after-segment-tag-seperator' record_sep = "'" foundtrailer = re.search( re.escape(record_sep) + '\s*E\s*N\s*D\s*' + re.escape(field_sep) + '.+?' + re.escape(record_sep), edifile[headpos:], re.DOTALL) if not foundtrailer: raise botslib.InMessageError( _(u'Found no valid envelope trailer in mailbag.')) endpos = headpos + foundtrailer.end() #so: interchange is from headerpos untill endpos #~ if HEADER.search(edifile[headpos+25:endpos]): #check if there is another header in the interchange #~ raise botslib.InMessageError(u'Error in mailbag format: found no valid envelope trailer.') ta_to = ta_from.copyta( status=endstatus ) #make transaction for translated message; gets ta_info of ta_frommes tofilename = str(ta_to.idta) tofile = botslib.opendata(tofilename, 'wb') tofile.write(edifile[headpos:endpos]) tofile.close() ta_to.update( statust=OK, filename=tofilename, editype=editype, messagetype=editype) #update outmessage transaction with ta_info; startpos = endpos botsglobal.logger.debug(_(u' File written: "%s".'), tofilename)
def extractexcel(ta_from, endstatus, **argv): ''' extract excel file. editype & messagetype are unchanged. ''' #***functions used by extractexcel #------------------------------------------------------------------------------- def read_xls(infilename): # Read excel first sheet into a 2-d array book = xlrd.open_workbook(infilename) sheet = book.sheet_by_index(0) formatter = lambda (t, v): format_excelval(book, t, v, False) xlsdata = [] for row in range(sheet.nrows): (types, values) = (sheet.row_types(row), sheet.row_values(row)) xlsdata.append(map(formatter, zip(types, values))) return xlsdata #------------------------------------------------------------------------------- def dump_csv(xlsdata, tofilename): stream = botslib.opendata(tofilename, 'wb') csvout = csv.writer(stream, quotechar=quotechar, delimiter=field_sep, doublequote=doublequote, escapechar=escape) csvout.writerows(map(utf8ize, xlsdata)) stream.close() #------------------------------------------------------------------------------- def format_excelval(book, type, value, wanttupledate): # Clean up the incoming excel data for some data types returnrow = [] if type == 2: if value == int(value): value = int(value) elif type == 3: datetuple = xlrd.xldate_as_tuple(value, book.datemode) value = datetuple if wanttupledate else tupledate_to_isodate( datetuple) elif type == 5: value = xlrd.error_text_from_code[value] return value #------------------------------------------------------------------------------- def tupledate_to_isodate(tupledate): # Turns a gregorian (year, month, day, hour, minute, nearest_second) into a # standard YYYY-MM-DDTHH:MM:SS ISO date. (y, m, d, hh, mm, ss) = tupledate nonzero = lambda n: n != 0 date = "%04d-%02d-%02d" % (y, m, d) if filter(nonzero, (y, m, d)) else '' time = "T%02d:%02d:%02d" % (hh, mm, ss) if filter( nonzero, (hh, mm, ss)) or not date else '' return date + time #------------------------------------------------------------------------------- def utf8ize(l): # Make string-like things into utf-8, leave other things alone return [ unicode(s).encode(charset) if hasattr(s, 'encode') else s for s in l ] #***end functions used by extractexcel import xlrd import csv #get parameters for csv-format; defaults are as the csv defaults (in grammar.py) charset = argv.get('charset', "utf-8") quotechar = argv.get('quotechar', "'") field_sep = argv.get('field_sep', ':') escape = argv.get('escape', '') if escape: doublequote = False else: doublequote = True try: infilename = botslib.abspathdata(ta_from.filename) xlsdata = read_xls(infilename) ta_to = ta_from.copyta(status=endstatus) tofilename = str(ta_to.idta) dump_csv(xlsdata, tofilename) ta_to.update( statust=OK, filename=tofilename) #update outmessage transaction with ta_info; botsglobal.logger.debug(_(u' File written: "%s".'), tofilename) except: txt = botslib.txtexc() botsglobal.logger.error( _(u'Excel extraction failed, may not be an Excel file? Error:\n%s' ), txt) raise botslib.InMessageError(_( u'Excel extraction failed, may not be an Excel file? Error:\n$error' ), error=txt)
def extractpdf(ta_from, endstatus, **argv): ''' Try to extract text content of a PDF file to a csv. You know this is not a great idea, right? But we'll do the best we can anyway! Page and line numbers are added to each row. Columns and rows are based on the x and y coordinates of each text element within tolerance allowed. Multiple text elements may combine to make one field, some PDFs have every character separated! You may need to experiment with x_group and y_group values, but defaults seem ok for most files. Output csv is UTF-8 encoded - The csv module doesn't directly support reading and writing Unicode If the PDF is just an image, all bets are off. Maybe try OCR, good luck with that! Mike Griffin 14/12/2011 ''' from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams, LTContainer, LTText, LTTextBox import csv class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def receive_layout(self, ltpage): # recursively get every text element and it's coordinates def render(item): if isinstance(item, LTContainer): for child in item: render(child) elif isinstance(item, LTText): (_, _, x, y) = item.bbox # group the y values (rows) within group tolerance for v in yv: if y > v - y_group and y < v + y_group: y = v yv.append(y) line = lines[int(-y)] line[x] = item.get_text().encode('utf-8') from collections import defaultdict lines = defaultdict(lambda: {}) yv = [] render(ltpage) lineid = 0 for y in sorted(lines.keys()): line = lines[y] lineid += 1 csvdata = [ltpage.pageid, lineid] # first 2 columns are page and line numbers # group the x values (fields) within group tolerance p = 0 field_txt = '' for x in sorted(line.keys()): gap = x - p if p > 0 and gap > x_group: csvdata.append(field_txt) field_txt = '' field_txt += line[x] p = x csvdata.append(field_txt) csvout.writerow(csvdata) if lineid == 0: raise botslib.InMessageError( _(u'PDF text extraction failed, it may contain just image(s)?' )) #get some optional parameters x_group = argv.get('x_group', 10) # group text closer than this as one field y_group = argv.get('y_group', 5) # group lines closer than this as one line password = argv.get('password', '') quotechar = argv.get('quotechar', '"') field_sep = argv.get('field_sep', ',') escape = argv.get('escape', '\\') charset = argv.get('charset', 'utf-8') if not escape: doublequote = True else: doublequote = False try: pdf_stream = botslib.opendata(ta_from.filename, 'rb') ta_to = ta_from.copyta(status=endstatus) tofilename = str(ta_to.idta) csv_stream = botslib.opendata(tofilename, 'wb') csvout = csv.writer(csv_stream, quotechar=quotechar, delimiter=field_sep, doublequote=doublequote, escapechar=escape) # Process PDF rsrcmgr = PDFResourceManager(caching=True) device = CsvConverter(rsrcmgr, csv_stream, codec=charset) process_pdf(rsrcmgr, device, pdf_stream, pagenos=set(), password=password, caching=True, check_extractable=True) device.close() pdf_stream.close() csv_stream.close() ta_to.update( statust=OK, filename=tofilename) #update outmessage transaction with ta_info; botsglobal.logger.debug(_(u' File written: "%s".'), tofilename) except: txt = botslib.txtexc() botsglobal.logger.error( _(u'PDF extraction failed, may not be a PDF file? Error:\n%s'), txt) raise botslib.InMessageError( _(u'PDF extraction failed, may not be a PDF file? Error:\n$error'), error=txt)
def mailbag(ta_from,endstatus,frommessagetype,**argv): ''' 2 main functions: - recognizes and distuinguishes several edi types: x12 edifact tradacoms ('mailbag' in, correct editype out) - split up interchanges (edifact, x12, tradacoms) details: - edifact, x12 and tradacoms can be can be mixed - handle multiple UNA in one file, including different charsets. - handle multiple ISA's with different separators in one file in bots > 3.0.0 all mailbag, edifact, x12 and tradacoms go via mailbag. ''' edifile = botslib.readdata(filename=ta_from.filename) startpos = 0 nr_interchanges = 0 while (1): found = HEADER.match(edifile[startpos:]) if found is None: if edifile[startpos:].strip(string.whitespace+'\x1A\x00'): #there is content...but not valid if nr_interchanges: #found interchanges, but remainder is not valid raise botslib.InMessageError(_(u'[M50]: Found data not in a valid interchange at position %(pos)s.'),{'pos':startpos}) else: #no interchanges found, content is not a valid edifact/x12/tradacoms interchange if frommessagetype == 'mailbag': #if indicated 'mailbag': guess if this is an xml file..... sniffxml = edifile[:25] sniffxml = sniffxml.lstrip(' \t\n\r\f\v\xFF\xFE\xEF\xBB\xBF\x00') #to find first ' real' data; some char are because of BOM, UTF-16 etc if sniffxml and sniffxml[0] == '<': #is a xml file; inmessage.py can determine the right xml messagetype via xpath. filesize = len(edifile) ta_to = ta_from.copyta(status=endstatus,statust=OK,filename=ta_from.filename,editype='xml',messagetype='mailbag',filesize=filesize) return raise botslib.InMessageError(_(u'[M51]: Edi file does not start with a valid interchange.')) else: #no parseble content if nr_interchanges: #OK: there are interchanges, but no new interchange is found. return else: #no edifact/x12/tradacoms envelope at all raise botslib.InMessageError(_(u'[M52]: Edi file contains only whitespace.')) elif found.group('x12'): editype = 'x12' headpos = startpos + found.start('x12') #determine field_sep and record_sep count = 0 for char in edifile[headpos:headpos+120]: #search first 120 characters to determine separators if char in '\r\n' and count != 105: continue count += 1 if count == 4: field_sep = char elif count in [7,18,21,32,35,51,54,70]: #extra checks for fixed ISA. if char != field_sep: raise botslib.InMessageError(_(u'[M53]: Non-valid ISA header at position %(pos)s; position %(pos_element)s of ISA is "%(foundchar)s", expect here element separator "%(field_sep)s".'), {'pos':headpos,'pos_element':unicode(count),'foundchar':char,'field_sep':field_sep}) elif count == 106: record_sep = char break foundtrailer = re.search('''%(record_sep)s \s* I[\n\r]*E[\n\r]*A .+? %(record_sep)s '''%{'record_sep':re.escape(record_sep)}, edifile[headpos:],re.DOTALL|re.VERBOSE) if not foundtrailer: foundtrailer2 = re.search('''%(record_sep)s \s* I[\n\r]*E[\n\r]*A '''%{'record_sep':re.escape(record_sep)}, edifile[headpos:],re.DOTALL|re.VERBOSE) if foundtrailer2: raise botslib.InMessageError(_(u'[M60]: Found no segment terminator for IEA trailer at position %(pos)s.'),{'pos':foundtrailer2.start()}) else: raise botslib.InMessageError(_(u'[M54]: Found no valid IEA trailer for the ISA header at position %(pos)s.'),{'pos':headpos}) elif found.group('edifact'): editype = 'edifact' headpos = startpos + found.start('edifact') #parse UNA. valid UNA: UNA:+.? ' if found.group('UNA'): count = 0 for char in found.group('UNAstring'): if char in '\r\n': continue count += 1 if count == 2: field_sep = char elif count == 4: escape = char elif count == 6: record_sep = char if count != 6 and len(found.group('UNAstring').rstrip()) != 6: raise botslib.InMessageError(_(u'[M55]: Non-valid UNA-segment at position %(pos)s. UNA-segment should be 6 positions.'),{'pos':headpos}) if found.group('field_sep') != field_sep: raise botslib.InMessageError(_(u'[M56]: Data element separator used in edifact file differs from value indicated in UNA-segment.')) else: #no UNA, interpret UNB if found.group('field_sep') == '+': record_sep = "'" escape = '?' elif found.group('field_sep') == '\x1D': #according to std this was preffered way...probably quite theoretic...but does no harm record_sep = '\x1C' escape = '' else: raise botslib.InMessageError(_(u'[M57]: Edifact file with non-standard separators. UNA segment should be used.')) #search trailer foundtrailer = re.search('''[^%(escape)s\n\r] #char that is not escape or cr/lf [\n\r]*? #maybe some cr/lf's %(record_sep)s #segment separator \s* #whitespace between segments U[\n\r]*N[\n\r]*Z #UNZ .+? #any chars [^%(escape)s\n\r] #char that is not escape or cr/lf [\n\r]*? #maybe some cr/lf's %(record_sep)s #segment separator '''%{'escape':escape,'record_sep':re.escape(record_sep)}, edifile[headpos:],re.DOTALL|re.VERBOSE) if not foundtrailer: raise botslib.InMessageError(_(u'[M58]: Found no valid UNZ trailer for the UNB header at position %(pos)s.'),{'pos':headpos}) elif found.group('tradacoms'): editype = 'tradacoms' #~ field_sep = '=' #the tradacoms 'after-segment-tag-separator' record_sep = "'" escape = '?' headpos = startpos + found.start('STX') foundtrailer = re.search('''[^%(escape)s\n\r] #char that is not escape or cr/lf [\n\r]*? #maybe some cr/lf's %(record_sep)s #segment separator \s* #whitespace between segments E[\n\r]*N[\n\r]*D .+? [^%(escape)s\n\r] #char that is not escape or cr/lf [\n\r]*? #maybe some cr/lf's %(record_sep)s #segment separator '''%{'escape':escape,'record_sep':re.escape(record_sep)}, edifile[headpos:],re.DOTALL|re.VERBOSE) if not foundtrailer: raise botslib.InMessageError(_(u'[M59]: Found no valid END trailer for the STX header at position %(pos)s.'),{'pos':headpos}) #so: found an interchange (from headerpos until endpos) endpos = headpos + foundtrailer.end() ta_to = ta_from.copyta(status=endstatus) #make transaction for translated message; gets ta_info of ta_frommes tofilename = unicode(ta_to.idta) filesize = len(edifile[headpos:endpos]) tofile = botslib.opendata(tofilename,'wb') tofile.write(edifile[headpos:endpos]) tofile.close() #editype is now either edifact, x12 or tradacoms #frommessagetype is the original frommessagetype (from route). #frommessagetype would normally be edifact, x12, tradacoms or mailbag, but could also be eg ORDERSD96AUNEAN007. #If so, we want to preserve that. if frommessagetype != 'mailbag' and frommessagetype != editype: messagetype = frommessagetype else: messagetype = editype ta_to.update(statust=OK,filename=tofilename,editype=editype,messagetype=messagetype,filesize=filesize) #update outmessage transaction with ta_info; startpos = endpos nr_interchanges += 1 botsglobal.logger.debug(_(u' File written: "%(tofilename)s".'),{'tofilename':tofilename})
def splitmailbag(startstatus=MAILBAG,endstatus=TRANSLATE,idroute=''): ''' splits 'mailbag'files to seperate files each containging one interchange (ISA-IEA or UNA/UNB-UNZ). handles x12 and edifact; these can be mixed. ''' header = re.compile('(\s*(ISA))|(\s*(UNA.{6})?\s*(U\s*N\s*B)s*.{1}(.{4}).{1}(.{1}))',re.DOTALL) # group: 1 2 3 4 5 6 7 for row in botslib.query(u'''SELECT idta,filename,charset FROM ta WHERE idta>%(rootidta)s AND status=%(status)s AND statust=%(statust)s AND idroute=%(idroute)s ''', {'status':startstatus,'statust':OK,'idroute':idroute,'rootidta':botslib.get_minta4query()}): try: ta_org=botslib.OldTransaction(row['idta']) ta_intermediate = ta_org.copyta(status=MAILBAGPARSED) edifile = botslib.readdata(filename=row['filename']) #read as binary... botsglobal.logmap.debug(u'Start parsing mailbag file "%s".',row['filename']) startpos=0 while (1): found = header.search(edifile[startpos:]) if found is None: if startpos: #ISA/UNB have been found in file; no new ISA/UNB is found. So all processing is done. break #guess if this is an xml file..... sniffxml = edifile[:25] sniffxml = sniffxml.lstrip(' \t\n\r\f\v\xFF\xFE\xEF\xBB\xBF\x00') #to find first ' real' data; some char are because of BOM, UTF-16 etc if sniffxml and sniffxml[0]=='<': ta_tomes=ta_intermediate.copyta(status=endstatus) #make transaction for translated message; gets ta_info of ta_frommes ta_tomes.update(status=STATUSTMP,statust=OK,filename=row['filename'],editype='xml') #update outmessage transaction with ta_info; break; else: raise botslib.InMessageError(_(u'Found no content in mailbag.')) elif found.group(1): editype='x12' headpos=startpos+ found.start(2) count=0 for c in edifile[headpos:headpos+120]: #search first 120 characters to find seperators if c in '\r\n' and count!=105: continue count +=1 if count==4: field_sep = c elif count==106: record_sep = c break foundtrailer = re.search(re.escape(record_sep)+'\s*IEA'+re.escape(field_sep)+'.+?'+re.escape(record_sep),edifile[headpos:],re.DOTALL) elif found.group(3): editype='edifact' if found.group(4): field_sep = edifile[startpos + found.start(4) + 4] record_sep = edifile[startpos + found.start(4) + 8] headpos=startpos+ found.start(4) else: field_sep = '+' record_sep = "'" headpos=startpos+ found.start(5) foundtrailer = re.search(re.escape(record_sep)+'\s*U\s*N\s*Z\s*'+re.escape(field_sep)+'.+?'+re.escape(record_sep),edifile[headpos:],re.DOTALL) if not foundtrailer: raise botslib.InMessageError(_(u'Found no valid envelope trailer in mailbag.')) endpos = headpos+foundtrailer.end() #so: interchange is from headerpos untill endpos #~ if header.search(edifile[headpos+25:endpos]): #check if there is another header in the interchange #~ raise botslib.InMessageError(u'Error in mailbag format: found no valid envelope trailer.') ta_tomes=ta_intermediate.copyta(status=endstatus) #make transaction for translated message; gets ta_info of ta_frommes tofilename = str(ta_tomes.idta) tofile = botslib.opendata(tofilename,'wb') tofile.write(edifile[headpos:endpos]) tofile.close() ta_tomes.update(status=STATUSTMP,statust=OK,filename=tofilename,editype=editype,messagetype=editype) #update outmessage transaction with ta_info; startpos=endpos except: txt=botslib.txtexc() ta_intermediate.failure() ta_intermediate.update(statust=ERROR,errortext=txt) else: botsglobal.logmap.debug(u'OK Parsing mailbag file "%s".',row['filename']) ta_org.update(statust=DONE) ta_intermediate.succes(endstatus) ta_intermediate.update(statust=DONE)