def _parsefields(self, lex_record, record_definition): """ Parse fields from one fixed message-record and check length of the fixed record. """ record2build = {} # start with empty dict fixedrecord = lex_record[ID][FIXEDLINE] # shortcut to fixed incoming record lenfixed = len(fixedrecord) recordlength = record_definition[FIXED_RECORD_LENGTH] if record_definition[FIXED_RECORD_LENGTH] != len(fixedrecord): if ( record_definition[FIXED_RECORD_LENGTH] > len(fixedrecord) and self.ta_info["checkfixedrecordtooshort"] ): raise InMessageError( _( '[S52] line %(line)s: Record "%(record)s" too short; is %(pos)s pos, defined is %(defpos)s pos.' ), line=lex_record[ID][LIN], record=lex_record[ID][VALUE], pos=len(fixedrecord), defpos=record_definition[FIXED_RECORD_LENGTH], ) if ( record_definition[FIXED_RECORD_LENGTH] < len(fixedrecord) and self.ta_info["checkfixedrecordtoolong"] ): raise InMessageError( _( '[S53] line %(line)s: Record "%(record)s" too long; is %(pos)s pos, defined is %(defpos)s pos.' ), line=lex_record[ID][LIN], record=lex_record[ID][VALUE], pos=len(fixedrecord), defpos=record_definition[FIXED_RECORD_LENGTH], ) pos = 0 for field_definition in record_definition[FIELDS]: if field_definition[ID] == "BOTSID" and self.ta_info["noBOTSID"]: record2build["BOTSID"] = lex_record[ID][VALUE] continue value = fixedrecord[ pos : pos + field_definition[LENGTH] ].strip() # copy string to avoid memory problem if value: record2build[field_definition[ID]] = value pos += field_definition[LENGTH] record2build["BOTSIDnr"] = record_definition[BOTSIDNR] return record2build
def _lex(self): """ edi file->self.lex_records.""" try: # there is a problem with the way python reads line by line: file/line offset is not correctly reported. # so the error is catched here to give correct/reasonable result. if self.ta_info["noBOTSID"]: # if read records contain no BOTSID: add it botsid = self.defmessage.structure[0][ ID ] # add the recordname as BOTSID for linenr, line in enumerate(self.filehandler, start=1): if not line.isspace(): line = line.rstrip("\r\n") # append record to recordlist self.lex_records.append( [{VALUE: botsid, LIN: linenr, POS: 0, FIXEDLINE: line}] ) else: startrecordid = self.ta_info["startrecordID"] endrecordid = self.ta_info["endrecordID"] for linenr, line in enumerate(self.filehandler, start=1): if not line.isspace(): line = line.rstrip("\r\n") # append record to recordlist self.lex_records.append( [ { VALUE: line[startrecordid:endrecordid].strip(), LIN: linenr, POS: 0, FIXEDLINE: line, } ] ) except UnicodeError as msg: rep_linenr = locals().get("linenr", 0) + 1 content = get_relevant_text_for_UnicodeError(msg) _exception = InMessageError( _( 'Characterset problem in file. At/after line %(line)s: "%(content)s"' ), {"line": rep_linenr, "content": content}, ) _exception.__cause__ = None raise _exception
def parse_edi_file(**ta_info): """ Read,lex, parse edi-file. Is a dispatch function for InMessage and subclasses. Error handling: there are different types of errors. For all errors related to incoming messages: catch these. Try to extract the relevant information for the message. - str errors: charset is wrong. """ try: classtocall = in_msg_classes.get( ta_info["editype"] ) # get inmessage class to call (subclass of InMessage) ediobject = classtocall(ta_info) except KeyError: raise InMessageError( _("Unknown editype for incoming message: %(editype)s"), ta_info) # read, lex, parse the incoming edi file # ALL errors are caught; these are 'fatal errors': processing has stopped. # get information from error/exception; format this into ediobject.errorfatal try: ediobject.initfromfile() except UnicodeError as e: # ~ raise botslib.MessageError('') #UNITTEST_CORRECTION content = get_relevant_text_for_UnicodeError(e) # msg.encoding should contain encoding, but does not (think this is not TransactionStatus.OK for UNOA, etc) ediobject.errorlist.append( str( InMessageError( _('[A59]: incoming file has not allowed characters at/after file-position %(pos)s: "%(content)s".' ), { "pos": e.start, "content": content }, ))) except Exception as e: # ~ raise botslib.MessageError('') #UNITTEST_CORRECTION txt = txtexc() if not config.get(["settings", "debug"]): txt = txt.partition(": ")[2] ediobject.errorlist.append(txt) else: ediobject.errorfatal = False return ediobject
def initfromfile(self): self.messagegrammarread(typeofgrammarfile="grammars") self._readcontent_edifile() jsonobject = simplejson.loads(self.rawinput) del self.rawinput if isinstance(jsonobject, list): self.root = node.Node() # initialise empty node. self.root.children = self._dojsonlist( jsonobject, self._getrootid()) # fill root with children for child in self.root.children: if not child.record: # sanity test: the children must have content raise InMessageError(_("[J51]: No usable content.")) self.checkmessage(child, self.defmessage) self.ta_info.update(child.queries) elif isinstance(jsonobject, dict): if len(jsonobject) == 1 and isinstance( list(jsonobject.values())[0], dict): # best structure: {rootid:{id2:<dict, list>}} self.root = self._dojsonobject( list(jsonobject.values())[0], list(jsonobject.keys())[0]) elif len(jsonobject) == 1 and isinstance( list(jsonobject.values())[0], list): # root dict has no name; use value from grammar for rootID; {id2:<dict, list>} self.root = node.Node(record={"BOTSID": self._getrootid() }) # initialise empty node. self.root.children = self._dojsonlist( list(jsonobject.values())[0], list(jsonobject.keys())[0]) else: self.root = self._dojsonobject(jsonobject, self._getrootid()) if not self.root: raise InMessageError(_("[J52]: No usable content.")) self.checkmessage(self.root, self.defmessage) self.ta_info.update(self.root.queries) else: # root in JSON is neither dict or list. raise InMessageError( _('[J53]: Content must be a "list" or "object".'))
def _dojsonlist(self, jsonobject, name): lijst = ( [] ) # initialise empty list, used to append a listof (converted) json objects for i in jsonobject: if isinstance(i, dict): # check list item is dict/object newnode = self._dojsonobject(i, name) if newnode: lijst.append(newnode) elif self.ta_info["checkunknownentities"]: raise InMessageError( _('[J54]: List content must be a "object".')) return lijst
def initfromfile(self): """ Initialisation from a edi file. """ self.messagegrammarread(typeofgrammarfile="grammars") # **charset errors, lex errors # open file. variants: read with charset, read as binary & handled in sniff, only opened and read in _lex. self._readcontent_edifile() self._sniff( ) # some hard-coded examination of edi file; ta_info can be overruled by syntax-parameters in edi-file # start lexing self._lex() # lex preprocessing via user exit indicated in syntax preprocess_lex = self.ta_info.get("preprocess_lex", False) if callable(preprocess_lex): preprocess_lex(lex=self.lex_records, ta_info=self.ta_info) if hasattr(self, "rawinput"): del self.rawinput # **breaking parser errors self.root = node.Node() # make root Node None. self.iternext_lex_record = iter(self.lex_records) leftover = self._parse(structure_level=self.defmessage.structure, inode=self.root) if leftover: raise InMessageError( _("[A50] line %(line)s pos %(pos)s: Found non-valid data at end of edi file; probably a problem with separators or message structure." ), { "line": leftover[0][LIN], "pos": leftover[0][POS] }, ) # probably not reached with edifact/x12 because of mailbag processing. del self.lex_records # self.root is now root of a tree (of nodes). # **non-breaking parser errors self.checkenvelope() self.checkmessage(self.root, self.defmessage) # get queries-dict for parsed message; this is used to update in database if self.root.record: self.ta_info.update(self.root.queries) else: for childnode in self.root.children: self.ta_info.update(childnode.queries) break
def _dojsonobject(self, jsonobject, name): thisnode = node.Node(record={"BOTSID": name}) # initialise empty node. for key, value in jsonobject.items(): if value is None: continue elif isinstance(value, str): # json field; map to field in node.record ## for generating grammars: empty strings should generate a field if value and not value.isspace( ): # use only if string has a value. thisnode.record[key] = value elif isinstance(value, dict): newnode = self._dojsonobject(value, key) if newnode: thisnode.append(newnode) elif isinstance(value, list): thisnode.children.extend(self._dojsonlist(value, key)) elif isinstance( value, (int, float)): # json field; map to field in node.record thisnode.record[key] = str(value) else: if self.ta_info["checkunknownentities"]: raise InMessageError( _('[J55]: Key "%(key)s" value "%(value)s": is not string, list or dict.' ), { "key": key, "value": value }, ) thisnode.record[key] = str(value) if len(thisnode.record) == 2 and not thisnode.children: return None # node is empty... # ~ thisnode.record['BOTSID']=name return thisnode
def _parse(self, structure_level, inode): """ This is the heart of the parsing of incoming messages (but not for xml, json) Read the lex_records one by one (self.iternext_lex_record, is an iterator) - parse the records. - identify record (lookup in structure) - identify fields in the record (use the record_definition from the grammar). - add grammar-info to records: field-tag,mpath. Parameters: - structure_level: current grammar/segmentgroup of the grammar-structure. - inode: parent node; all parsed records are added as children of inode 2x recursive: SUBTRANSLATION and segmentgroups """ structure_index = 0 # keep track of where we are in the structure_level countnrofoccurences = 0 # number of occurences of current record in structure structure_end = len(structure_level) # indicate if the next record should be fetched, or if the current_lex_record is still being parsed. get_next_lex_record = True # it might seem logical to test here 'current_lex_record is None', but # this is already used to indicate 'no more records'. while True: if get_next_lex_record: try: current_lex_record = next(self.iternext_lex_record) except StopIteration: # catch when no more lex_record. current_lex_record = None get_next_lex_record = False if (current_lex_record is None or structure_level[structure_index][ID] != current_lex_record[ID][VALUE]): # is record is required in structure_level, and countnrofoccurences==0: error; if structure_level[structure_index][ MIN] and not countnrofoccurences: # enough check here; message is # validated more accurate later try: raise InMessageError( self.messagetypetxt + _('[S50]: Line:%(line)s pos:%(pos)s record:"%(record)s": message has an error in its structure; this record is not allowed here. Scanned in message definition until mandatory record: "%(looked)s".' ), { "record": current_lex_record[ID][VALUE], "line": current_lex_record[ID][LIN], "pos": current_lex_record[ID][POS], "looked": self.mpathformat( structure_level[structure_index][MPATH]), }, ) except TypeError: # when no UNZ (edifact) raise InMessageError( self.messagetypetxt + _('[S51]: Missing mandatory record "%(record)s".'), { "record": self.mpathformat( structure_level[structure_index][MPATH]) }, ) structure_index += 1 if (structure_index == structure_end ): # current_lex_record is not in this level. Go level up # if on 'first level': give specific error if (current_lex_record is not None and structure_level == self.defmessage.structure): raise InMessageError( self.messagetypetxt + _('[S50]: Line:%(line)s pos:%(pos)s record:"%(record)s": message has an error in its structure; this record is not allowed here. Scanned in message definition until mandatory record: "%(looked)s".' ), { "record": current_lex_record[ID][VALUE], "line": current_lex_record[ID][LIN], "pos": current_lex_record[ID][POS], "looked": self.mpathformat( structure_level[structure_index - 1][MPATH]), }, ) # return either None (no more lex_records to parse) or the last # current_lex_record (the last current_lex_record is not found in this # level) return current_lex_record countnrofoccurences = 0 continue # continue while-loop: get_next_lex_record is false as no match with structure is made; go and look at next record of structure # record is found in grammar countnrofoccurences += 1 newnode = node.Node( record=self._parsefields(current_lex_record, structure_level[structure_index]), linpos_info=(current_lex_record[0][LIN], current_lex_record[0][POS]), ) # make new node inode.append( newnode ) # succes! append new node as a child to current (parent)node if SUBTRANSLATION in structure_level[structure_index]: # start a SUBTRANSLATION; find the right messagetype, etc messagetype = newnode.enhancedget( structure_level[structure_index][SUBTRANSLATION]) if not messagetype: raise TranslationNotFoundError( _('Could not find SUBTRANSLATION "%(sub)s" in (sub)message.' ), { "sub": structure_level[structure_index][SUBTRANSLATION] }, ) messagetype = self._manipulatemessagetype(messagetype, inode) try: defmessage = grammar.grammarread( self.__class__.__name__, messagetype, typeofgrammarfile="grammars", ) except BotsImportError: raisenovalidmapping_error = True if hasattr(self.defmessage.module, "getmessagetype"): messagetype2 = runscript( self.defmessage.module, self.defmessage.grammarname, "getmessagetype", editype=self.__class__.__name__, messagetype=messagetype, ) if messagetype2: try: defmessage = grammar.grammarread( self.__class__.__name__, messagetype2, typeofgrammarfile="grammars", ) raisenovalidmapping_error = False except BotsImportError: pass if raisenovalidmapping_error: raise TranslationNotFoundError( _('No (valid) grammar for editype "%(editype)s" messagetype "%(messagetype)s".' ), { "editype": self.__class__.__name__, "messagetype": messagetype, }, ) self.messagecount += 1 self.messagetypetxt = _( "Message nr %(count)s, type %(type)s, " % { "count": self.messagecount, "type": messagetype }) current_lex_record = self._parse( structure_level=defmessage.structure[0][LEVEL], inode=newnode) # copy messagetype into 1st segment of subtranslation (eg UNH, ST) newnode.queries = {"messagetype": messagetype} newnode.queries.update(defmessage.syntax) # ~ newnode.queries = defmessage.syntax.copy() #if using this line instead of previous 2: gives errors eg in incoming edifact...do not understand why self.checkmessage(newnode, defmessage, subtranslation=True ) # check the results of the subtranslation # ~ end SUBTRANSLATION self.messagetypetxt = "" # get_next_lex_record is still False; we are trying to match the last (not # matched) record from the SUBTRANSLATION (named 'current_lex_record'). else: if (LEVEL in structure_level[structure_index] ): # if header, go parse segmentgroup (recursive) current_lex_record = self._parse( structure_level=structure_level[structure_index] [LEVEL], inode=newnode, ) # get_next_lex_record is still False; the current_lex_record that was not # matched in lower segmentgroups is still being parsed. else: get_next_lex_record = True # accomodate for UNS = UNS construction if (structure_level[structure_index][MIN] == structure_level[structure_index][MAX] == countnrofoccurences): if structure_index + 1 == structure_end: pass else: structure_index += 1 countnrofoccurences = 0
def _lex(self): """ lexes file with variable records to list of lex_records, fields and subfields (build self.lex_records).""" record_sep = self.ta_info["record_sep"] mode_inrecord = ( 0 ) # 1 indicates: lexing in record, 0 is lexing 'between records'. # for tradacoms; field_sep and record_tag_sep have same function. field_sep = self.ta_info["field_sep"] + self.ta_info["record_tag_sep"] sfield_sep = self.ta_info["sfield_sep"] rep_sep = self.ta_info["reserve"] allow_spaces_between_records = self.ta_info.get( "allow_spaces_between_records", True ) sfield = 0 # 1: subfield, 0: not a subfield, 2:repeat quote_char = self.ta_info[ "quote_char" ] # typical fo csv. example with quote_char ": ,"1523",TEXT,"123", mode_quote = 0 # 0=not in quote, 1=in quote # status within mode_quote. 0=just another char within quote, 1=met 2nd # quote char; might be end of quote OR escaping of another quote-char. mode_2quote = 0 escape = self.ta_info[ "escape" ] # char after escape-char is not interpreted as separator mode_escape = 0 # 0=not escaping, 1=escaping # chars to ignore/skip/discard. eg edifact: if wrapped to 80pos lines and <CR/LF> at end of segment skip_char = self.ta_info["skip_char"] lex_record = [] # gather the content of a record value = "" # gather the content of (sub)field; the current token valueline = 1 # record line of token valuepos = 1 # record position of token in line countline = 1 # count number of lines; start with 1 countpos = 0 # count position/number of chars within line sep = field_sep + sfield_sep + record_sep + escape + rep_sep for char in self.rawinput: # get next char if char == "\n": # count number lines/position; no action. countline += 1 # count line countpos = 0 # position back to 0 else: countpos += 1 # position within line if mode_quote: # lexing within a quote; note that quote-char works as escape-char within a quote if mode_2quote: mode_2quote = 0 if ( char == quote_char ): # after quote-char another quote-char: used to escape quote_char: value += char # append quote_char continue else: # quote is ended: mode_quote = 0 # continue parsing of this char elif mode_escape: # tricky: escaping a quote char mode_escape = 0 value += char continue elif ( char == quote_char ): # either end-quote or escaping quote_char,we do not know yet mode_2quote = 1 continue elif char == escape: mode_escape = 1 continue else: # we are in quote, just append char to token value += char continue if char in skip_char: # char is skipped. In csv these chars could be in a quote; in eg edifact # chars will be skipped, even if after escape sign. continue if not mode_inrecord: # get here after record-separator is found. we are 'between' records. # some special handling for whtiespace characters; for other chars: go on lexing if char.isspace(): # whitespace = ' \t\n\r\v\f' if ( allow_spaces_between_records ): # by default True; False for strict handling of x12/edifact # exception for tab-delimited csv files. If first field is not filled: # first TAB is ignored, which is not TransactionStatus.OK. Patch this: if char in field_sep and self.__class__.__name__ == 'Csv': # isinstance(self, Csv): pass # do not ignore TAB -> go on lexing else: continue # ignore whitespace character; continue for-loop with next character else: # for strict checks: no spaces between records raise InMessageError( _( "[A67]: Found space characters between segments. Line %(countline)s, position %(pos)s, position %(countpos)s." ), {"countline": countline, "countpos": countpos}, ) mode_inrecord = 1 # not whitespace - a new record has started if mode_escape: # in escaped_mode: char after escape sign is appended to token mode_escape = 0 value += char continue if not value: # if no char in token: this is a new token, get line and pos for (new) token valueline = countline valuepos = countpos if char == quote_char and (not value or value.isspace()): # for csv: handle new quote value. New quote value only makes sense for # new field (value is empty) or field contains only whitespace mode_quote = 1 continue if char not in sep: value += char # just a char: append char to value continue if char in field_sep: # end of (sub)field. Note: first field of composite is marked as 'field' lex_record.append( {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos} ) # write current value to lex_record value = "" sfield = 0 # new token is field continue if char == sfield_sep: # end of (sub)field. Note: first field of composite is marked as 'field' lex_record.append( {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos} ) # write current value to lex_record value = "" sfield = 1 # new token is sub-field continue if char in record_sep: # end of record lex_record.append( {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos} ) # write current value to lex_record self.lex_records.append( lex_record ) # write lex_record to self.lex_records lex_record = [] value = "" sfield = 0 # new token is field mode_inrecord = 0 # we are not in a record continue if char == escape: mode_escape = 1 continue if char == rep_sep: lex_record.append( {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos} ) # write current value to lex_record value = "" sfield = 2 # new token is repeating continue # end of for-loop. all characters have been processed. # in a perfect world, value should always be empty now, but: # it appears a csv record is not always closed properly, so force the closing of the last record of csv file: if mode_inrecord and self.ta_info.get( "allow_lastrecordnotclosedproperly", False ): lex_record.append( {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos} ) # append element in record self.lex_records.append(lex_record) # write record to recordlist else: leftover = value.strip("\x00\x1a") if leftover: raise InMessageError( _( '[A51]: Found non-valid data at end of edi file; probably a problem with separators or message structure: "%(leftover)s".' ), {"leftover": leftover}, )
def initfromfile(self): """ initialisation from an excel file. file is first converted to csv using python module xlrd """ try: self.xlrd = botsbaseimport("xlrd") except ImportError: raise ImportError( _('Dependency failure: editype "excel" requires python library "xlrd".' )) import csv as csvlib try: import StringIO except: import io as StringIO self.messagegrammarread(typeofgrammarfile="grammars") self.ta_info["charset"] = self.defmessage.syntax[ "charset"] # always use charset of edi file. if self.ta_info["escape"]: doublequote = False else: doublequote = True logger.debug('Read edi file "%(filename)s".', self.ta_info) # xlrd reads excel file; python's csv modules write this to file-like # StringIO (as utf-8); read StringIO as self.rawinput; decode this # (utf-8->str) infilename = abspathdata(self.ta_info["filename"]) try: xlsdata = self.read_xls(infilename) except: txt = txtexc() logger.error( _("Excel extraction failed, may not be an Excel file? Error:\n%(txt)s" ), {"txt": txt}, ) raise InMessageError( _("Excel extraction failed, may not be an Excel file? Error:\n%(txt)s" ), {"txt": txt}, ) rawinputfile = StringIO.StringIO() csvout = csvlib.writer( rawinputfile, quotechar=self.ta_info["quote_char"], delimiter=self.ta_info["field_sep"], doublequote=doublequote, escapechar=self.ta_info["escape"], ) csvout.writerows(map(self.utf8ize, xlsdata)) rawinputfile.seek(0) self.rawinput = rawinputfile.read() rawinputfile.close() self.rawinput = self.rawinput.decode("utf-8") # start lexing and parsing as csv self._lex() if hasattr(self, "rawinput"): del self.rawinput self.root = node.Node() # make root Node None. self.iternext_lex_record = iter(self.lex_records) leftover = self._parse(structure_level=self.defmessage.structure, inode=self.root) if leftover: raise InMessageError( _('[A52]: Found non-valid data at end of excel file: "%(leftover)s".' ), {"leftover": leftover}, ) del self.lex_records self.checkmessage(self.root, self.defmessage)
def _sniff(self): """ examine the beginning of edifact file for syntax parameters and charset. if (beginning of) edifact file is not correct: error. edifact file is read as binary. edifact have several charsets (UNOA, UNOC, UNOY). in processing is assumed: charset is ascii, uft-8 or some charset where 1char=1byte (eg iso-9959-1) (if other charset: would be easy to change. charset is in grammar, read/decode for charset, do parsing) Bots assumes: UNA-string contains NO extra CR/LF. (would be absurd; combination of: multiple UNA in file & using 'blocked' edifact.) """ rawinput = self.rawinput[0:99].decode("iso-8859-1") # **************find first non-whitespace character rawinput = rawinput.lstrip() # **************check if UNA if rawinput.startswith("UNA"): has_una_string = True # read UNA; set syntax parameters from UNA count = 3 try: for field in [ "sfield_sep", "field_sep", "decimaal", "escape", "reserve", "record_sep", ]: self.ta_info[field] = rawinput[count] count += 1 except IndexError: # plus some border cases; not possible if mailbag is used. raise InMessageError( _("[A53]: Edifact file contains only whitespace.")) # option extra check: separators etc are never in [0-9-a-zA-Z]. # UNA-string is done; loop until next not-space char rawinput = rawinput[count:].lstrip() else: has_una_string = False # **************expect UNB # loop over rawinput to extract segmenttag, used separators, etc. count2 = 0 found_tag = "" found_charset = "" for char in rawinput: if char in self.ta_info["skip_char"]: continue if count2 <= 2: found_tag += char elif count2 == 3: found_field_sep = char if found_tag != "UNB": # also: UNA too short. not possible if mailbag is used. raise InMessageError( _('[A54]: Found no "UNB" at the start of edifact file.' )) elif count2 <= 7: found_charset += char elif count2 == 8: found_sfield_sep = char else: self.ta_info["version"] = char break count2 += 1 else: # if arrive here: to many <cr/lf>? raise InMessageError( _("[A55]: Problems with UNB-segment; encountered too many <CR/LF>." )) # set and/or verify separators if has_una_string: if (found_field_sep != self.ta_info["field_sep"] or found_sfield_sep != self.ta_info["sfield_sep"]): raise InMessageError( _("[A56]: Separators used in edifact file differ from values indicated in UNA-segment." )) else: if (found_field_sep == "+" and found_sfield_sep == ":"): # assume standard/UNOA separators. self.ta_info["sfield_sep"] = ":" self.ta_info["field_sep"] = "+" self.ta_info["decimaal"] = "." self.ta_info["escape"] = "?" self.ta_info["reserve"] = "*" self.ta_info["record_sep"] = "'" elif (found_field_sep == "\x1D" and found_sfield_sep == "\x1F"): # check if UNOB separators are used self.ta_info["sfield_sep"] = "\x1F" self.ta_info["field_sep"] = "\x1D" self.ta_info["decimaal"] = "." self.ta_info["escape"] = "" self.ta_info["reserve"] = "*" self.ta_info["record_sep"] = "\x1C" else: raise InMessageError( _("[A57]: Edifact file with non-standard separators. An UNA segment should be used." )) # *********** decode the file (to str) try: startUNB = self.rawinput.find(b"UNB") self.rawinput = self.rawinput[startUNB:].decode( found_charset, self.ta_info["checkcharsetin"]) self.ta_info["charset"] = found_charset except LookupError: _exception = InMessageError( _('[A58]: Edifact file has unknown characterset "%(charset)s".' ), {"charset": found_charset}, ) _exception.__cause__ = None raise _exception # ~ except UnicodeDecodeError as msg: # ~ raise InMessageError(_('[A59]: Edifact file has not allowed characters at/after file-position %(content)s.'), # ~ {'content':msg[2]}) # repetition separator only for version >= 4. if self.ta_info["version"] < "4" or self.ta_info["reserve"] == " ": # if version > 4 and repetition separator is # space: assume this is a mistake; use # repetition separator self.ta_info["reserve"] = "" self.separatorcheck(self.ta_info["sfield_sep"] + self.ta_info["field_sep"] + self.ta_info["decimaal"] + self.ta_info["escape"] + self.ta_info["reserve"] + self.ta_info["record_sep"])
def initfromfile(self): logger.debug('Read edi file "%(filename)s".', self.ta_info) filename = abspathdata(self.ta_info["filename"]) if self.ta_info["messagetype"] == "mailbag": # the messagetype is not know. # bots reads file usersys/grammars/xml/mailbag.py, and uses 'mailbagsearch' to determine the messagetype # mailbagsearch is a list, containing python dicts. Dict consist of 'xpath', 'messagetype' and (optionally) 'content'. # 'xpath' is a xpath to use on xml-file (using elementtree xpath functionality) # if found, and 'content' in the dict; if 'content' is equal to value found by xpath-search, then set messagetype. # if found, and no 'content' in the dict; set messagetype. try: module, grammarname = botsimport("grammars", "xml", "mailbag") mailbagsearch = getattr(module, "mailbagsearch") except AttributeError: logger.error( "Missing mailbagsearch in mailbag definitions for xml.") raise except BotsImportError: logger.error( "Missing mailbag definitions for xml, should be there.") raise parser = ET.XMLParser() try: extra_character_entity = getattr(module, "extra_character_entity") for key, value in extra_character_entity.items(): parser.entity[key] = value except AttributeError: pass # there is no extra_character_entity in the mailbag definitions, is TransactionStatus.OK. etree = ( ET.ElementTree() ) # ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed etreeroot = etree.parse(filename, parser) for item in mailbagsearch: if "xpath" not in item or "messagetype" not in item: raise InMessageError( _("Invalid search parameters in xml mailbag.")) found = etree.find(item["xpath"]) if found is not None: if "content" in item and found.text != item["content"]: continue self.ta_info["messagetype"] = item["messagetype"] break else: raise InMessageError( _("Could not find right xml messagetype for mailbag.")) self.messagegrammarread(typeofgrammarfile="grammars") else: self.messagegrammarread(typeofgrammarfile="grammars") parser = ET.XMLParser() for key, value in self.ta_info["extra_character_entity"].items(): parser.entity[key] = value etree = ( ET.ElementTree() ) # ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed etreeroot = etree.parse(filename, parser) self._handle_empty(etreeroot) self.stackinit() self.root = self._etree2botstree( etreeroot) # convert etree to bots-nodes-tree self.checkmessage(self.root, self.defmessage) self.ta_info.update(self.root.queries)