Esempio n. 1
0
 def _parsefields(self, lex_record, record_definition):
     """ Parse fields from one fixed message-record and check length of the fixed record.
     """
     record2build = {}  # start with empty dict
     fixedrecord = lex_record[ID][FIXEDLINE]  # shortcut to fixed incoming record
     lenfixed = len(fixedrecord)
     recordlength = record_definition[FIXED_RECORD_LENGTH]
     if record_definition[FIXED_RECORD_LENGTH] != len(fixedrecord):
         if (
                 record_definition[FIXED_RECORD_LENGTH] > len(fixedrecord)
                 and self.ta_info["checkfixedrecordtooshort"]
         ):
             raise InMessageError(
                 _(
                     '[S52] line %(line)s: Record "%(record)s" too short; is %(pos)s pos, defined is %(defpos)s pos.'
                 ),
                 line=lex_record[ID][LIN],
                 record=lex_record[ID][VALUE],
                 pos=len(fixedrecord),
                 defpos=record_definition[FIXED_RECORD_LENGTH],
             )
         if (
                 record_definition[FIXED_RECORD_LENGTH] < len(fixedrecord)
                 and self.ta_info["checkfixedrecordtoolong"]
         ):
             raise InMessageError(
                 _(
                     '[S53] line %(line)s: Record "%(record)s" too long; is %(pos)s pos, defined is %(defpos)s pos.'
                 ),
                 line=lex_record[ID][LIN],
                 record=lex_record[ID][VALUE],
                 pos=len(fixedrecord),
                 defpos=record_definition[FIXED_RECORD_LENGTH],
             )
     pos = 0
     for field_definition in record_definition[FIELDS]:
         if field_definition[ID] == "BOTSID" and self.ta_info["noBOTSID"]:
             record2build["BOTSID"] = lex_record[ID][VALUE]
             continue
         value = fixedrecord[
                 pos : pos + field_definition[LENGTH]
                 ].strip()  # copy string to avoid memory problem
         if value:
             record2build[field_definition[ID]] = value
         pos += field_definition[LENGTH]
     record2build["BOTSIDnr"] = record_definition[BOTSIDNR]
     return record2build
Esempio n. 2
0
 def _lex(self):
     """ edi file->self.lex_records."""
     try:
         # there is a problem with the way python reads line by line: file/line offset is not correctly reported.
         # so the error is catched here to give correct/reasonable result.
         if self.ta_info["noBOTSID"]:  # if read records contain no BOTSID: add it
             botsid = self.defmessage.structure[0][
                 ID
             ]  # add the recordname as BOTSID
             for linenr, line in enumerate(self.filehandler, start=1):
                 if not line.isspace():
                     line = line.rstrip("\r\n")
                     # append record to recordlist
                     self.lex_records.append(
                         [{VALUE: botsid, LIN: linenr, POS: 0, FIXEDLINE: line}]
                     )
         else:
             startrecordid = self.ta_info["startrecordID"]
             endrecordid = self.ta_info["endrecordID"]
             for linenr, line in enumerate(self.filehandler, start=1):
                 if not line.isspace():
                     line = line.rstrip("\r\n")
                     # append record to recordlist
                     self.lex_records.append(
                         [
                             {
                                 VALUE: line[startrecordid:endrecordid].strip(),
                                 LIN: linenr,
                                 POS: 0,
                                 FIXEDLINE: line,
                             }
                         ]
                     )
     except UnicodeError as msg:
         rep_linenr = locals().get("linenr", 0) + 1
         content = get_relevant_text_for_UnicodeError(msg)
         _exception = InMessageError(
             _(
                 'Characterset problem in file. At/after line %(line)s: "%(content)s"'
             ),
             {"line": rep_linenr, "content": content},
         )
         _exception.__cause__ = None
         raise _exception
Esempio n. 3
0
def parse_edi_file(**ta_info):
    """ Read,lex, parse edi-file. Is a dispatch function for InMessage and subclasses.
        Error handling: there are different types of errors.
        For all errors related to incoming messages: catch these.
        Try to extract the relevant information for the message.
        - str errors: charset is wrong.
    """
    try:
        classtocall = in_msg_classes.get(
            ta_info["editype"]
        )  # get inmessage class to call (subclass of InMessage)
        ediobject = classtocall(ta_info)
    except KeyError:
        raise InMessageError(
            _("Unknown editype for incoming message: %(editype)s"), ta_info)
    # read, lex, parse the incoming edi file
    # ALL errors are caught; these are 'fatal errors': processing has stopped.
    # get information from error/exception; format this into ediobject.errorfatal
    try:
        ediobject.initfromfile()
    except UnicodeError as e:
        # ~ raise botslib.MessageError('')      #UNITTEST_CORRECTION
        content = get_relevant_text_for_UnicodeError(e)
        # msg.encoding should contain encoding, but does not (think this is not TransactionStatus.OK for UNOA, etc)
        ediobject.errorlist.append(
            str(
                InMessageError(
                    _('[A59]: incoming file has not allowed characters at/after file-position %(pos)s: "%(content)s".'
                      ),
                    {
                        "pos": e.start,
                        "content": content
                    },
                )))
    except Exception as e:
        # ~ raise botslib.MessageError('')      #UNITTEST_CORRECTION
        txt = txtexc()
        if not config.get(["settings", "debug"]):
            txt = txt.partition(": ")[2]
        ediobject.errorlist.append(txt)
    else:
        ediobject.errorfatal = False
    return ediobject
Esempio n. 4
0
    def initfromfile(self):
        self.messagegrammarread(typeofgrammarfile="grammars")
        self._readcontent_edifile()

        jsonobject = simplejson.loads(self.rawinput)
        del self.rawinput
        if isinstance(jsonobject, list):
            self.root = node.Node()  # initialise empty node.
            self.root.children = self._dojsonlist(
                jsonobject, self._getrootid())  # fill root with children
            for child in self.root.children:
                if not child.record:  # sanity test: the children must have content
                    raise InMessageError(_("[J51]: No usable content."))
                self.checkmessage(child, self.defmessage)
                self.ta_info.update(child.queries)
        elif isinstance(jsonobject, dict):
            if len(jsonobject) == 1 and isinstance(
                    list(jsonobject.values())[0], dict):
                # best structure: {rootid:{id2:<dict, list>}}
                self.root = self._dojsonobject(
                    list(jsonobject.values())[0],
                    list(jsonobject.keys())[0])
            elif len(jsonobject) == 1 and isinstance(
                    list(jsonobject.values())[0], list):
                # root dict has no name; use value from grammar for rootID; {id2:<dict, list>}
                self.root = node.Node(record={"BOTSID": self._getrootid()
                                              })  # initialise empty node.
                self.root.children = self._dojsonlist(
                    list(jsonobject.values())[0],
                    list(jsonobject.keys())[0])
            else:
                self.root = self._dojsonobject(jsonobject, self._getrootid())
            if not self.root:
                raise InMessageError(_("[J52]: No usable content."))
            self.checkmessage(self.root, self.defmessage)
            self.ta_info.update(self.root.queries)
        else:
            # root in JSON is neither dict or list.
            raise InMessageError(
                _('[J53]: Content must be a "list" or "object".'))
Esempio n. 5
0
 def _dojsonlist(self, jsonobject, name):
     lijst = (
         []
     )  # initialise empty list, used to append a listof (converted) json objects
     for i in jsonobject:
         if isinstance(i, dict):  # check list item is dict/object
             newnode = self._dojsonobject(i, name)
             if newnode:
                 lijst.append(newnode)
         elif self.ta_info["checkunknownentities"]:
             raise InMessageError(
                 _('[J54]: List content must be a "object".'))
     return lijst
Esempio n. 6
0
    def initfromfile(self):
        """ Initialisation from a edi file.
        """
        self.messagegrammarread(typeofgrammarfile="grammars")
        # **charset errors, lex errors
        # open file. variants: read with charset, read as binary & handled in sniff, only opened and read in _lex.
        self._readcontent_edifile()
        self._sniff(
        )  # some hard-coded examination of edi file; ta_info can be overruled by syntax-parameters in edi-file
        # start lexing
        self._lex()
        # lex preprocessing via user exit indicated in syntax
        preprocess_lex = self.ta_info.get("preprocess_lex", False)
        if callable(preprocess_lex):
            preprocess_lex(lex=self.lex_records, ta_info=self.ta_info)
        if hasattr(self, "rawinput"):
            del self.rawinput
        # **breaking parser errors
        self.root = node.Node()  # make root Node None.
        self.iternext_lex_record = iter(self.lex_records)
        leftover = self._parse(structure_level=self.defmessage.structure,
                               inode=self.root)
        if leftover:
            raise InMessageError(
                _("[A50] line %(line)s pos %(pos)s: Found non-valid data at end of edi file; probably a problem with separators or message structure."
                  ),
                {
                    "line": leftover[0][LIN],
                    "pos": leftover[0][POS]
                },
            )  # probably not reached with edifact/x12 because of mailbag processing.
        del self.lex_records
        # self.root is now root of a tree (of nodes).

        # **non-breaking parser errors
        self.checkenvelope()
        self.checkmessage(self.root, self.defmessage)
        # get queries-dict for parsed message; this is used to update in database
        if self.root.record:
            self.ta_info.update(self.root.queries)
        else:
            for childnode in self.root.children:
                self.ta_info.update(childnode.queries)
                break
Esempio n. 7
0
 def _dojsonobject(self, jsonobject, name):
     thisnode = node.Node(record={"BOTSID": name})  # initialise empty node.
     for key, value in jsonobject.items():
         if value is None:
             continue
         elif isinstance(value,
                         str):  # json field; map to field in node.record
             ## for generating grammars: empty strings should generate a field
             if value and not value.isspace(
             ):  # use only if string has a value.
                 thisnode.record[key] = value
         elif isinstance(value, dict):
             newnode = self._dojsonobject(value, key)
             if newnode:
                 thisnode.append(newnode)
         elif isinstance(value, list):
             thisnode.children.extend(self._dojsonlist(value, key))
         elif isinstance(
                 value,
             (int, float)):  # json field; map to field in node.record
             thisnode.record[key] = str(value)
         else:
             if self.ta_info["checkunknownentities"]:
                 raise InMessageError(
                     _('[J55]: Key "%(key)s" value "%(value)s": is not string, list or dict.'
                       ),
                     {
                         "key": key,
                         "value": value
                     },
                 )
             thisnode.record[key] = str(value)
     if len(thisnode.record) == 2 and not thisnode.children:
         return None  # node is empty...
     # ~ thisnode.record['BOTSID']=name
     return thisnode
Esempio n. 8
0
 def _parse(self, structure_level, inode):
     """ This is the heart of the parsing of incoming messages (but not for xml, json)
         Read the lex_records one by one (self.iternext_lex_record, is an iterator)
         - parse the records.
         - identify record (lookup in structure)
         - identify fields in the record (use the record_definition from the grammar).
         - add grammar-info to records: field-tag,mpath.
         Parameters:
         - structure_level: current grammar/segmentgroup of the grammar-structure.
         - inode: parent node; all parsed records are added as children of inode
         2x recursive: SUBTRANSLATION and segmentgroups
     """
     structure_index = 0  # keep track of where we are in the structure_level
     countnrofoccurences = 0  # number of occurences of current record in structure
     structure_end = len(structure_level)
     # indicate if the next record should be fetched, or if the current_lex_record is still being parsed.
     get_next_lex_record = True
     # it might seem logical to test here 'current_lex_record is None', but
     # this is already used to indicate 'no more records'.
     while True:
         if get_next_lex_record:
             try:
                 current_lex_record = next(self.iternext_lex_record)
             except StopIteration:  # catch when no more lex_record.
                 current_lex_record = None
             get_next_lex_record = False
         if (current_lex_record is None
                 or structure_level[structure_index][ID] !=
                 current_lex_record[ID][VALUE]):
             # is record is required in structure_level, and countnrofoccurences==0: error;
             if structure_level[structure_index][
                     MIN] and not countnrofoccurences:
                 # enough check here; message is
                 # validated more accurate later
                 try:
                     raise InMessageError(
                         self.messagetypetxt +
                         _('[S50]: Line:%(line)s pos:%(pos)s record:"%(record)s": message has an error in its structure; this record is not allowed here. Scanned in message definition until mandatory record: "%(looked)s".'
                           ),
                         {
                             "record":
                             current_lex_record[ID][VALUE],
                             "line":
                             current_lex_record[ID][LIN],
                             "pos":
                             current_lex_record[ID][POS],
                             "looked":
                             self.mpathformat(
                                 structure_level[structure_index][MPATH]),
                         },
                     )
                 except TypeError:  # when no UNZ (edifact)
                     raise InMessageError(
                         self.messagetypetxt +
                         _('[S51]: Missing mandatory record "%(record)s".'),
                         {
                             "record":
                             self.mpathformat(
                                 structure_level[structure_index][MPATH])
                         },
                     )
             structure_index += 1
             if (structure_index == structure_end
                 ):  # current_lex_record is not in this level. Go level up
                 # if on 'first level': give specific error
                 if (current_lex_record is not None
                         and structure_level == self.defmessage.structure):
                     raise InMessageError(
                         self.messagetypetxt +
                         _('[S50]: Line:%(line)s pos:%(pos)s record:"%(record)s": message has an error in its structure; this record is not allowed here. Scanned in message definition until mandatory record: "%(looked)s".'
                           ),
                         {
                             "record":
                             current_lex_record[ID][VALUE],
                             "line":
                             current_lex_record[ID][LIN],
                             "pos":
                             current_lex_record[ID][POS],
                             "looked":
                             self.mpathformat(
                                 structure_level[structure_index -
                                                 1][MPATH]),
                         },
                     )
                 # return either None (no more lex_records to parse) or the last
                 # current_lex_record (the last current_lex_record is not found in this
                 # level)
                 return current_lex_record
             countnrofoccurences = 0
             continue  # continue while-loop: get_next_lex_record is false as no match with structure is made; go and look at next record of structure
         # record is found in grammar
         countnrofoccurences += 1
         newnode = node.Node(
             record=self._parsefields(current_lex_record,
                                      structure_level[structure_index]),
             linpos_info=(current_lex_record[0][LIN],
                          current_lex_record[0][POS]),
         )  # make new node
         inode.append(
             newnode
         )  # succes! append new node as a child to current (parent)node
         if SUBTRANSLATION in structure_level[structure_index]:
             # start a SUBTRANSLATION; find the right messagetype, etc
             messagetype = newnode.enhancedget(
                 structure_level[structure_index][SUBTRANSLATION])
             if not messagetype:
                 raise TranslationNotFoundError(
                     _('Could not find SUBTRANSLATION "%(sub)s" in (sub)message.'
                       ),
                     {
                         "sub":
                         structure_level[structure_index][SUBTRANSLATION]
                     },
                 )
             messagetype = self._manipulatemessagetype(messagetype, inode)
             try:
                 defmessage = grammar.grammarread(
                     self.__class__.__name__,
                     messagetype,
                     typeofgrammarfile="grammars",
                 )
             except BotsImportError:
                 raisenovalidmapping_error = True
                 if hasattr(self.defmessage.module, "getmessagetype"):
                     messagetype2 = runscript(
                         self.defmessage.module,
                         self.defmessage.grammarname,
                         "getmessagetype",
                         editype=self.__class__.__name__,
                         messagetype=messagetype,
                     )
                     if messagetype2:
                         try:
                             defmessage = grammar.grammarread(
                                 self.__class__.__name__,
                                 messagetype2,
                                 typeofgrammarfile="grammars",
                             )
                             raisenovalidmapping_error = False
                         except BotsImportError:
                             pass
                 if raisenovalidmapping_error:
                     raise TranslationNotFoundError(
                         _('No (valid) grammar for editype "%(editype)s" messagetype "%(messagetype)s".'
                           ),
                         {
                             "editype": self.__class__.__name__,
                             "messagetype": messagetype,
                         },
                     )
             self.messagecount += 1
             self.messagetypetxt = _(
                 "Message nr %(count)s, type %(type)s, " % {
                     "count": self.messagecount,
                     "type": messagetype
                 })
             current_lex_record = self._parse(
                 structure_level=defmessage.structure[0][LEVEL],
                 inode=newnode)
             # copy messagetype into 1st segment of subtranslation (eg UNH, ST)
             newnode.queries = {"messagetype": messagetype}
             newnode.queries.update(defmessage.syntax)
             # ~ newnode.queries = defmessage.syntax.copy()       #if using this line instead of previous 2: gives errors eg in incoming edifact...do not understand why
             self.checkmessage(newnode, defmessage, subtranslation=True
                               )  # check the results of the subtranslation
             # ~ end SUBTRANSLATION
             self.messagetypetxt = ""
             # get_next_lex_record is still False; we are trying to match the last (not
             # matched) record from the SUBTRANSLATION (named 'current_lex_record').
         else:
             if (LEVEL in structure_level[structure_index]
                 ):  # if header, go parse segmentgroup (recursive)
                 current_lex_record = self._parse(
                     structure_level=structure_level[structure_index]
                     [LEVEL],
                     inode=newnode,
                 )
                 # get_next_lex_record is still False; the current_lex_record that was not
                 # matched in lower segmentgroups is still being parsed.
             else:
                 get_next_lex_record = True
             # accomodate for UNS = UNS construction
             if (structure_level[structure_index][MIN] ==
                     structure_level[structure_index][MAX] ==
                     countnrofoccurences):
                 if structure_index + 1 == structure_end:
                     pass
                 else:
                     structure_index += 1
                     countnrofoccurences = 0
Esempio n. 9
0
    def _lex(self):
        """ lexes file with variable records to list of lex_records, fields and subfields (build self.lex_records)."""
        record_sep = self.ta_info["record_sep"]
        mode_inrecord = (
            0
        )  # 1 indicates: lexing in record, 0 is lexing 'between records'.
        # for tradacoms; field_sep and record_tag_sep have same function.
        field_sep = self.ta_info["field_sep"] + self.ta_info["record_tag_sep"]
        sfield_sep = self.ta_info["sfield_sep"]
        rep_sep = self.ta_info["reserve"]
        allow_spaces_between_records = self.ta_info.get(
            "allow_spaces_between_records", True
        )
        sfield = 0  # 1: subfield, 0: not a subfield, 2:repeat
        quote_char = self.ta_info[
            "quote_char"
        ]  # typical fo csv. example with quote_char ":  ,"1523",TEXT,"123",
        mode_quote = 0  # 0=not in quote, 1=in quote
        # status within mode_quote. 0=just another char within quote, 1=met 2nd
        # quote char; might be end of quote OR escaping of another quote-char.
        mode_2quote = 0
        escape = self.ta_info[
            "escape"
        ]  # char after escape-char is not interpreted as separator
        mode_escape = 0  # 0=not escaping, 1=escaping
        # chars to ignore/skip/discard. eg edifact: if wrapped to 80pos lines and <CR/LF> at end of segment
        skip_char = self.ta_info["skip_char"]
        lex_record = []  # gather the content of a record
        value = ""  # gather the content of (sub)field; the current token
        valueline = 1  # record line of token
        valuepos = 1  # record position of token in line
        countline = 1  # count number of lines; start with 1
        countpos = 0  # count position/number of chars within line
        sep = field_sep + sfield_sep + record_sep + escape + rep_sep

        for char in self.rawinput:  # get next char
            if char == "\n":
                # count number lines/position; no action.
                countline += 1  # count line
                countpos = 0  # position back to 0
            else:
                countpos += 1  # position within line
            if mode_quote:
                # lexing within a quote; note that quote-char works as escape-char within a quote
                if mode_2quote:
                    mode_2quote = 0
                    if (
                            char == quote_char
                    ):  # after quote-char another quote-char: used to escape quote_char:
                        value += char  # append quote_char
                        continue
                    else:  # quote is ended:
                        mode_quote = 0
                        # continue parsing of this char
                elif mode_escape:  # tricky: escaping a quote char
                    mode_escape = 0
                    value += char
                    continue
                elif (
                        char == quote_char
                ):  # either end-quote or escaping quote_char,we do not know yet
                    mode_2quote = 1
                    continue
                elif char == escape:
                    mode_escape = 1
                    continue
                else:  # we are in quote, just append char to token
                    value += char
                    continue
            if char in skip_char:
                # char is skipped. In csv these chars could be in a quote; in eg edifact
                # chars will be skipped, even if after escape sign.
                continue
            if not mode_inrecord:
                # get here after record-separator is found. we are 'between' records.
                # some special handling for whtiespace characters; for other chars: go on lexing
                if char.isspace():  # whitespace = ' \t\n\r\v\f'
                    if (
                            allow_spaces_between_records
                    ):  # by default True; False for strict handling of x12/edifact
                        # exception for tab-delimited csv files. If first field is not filled:
                        # first TAB is ignored, which is not TransactionStatus.OK. Patch this:
                        if char in field_sep and self.__class__.__name__ == 'Csv': #  isinstance(self, Csv):
                            pass  # do not ignore TAB -> go on lexing
                        else:
                            continue  # ignore whitespace character; continue for-loop with next character
                    else:  # for strict checks: no spaces between records
                        raise InMessageError(
                            _(
                                "[A67]: Found space characters between segments. Line %(countline)s, position %(pos)s, position %(countpos)s."
                            ),
                            {"countline": countline, "countpos": countpos},
                        )
                mode_inrecord = 1  # not whitespace - a new record has started
            if mode_escape:
                # in escaped_mode: char after escape sign is appended to token
                mode_escape = 0
                value += char
                continue
            if not value:
                # if no char in token: this is a new token, get line and pos for (new) token
                valueline = countline
                valuepos = countpos
            if char == quote_char and (not value or value.isspace()):
                # for csv: handle new quote value. New quote value only makes sense for
                # new field (value is empty) or field contains only whitespace
                mode_quote = 1
                continue
            if char not in sep:
                value += char  # just a char: append char to value
                continue
            if char in field_sep:
                # end of (sub)field. Note: first field of composite is marked as 'field'
                lex_record.append(
                    {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos}
                )  # write current value to lex_record
                value = ""
                sfield = 0  # new token is field
                continue
            if char == sfield_sep:
                # end of (sub)field. Note: first field of composite is marked as 'field'
                lex_record.append(
                    {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos}
                )  # write current value to lex_record
                value = ""
                sfield = 1  # new token is sub-field
                continue
            if char in record_sep:  # end of record
                lex_record.append(
                    {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos}
                )  # write current value to lex_record
                self.lex_records.append(
                    lex_record
                )  # write lex_record to self.lex_records
                lex_record = []
                value = ""
                sfield = 0  # new token is field
                mode_inrecord = 0  # we are not in a record
                continue
            if char == escape:
                mode_escape = 1
                continue
            if char == rep_sep:
                lex_record.append(
                    {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos}
                )  # write current value to lex_record
                value = ""
                sfield = 2  # new token is repeating
                continue
        # end of for-loop. all characters have been processed.
        # in a perfect world, value should always be empty now, but:
        # it appears a csv record is not always closed properly, so force the closing of the last record of csv file:
        if mode_inrecord and self.ta_info.get(
                "allow_lastrecordnotclosedproperly", False
        ):
            lex_record.append(
                {VALUE: value, SFIELD: sfield, LIN: valueline, POS: valuepos}
            )  # append element in record
            self.lex_records.append(lex_record)  # write record to recordlist
        else:
            leftover = value.strip("\x00\x1a")
            if leftover:
                raise InMessageError(
                    _(
                        '[A51]: Found non-valid data at end of edi file; probably a problem with separators or message structure: "%(leftover)s".'
                    ),
                    {"leftover": leftover},
                )
Esempio n. 10
0
    def initfromfile(self):
        """ initialisation from an excel file.
            file is first converted to csv using python module xlrd
        """
        try:
            self.xlrd = botsbaseimport("xlrd")
        except ImportError:
            raise ImportError(
                _('Dependency failure: editype "excel" requires python library "xlrd".'
                  ))
        import csv as csvlib

        try:
            import StringIO
        except:
            import io as StringIO

        self.messagegrammarread(typeofgrammarfile="grammars")
        self.ta_info["charset"] = self.defmessage.syntax[
            "charset"]  # always use charset of edi file.
        if self.ta_info["escape"]:
            doublequote = False
        else:
            doublequote = True

        logger.debug('Read edi file "%(filename)s".', self.ta_info)
        # xlrd reads excel file; python's csv modules write this to file-like
        # StringIO (as utf-8); read StringIO as self.rawinput; decode this
        # (utf-8->str)
        infilename = abspathdata(self.ta_info["filename"])
        try:
            xlsdata = self.read_xls(infilename)
        except:
            txt = txtexc()
            logger.error(
                _("Excel extraction failed, may not be an Excel file? Error:\n%(txt)s"
                  ),
                {"txt": txt},
            )
            raise InMessageError(
                _("Excel extraction failed, may not be an Excel file? Error:\n%(txt)s"
                  ),
                {"txt": txt},
            )
        rawinputfile = StringIO.StringIO()
        csvout = csvlib.writer(
            rawinputfile,
            quotechar=self.ta_info["quote_char"],
            delimiter=self.ta_info["field_sep"],
            doublequote=doublequote,
            escapechar=self.ta_info["escape"],
        )
        csvout.writerows(map(self.utf8ize, xlsdata))
        rawinputfile.seek(0)
        self.rawinput = rawinputfile.read()
        rawinputfile.close()
        self.rawinput = self.rawinput.decode("utf-8")
        # start lexing and parsing as csv
        self._lex()
        if hasattr(self, "rawinput"):
            del self.rawinput
        self.root = node.Node()  # make root Node None.
        self.iternext_lex_record = iter(self.lex_records)
        leftover = self._parse(structure_level=self.defmessage.structure,
                               inode=self.root)
        if leftover:
            raise InMessageError(
                _('[A52]: Found non-valid data at end of excel file: "%(leftover)s".'
                  ),
                {"leftover": leftover},
            )
        del self.lex_records
        self.checkmessage(self.root, self.defmessage)
Esempio n. 11
0
    def _sniff(self):
        """ examine the beginning of edifact file for syntax parameters and charset. if (beginning of) edifact file is not correct: error.
            edifact file is read as binary. edifact have several charsets (UNOA, UNOC, UNOY).
            in processing is assumed: charset is ascii, uft-8 or some charset where 1char=1byte (eg iso-9959-1)
            (if other charset: would be easy to change. charset is in grammar, read/decode for charset, do parsing)
            Bots assumes: UNA-string contains NO extra CR/LF. (would be absurd; combination of: multiple UNA in file & using 'blocked' edifact.)
        """
        rawinput = self.rawinput[0:99].decode("iso-8859-1")
        # **************find first non-whitespace character
        rawinput = rawinput.lstrip()
        # **************check if UNA
        if rawinput.startswith("UNA"):
            has_una_string = True
            # read UNA; set syntax parameters from UNA
            count = 3
            try:
                for field in [
                        "sfield_sep",
                        "field_sep",
                        "decimaal",
                        "escape",
                        "reserve",
                        "record_sep",
                ]:
                    self.ta_info[field] = rawinput[count]
                    count += 1
            except IndexError:
                # plus some border cases; not possible if mailbag is used.
                raise InMessageError(
                    _("[A53]: Edifact file contains only whitespace."))
            # option extra check: separators etc are never in [0-9-a-zA-Z].
            # UNA-string is done; loop until next not-space char
            rawinput = rawinput[count:].lstrip()
        else:
            has_una_string = False

        # **************expect UNB
        # loop over rawinput to extract segmenttag, used separators, etc.
        count2 = 0
        found_tag = ""
        found_charset = ""
        for char in rawinput:
            if char in self.ta_info["skip_char"]:
                continue
            if count2 <= 2:
                found_tag += char
            elif count2 == 3:
                found_field_sep = char
                if found_tag != "UNB":
                    # also: UNA too short. not possible if mailbag is used.
                    raise InMessageError(
                        _('[A54]: Found no "UNB" at the start of edifact file.'
                          ))
            elif count2 <= 7:
                found_charset += char
            elif count2 == 8:
                found_sfield_sep = char
            else:
                self.ta_info["version"] = char
                break
            count2 += 1
        else:
            # if arrive here: to many <cr/lf>?
            raise InMessageError(
                _("[A55]: Problems with UNB-segment; encountered too many <CR/LF>."
                  ))

        # set and/or verify separators
        if has_una_string:
            if (found_field_sep != self.ta_info["field_sep"]
                    or found_sfield_sep != self.ta_info["sfield_sep"]):
                raise InMessageError(
                    _("[A56]: Separators used in edifact file differ from values indicated in UNA-segment."
                      ))
        else:
            if (found_field_sep == "+" and found_sfield_sep
                    == ":"):  # assume standard/UNOA separators.
                self.ta_info["sfield_sep"] = ":"
                self.ta_info["field_sep"] = "+"
                self.ta_info["decimaal"] = "."
                self.ta_info["escape"] = "?"
                self.ta_info["reserve"] = "*"
                self.ta_info["record_sep"] = "'"
            elif (found_field_sep == "\x1D" and found_sfield_sep
                  == "\x1F"):  # check if UNOB separators are used
                self.ta_info["sfield_sep"] = "\x1F"
                self.ta_info["field_sep"] = "\x1D"
                self.ta_info["decimaal"] = "."
                self.ta_info["escape"] = ""
                self.ta_info["reserve"] = "*"
                self.ta_info["record_sep"] = "\x1C"
            else:
                raise InMessageError(
                    _("[A57]: Edifact file with non-standard separators. An UNA segment should be used."
                      ))

        # *********** decode the file (to str)
        try:
            startUNB = self.rawinput.find(b"UNB")
            self.rawinput = self.rawinput[startUNB:].decode(
                found_charset, self.ta_info["checkcharsetin"])
            self.ta_info["charset"] = found_charset
        except LookupError:
            _exception = InMessageError(
                _('[A58]: Edifact file has unknown characterset "%(charset)s".'
                  ),
                {"charset": found_charset},
            )
            _exception.__cause__ = None
            raise _exception
        # ~ except UnicodeDecodeError as msg:
        # ~ raise InMessageError(_('[A59]: Edifact file has not allowed characters at/after file-position %(content)s.'),
        # ~ {'content':msg[2]})
        # repetition separator only for version >= 4.
        if self.ta_info["version"] < "4" or self.ta_info["reserve"] == " ":
            # if version > 4 and repetition separator is
            # space: assume this is a mistake; use
            # repetition separator
            self.ta_info["reserve"] = ""
        self.separatorcheck(self.ta_info["sfield_sep"] +
                            self.ta_info["field_sep"] +
                            self.ta_info["decimaal"] + self.ta_info["escape"] +
                            self.ta_info["reserve"] +
                            self.ta_info["record_sep"])
Esempio n. 12
0
    def initfromfile(self):
        logger.debug('Read edi file "%(filename)s".', self.ta_info)
        filename = abspathdata(self.ta_info["filename"])

        if self.ta_info["messagetype"] == "mailbag":
            # the messagetype is not know.
            # bots reads file usersys/grammars/xml/mailbag.py, and uses 'mailbagsearch' to determine the messagetype
            # mailbagsearch is a list, containing python dicts. Dict consist of 'xpath', 'messagetype' and (optionally) 'content'.
            # 'xpath' is a xpath to use on xml-file (using elementtree xpath functionality)
            # if found, and 'content' in the dict; if 'content' is equal to value found by xpath-search, then set messagetype.
            # if found, and no 'content' in the dict; set messagetype.
            try:
                module, grammarname = botsimport("grammars", "xml", "mailbag")
                mailbagsearch = getattr(module, "mailbagsearch")
            except AttributeError:
                logger.error(
                    "Missing mailbagsearch in mailbag definitions for xml.")
                raise
            except BotsImportError:
                logger.error(
                    "Missing mailbag definitions for xml, should be there.")
                raise
            parser = ET.XMLParser()
            try:
                extra_character_entity = getattr(module,
                                                 "extra_character_entity")
                for key, value in extra_character_entity.items():
                    parser.entity[key] = value
            except AttributeError:
                pass  # there is no extra_character_entity in the mailbag definitions, is TransactionStatus.OK.
            etree = (
                ET.ElementTree()
            )  # ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed
            etreeroot = etree.parse(filename, parser)
            for item in mailbagsearch:
                if "xpath" not in item or "messagetype" not in item:
                    raise InMessageError(
                        _("Invalid search parameters in xml mailbag."))
                found = etree.find(item["xpath"])
                if found is not None:
                    if "content" in item and found.text != item["content"]:
                        continue
                    self.ta_info["messagetype"] = item["messagetype"]
                    break
            else:
                raise InMessageError(
                    _("Could not find right xml messagetype for mailbag."))

            self.messagegrammarread(typeofgrammarfile="grammars")
        else:
            self.messagegrammarread(typeofgrammarfile="grammars")
            parser = ET.XMLParser()
            for key, value in self.ta_info["extra_character_entity"].items():
                parser.entity[key] = value
            etree = (
                ET.ElementTree()
            )  # ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed
            etreeroot = etree.parse(filename, parser)
        self._handle_empty(etreeroot)
        self.stackinit()
        self.root = self._etree2botstree(
            etreeroot)  # convert etree to bots-nodes-tree
        self.checkmessage(self.root, self.defmessage)
        self.ta_info.update(self.root.queries)