Example #1
0
def extractSBT(s):
    """
    Takes a string of the SBT XML, and returns the answers of the SBT.

    :param s: the json structure with responses
    :return: answer string
    """

    if (s.find('<responseData>') > 0 and s.find('</responseData>') == -1):
        if (s.rfind('</responseDatum>') > 0):
            s = s.rsplit('</responseDatum>',
                         1)[0] + '</responseDatum></responseData>'
        else:
            s = s + '</responseData>'

    s = str(s, errors='ignore')
    root = etree.fromstring(s)

    answerlist = []
    for responseDatum in root.iter('responseDatum'):
        sceneId = responseDatum.findtext('sceneId')
        responseComponentId = responseDatum.findtext('responseComponentId')
        responseType = responseDatum.findtext('responseType')
        for content in responseDatum.iter('content'):
            ctdict = parseXMLContentDatum(content)
            if (responseType == "Selection"):
                for key, value in ctdict.items():
                    if (value == 'true'):
                        sel = string.ascii_uppercase[key]
                        answerlist.append("{}-{}".format(
                            responseComponentId, sel))
                        break
            elif (responseType == "TextSelection"):
                for key, value in ctdict.items():
                    answerlist.append("{}-{}".format(responseComponentId,
                                                     value))
            elif (responseType == "Math"):
                #mathml, output last action
                for key, value in ctdict.items():
                    val = MathMLExtraction(value)
                    answerlist.append("{}-{}".format(key, val))
            elif (responseType == "Text"):
                for key, value in ctdict.items():
                    if (value.startswith('![CDATA[')):
                        value = value.split('![CDATA[')[1].rstrip(']]')
                    answerlist.append("{}-{}".format(key, value))
            elif (responseType == "Record"):
                for key, value in ctdict.items():
                    answerlist.append("{}-{}".format(key, value))
            else:
                continue
    return answerlist
Example #2
0
def parsePearsonObservableXML(source, unicodeJunkChar = "@"):
    """
    Takes a Pearson observable XML string, returns a Pandas data frame.
    The Pearson XML export is one student per file, like the following:
    <?xml version="1.0" encoding="utf-8"?>
    <assessmentResult>
      <context>
        <sessionIdentifier sourceID="Database Version 213" softwareVersion="4.1.1.34969" superVersion="4.1.3" chromeExtension="3.1.3" assessmentYear="2017" schoolCode="666666" sessionNumber="DS0401" />
        <bookletNumber>8888888888</bookletNumber>
        <assignedForm>R888</assignedForm>
      </context>
      <testResult assessmentYear="2017" subjectName="Reading" assessedGroup="Grade 4" datestamp="2017-01-30T16:30:12.012Z">
        <outcomeVariable cardinality="record" interpretation="AdministrationCode">
          <value fieldIdentifier="AdministrationCode" baseType="integer">10</value>
          <value fieldIdentifier="AdministrationCodeDescription" baseType="string">Original session - In session full time</value>
        </outcomeVariable>
        <outcomeVariable cardinality="single" baseType="string" interpretation="TeacherNumber">
          <value>01</value>
        </outcomeVariable>
      </testResult>
      <itemResult accessionNumber="Adjust" itemType="Adjustment" blockCode="ADJUST">
        <outcomeVariable cardinality="single" interpretation="Enter Item">
          <value fieldIdentifier="EventTime" baseType="dateTime">2017-01-30T14:12:13.343Z</value>
        </outcomeVariable>
      </itemResult>
      ...
    Note that one or two of the <itemResult> elements may have ItemType=="SBT", such as
    <itemResult accessionNumber="VH888888" itemType="SBT" blockCode="8888888">
        <observableDatum>
          <sceneId>intro01</sceneId>
          <controlId>api</controlId>
          <eventType>api.itemReadyEvent</eventType>
          <timestamp>2017-12-11T14:29:12.531Z</timestamp>
          <content>
            <pair>
              <key>success</key>
              <value>true</value>
            </pair>
          </content>
        </observableDatum>
    In this case the function will unpack the SBT records as rows in the data frame as well. The code also replaces any unicode character with `@` by default.
    :param source: the XML string, or an XML root node
    :param unicodeJunkChar: the character or string to replace any unicode characters; default to "@"
    :return: a parsed Pandas data frame, or None if error
    """

    # if source is a XML node, skip the parsing

 #   print "From parsePearsonObservableXML"

    try:
        # first, replacing all unicode characters to unicodeJunkChar
        source = re.sub(r"\&\#x[0-9a-fA-F]+", unicodeJunkChar, source)
        # try to parse the xml string
        root = ET.fromstring(source)
    except Exception as e:
        warnings.warn("XML contains incomplete Booklet level information")
        logger.error("XML contains incomplete Booklet level information")
        logger.exception(e)
        # not a string
        root = source

    observableList = []
    bookletDict=dict()
    # processing top-level xml elements
    try:
        bookletDict["BookletNumber"]= root.find("context/bookletNumber").text
        bookletDict["Form"] = root.find("context/assignedForm").text
        bookletDict["SchoolCode"] = root.find("context/sessionIdentifier").get("schoolCode")
        bookletDict["sessionNumber"] = root.find("context/sessionIdentifier").get("sessionNumber")
        bookletDict["Year"] = root.find("context/sessionIdentifier").get("assessmentYear")
        bookletDict["Grade"] = root.find("testResult").get("assessedGroup")
        bookletDict["SubjectCode"] = root.find("testResult").get("subjectName")
    except Exception as e:
        warnings.warn("XML contains incomplete Booklet level information")
        logger.error("XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    for itemResult in root.iter("itemResult"):
        # make a copy of the bookletDict
        itemDict = bookletDict.copy()
        # populate
        itemDict["BlockCode"] = itemResult.get("blockCode")
        itemDict["ItemTypeCode"] = itemResult.get("itemType")
        itemDict["AccessionNumber"] = itemResult.get("accessionNumber")

        # Now depending on the itemType we have different processes
        if itemDict["ItemTypeCode"] in ["SBT", "ReadingNonSBT"]:
            # SBTs actually embed its own XML data, we need to loop through them and save each as a row
            for observableDatum in itemResult.iter('observableDatum'):
                # make a copy of the itemDict
                obsDict = itemDict.copy()
                # populate
                obsDict["SceneId"] = observableDatum.findtext('sceneId')
                obsDict["ControlId"] = observableDatum.findtext('controlId')
                obsDict["ResponseComponentId"] = observableDatum.findtext('controlId').rsplit('.',1)[0]
                obsDict["Label"] = observableDatum.findtext('eventType')
                obsDict["EventTime"] = observableDatum.findtext('timestamp')
                obsDict["ExtendedInfo"] = parseXMLContentDatum(observableDatum.find("content"))
                # add to the list
                observableList.append(obsDict)
                # obsDict.clear() # gc? No, stupid. This would clear the obj in the list already.
        else:
            # all others, each observableDatum is a row
            itemDict["Label"] = itemResult.find("outcomeVariable").get("interpretation")
            # populate
            for value in itemResult.find("outcomeVariable").iter("value"):
                itemDict[value.get("fieldIdentifier")] = value.text
            # add to the list
            observableList.append(itemDict)
        # itemDict.clear() # gc? NO, see above.
    # error check
    # if no actual data records, exit with a warning and return None
    if len(observableList) == 0:
        warnings.warn("XML contains no data")
        logger.warning("XML contains no data")
        return None  # We have data. Now we create a data frame, parse the ExtendedInfo

    # notice the configuration is specified.
    try:
        df = pd.DataFrame.from_dict(observableList)
        df = df.pipe(parseExtendedInfo)
        # parse extended info for SBT items
        idx = df["ItemTypeCode"].isin(["SBT", "ReadingNonSbt"])
        df.loc[idx, "extInfo"] = df.loc[idx, "ExtendedInfo"].pipe(parseJSONObservables)
        df = df.sort_values("EventTime")
    except Exception as e:
        warnings.warn("XML data cannot be converted to a data frame")
        logger.error("XML data cannot be converted to a data frame")
        logger.exception(e)
        return None

    return df
Example #3
0
def parseSbtXML(source, keepResponseData=False):
    """
    Parse the SBT xml string for each block.
    The 2019 SBTs adds a new stateDiff field. This function takes an
    individual XML per student per block, and returns a Pandas data frame.
    
    :param source: the XML string or a XML node
    :param keepResponseData: false by default, otherwise combines both obs and res data.
    :return: a data frame or None
    """

    # if source is a XML node, skip the parsing
    try:
        parser = ET.XMLParser()
        root = ET.fromstring(source, parser=parser)
    except:
        # not a string
        root = source

    df = None
    # get top level basic info
    try:
        bookletId = root.findtext('bookletId')
        #    stateInfo=root.findtext('stateInfo')
        taskId = root.findtext('taskId')
        blockId = root.findtext('blockId')
        accommodations = root.findtext('accommodations')
        extendedTimeFactor = root.findtext('extendedTimeFactor')
        # print bookletId, taskId, blockId
    except Exception as e:
        warnings.warn(
            "ParseSbtXML: XML contains incomplete Booklet level information")
        logger.error(
            "ParseSbtXML: XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    # get observable data
    observableMatrix = []
    ct = None
    for observableDatum in root.iter('observableDatum'):
        sceneId = observableDatum.findtext('sceneId')
        controlId = observableDatum.findtext('controlId')
        eventType = observableDatum.findtext('eventType')
        timestamp = observableDatum.findtext('timestamp')
        stateDiff = observableDatum.findtext('stateDiff')
        # get content in json format
        for content in observableDatum.iter('content'):
            # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
            # @@ why are we looping here? Shouldn't there be a single Content here?
            ct = str(parseXMLContentDatum(content))

        observableMatrix.append({
            'BookletNumber': bookletId,
            'BlockId': blockId,
            'TaskId': taskId,
            'Accomodations': accommodations,
            'ExtendedTimeFactor': extendedTimeFactor,
            'SceneId': sceneId,
            'ControlId': controlId,
            'Label': eventType,
            'EventTime': timestamp,
            'StateDiff': stateDiff.replace("\n", ""),
            'ExtendedInfo': ct
        })

    # turn the data into a data frame
    if not keepResponseData:
        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            df = pd.DataFrame.from_dict(observableMatrix)
        except Exception as e:
            warnings.warn(
                "ParseSbtXML: cannot turn Observable XML into a data frame")
            logger.error(
                "ParseSbtXML: cannot turn Observable XML into a data frame")
            logger.exception(e)
            return None

    else:
        # get response data
        responseMatrix = []
        ct = None
        for responseDatum in root.iter('responseDatum'):
            sceneId = responseDatum.findtext('sceneId')
            responseComponentId = responseDatum.findtext('responseComponentId')
            responseType = responseDatum.findtext('responseType')
            # get content in json format
            for content in responseDatum.iter('content'):
                # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
                # @@ why are we looping here? Shouldn't there be a single Content here?
                ct = str(parseXMLContentDatum(content))
            responseMatrix.append({
                'SceneId': sceneId,
                'ResponseComponentId': responseComponentId,
                'ResponseType': responseType,
                'ResponseContent': ct
            })

        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            dfObs = pd.DataFrame.from_dict(observableMatrix)
            # then create response dataframe
            dfResp = pd.DataFrame.from_dict(responseMatrix)
            if (dfResp.empty):
                df = dfObs
            elif (dfObs.empty):
                df = dfResp
            else:
                # GF: Wait, does "outer" really work here?
                df = pd.merge(dfObs, dfResp, how='outer', on='SceneId')
        except Exception as e:
            warnings.warn("ParseSbtXML: cannot turn XML into a data frame")
            logger.error("ParseSbtXML: cannot turn XML into a data frame")
            logger.exception(e)
            return None

    return df
Example #4
0
def parseSbtObservableXML(
    source,
    bl="current",
    bc="current",
    unicodeJunkChar="@"
):  #bl stands for bookletnumber; bc stands for blockcode
    """
    Parse the SBT xml string from SQL Response data table for each block to get Observable data frame.

    This function parses the XML string stored in the SQL Response Data Table for SBT and similar
    black-box component, where it keeps its own observable data and export
    an XML along with the response data. We have to export this data from
    the responseData SQL database as an XML file. This function takes an
    individual XML per student per block, and returns a Pandas data frame.

    :param source: the XML string or a XML node
    :param unicodeJunkChar: the character or string to replace any unicode characters; default to "@"
    :return: a data frame of observables or None if errors
    """
    # if source is a XML node, skip the parsing

    print('From parseSbtobservableXML')

    try:
        # first, replacing all unicode characters to unicodeJunkChar
        source = re.sub(r"\&\#x[0-9a-fA-F]+", unicodeJunkChar, source)
        # try to parse the xml string
        root = ET.fromstring(source)
    except Exception as e:
        #print bl, " ", bc, "Not able to parse"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains incomplete Booklet level information")
        logger.error("BlockCode " + bc + " BookletNumber " + bl +
                     " XML contains incomplete Booklet level information")
        logger.exception(e)
        # not a string
        root = source
    observableList = []
    bookletDict = dict()
    # processing top-level xml elements
    try:
        bookletDict["BookletNumber"] = root.find("bookletId").text
        # bookletDict["Form"] = ""
        # bookletDict["SchoolCode"] = ""
        # bookletDict["sessionNumber"] = ""
        # bookletDict["Year"] = ""
        # bookletDict["Grade"] = ""
        # bookletDict["SubjectCode"] = ""
        bookletDict["BlockCode"] = root.find("blockId").text
        bookletDict["ItemTypeCode"] = "SBT"
        bookletDict["AccessionNumber"] = root.find("taskId").text
    except Exception as e:
        #print bl, " ", bc, " error"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains incomplete Booklet level information")
        logger.error("BlockCode " + bc + " BookletNumber " + bl +
                     " XML contains incomplete Booklet level information")
        logger.exception(e)
        return None
    # SBTs actually embed its own XML data, we need to loop through them and save each as a row
    for observableDatum in root.iter('observableDatum'):

        # make a copy of the itemDict
        obsDict = bookletDict.copy()
        # populate
        obsDict["SceneId"] = observableDatum.findtext('sceneId')
        obsDict["ControlId"] = observableDatum.findtext('controlId')

        #        obsDict["ResponseComponentId"]=observableDatum.findtext('controlId').rsplit('.',1)[0]

        obsDict["Label"] = observableDatum.findtext('eventType')
        obsDict["EventTime"] = observableDatum.findtext('timestamp')
        obsDict["ExtendedInfo"] = parseXMLContentDatum(
            observableDatum.find("content"))
        # add to the list
        observableList.append(obsDict)
        # obsDict.clear() # gc? No, stupid. This would clear the obj in the list already.

        # itemDict.clear() # gc? NO, see above.
    # error check
    # if no actual data records, exit with a warning and return None
    if len(observableList) == 0:
        #print bl," ",bc," has no data"
        warnings.warn("BlockCode " + bc + " BookletNumber " + bl +
                      " XML contains no data")
        logger.warning("BlockCode " + bc + " BookletNumber " + bl +
                       " XML contains no data")
        return None  # We have data. Now we create a data frame, parse the ExtendedInfo
    # notice the configuration is specified.
    try:
        df = pd.DataFrame.from_dict(observableList)
        # parse extended info for SBT items
        idx = df["ItemTypeCode"].isin(["SBT", "ReadingNonSbt"])
        df.loc[idx,
               "extInfo"] = df.loc[idx,
                                   "ExtendedInfo"].pipe(parseJSONObservables)
        df = df.sort_values("EventTime")
    except Exception as e:
        warnings.warn("XML data cannot be converted to a data frame")
        logger.error("XML data cannot be converted to a data frame")
        logger.exception(e)
        return None
    return df
Example #5
0
def parseIctXML(source, keepResponseData=False):
    """
    Parse the 2015 ICT observables xml string for each block.

    The 2015 Science ICT follows a precursor of the SBT data format, where most of the
    observable events are saved in the Response data table, as a "response" associated
    with an AccNum. This function takes the XML string of that session, and returns a
    data frame following the standard format of the process data log.

    Note that there are several fields that are unused, e.g., "Label", because they
    are used in the eNAEP-based process data logs (which deal with response-related
    events). We need to merge the two sources of logs to obtain a complete proces
    data log.

    :param source: the XML string or a XML node
    :param keepResponseData: false by default, otherwise combines both obs and res data.
    :return: a data frame or None
    """

    # if source is a XML node, skip the parsing
    try:
        parser = ET.XMLParser()
        root = ET.fromstring(source, parser=parser)
    except:
        # not a string
        root = source

    df = None
    # get top level basic info
    try:
        bookletId = root.findtext('bookletId')
        #    stateInfo=root.findtext('stateInfo')
        taskId = root.findtext('taskId')
        blockId = root.findtext('blockId')
        accommodations = root.findtext('accommodations')
        extendedTimeFactor = root.findtext('extendedTimeFactor')
        # print bookletId, taskId, blockId
    except Exception as e:
        warnings.warn("ParseIctXML: XML contains incomplete Booklet level information")
        logger.error("ParseIctXML: XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    # get observable data
    observableMatrix = []
    ct = None
    for observableDatum in root.iter('observableDatum'):
        sceneId = observableDatum.findtext('sceneId')
        controlId = observableDatum.findtext('controlId')
        eventType = observableDatum.findtext('eventType')
        timestamp = observableDatum.findtext('timestamp')
        # get content in json format
        for content in observableDatum.iter('content'):
            # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
            # @@ why are we looping here? Shouldn't there be a single Content here?
            ct = unicodedata.normalize('NFKD', str(content.text))
            #ct = unicodeToAscii(content.text)

        observableMatrix.append({'BookletNumber': bookletId,
                                 'BlockId': blockId,
                                 'TaskId': taskId,
                                 'Accomodations': accommodations,
                                 'ExtendedTimeFactor': extendedTimeFactor,
                                 'SceneId': sceneId,
                                 'ControlId': controlId,
                                 'Label': eventType,
                                 'EventTime': timestamp,
                                 'ExtendedInfo': ct})

    # turn the data into a data frame
    if not keepResponseData:
        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            df = pd.DataFrame.from_dict(observableMatrix)
        except Exception as e:
            warnings.warn("ParseIctXML: cannot turn Observable XML into a data frame")
            logger.error("ParseIctXML: cannot turn Observable XML into a data frame")
            logger.exception(e)
            return None

    else:
        # get response data
        responseMatrix = []
        ct = None
        for responseDatum in root.iter('responseDatum'):
            sceneId = responseDatum.findtext('sceneId')
            responseComponentId = responseDatum.findtext('responseComponentId')
            responseType = responseDatum.findtext('responseType')
            # get content in json format
            for content in responseDatum.iter('content'):
                # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
                # @@ why are we looping here? Shouldn't there be a single Content here?
                ct = str(parseXMLContentDatum(content))
            responseMatrix.append({'SceneId': sceneId,
                                   'ResponseComponentId': responseComponentId,
                                   'ResponseType': responseType,
                                   'ResponseContent': ct})

        try:
            # create dataframe encapsules all the info
            # first create observable dataframe
            dfObs = pd.DataFrame.from_dict(observableMatrix)
            # then create response dataframe
            dfResp = pd.DataFrame.from_dict(responseMatrix)
            if (dfResp.empty):
                df = dfObs
            elif (dfObs.empty):
                df = dfResp
            else:
                # GF: Wait, does "outer" really work here?
                # TODO: We don't want to back-fill the response for each scene
                # If we ever do this, we want the ResponseContent to relfect the
                # state of the responses at this point.
                df = pd.merge(dfObs, dfResp, how='outer', on='SceneId')
        except Exception as e:
            warnings.warn("ParseIctXML: cannot turn XML into a data frame")
            logger.error("ParseIctXML: cannot turn XML into a data frame")
            logger.exception(e)
            return None

    return df
Example #6
0
def parsePearsonObservableXML(source):
    """
    Takes a Pearson observable XML string, returns a Pandas data frame.

    The Pearson XML export is one student per file, like the following:

    <?xml version="1.0" encoding="utf-8"?>
    <assessmentResult>
      <context>
        <sessionIdentifier sourceID="Database Version 213" softwareVersion="4.1.1.34969" superVersion="4.1.3" chromeExtension="3.1.3" assessmentYear="2017" schoolCode="666666" sessionNumber="DS0401" />
        <bookletNumber>8888888888</bookletNumber>
        <assignedForm>R888</assignedForm>
      </context>
      <testResult assessmentYear="2017" subjectName="Reading" assessedGroup="Grade 4" datestamp="2017-01-30T16:30:12.012Z">
        <outcomeVariable cardinality="record" interpretation="AdministrationCode">
          <value fieldIdentifier="AdministrationCode" baseType="integer">10</value>
          <value fieldIdentifier="AdministrationCodeDescription" baseType="string">Original session - In session full time</value>
        </outcomeVariable>
        <outcomeVariable cardinality="single" baseType="string" interpretation="TeacherNumber">
          <value>01</value>
        </outcomeVariable>
      </testResult>
      <itemResult accessionNumber="Adjust" itemType="Adjustment" blockCode="ADJUST">
        <outcomeVariable cardinality="single" interpretation="Enter Item">
          <value fieldIdentifier="EventTime" baseType="dateTime">2017-01-30T14:12:13.343Z</value>
        </outcomeVariable>
      </itemResult>
      ...

    :param source: the XML string, or an XML root node
    :return: a parsed Pandas data frame, or None if error
    """

    # if source is a XML node, skip the parsing
    try:
        parser = ET.XMLParser()
        root = ET.fromstring(source, parser=parser)
    except:
        # not a string
        root = source

    df = None
    observableMatrix = []
    # processing top-level xml elements
    try:
        context = root.find("context")
        sessionID = context.find("sessionIdentifier")
        sessionNumber = sessionID.attrib["sessionNumber"]
        bookletNumber = context.findtext("bookletNumber")
        schoolCode = sessionID.attrib["schoolCode"]
        assessmentYear = sessionID.attrib["assessmentYear"]
        assignedForm = context.findtext("assignedForm")
        #    chromeExtension=sessionID.attrib["chromeExtension"]
        #    superVersion=sessionID.attrib["superVersion"]
        #    softwareVersion=sessionID.attrib["softwareVersion"]
        #    sourceID=sessionID.attrib["sourceID"]
        testResult = root.find("testResult")
        #   datestamp = testResult.attrib["datestamp"]
        assessedGroup = testResult.attrib["assessedGroup"]
        subjectName = testResult.attrib["subjectName"]
        # logging
        logger.debug("Booklet: %s, %s, %s, %s, %s, %s",
                     subjectName, sessionNumber, schoolCode,
                     assessmentYear, bookletNumber, assignedForm)
        # print subjectName, sessionNumber, schoolCode, assessmentYear, bookletNumber, assignedForm
        #   for outcome in testResult.iter("outcomeVariables"):
        #       for value in outcome.iter("value"):
        #           if(value.attrib["fieldIdentifier"]=="AdministrationCode"):
        #               AdministrationCode=value.text
        #           elif(value.attrib["fieldIdentifier"]=="AdministrationCodeDescription"):
        #               AdministrationCodeDescription=value.text
        #           elif(value.attrib["fieldIdentifier"]=="TeacherNumber"):
        #               TeacherNumber=value.text
    except Exception as e:
        warnings.warn("XML contains incomplete Booklet level information")
        logger.error("XML contains incomplete Booklet level information")
        logger.exception(e)
        return None

    ct = None
    for itemResult in root.iter("itemResult"):
        blockCode = itemResult.attrib["blockCode"]
        itemType = itemResult.attrib["itemType"]
        accessionNumber = itemResult.attrib["accessionNumber"]

        # Now depending on the itemType we have different processes

        if (itemType == "SBT"):
            # SBTs actually embed its own XML data,
            # we need to loop through them and save each as a row
            for observableDatum in root.iter('observableDatum'):
                sceneId = observableDatum.findtext('sceneId')
                controlId = observableDatum.findtext('controlId')
                eventType = observableDatum.findtext('eventType')
                timestamp = observableDatum.findtext('timestamp')
                for content in observableDatum.iter('content'):
                    # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste.
                    # @@ why are we looping here? Shouldn't there be a single Content here?
                    ct = str(parseXMLContentDatum(content))

                observableMatrix.append({"BookletNumber": bookletNumber,
                                         "AssignedForm": assignedForm,
                                         "SessionNumber": sessionNumber,
                                         "SchoolCode": schoolCode,
                                         "AssessmentYear": assessmentYear,
                                         "Grade": assessedGroup,
                                         "Subject": subjectName,
                                         "BlockCode": blockCode,
                                         "ItemType": itemType,
                                         "AccessionNumber": accessionNumber,
                                         "SceneId": sceneId,
                                         "controlId": controlId,
                                         "Label": eventType,
                                         "EventTime": timestamp,
                                         "ExtendedInfo": ct})
        else:
            # all others, each observableDatum is a row
            outcomeVar = itemResult.find("outcomeVariable")
            label = outcomeVar.attrib["interpretation"]
            extendedInfo = ""
            eventTime = ""
            for value in outcomeVar.iter("value"):
                if (value.attrib["fieldIdentifier"] == "EventTime"):
                    eventTime = value.text
                elif (value.attrib["fieldIdentifier"] == "ExtendedInfo"):
                    extInf = value.text
                    # for math we remove the MathML because it's too large
                    # we keep only the latex
                    if (subjectName == "Mathematics" and label == "Math Keypress"):
                        extendedInfo = parseMathML(extInf)
                    else:
                        extendedInfo = extInf

            observableMatrix.append({"BookletNumber": bookletNumber,
                                     "SessionNumber": sessionNumber,
                                     "SchoolCode": schoolCode,
                                     "AssessmentYear": assessmentYear,
                                     "Grade": assessedGroup,
                                     "Subject": subjectName,
                                     "BlockCode": blockCode,
                                     "ItemType": itemType,
                                     "AccessionNumber": accessionNumber,
                                     "Label": label,
                                     "EventTime": eventTime,
                                     "ExtendedInfo": extendedInfo})
    # error check
    # if no actual data records, exit with a warning and return None
    if len(observableMatrix) == 0:
        warnings.warn("XML contains no data")
        logger.warning("XML contains no data")
        return None  # We have data. Now we create a data frame, parse the ExtendedInfo

    # notice the configuration is specified.
    try:
        df = pd.DataFrame.from_dict(observableMatrix)
        df = df.pipe(parseExtendedInfo)
        df = df.sort_values("EventTime")
    except Exception as e:
        warnings.warn("XML data cannot be converted to a data frame")
        logger.error("XML data cannot be converted to a data frame")
        logger.exception(e)
        return None

    # if(eventTime<>""):
    #        df=df.sort_values("EventTime")
    return df