def extractSelectionResponse(respList): """ To extract responses from in-text selection item types in reading assessments. :param respList: the json structure with responses :return: answer string """ try: # res = ["option-" + str(i + 1) for i, o in enumerate(respList) if o["val"] == "true"] # i is for the option number, 0-based # o is for whether this option is chosen res = [ str(i + 1) for i, o in enumerate(respList) if o["val"] == "true" ] except Exception as e: logger.error("extractSelectionResponse:") logger.exception(e) exc_buffer = io.StringIO() traceback.print_exc(file=exc_buffer) logger.error('Uncaught exception in worker process:\n%s', exc_buffer.getvalue()) res = respList return res
def reconByConfig(df, config): """ Auxilury function to reconstruct responses using a particular configuration. It takes an observable data frame df that is already pre-filtered to contain a certain events, groupby the typical variables (BookletNumber and BlockCode), run the reconstruction function (aka "dispatcher") before running teh "postprocessor" function to clean up the output. :param df: the observable data frame, typically containing a single event type :param config: the configuration dict with required field. :return: the reconstructed data frame, or an empty data frame if errors occur """ assert ("byVars" in config) assert ("dispatcher" in config) assert ("postprocessor" in config) # config the parser function try: res = df.groupby(config["byVars"])\ .apply(config["dispatcher"], config=config).reset_index()\ .pipe(config["postprocessor"]) except Exception as e: logger.error("reconByConfig:") #logger.error("input df:\n{}".format(df)) logger.exception(e) res = pd.DataFrame() return res
def extractMCResponse(respList, maxNumberOfOptions=10): """ This function extracts MC responses from the XML output, and returns a list of options in letters (and numbers if out of range). The input is of the following format: [{u'val': u'true', u'key': u'4'}] The output of the above would be ["D"]. If no response, return []. If not a response string, return None :param respList: the json structure with responses :param maxNumberOfOptions: optional the max number of options for MC; default to 10 to save space :return: answer string """ try: res = [num2alpha(r["key"]) for r in respList] except Exception as e: logger.error("extractMCResponse:") logger.exception(e) exc_buffer = io.StringIO() traceback.print_exc(file=exc_buffer) logger.error('Uncaught exception in worker process:\n%s', exc_buffer.getvalue()) res = respList return res
def postProcessSBTTextSelectionResp(dfSBTTextSelectionResp): """ Auxilary function to post-process TextSelection responses for SBTs. The input df is is output from reconSBTTextSelection(). It's a Series with reset_index() applied; therefore the var of interst is dfSBTTextSelectionResp[0], which contains [{'ExtractedAnswer': 'selection-22', 'ResponseComponentId': 'item-SelectExamples'}] We then take the df with many of these, return something like ['selection-22', 'selection-23', 'selection-27'] for each BookletNumber, BlockId, and ResponseComponentId. We will use a 2-level groupby(). The first level is to make sure we keep the ["BookletNumber", "BlockCode"] info. The second level, we create a new df based on ResponseComponentId and ExtractedAnswer, then combine ExtractedAnswer for each ResponseComponentId, creating a sorted list as output. We do a little clearn up to ensure the output df has the column names we wanted. :param dfSBTTextSelectionResp: the output from reconSBTTextSelection() :return: a df with columns ["BookletNumber","BlockCode","ResponseComponentId","ExtractedAnswer"]; None if response is empty or error. """ # print "From postProcessSBTTextSelectionResp" if dfSBTTextSelectionResp is None: logger.error( "postProcessSBTTextSelectionResp: dfSBTTextSelectionResp is None") return None if "ResponseComponentId" not in dfSBTTextSelectionResp.columns: try: #logger.error("postProcessSBTTextSelectionResp: ResponseComponentId not in dfSBTTextSelectionResp.columns") #logger.error("\n{}".format(dfSBTTextSelectionResp)) dfSBTTextSelectionResp[ "ResponseComponentId"] = dfSBTTextSelectionResp[ "ControlId"].apply(truncate) except: return None # print dfSBTTextSelectionResp["ResponseComponentId"] if dfSBTTextSelectionResp.shape[0] > 0: # condense this by ResponseComponentId try: # first, melt the data to multiple rows per ResponseComponentId res = dfSBTTextSelectionResp.groupby(["BookletNumber", "BlockCode"]) \ .apply(lambda df: pd.DataFrame(df["ReconstructedAnswer"].sum())) \ .reset_index() # now recast to one row per ResponseComponentId, with responses in a list and sorted res= res.groupby(["BookletNumber", "BlockCode", "ResponseComponentId"]) \ .apply(lambda df: df["ReconstructedAnswer"].sort_values().tolist()) \ .rename("ReconstructedAnswer").reset_index() return res except Exception as e: logger.error("postProcessSBTTextSelectionResp:") logger.exception(e) logger.debug(dfSBTTextSelectionResp) return None else: return None
def extractTextResponse(respList): """ Extract response from Text item types in SBTs. Example: [{'key': "Explain message of xxxx", 'val': 'XXXXX'}] --> [{'val': 'XXXXX'}] :param respList: the json structure with responses :return: answer string """ try: res = [{'val': o["val"]} for o in respList] except Exception as e: logger.error("extractTextResponse:") logger.exception(e) exc_buffer = io.StringIO() traceback.print_exc(file=exc_buffer) logger.error('Uncaught exception in worker process:\n%s', exc_buffer.getvalue()) res = respList return res
def extractSbtResponseXML(itemResult, headerDict): """Given a XML node "itemResult", return a list of responses :param itemResult: a xml.etree node that is itemResult :param headerDict: a dictionary with student-level information such as teh BookletNumber, etc. :return a list of dicts or None """ responseMatrix = [] try: blockCode = itemResult.get("blockCode") itemAccessionNumber = itemResult.get("accessionNumber") # sometimes response data is stored under a different AccNum than the ItemAccNum accessionNumber = itemResult.get("respondedIn") if not accessionNumber: accessionNumber = itemAccessionNumber itemType = itemResult.get("itemType") # we need to loop through responseDatum elements responseData = itemResult.find( 'responseVariable/candidateResponse/value/taskState/responseData') for responseDatum in responseData.iter('responseDatum'): sceneId = responseDatum.find("sceneId").text responseComponentId = responseDatum.find( "responseComponentId").text responseType = responseDatum.find("responseType").text content = responseDatum.find("content") ct = [] if content is not None: for pair in content.iter('pair'): k = pair.find("key").text v = pair.find("value").text ct.append({"key": k, "val": v}) responseMatrix.append({ 'BookletNumber': headerDict['BookletNumber'], 'Form': headerDict['Form'], 'Year': headerDict['Year'], 'SubjectCode': headerDict['SubjectCode'], 'Grade': headerDict['Grade'], 'BlockCode': blockCode, 'AccessionNumber': accessionNumber, 'ItemAccessionNumber': itemAccessionNumber, 'ItemTypeCode': itemType, 'ChildItemAccessionNumber': sceneId, 'ChildItemType': responseType, 'ResponseComponentId': responseComponentId, 'Response': ct }) except Exception as e: logger.error( "extractSbtResponseXML: Error parsing the SBT XML itemResult") logger.exception(e) return None return responseMatrix
def extractTextSelectionResponse(respList): """ Extract response from TextSelection item types in SBTS. Example: [{u'val': u'3', u'key': u'selectedUnit1'}] --> [selection-3] :param respList: the json structure with responses :param maxNumberOfOptions: optional the max number of options for MC; default to 10 to save space :return: answer string """ try: res = ["selection-" + r["val"] for r in respList] except Exception as e: logger.error("extractTextSelectionResponse:") logger.exception(e) exc_buffer = io.StringIO() traceback.print_exc(file=exc_buffer) logger.error('Uncaught exception in worker process:\n%s', exc_buffer.getvalue()) res = respList return res
def parseSbtXML(source, keepResponseData=False): """ Parse the SBT xml string for each block. The 2019 SBTs adds a new stateDiff field. This function takes an individual XML per student per block, and returns a Pandas data frame. :param source: the XML string or a XML node :param keepResponseData: false by default, otherwise combines both obs and res data. :return: a data frame or None """ # if source is a XML node, skip the parsing try: parser = ET.XMLParser() root = ET.fromstring(source, parser=parser) except: # not a string root = source df = None # get top level basic info try: bookletId = root.findtext('bookletId') # stateInfo=root.findtext('stateInfo') taskId = root.findtext('taskId') blockId = root.findtext('blockId') accommodations = root.findtext('accommodations') extendedTimeFactor = root.findtext('extendedTimeFactor') # print bookletId, taskId, blockId except Exception as e: warnings.warn( "ParseSbtXML: XML contains incomplete Booklet level information") logger.error( "ParseSbtXML: XML contains incomplete Booklet level information") logger.exception(e) return None # get observable data observableMatrix = [] ct = None for observableDatum in root.iter('observableDatum'): sceneId = observableDatum.findtext('sceneId') controlId = observableDatum.findtext('controlId') eventType = observableDatum.findtext('eventType') timestamp = observableDatum.findtext('timestamp') stateDiff = observableDatum.findtext('stateDiff') # get content in json format for content in observableDatum.iter('content'): # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste. # @@ why are we looping here? Shouldn't there be a single Content here? ct = str(parseXMLContentDatum(content)) observableMatrix.append({ 'BookletNumber': bookletId, 'BlockId': blockId, 'TaskId': taskId, 'Accomodations': accommodations, 'ExtendedTimeFactor': extendedTimeFactor, 'SceneId': sceneId, 'ControlId': controlId, 'Label': eventType, 'EventTime': timestamp, 'StateDiff': stateDiff.replace("\n", ""), 'ExtendedInfo': ct }) # turn the data into a data frame if not keepResponseData: try: # create dataframe encapsules all the info # first create observable dataframe df = pd.DataFrame.from_dict(observableMatrix) except Exception as e: warnings.warn( "ParseSbtXML: cannot turn Observable XML into a data frame") logger.error( "ParseSbtXML: cannot turn Observable XML into a data frame") logger.exception(e) return None else: # get response data responseMatrix = [] ct = None for responseDatum in root.iter('responseDatum'): sceneId = responseDatum.findtext('sceneId') responseComponentId = responseDatum.findtext('responseComponentId') responseType = responseDatum.findtext('responseType') # get content in json format for content in responseDatum.iter('content'): # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste. # @@ why are we looping here? Shouldn't there be a single Content here? ct = str(parseXMLContentDatum(content)) responseMatrix.append({ 'SceneId': sceneId, 'ResponseComponentId': responseComponentId, 'ResponseType': responseType, 'ResponseContent': ct }) try: # create dataframe encapsules all the info # first create observable dataframe dfObs = pd.DataFrame.from_dict(observableMatrix) # then create response dataframe dfResp = pd.DataFrame.from_dict(responseMatrix) if (dfResp.empty): df = dfObs elif (dfObs.empty): df = dfResp else: # GF: Wait, does "outer" really work here? df = pd.merge(dfObs, dfResp, how='outer', on='SceneId') except Exception as e: warnings.warn("ParseSbtXML: cannot turn XML into a data frame") logger.error("ParseSbtXML: cannot turn XML into a data frame") logger.exception(e) return None return df
def parseSbtObservableXML( source, bl="current", bc="current", unicodeJunkChar="@" ): #bl stands for bookletnumber; bc stands for blockcode """ Parse the SBT xml string from SQL Response data table for each block to get Observable data frame. This function parses the XML string stored in the SQL Response Data Table for SBT and similar black-box component, where it keeps its own observable data and export an XML along with the response data. We have to export this data from the responseData SQL database as an XML file. This function takes an individual XML per student per block, and returns a Pandas data frame. :param source: the XML string or a XML node :param unicodeJunkChar: the character or string to replace any unicode characters; default to "@" :return: a data frame of observables or None if errors """ # if source is a XML node, skip the parsing print('From parseSbtobservableXML') try: # first, replacing all unicode characters to unicodeJunkChar source = re.sub(r"\&\#x[0-9a-fA-F]+", unicodeJunkChar, source) # try to parse the xml string root = ET.fromstring(source) except Exception as e: #print bl, " ", bc, "Not able to parse" warnings.warn("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.error("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.exception(e) # not a string root = source observableList = [] bookletDict = dict() # processing top-level xml elements try: bookletDict["BookletNumber"] = root.find("bookletId").text # bookletDict["Form"] = "" # bookletDict["SchoolCode"] = "" # bookletDict["sessionNumber"] = "" # bookletDict["Year"] = "" # bookletDict["Grade"] = "" # bookletDict["SubjectCode"] = "" bookletDict["BlockCode"] = root.find("blockId").text bookletDict["ItemTypeCode"] = "SBT" bookletDict["AccessionNumber"] = root.find("taskId").text except Exception as e: #print bl, " ", bc, " error" warnings.warn("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.error("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.exception(e) return None # SBTs actually embed its own XML data, we need to loop through them and save each as a row for observableDatum in root.iter('observableDatum'): # make a copy of the itemDict obsDict = bookletDict.copy() # populate obsDict["SceneId"] = observableDatum.findtext('sceneId') obsDict["ControlId"] = observableDatum.findtext('controlId') # obsDict["ResponseComponentId"]=observableDatum.findtext('controlId').rsplit('.',1)[0] obsDict["Label"] = observableDatum.findtext('eventType') obsDict["EventTime"] = observableDatum.findtext('timestamp') obsDict["ExtendedInfo"] = parseXMLContentDatum( observableDatum.find("content")) # add to the list observableList.append(obsDict) # obsDict.clear() # gc? No, stupid. This would clear the obj in the list already. # itemDict.clear() # gc? NO, see above. # error check # if no actual data records, exit with a warning and return None if len(observableList) == 0: #print bl," ",bc," has no data" warnings.warn("BlockCode " + bc + " BookletNumber " + bl + " XML contains no data") logger.warning("BlockCode " + bc + " BookletNumber " + bl + " XML contains no data") return None # We have data. Now we create a data frame, parse the ExtendedInfo # notice the configuration is specified. try: df = pd.DataFrame.from_dict(observableList) # parse extended info for SBT items idx = df["ItemTypeCode"].isin(["SBT", "ReadingNonSbt"]) df.loc[idx, "extInfo"] = df.loc[idx, "ExtendedInfo"].pipe(parseJSONObservables) df = df.sort_values("EventTime") except Exception as e: warnings.warn("XML data cannot be converted to a data frame") logger.error("XML data cannot be converted to a data frame") logger.exception(e) return None return df
def xvalBooklets(dfResp, dfObsResp, configObsList, configRespList): """ Cross-validates records for a booklet using data from a ready-made data frames. Returns a data frame containing extracted responses from the response data table and the reconstructed responses from the observable data, for selected item types that the x-val algorithm currently handles. :param dfResp: a data frame of response data, from which we extract the responses for each item :param dfObsResp: a data frame of observable data, from which we reconstruct responses for each item :param configObsList: list containing configurations for processing observables :param configRespList: list containing configurations for processing responses :return: a data frame that matches the extracted and reconstructed responses """ assert (len(configObsList) > 0 & ("itemtypeColumn" in configObsList)) assert (isinstance(dfResp, pd.DataFrame)) assert (isinstance(dfObsResp, pd.DataFrame)) # make sure there are overlapping subjects subjlist = list( set(dfResp.BookletNumber.unique()).intersection( set(dfObsResp.BookletNumber.unique()))) assert (len(subjlist) > 0) ################## # recon answers using the configObsList # Join the observable data back again try: dfObs = pd.concat( [reconByConfig(dfObsResp, config=c) for c in configObsList]) if dfObs.shape[0] > 0: dfObs = dfObs.loc[:, [ 'BlockCode', 'BookletNumber', "AccessionNumber", 'ResponseComponentId', 'ReconstructedAnswer', 'ResponseHistory' ]] except Exception as e: logger.error("xvalBooklets: Error reconstructing responses") logger.exception(e) return None ################## # Merge recorded and reconstructed responses try: dfCompare = pd.merge( dfResp, dfObs, how="outer", on=["BookletNumber", "BlockCode", "ResponseComponentId"]) except Exception as e: logger.error( "xvalBooklets: Error merging response and observable data") logger.exception(e) return None # Need to transform the extracted responses by the `childItemType`, because `ItemTypeCode` is too gross. dfCompare.loc[dfCompare.ItemTypeCode.isin(["MCSS", "BQMCSS"]), "ChildItemType"] = "MCSS" dfCompare.loc[dfCompare.ItemTypeCode.isin(["MCMS", "BQMCMS"]), "ChildItemType"] = "MCMS" # ## Extract and transform responses to prepare for comparisons try: dfCompare = pd.concat( [parseItemResponses(dfCompare, config=c) for c in configRespList]) except Exception as e: logger.error("xvalBooklets: Error extracting responses") logger.exception(e) return None # ## Comparison and discrepancies # first, take care of a special case in BQMCMS and BQChoices, where one can add free text as "response" idx = dfCompare.ItemTypeCode.isin([ "BQMCSS", "BQMCMS", "BQChoices" ]) & dfCompare["ExtractedAnswer"].notnull() dfCompare.loc[idx, "ExtractedAnswer"] = dfCompare.loc[idx, "ExtractedAnswer"] \ .apply(lambda l: [i for i in l if i not in ['response', 'response']]) # discrepancies try: # we take a shortcut here, converting responses to a set of string-values # if the response is None, then the result is not a set, but a None setReconAnswer = dfCompare.loc[:, "ReconstructedAnswer"]\ .apply(lambda respList: set([str(i) for i in respList]) if isinstance(respList,list) else None) setExtraAnswer = dfCompare.loc[:, "ExtractedAnswer"]\ .apply(lambda respList: set([str(i) for i in respList]) if isinstance(respList,list) else None) dfCompare.loc[:, "matched"] = None # matched==True iff neither is None and the sets (of strings) are equal (recall None!=None) idx = setReconAnswer == setExtraAnswer dfCompare.loc[idx, "matched"] = True # matched==False iff the 2 sets were not equal, or one of them is None, but if both are None, we ignore idx = (setReconAnswer != setExtraAnswer) dfCompare.loc[idx, "matched"] = False dfCompare.loc[setReconAnswer.isnull() & setExtraAnswer.isnull(), "matched"] = None # if the response is empty, it is treated as missing; comparison is True idx = dfCompare["ReconstructedAnswer"].isnull() & ( dfCompare["ExtractedAnswer"].apply(lambda l: l == [])) # dfCompare.loc[idx, "matched"] = None dfCompare.loc[idx, "matched"] = True except Exception as e: logger.error( "xvalBooklets: Error comparing extracted and reconstructed responses" ) logger.exception(e) return None return dfCompare
def reconSBTItemResponses(df, config=None): """Parse SBT process data, reconstruct responses using an array of functions :param df: the input data frame :type df: Pandas data frame :param config: optional configuation object; default to None :type config: object or None :returns: df with responses :rtype: Pandas data frame """ try: assert (isinstance(df, pd.DataFrame)) assert (config["itemtypeColumn"] in df.columns) assert (config["accnumColumn"] in df.columns) except Exception as e: #logger.error("reconSBTItemResponses: Returning None due to errors") #logger.exception(e) return None # make sure we have relevant events, else return None if df.loc[df[config["itemtypeColumn"]].isin(list( config["handlers"].keys()))].shape[0] == 0: return None if config is None: config = { "itemtypeColumn": "Label", "accnumColumn": "ControlId", "outputColumn": "ReconAnswer", "handlers": { "select.drop": reconSBTSelectDrop, "text.blur": reconSBTText, "select.choose": reconSBTSelectChoice } } # now let's revert the config, to get `parser:[list of labels]` funcMap = {} for k, v in config["handlers"].items(): funcMap[v] = funcMap.get(v, []) + [k] # we now loop through all funcMap elements and do the conversion # TODO: consider ways to parallelize the process, e.g., using dask alldata = [] for parser, eventList in funcMap.items(): idx = df.loc[:, config["itemtypeColumn"]].isin(eventList) # alldata.append( df.loc[idx, :].groupby([accnum, itemtype]).apply(parser, accnum=accnum, itemtype=itemtype)) # alldata.append( df.loc[idx, :].groupby(accnum).apply(parser, accnum=accnum, itemtype=itemtype)) tmp = df.loc[idx, :]\ .groupby(config["accnumColumn"])\ .apply(parser, accnum=config["accnumColumn"], itemtype=config["itemtypeColumn"]) if tmp.shape[0] > 0: alldata.append(tmp) # concat data try: res = pd.concat(alldata).reset_index() res.columns = [config["accnumColumn"], config["outputColumn"]] except Exception as e: logger.error("reconSBTItemResponses: Returning None due to errors") logger.exception(e) return None return res
def parseSbtResponseXML( source, bl="current", bc="current", unicodeJunkChar="@" ): #bl stands for bookletnumber; bc stands for blockcode """ Parse the SBT xml string from SQL Response data table for each block and get Response Data. This function parses the XML string stored in the SQL Response Data Table for SBT and similar black-box component, where it keeps its own observable data and export an XML along with the response data. We have to export this data from the responseData SQL database as an XML file. This function takes an individual XML per student per block, and returns a Pandas data frame. :param source: the XML string or a XML node :param unicodeJunkChar: the character or string to replace any unicode characters; default to "@" :return: a data frame of response data or None if errors """ # if source is a XML node, skip the parsing try: # first, replacing all unicode characters to unicodeJunkChar source = re.sub(r"\&\#x[0-9a-fA-F]+", unicodeJunkChar, source) # try to parse the xml string root = ET.fromstring(source) except Exception as e: #print bl, " " , bc, "Not able to parse" warnings.warn("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.error("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.exception(e) # not a string root = source responseList = [] bookletDict = dict() # processing top-level xml elements try: bookletDict["BookletNumber"] = root.find("bookletId").text bookletDict["BlockCode"] = root.find("blockId").text bookletDict["ItemTypeCode"] = "SBT" bookletDict["AccessionNumber"] = root.find("taskId").text except Exception as e: #print bl, " ", bc, " error" warnings.warn("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.error("BlockCode " + bc + " BookletNumber " + bl + " XML contains incomplete Booklet level information") logger.exception(e) return None # SBTs actually embed its own XML data, we need to loop through them and save each as a row for responseDatum in root.iter('responseDatum'): # make a copy of the itemDict respDict = bookletDict.copy() # populate respDict["SceneId"] = responseDatum.findtext('sceneId') respDict["responseComponentId"] = responseDatum.findtext( 'responseComponentId') respDict["responseType"] = responseDatum.findtext('responseType') # parse content content = responseDatum.find("content") ct = [] if content is not None: for pair in content.iter('pair'): k = pair.find("key").text v = pair.find("value").text ct.append({"key": k, "val": v}) respDict["Response"] = ct # add to the list responseList.append(respDict) # respDict.clear() # gc? No, stupid. This would clear the obj in the list already. # itemDict.clear() # gc? NO, see above. # error check # if no actual data records, exit with a warning and return None if len(responseList) == 0: #print bl," ",bc," has no data" warnings.warn("BlockCode " + bc + " BookletNumber " + bl + " XML contains no data") logger.warning("BlockCode " + bc + " BookletNumber " + bl + " XML contains no data") return None # We have data. Now we create a data frame, parse the ExtendedInfo # notice the configuration is specified. try: df = pd.DataFrame.from_dict(responseList) except Exception as e: warnings.warn("XML data cannot be converted to a data frame") logger.error("XML data cannot be converted to a data frame") logger.exception(e) return None return df
def parseIctXML(source, keepResponseData=False): """ Parse the 2015 ICT observables xml string for each block. The 2015 Science ICT follows a precursor of the SBT data format, where most of the observable events are saved in the Response data table, as a "response" associated with an AccNum. This function takes the XML string of that session, and returns a data frame following the standard format of the process data log. Note that there are several fields that are unused, e.g., "Label", because they are used in the eNAEP-based process data logs (which deal with response-related events). We need to merge the two sources of logs to obtain a complete proces data log. :param source: the XML string or a XML node :param keepResponseData: false by default, otherwise combines both obs and res data. :return: a data frame or None """ # if source is a XML node, skip the parsing try: parser = ET.XMLParser() root = ET.fromstring(source, parser=parser) except: # not a string root = source df = None # get top level basic info try: bookletId = root.findtext('bookletId') # stateInfo=root.findtext('stateInfo') taskId = root.findtext('taskId') blockId = root.findtext('blockId') accommodations = root.findtext('accommodations') extendedTimeFactor = root.findtext('extendedTimeFactor') # print bookletId, taskId, blockId except Exception as e: warnings.warn("ParseIctXML: XML contains incomplete Booklet level information") logger.error("ParseIctXML: XML contains incomplete Booklet level information") logger.exception(e) return None # get observable data observableMatrix = [] ct = None for observableDatum in root.iter('observableDatum'): sceneId = observableDatum.findtext('sceneId') controlId = observableDatum.findtext('controlId') eventType = observableDatum.findtext('eventType') timestamp = observableDatum.findtext('timestamp') # get content in json format for content in observableDatum.iter('content'): # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste. # @@ why are we looping here? Shouldn't there be a single Content here? ct = unicodedata.normalize('NFKD', str(content.text)) #ct = unicodeToAscii(content.text) observableMatrix.append({'BookletNumber': bookletId, 'BlockId': blockId, 'TaskId': taskId, 'Accomodations': accommodations, 'ExtendedTimeFactor': extendedTimeFactor, 'SceneId': sceneId, 'ControlId': controlId, 'Label': eventType, 'EventTime': timestamp, 'ExtendedInfo': ct}) # turn the data into a data frame if not keepResponseData: try: # create dataframe encapsules all the info # first create observable dataframe df = pd.DataFrame.from_dict(observableMatrix) except Exception as e: warnings.warn("ParseIctXML: cannot turn Observable XML into a data frame") logger.error("ParseIctXML: cannot turn Observable XML into a data frame") logger.exception(e) return None else: # get response data responseMatrix = [] ct = None for responseDatum in root.iter('responseDatum'): sceneId = responseDatum.findtext('sceneId') responseComponentId = responseDatum.findtext('responseComponentId') responseType = responseDatum.findtext('responseType') # get content in json format for content in responseDatum.iter('content'): # ct needs to be a string; but we will later re-parse this as JSON. @@@ Waste. # @@ why are we looping here? Shouldn't there be a single Content here? ct = str(parseXMLContentDatum(content)) responseMatrix.append({'SceneId': sceneId, 'ResponseComponentId': responseComponentId, 'ResponseType': responseType, 'ResponseContent': ct}) try: # create dataframe encapsules all the info # first create observable dataframe dfObs = pd.DataFrame.from_dict(observableMatrix) # then create response dataframe dfResp = pd.DataFrame.from_dict(responseMatrix) if (dfResp.empty): df = dfObs elif (dfObs.empty): df = dfResp else: # GF: Wait, does "outer" really work here? # TODO: We don't want to back-fill the response for each scene # If we ever do this, we want the ResponseContent to relfect the # state of the responses at this point. df = pd.merge(dfObs, dfResp, how='outer', on='SceneId') except Exception as e: warnings.warn("ParseIctXML: cannot turn XML into a data frame") logger.error("ParseIctXML: cannot turn XML into a data frame") logger.exception(e) return None return df
def reconSBTTextSelection(itemLog, accnum="ControlId", itemtype="Label"): """ Given a Pandas data frame containing the log for one item, return the reconstructed response. Examples of how select.toggle works: Label == "select.toggle" extendedInfo == {u'to': u'true', u'from': u'false'} controlId == "item-SelectExamples-selection-22" ... ==> {"ResponseComponentId": "item-SelectExamples", "ExtractedAnswer" = [selection-22, selection-23, selection-27]} The trick is that when we process by controlId, we can't put the the selections for each item back as a list. We will do this using a post-processing function, which will scan all item of this type, and combine ones with the same ResponseComponentId as a list. So for now, we will process one action at a time. We will have to run through each line and track the selection and deselection of each unit of selection. :param itemLog: a data fram containing the log of a single TextSelection select.toggle item :param accnum: the column name that identifies items :param itemtype: the column name that identifies the item type :return: a Pandas series of the reconstructed responses. """ assert (isinstance(itemLog, pd.DataFrame)) assert ("extInfo" in itemLog.columns) # only a single item assert (itemLog[accnum].nunique() == 1) # only a single item type assert (itemLog["ItemTypeCode"].nunique() == 1) # return the last content try: res = [] lastRow = itemLog\ .loc[itemLog[itemtype] == "select.toggle"]\ .iloc[-1] response = lastRow.extInfo["to"] controlId = lastRow[accnum] # if this is not an "unselect" event if response == "true": # for "item-SelectExamples-selection-22" # for "slide10questions-option-2" for delimitor in ["_Selection", "_String"]: if delimitor in controlId: tmplist = controlId.split(delimitor) if len(tmplist) == 2: # we should get 2 parts res.append({ "ResponseComponentId": "{}".format(tmplist[0]), "ReconstructedAnswer": "{}".format(tmplist[1]) }) else: # error parsing this response: return controlId res.append({ "ResponseComponentId": "{}".format(controlId), "ReconstructedAnswer": "{}".format(controlId) }) break # quite the loop if processed # if no match, this is a case of "controlId == True"; i.e., # we return if len(res) == 0: res = [{ "ResponseComponentId": "{}".format(controlId), "ReconstructedAnswer": "{}_Selected".format(controlId) }] except Exception as e: logger.error("reconSBTTextSelection:") logger.exception(e) # logger.debug(itemLog) res = [{ "ResponseComponentId": "ERROR_reconSBTTextSelection", "ReconstructedAnswer": "ERROR_reconSBTTextSelection" }] return res
def parsePearsonResponseXML(source): """ Parse Pearson response XMLs, using XPath. Naming following the SQL: ``` [ItemResponse].[ItemResponseId], Subject.SubjectCode, Assessment.AssessedGroupId as Grade, Student.BookletNumber, [Block].BlockCode, Item.AccessionNumber, ItemType.ItemTypeCode, [ItemResponse].[Response], [ItemResponse].[IsAnswered] ``` We are adding a few new columns: - `ChildItemAccessionNumber`: native for some eNAEP; for SBT type, ``` 'ChildItemAccessionNumber': sceneId ``` - `ChildItemType`: native for some eNAEP; for SBT type, ``` 'ChildItemType': responseType ``` - `ResponseComponentId`: native for SBT style data; for eNAEP, it is a combination of AccNum and childAccNum ``` 'ResponseComponentId': "item-{}".format(accessionNumber) \ if childItemAccessionNumber is None else \ "item-{}-{}".format(accessionNumber, childItemAccessionNumber), ``` :param source: the XML string or a XML node :return: a data frame or None """ # if source is a XML node, skip the parsing try: root = ET.fromstring(source) except: # not a string root = source # get top level basic info, using proper xpath try: bookletNumber = root.find('./context/bookletNumber').text assignedForm = root.find('./context/assignedForm').text assessmentYear = root.find('./testResult').get("assessmentYear") subjectName = root.find('./testResult').get("subjectName") grade = root.find('./testResult').get("assessedGroup") except Exception as e: logger.error( "parseResponseXML: XML contains incomplete Booklet level information" ) logger.exception(e) return None responseMatrix = [] headerDict = { 'BookletNumber': bookletNumber, 'Form': assignedForm, 'Year': assessmentYear, 'SubjectCode': subjectName, 'Grade': grade } for itemResult in root.iter('itemResult'): # now the key/value pairs if itemResult.get("itemType") in ["SBT", "ReadingNonSBT"]: try: responseMatrix += extractSbtResponseXML(itemResult, headerDict) except Exception as e: logger.error( "parseResponseXML: Unable to parse SBT responseData") logger.exception(e) continue else: # regular eNAEP types try: blockCode = itemResult.get("blockCode") itemAccessionNumber = itemResult.get("accessionNumber") # sometimes response data is stored under a different AccNum than the ItemAccNum accessionNumber = itemResult.get("respondedIn") if not accessionNumber: accessionNumber = itemAccessionNumber itemType = itemResult.get("itemType") childItemAccessionNumber = itemResult.get( "childItemAccessionNumber") childItemType = itemResult.get("childItemType") content = itemResult.find( "responseVariable/candidateResponse/value/content") ct = [] if content is not None: for pair in content.iter('pair'): k = pair.find("key").text v = pair.find("value").text ct.append({"key": k, "val": v}) responseMatrix.append({ 'BookletNumber': bookletNumber, 'Form': assignedForm, 'Year': assessmentYear, 'SubjectCode': subjectName, 'Grade': grade, 'BlockCode': blockCode, 'AccessionNumber': accessionNumber, 'ItemAccessionNumber': itemAccessionNumber, 'ItemTypeCode': itemType, 'ChildItemAccessionNumber': childItemAccessionNumber, 'ChildItemType': childItemType, 'ResponseComponentId': "item-{}".format(accessionNumber) \ if childItemAccessionNumber is None else \ "item-{}-{}".format(accessionNumber, childItemAccessionNumber), 'Response': ct }) except Exception as e: logger.error( "parseResponseXML: Unable to parse eNAEP response content") logger.exception(e) continue return pd.DataFrame(responseMatrix)