def printTestDuration(path): allTime = [] for fn in utils.findFiles(path, filterExt=".csv"): timeStrList = [rowTuple[2].split(",")[-2] for rowTuple in user_response.loadUserResponse(join(path, fn))] timeList = [] for timeStamp in timeStrList: try: minutes, seconds = timeStamp.split(':') except ValueError: continue seconds = int(minutes) * 60 + float(seconds) minutes = seconds / 60.0 timeList.append(minutes) totalTime = sum(timeList) allTime.append(totalTime) print("%s, %f" % (fn, totalTime)) meanTime = sum(allTime) / len(allTime) print("Mean: %f" % meanTime) timeDeviationList = [(time - meanTime) ** 2 for time in allTime] stDev = math.sqrt(sum(timeDeviationList) / len(allTime)) print("Standard Deviation: %f" % stDev)
def removeDuplicates(path, overwrite=False): outputPath = join(path, "duplicates_removed") if overwrite is False: os.mkdir(outputPath) else: utils.makeDir(outputPath) anyDuplicatesFound = False for fn in utils.findFiles(path, filterExt=".csv"): with io.open(join(path, fn), "r", encoding="utf-8") as fd: data = fd.read() dataList = data.splitlines() try: outputList = [ dataList[0], ] except IndexError: raise EmptyUserDataFile(fn) prevString = dataList[0].split(";,")[0].rsplit("]", 1)[0] for i in range(1, len(dataList)): curString = dataList[i].split(";,")[0].rsplit("]", 1)[0] if curString == prevString: if anyDuplicatesFound is False: print("Duplicates removed:") anyDuplicatesFound = True print("%s, %d, %s" % (fn, i, curString)) else: outputList.append(dataList[i]) prevString = curString # Special case: pop the last item in the sequence if it is 'login' # -- this happens when a user tries to log in to an experiment after # already completing it if outputList[-1][:6] == "login," and len(outputList) > 1: outputList.pop(-1) with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(outputList)) if anyDuplicatesFound is True: print("End of duplicates listing")
def removeDuplicates(path, overwrite=False): outputPath = join(path, "duplicates_removed") if overwrite is False: os.mkdir(outputPath) else: utils.makeDir(outputPath) anyDuplicatesFound = False for fn in utils.findFiles(path, filterExt=".csv"): with io.open(join(path, fn), "r", encoding="utf-8") as fd: data = fd.read() dataList = data.splitlines() try: outputList = [dataList[0], ] except IndexError: raise EmptyUserDataFile(fn) prevString = dataList[0].split(";,")[0].rsplit("]", 1)[0] for i in range(1, len(dataList)): curString = dataList[i].split(";,")[0].rsplit("]", 1)[0] if curString == prevString: if anyDuplicatesFound is False: print("Duplicates removed:") anyDuplicatesFound = True print("%s, %d, %s" % (fn, i, curString)) else: outputList.append(dataList[i]) prevString = curString # Special case: pop the last item in the sequence if it is 'login' # -- this happens when a user tries to log in to an experiment after # already completing it if outputList[-1][:6] == "login," and len(outputList) > 1: outputList.pop(-1) with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(outputList)) if anyDuplicatesFound is True: print("End of duplicates listing")
def extractFromTest(path, keyList, removeItemList=None, onlyKeepList=None): ''' Extracts all matching keys from a user's results in an LMEDS test ''' # Load all user data # -- separate the command name on each line from the rest of the line testSequenceDataList = [] for fn in utils.findFiles(path, filterExt=".csv"): with io.open(join(path, fn), "r", encoding="utf-8") as fd: subjectDataList = [row.rstrip("\n") for row in fd.readlines()] subjectDataList = [line.split(",", 1) for line in subjectDataList] testSequenceDataList.append((fn, subjectDataList)) for key in keyList: outputDir = join(path, key) utils.makeDir(outputDir) for fn, subjectDataList in testSequenceDataList: subjectDataSubsetList = [] for line in subjectDataList: command = line[0] if command == key: partialLine = line[1].split(';,')[0] skipOut = False if removeItemList is not None: for item in removeItemList: if item in partialLine: skipOut = True if onlyKeepList is not None: for item in onlyKeepList: if item not in partialLine: skipOut = True if skipOut is True: continue subjectDataSubsetList.append(",".join(line)) # Don't output a blank file if subjectDataSubsetList == []: continue with io.open(join(outputDir, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(subjectDataSubsetList))
def checkSequenceFile(survey): outputDir = join(survey.outputDir, survey.testSequence.sequenceTitle) if not os.path.exists(outputDir): print("FYI: Output folder does not exist: '%s'" % outputDir) try: if len(utils.findFiles(outputDir, filterExt=".csv")) > 0: print("FYI: User data already exists in output folder") except OSError: pass seq = sequence.TestSequence(survey, survey.sequenceFN) numErrors = 0 for pageNum in range(seq.getNumPages()): try: page = seq.getPage(pageNum) except TypeError: print("Page %d: Problem with the number of arguments" % pageNum) continue except (utils.FilesDoNotExist, loader.TextNotInDictionaryException) as e: print("Page %d: %s\n" % (pageNum, str(e))) numErrors += 1 continue try: page = page.getHTML() except BaseException as e: errStr = ("Page %d: Problem with at least one of the arguments: " "'%s'\n%s:%s\n") print(errStr % (pageNum, str(seq.testItemList[pageNum]), e.__class__.__name__, str(e))) numErrors += 1 continue if numErrors == 0: print("No errors found in sequence file.")
def postProcessResults(testName, sequenceFN, removeDuplicatesFlag, removeItemList=None): rootPath = join(constants.rootDir, "tests", testName) txtPath = join(rootPath, "txt") tmpSequence = sequence.TestSequence(None, join(rootPath, sequenceFN)) fullPath = join(rootPath, "output", tmpSequence.sequenceTitle) pathToData = fullPath if removeDuplicatesFlag is True: removeDuplicates(pathToData, True) pathToData = join(pathToData, "duplicates_removed") else: newPathToData = join(pathToData, "duplicates_not_removed") utils.makeDir(newPathToData) for fn in utils.findFiles(pathToData, filterExt=".csv"): shutil.copy(join(pathToData, fn), join(newPathToData, fn)) pathToData = newPathToData outputPath = pathToData + "_results" userResponseList = [] fnList = utils.findFiles(pathToData, filterExt=".csv") for fn in fnList: fullPath = join(pathToData, fn) userResponseList.append(user_response.loadUserResponse(fullPath)) # Don't continue if files are of different lengths testLen = len(userResponseList[0]) if not all([len(response) == testLen for response in userResponseList]): print("ERROR: Not all responses in folder %s are the same length" % pathToData) countDict = {} for fn, response in utils.safeZip([fnList, userResponseList], True): countDict.setdefault(len(response), []) countDict[len(response)].append(fn) keyList = list(countDict.keys()) keyList.sort() for numLines in keyList: print("%d lines - %s" % (numLines, str(countDict[numLines]))) exit(0) # Don't continue if pages are different pageNameList = [[(pageTuple[0], pageTuple[1]) for pageTuple in response] for response in userResponseList] sameList = [] fnListOfLists = [] for fn, pageList in utils.safeZip([fnList, pageNameList], True): i = 0 while True: if len(sameList) == i: sameList.append(pageList) fnListOfLists.append([]) else: if sameList[i] == pageList: fnListOfLists[i].append(fn) break else: i += 1 if len(sameList) == 0: print("ERROR: There don't appear to be any test data in folder %s" % pathToData) exit(0) if len(sameList) != 1: print("ERROR: User data doesn't agree. Filenames printed on " "different lines differ in their pages.") for subFNList in fnListOfLists: print(", ".join(subFNList)) # Extract the different tests users completed uniquePageList = [] for pageTuple in pageNameList[0]: pageName = pageTuple[0] if pageName not in uniquePageList: uniquePageList.append(pageName) extractFromTest(pathToData, uniquePageList, removeItemList) # Transpose the surveys if "survey" in uniquePageList: surveyNameList = [] for pageName, stimuliArgList in pageNameList[0]: if pageName == "survey": surveyName = stimuliArgList[0] surveyNameList.append(join(rootPath, surveyName + '.txt')) transpose_survey.transposeSurvey(join(pathToData, "survey"), surveyNameList, outputPath) # Transpose the rpt pages prominencePageList = ["prominence", "boundary", "boundary_and_prominence", "syllable_marking"] for pageName in prominencePageList: if pageName in uniquePageList: transpose_rpt.transposeRPT(join(pathToData, pageName), txtPath, pageName, outputPath) choicePageList = ["media_choice", ] for pageName in choicePageList: if pageName in uniquePageList: transpose_choice.transposeChoice(join(pathToData, pageName), pageName, outputPath)
def transposeChoice(path, pageName, outputPath): utils.makeDir(outputPath) # Load response data responseDataList = [] fnList = utils.findFiles(path, filterExt=".csv") for fn in fnList: a = user_response.loadUserResponse(join(path, fn)) responseDataList.append(a) # Sort response if sequence order information is available parsedTuple = transpose_utils.parseResponse(responseDataList) responseDataList, stimuliListsOfLists, orderListOfLists = parsedTuple # Convert response to single answer tmpUserResponse = [] for userDataList in responseDataList: # Get user response userResponse = [str(responseTuple[3].split(',').index('1')) for responseTuple in userDataList] tmpUserResponse.append(userResponse) responseDataList = tmpUserResponse # Verify that all responses have the same list of stimuli assert(all([stimuliListsOfLists[0] == header for header in stimuliListsOfLists])) # Transpose data tResponseDataList = [row for row in utils.safeZip(responseDataList, True)] tOrderListOfLists = [] if len(orderListOfLists) > 0: tOrderListOfLists = [row for row in utils.safeZip(orderListOfLists, True)] # Add a unique id to each row oom = utils.orderOfMagnitude(len(stimuliListsOfLists[0])) stimID = "s%%0%dd" % (oom + 1) stimuliList = ["%s,%s" % (stimID % i, row) for i, row in enumerate(stimuliListsOfLists[0])] addSequenceInfo = len(tOrderListOfLists) > 0 if addSequenceInfo: # Add sequence information to each row tResponseDataList = [list(row) + list(sequenceInfo) for row, sequenceInfo in utils.safeZip([tResponseDataList, tOrderListOfLists], True)] # Aggregate the stimuli and the responses in rows tResponseDataList = [list(row) for row in tResponseDataList] outputList = [[header, ] + list(row) for header, row in utils.safeZip([stimuliList, tResponseDataList], True)] # Add the column heading rows # First row in unanonymized user names; Second row is anonymized numArgs = stimuliList[0].count(",") rowOne, rowTwo = _buildHeader(fnList, numArgs, pageName, addSequenceInfo) outputList = [rowOne, rowTwo, ] + outputList outputTxt = u"\n".join([",".join(row) for row in outputList]) outputFN = join(outputPath, pageName + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt) # Output a template users can fill in to auto score the results name = pageName + "_answer_template.csv" answersFN = join(outputPath, name) if os.path.exists(answersFN): print("Response template '%s' already exists. Not overwriting." % name) else: outputTxt = u"\n".join(stimuliList) with io.open(answersFN, "w", encoding="utf-8") as fd: fd.write(outputTxt)
def transposeRPT(path, txtPath, pageName, outputPath): ''' Transposes RPT data Input files: one file per subject Output files: one file per stimuli ''' utils.makeDir(outputPath) # Load response data responseDataList = [] fnList = utils.findFiles(path, filterExt=".csv") for fn in fnList: a = user_response.loadUserResponse(join(path, fn)) responseDataList.append(a) # Load the demarcator, if there is one # and load the order info if present demarcator = None pageName, pageArgs, _, _ = responseDataList[0][0] if pageName == "syllable_marking": # The demarcator can either be an arg or a keyword arg. # Either way, it should be the last item in the list demarcator = pageArgs[-1] if "syllableDemarcator" in demarcator: demarcator = demarcator.split("=")[1] # Sort response if sequence order information is available parsedTuple = transpose_utils.parseResponse(responseDataList) responseDataList, _, orderListOfLists = parsedTuple orderList = [] if len(orderListOfLists) > 0: orderList = [",".join(row) for row in utils.safeZip(orderListOfLists, True)] # Load Words txtDict = {} for fn in utils.findFiles(txtPath, filterExt=".txt"): name = os.path.splitext(fn)[0] txtList = loader.loadTxtFile(join(txtPath, fn)) txtList = [tmpTxt.replace(" ", ",") for tmpTxt in txtList] # Remove HTML tags txtList = [word for word in txtList if "<" not in word] txt = ",".join(txtList) if demarcator is None: txtDict[name] = [word for word in txt.split(",") if word != ""] else: txtDict[name] = [syllable for word in txt.split(",") if word != "" for syllable in word.split(demarcator)] returnDict, idKeyList = _transposeRPT(responseDataList) doUserSeqHeader = len(orderListOfLists) > 0 headerRow, anonHeaderRow = _buildHeader(fnList, pageName, doUserSeqHeader, idKeyList[0]) # Format the output rpt scores aggrOutputList = [headerRow, anonHeaderRow] for i in range(len(idKeyList)): stimulusID = idKeyList[i] wordList = txtDict[stimulusID.split(",")[2]] stimulusIDList = [stimulusID for _ in wordList] aspectSumList = [stimulusIDList, wordList, ] aspectList = [] try: bScoreList, bSumList = _getScores(returnDict[stimulusID], B) except KeyError: pass try: pScoreList, pSumList = _getScores(returnDict[stimulusID], P) except KeyError: pass if pageName == "boundary": aspectSumList.extend([bSumList, ]) aspectList.extend([bScoreList, ]) elif pageName in ["prominence", "syllable_marking"]: aspectSumList.extend([pSumList, ]) aspectList.extend([pScoreList, ]) elif pageName == "boundary_and_prominence": aspectSumList.extend([bSumList, pSumList, ]) aspectList.extend([bScoreList, pScoreList, ]) # Extend header with sequence order information if doUserSeqHeader: orderStr = orderList[i] numAnnotators = range(max([len(bSumList), len(pSumList)])) tmpOrderList = [orderStr for _ in numAnnotators] aspectList.extend([tmpOrderList, ]) dataList = aspectSumList + aspectList combinedList = [_unifyRow(row) for row in utils.safeZip(dataList, enforceLength=True)] aggrOutputList.extend([",".join(row) for row in combinedList]) outputTxt = "\n".join(aggrOutputList) outputFN = join(outputPath, pageName + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt)
def transposeRPT(path, txtPath, pageName, outputPath): ''' Transposes RPT data Input files: one file per subject Output files: one file per stimuli ''' utils.makeDir(outputPath) # Load response data responseDataList = [] fnList = utils.findFiles(path, filterExt=".csv") for fn in fnList: a = user_response.loadUserResponse(join(path, fn)) responseDataList.append(a) # Load the demarcator, if there is one # and load the order info if present demarcator = None pageName, pageArgs, _, _ = responseDataList[0][0] if pageName == "syllable_marking": # The demarcator can either be an arg or a keyword arg. # Either way, it should be the last item in the list demarcator = pageArgs[-1] if "syllableDemarcator" in demarcator: demarcator = demarcator.split("=")[1] # Sort response if sequence order information is available parsedTuple = transpose_utils.parseResponse(responseDataList) responseDataList, _, orderListOfLists = parsedTuple orderList = [] if len(orderListOfLists) > 0: orderList = [ ",".join(row) for row in utils.safeZip(orderListOfLists, True) ] # Load Words txtDict = {} for fn in utils.findFiles(txtPath, filterExt=".txt"): name = os.path.splitext(fn)[0] txtList = loader.loadTxtFile(join(txtPath, fn)) txtList = [tmpTxt.replace(" ", ",") for tmpTxt in txtList] # Remove HTML tags txtList = [word for word in txtList if "<" not in word] txt = ",".join(txtList) if demarcator is None: txtDict[name] = [word for word in txt.split(",") if word != ""] else: txtDict[name] = [ syllable for word in txt.split(",") if word != "" for syllable in word.split(demarcator) ] returnDict, idKeyList = _transposeRPT(responseDataList) doUserSeqHeader = len(orderListOfLists) > 0 headerRow, anonHeaderRow = _buildHeader(fnList, pageName, doUserSeqHeader, idKeyList[0]) # Format the output rpt scores aggrOutputList = [headerRow, anonHeaderRow] for i in range(len(idKeyList)): stimulusID = idKeyList[i] wordList = txtDict[stimulusID.split(",")[2]] stimulusIDList = [stimulusID for _ in wordList] aspectSumList = [ stimulusIDList, wordList, ] aspectList = [] try: bScoreList, bSumList = _getScores(returnDict[stimulusID], B) except KeyError: pass try: pScoreList, pSumList = _getScores(returnDict[stimulusID], P) except KeyError: pass if pageName == "boundary": aspectSumList.extend([ bSumList, ]) aspectList.extend([ bScoreList, ]) elif pageName in ["prominence", "syllable_marking"]: aspectSumList.extend([ pSumList, ]) aspectList.extend([ pScoreList, ]) elif pageName == "boundary_and_prominence": aspectSumList.extend([ bSumList, pSumList, ]) aspectList.extend([ bScoreList, pScoreList, ]) # Extend header with sequence order information if doUserSeqHeader: orderStr = orderList[i] numAnnotators = range(max([len(bSumList), len(pSumList)])) tmpOrderList = [orderStr for _ in numAnnotators] aspectList.extend([ tmpOrderList, ]) dataList = aspectSumList + aspectList combinedList = [ _unifyRow(row) for row in utils.safeZip(dataList, enforceLength=True) ] aggrOutputList.extend([",".join(row) for row in combinedList]) outputTxt = "\n".join(aggrOutputList) outputFN = join(outputPath, pageName + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt)
def transposeChoice(path, pageName, outputPath): utils.makeDir(outputPath) # Load response data responseDataList = [] fnList = utils.findFiles(path, filterExt=".csv") for fn in fnList: a = user_response.loadUserResponse(join(path, fn)) responseDataList.append(a) # Sort response if sequence order information is available parsedTuple = transpose_utils.parseResponse(responseDataList) responseDataList, stimuliListsOfLists, orderListOfLists = parsedTuple # Convert response to single answer tmpUserResponse = [] for userDataList in responseDataList: # Get user response userResponse = [ str(responseTuple[3].split(',').index('1')) for responseTuple in userDataList ] tmpUserResponse.append(userResponse) responseDataList = tmpUserResponse # Verify that all responses have the same list of stimuli assert (all( [stimuliListsOfLists[0] == header for header in stimuliListsOfLists])) # Transpose data tResponseDataList = [row for row in utils.safeZip(responseDataList, True)] tOrderListOfLists = [] if len(orderListOfLists) > 0: tOrderListOfLists = [ row for row in utils.safeZip(orderListOfLists, True) ] # Add a unique id to each row oom = utils.orderOfMagnitude(len(stimuliListsOfLists[0])) stimID = "s%%0%dd" % (oom + 1) stimuliList = [ "%s,%s" % (stimID % i, row) for i, row in enumerate(stimuliListsOfLists[0]) ] addSequenceInfo = len(tOrderListOfLists) > 0 if addSequenceInfo: # Add sequence information to each row tResponseDataList = [ list(row) + list(sequenceInfo) for row, sequenceInfo in utils.safeZip([tResponseDataList, tOrderListOfLists], True) ] # Aggregate the stimuli and the responses in rows tResponseDataList = [list(row) for row in tResponseDataList] outputList = [[ header, ] + list(row) for header, row in utils.safeZip( [stimuliList, tResponseDataList], True)] # Add the column heading rows # First row in unanonymized user names; Second row is anonymized numArgs = stimuliList[0].count(",") rowOne, rowTwo = _buildHeader(fnList, numArgs, pageName, addSequenceInfo) outputList = [ rowOne, rowTwo, ] + outputList outputTxt = u"\n".join([",".join(row) for row in outputList]) outputFN = join(outputPath, pageName + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt) # Output a template users can fill in to auto score the results name = pageName + "_answer_template.csv" answersFN = join(outputPath, name) if os.path.exists(answersFN): print("Response template '%s' already exists. Not overwriting." % name) else: outputTxt = u"\n".join(stimuliList) with io.open(answersFN, "w", encoding="utf-8") as fd: fd.write(outputTxt)
def transposeSurvey(path, surveyFullPathList, outputPath): utils.makeDir(outputPath) surveyDataList = [] fnList = utils.findFiles(path, filterExt=".csv") for fn in fnList: surveyDataList.append(user_response.loadUserResponse(join(path, fn))) aspectKeyList = [] # Load the data returnDict = {} defaultDict = {} for surveyFN in surveyFullPathList: fn = os.path.split(surveyFN)[1] surveyName = os.path.splitext(fn)[0] questionTitleDataList = [] surveyQuestionDataList = [] surveyItemList = survey.parseSurveyFile(surveyFN) for surveyItem in surveyItemList: for widgetType, widgetTextList in surveyItem.widgetList: if widgetType == "None": continue if widgetType in ["Multiline_Textbox", "Sliding_Scale"]: widgetTextList = ["", ] blankTxt = ["", ] * (len(widgetTextList) - 1) # Removing commas b/c we're using csv files surveyQuestion = surveyItem.text.replace(",", "") questionTitleDataList.extend([surveyQuestion, ] + blankTxt) if len(widgetTextList) == 0: surveyQuestionDataList.extend(["", ]) else: surveyQuestionDataList.extend(widgetTextList) defaultDict.setdefault(surveyName, []) defaultDict[surveyName].append(questionTitleDataList) defaultDict[surveyName].append(surveyQuestionDataList) for fn, userDataList in utils.safeZip([fnList, surveyDataList], True): for dataTuple in userDataList: # taskName, stimuliArgList, argTxt, dataTxt = dataTuple stimuliArgList = dataTuple[1] stimuliID = stimuliArgList[0] dataTxt = dataTuple[3] returnDict.setdefault(stimuliID, defaultDict[stimuliID]) dataList = dataTxt.split(",") returnDict[stimuliID].append(dataList) idKeyList = returnDict.keys() # Transpose the data for stimulusID in idKeyList: returnDict[stimulusID] = [list(subTuple) for subTuple in utils.safeZip(returnDict[stimulusID], enforceLength=True)] # Add a summation column newData = [] for row in returnDict[stimulusID]: try: total = str(sum([int(val) if val != '' else 0 for val in row[2:]])) except ValueError: total = '-' newData.append(row[:2] + [total, ] + row[2:]) returnDict[stimulusID] = newData mainSurveyData = [",".join(subList) for subList in returnDict[stimulusID]] outputTxtList = [",".join(["", "", "Total", ] + fnList), ] outputTxtList += mainSurveyData outputTxt = "\n".join(outputTxtList) outputFN = join(outputPath, stimulusID + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt) return returnDict, idKeyList, aspectKeyList