def tidy(source_dir, dest_dir, tested_function_name):
    """
    Go through every .py file in the source_dir directory and call tidy_one on each.
    Write out the results to a file of the same name in the dest_dir directory.

    source_dir: string, path to folder with data to tidy
    dest_dir: string, path to folder to write tidy data to. Does not need
        to already exist.
    tested_function_name: the name of the function which will be tested. Used
        when tidying to remove debugging function calls.

    If STOP_ON_ERROR is set and an error is encountered while tidying, raises
    that error (and returns nothing).

    returns: list of solution ids (filenames without .py) that could not be tidied
    """
    ensure_folder_exists(dest_dir)

    skipped = []
    for filename in os.listdir(source_dir):
        # Skip non-python files
        if not filename.endswith('.py'):
            continue

        sol_id = filename.split('.')[0]
        print "Tidying", sol_id

        source_path = path.join(source_dir, filename)
        dest_path = path.join(dest_dir, filename)
        try:
            tidy_one(source_path, dest_path, tested_function_name)
        except:
            if STOP_ON_ERROR: raise
            skipped.append(sol_id)
    return skipped
def tidy(source_dir, dest_dir, tested_function_name):
    """
    Go through every .py file in the source_dir directory and call tidy_one on each.
    Write out the results to a file of the same name in the dest_dir directory.

    source_dir: string, path to folder with data to tidy
    dest_dir: string, path to folder to write tidy data to. Does not need
        to already exist.
    tested_function_name: the name of the function which will be tested. Used
        when tidying to remove debugging function calls.

    If STOP_ON_ERROR is set and an error is encountered while tidying, raises
    that error (and returns nothing).

    returns: list of solution ids (filenames without .py) that could not be tidied
    """
    ensure_folder_exists(dest_dir)

    skipped = []
    for filename in os.listdir(source_dir):
        # Skip non-python files
        if not filename.endswith('.py'):
            continue

        sol_id = filename.split('.')[0]
        print "Tidying", sol_id

        source_path = path.join(source_dir, filename)
        dest_path = path.join(dest_dir, filename)
        try:
            tidy_one(source_path, dest_path, tested_function_name)
        except:
            if STOP_ON_ERROR: raise
            skipped.append(sol_id)
    return skipped
def augment(source_dir, dest_dir):
    """
    Append or prepend any extra code from affixes.py to student submissions.

    source_dir: string, path to folder with data to augment
    dest_dir: string, path to folder to write augmented data to. Does not need
        to already exist.
    """
    ensure_folder_exists(dest_dir)

    for filename in os.listdir(source_dir):
        with open(path.join(source_dir, filename), 'r') as f:
            source = f.read()
        with open(path.join(dest_dir, filename), 'w') as f:
            f.write(import_prefix + source + testcase_defs)
def augment(source_dir, dest_dir):
    """
    Append or prepend any extra code from affixes.py to student submissions.

    source_dir: string, path to folder with data to augment
    dest_dir: string, path to folder to write augmented data to. Does not need
        to already exist.
    """
    ensure_folder_exists(dest_dir)

    for filename in os.listdir(source_dir):
        with open(path.join(source_dir, filename), 'r') as f:
            source = f.read()
        with open(path.join(dest_dir, filename), 'w') as f:
            f.write(import_prefix + source + testcase_defs)
def execute_and_pickle(source_dir, dest_dir, testcases, output_only):
    """
    Wrapper function.

    Run pg_logger on each file in source_dir for each test case and store the
    results in pickle files in dest_dir.

    source_dir: string, path to a directory of source files to run
    dest_dir: string, path to a directory to put pickle files. Does not need to
        already exist
    testcases: list of strings, where each string is a well-formed test case
        for the source files in question

    If STOP_ON_ERROR is set and an error is encountered while logging or
        pickling, raises that error (and returns nothing).

    returns: (skipped_running, skipped_pickling) - two lists of solution ids
        (filename without .py) that encountered errors while executing or
        while pickling, respectively. A failure while pickling means that the
        solution executed correctly but the results could not be stored.
    """
    ensure_folder_exists(dest_dir)
    skipped_running, skipped_pickling = [], []

    for filename in os.listdir(source_dir):
        sol_id = filename.split('.')[0]
        with open(path.join(source_dir, filename), 'r') as f:
            source = f.read()

        # Execute
        print "Running logger on", sol_id
        try:
            all_traces, all_outputs = do_logger_run(source, testcases,
                                                    output_only)
        except:
            if STOP_ON_ERROR: raise
            skipped_running.append(sol_id)
            # We had an error, do not try to pickle and just move on
            continue

        # Pickle results
        try:
            do_pickle(sol_id, all_traces, all_outputs, testcases, dest_dir)
        except pickle.PicklingError:
            if STOP_ON_ERROR: raise
            skipped_pickling.append(sol_id)

    return skipped_running, skipped_pickling
def execute_and_pickle(source_dir, dest_dir, testcases, output_only):
    """
    Wrapper function.

    Run pg_logger on each file in source_dir for each test case and store the
    results in pickle files in dest_dir.

    source_dir: string, path to a directory of source files to run
    dest_dir: string, path to a directory to put pickle files. Does not need to
        already exist
    testcases: list of strings, where each string is a well-formed test case
        for the source files in question

    If STOP_ON_ERROR is set and an error is encountered while logging or
        pickling, raises that error (and returns nothing).

    returns: (skipped_running, skipped_pickling) - two lists of solution ids
        (filename without .py) that encountered errors while executing or
        while pickling, respectively. A failure while pickling means that the
        solution executed correctly but the results could not be stored.
    """
    ensure_folder_exists(dest_dir)
    skipped_running, skipped_pickling = [], []

    for filename in os.listdir(source_dir):
        sol_id = filename.split('.')[0]
        with open(path.join(source_dir, filename), 'r') as f:
            source = f.read()

        # Execute
        print "Running logger on", sol_id
        try:
            all_traces, all_outputs = do_logger_run(source, testcases, output_only)
        except:
            if STOP_ON_ERROR: raise
            skipped_running.append(sol_id)
            # We had an error, do not try to pickle and just move on
            continue

        # Pickle results
        try:
            do_pickle(sol_id, all_traces, all_outputs, testcases, dest_dir)
        except pickle.PicklingError:
            if STOP_ON_ERROR: raise
            skipped_pickling.append(sol_id)

    return skipped_running, skipped_pickling
Esempio n. 7
0
def run(folderOfData, destFolder):
    ensure_folder_exists(destFolder)
    def dumpOutput(data, filename, sort_keys=True, indent=4):
        filepath = path.join(destFolder, filename)
        with open(filepath, 'w') as f:
            json.dump(data, f, sort_keys=sort_keys, indent=indent, cls=ElenaEncoder)

    # Load solutions
    all_solutions = []
    populate_from_pickles(all_solutions, path.join(folderOfData, 'pickleFiles'))
    #dumpOutput(all_solutions,'all_solutions.json')
    # for sol in all_solutions:
    #     pprint.pprint(sol.getDict())
    

    # Collect variables into AbstractVariables
    all_abstracts = []
    skipped_extract_sequences = extract_and_collect_var_seqs(all_solutions, all_abstracts)
    #print all_abstracts
    #for absvar in all_abstracts:
    #    pprint.pprint(absvar.getDict())

    all_solutions = [sol for sol in all_solutions if sol.output==CORRECT]

    # for sol in all_solutions:
    #     pprint.pprint(sol.getDict())


    find_canon_names(all_abstracts)
    # for absvar in all_abstracts:
    #     pprint.pprint(absvar.getDict())

    all_lines = []
    skipped_by_renamer = compute_all_lines(all_solutions,folderOfData,all_lines)

    for line in all_lines:
        pprint.pprint(line.getDict())

    all_templates = []
    for line in all_lines:
        add_to_setlist(line.template,all_templates)

    template_dict = {}
    for template in all_templates:
        #print template
        hand_made_counter = {};
        for sol in all_solutions:
            for line in sol.lines:
                line_object = line[0]
                if line_object.template==template:
                    #print sol.solnum,line
                    in_hand_made_counter = False
                    for line_key, count in hand_made_counter.iteritems():
                        if line_key == line_object:
                            hand_made_counter[line_key]+=1
                            in_hand_made_counter = True
                    if not(in_hand_made_counter):
                        hand_made_counter[line_object] = 1
                    #template_dict[template][line[0]] += 1

        template_dict[template] = hand_made_counter


    # for sol in all_solutions:
    #     pprint.pprint(sol.getDict())
    for i in xrange(10):
        pprint.pprint(all_solutions[i].getDict())
    for sol in all_solutions:
        pprint.pprint(sol.output)

    print 'template_dict'
    #pprint.pprint(template_dict)
    for template,count_dict in template_dict.iteritems():
        print ''
        print 'template'
        print template
        print ':'
        print pprint.pprint(count_dict)
Esempio n. 8
0
def run(folderOfData, destFolder):

    # Constants and initial lists
    getRidOfStars = True
    rewrite_pipeline_toggle = False
    sharedVarThreshold = 1
    # solnum -> trace
    progTraceDictAll = {}
    argAndReturnVarInfo = {}

    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## populate_from_pickles
    ###############################################################################
    ## adds to:
    ##  progTraceDictAll: solnum -> trace
    ##  argAndReturnVarInfo: solnum -> { args, returnVars }
    ##
    ###############################################################################

    def populate_from_pickles(pickleSrc, formattedSrc, formattedExtn='.py.html'):
        print "Loading data"
        for filename in os.listdir(pickleSrc):
            solNum = filename.split('.')[0]
            solNumInt = int(solNum)
            print solNum

            with open(path.join(pickleSrc, filename), 'r') as f:
                unpickled = pickle.load(f)
            progTraceDictAll[solNum] = unpickled['trace']

            argAndReturnVarInfo[solNumInt] = {}
            argAndReturnVarInfo[solNumInt]['args'] = unpickled['args']
            argAndReturnVarInfo[solNumInt]['returnVars'] = unpickled['returnVars']
            # with open(path.join(formattedSrc, solNum + formattedExtn), 'r') as f:
            #     argAndReturnVarInfo[solNumInt]['code'] = f.read()

    populate_from_pickles(path.join(folderOfData, 'pickleFiles'), path.join(folderOfData, 'tidyDataHTML'))


    #from: http://stackoverflow.com/questions/8230315/python-sets-are-not-json-serializable
    #and http://stackoverflow.com/questions/624926/how-to-detect-whether-a-python-variable-is-a-function
    class ElenaEncoder(json.JSONEncoder):
        def default(self, obj):
           if isinstance(obj, set):
              return {'type':'set', 'list':list(obj)}
           if isinstance(obj, types.FunctionType):
              return {'type':'function'}
           return json.JSONEncoder.default(self, obj)


    def dumpOutput(data, filename, sort_keys=True, indent=4):
        filepath = path.join(destFolder, filename)
        with open(filepath, 'w') as f:
            json.dump(data, f, sort_keys=sort_keys, indent=indent, cls=ElenaEncoder)


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## getVarEquivsAcrossAllSolsLinear
    ###############################################################################
    ## adds to:
    ##  dictOfNamesAndFilesIndexedByVarSeqTempName: tempName -> (local name, solnum)
    ##  dictTempNameToSequence: tempName -> value sequence
    ##
    ## for each trace
    ##      for each local variable
    ##          extract sequence of values
    ##          if we have already seen it,
    ##              add (local name, solnum) to the correct entry in dictOfNames
    ##          else
    ##              add a new entry in dictOfNames: tempName -> (local name, solnum)
    ##              add a new entry in dictTemp: tempName -> sequence
    ###############################################################################
    ensure_folder_exists(destFolder)

    '''Linearly accumulate variables that have the same sequence of values across
    multiple solutions'''
    def extractSequence(column):
        valueSequence = []
        for elem in column:
            val = elem[1]
            if val != 'myNaN' and val != None:

                if valueSequence == []:
                    valueSequence.append(val)
                else:
                    lastval = valueSequence[-1]
                    if val != lastval:
                        valueSequence.append(val)
        return valueSequence

    # sequence of values is extracted and compared to what we already have
    def isThisVarSeqInOurDict(localVarData):
        for tempVarName, tempVarData in dictTempNameToSequence.iteritems():
            if tempVarData == extractSequence(localVarData):
                return tempVarName
        return None

    # temporary variable name to (local name, solution number)
    dictOfNamesAndFilesIndexedByVarSeqTempName = {}
    dictTempNameToSequence = {}

    def getVarEquivsAcrossAllSolsLinear():
        numSkippedSols_tooLong = []

        # This will not remove duplicates within one solution
        for k,v in progTraceDictAll.iteritems():
            # k is the solution number, v is the trace of variable names and values

            # Extracts the return value from the trace
            if '__return__' not in v.keys():
                numSkippedSols_tooLong.append(k)
                continue

            for localVarName, localVarData in v.iteritems():
                if localVarName.startswith('__'):
                    continue
                tempVarName = isThisVarSeqInOurDict(localVarData)
                theExtractedSequence = extractSequence(localVarData)
                if len(theExtractedSequence)==1 and type(theExtractedSequence[0]) is str:
                    if theExtractedSequence[0].startswith('__'):
                        # A function definition
                        continue
                if tempVarName != None:
                    # we already have the variable sequence in our dict
                    dictOfNamesAndFilesIndexedByVarSeqTempName[tempVarName].append((localVarName,k))
                else:
                    if len(dictOfNamesAndFilesIndexedByVarSeqTempName.keys()) == 0:
                        # 0 is a temp name here
                        dictOfNamesAndFilesIndexedByVarSeqTempName[0] = [(localVarName,k)]
                        dictTempNameToSequence[0] = theExtractedSequence
                    else:
                        maxVarName = max(dictOfNamesAndFilesIndexedByVarSeqTempName.keys())
                        newTempVarName = maxVarName + 1
                        dictOfNamesAndFilesIndexedByVarSeqTempName[newTempVarName] = [(localVarName,k)]
                        dictTempNameToSequence[newTempVarName] = theExtractedSequence
        return numSkippedSols_tooLong

    print "Getting variable equivalents"
    numSkippedSols_tooLong = getVarEquivsAcrossAllSolsLinear()


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## Populating dictForJson
    ###############################################################################
    ## adds to:
    ##  dictForJson: solnum -> {
    ##      weirdVars: [ local name, ... ]
    ##  }
    ##
    ## find the number of occurences of each abstract variable
    ## for each number, in descending order
    ##      for each abstract variable with only 1 occurrance
    ##          add entry to dictForJson[solnum][weirdVars]
    ###############################################################################

    '''Begin populating a json file (dictForJson) that is indexed by
    the solution name and includes things like the weird and common variable names'''

    print "Populating dictForJson"
    dictForJson = {}
    for nums in sorted(set(map(len, dictOfNamesAndFilesIndexedByVarSeqTempName.values())),reverse=True):
        for k,v in dictOfNamesAndFilesIndexedByVarSeqTempName.iteritems():
            if nums == len(v) and nums == 1:
                for (localVar, solname) in v:
                    if str(solname) in dictForJson.keys():
                        dictForJson[solname]['weirdVars'].append(localVar)
                    else:
                        dictForJson[solname]={}
                        dictForJson[solname]['weirdVars']= [localVar]


    '''Determine what the common name of a variable should be
    by extracting the most common variable name'''
    def extractVarName(tempName):
        accumulatedVarNames = []
        accumulatedVarNames+= [tup[0] for tup in dictOfNamesAndFilesIndexedByVarSeqTempName[tempName]]
        cnt = Counter()
        for word in accumulatedVarNames:
            try:
                cnt[word] += 1
            except:
                print word, ' failed'
        return cnt.most_common(1)[0][0]


    dictOfSeqAndCommonNameByTempVar = {}
    dictOfNumFilesByCommonName = {}


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## Extract variable names
    ###############################################################################
    ## adds to:
    ##  dictOfNumFilesByCommonName: canon name -> [ number of files, ... ]
    ##  NOTE: multiple elements in the list means that more than one abstract
    ##        variable has the same canon name
    ##  NOTE: the list is sorted in descending order
    ##
    ## for each temp variable
    ##      get the canon name of the variable (the most common local name)
    ##      add to dictOfNumFilesByCommonName
    ###############################################################################

    print "Extracting variable names"
    # Sweep through and accumulate file lengths into a dict
    for tempKey, listOfFileAndVars in dictOfNamesAndFilesIndexedByVarSeqTempName.iteritems():
        numFiles = len(listOfFileAndVars)
        if numFiles > sharedVarThreshold:

            canonName = extractVarName(tempKey)

            #update the max files associated with a common name
            if canonName in dictOfNumFilesByCommonName:
                dictOfNumFilesByCommonName[canonName].append(numFiles)
                dictOfNumFilesByCommonName[canonName] = sorted(dictOfNumFilesByCommonName[canonName],reverse=True)
            else:
                dictOfNumFilesByCommonName[canonName] = [numFiles]

    # Write out dictOfNamesAndFilesIndexedByVarSeqTempName to json so I can visualize it
    print "Writing variables by varSeqTempName"
    dumpOutput(dictOfNamesAndFilesIndexedByVarSeqTempName,
               'variablesByVarSeqTempName.json')


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## Add common names
    ###############################################################################
    ## adds to:
    ##  dictOfSeqAndCommonNameByTempVar: temp name -> {
    ##      howManyFiles
    ##      commonName
    ##      commonNameWithSuffix
    ##      sequence
    ##  }
    ##  dictOfSeqByCommonNamePlusSuffix: canon name with suffix -> sequence of values
    ##  dictForJson: solnum -> {
    ##      sharedVars: [ canon name with suffix, ...]
    ##      localVars: [ local variable name, ...]
    ##      commonNameAppend: [ suffix, ...]
    ##      commonName: [ canon name, ...]
    ##      *** may also have weirdVars from above
    ##  }
    ##
    ## for each temp variable
    ##      count the number of files it appears in
    ##      get canon name and add suffix if necessary
    ##      add sequence of values to dictOfSeqBy... and dictOfSeqAnd...
    ##      for each (local variable, solnum) the temp var occurs in
    ##          add to dictForJson
    ###############################################################################

    '''Add more info to dictForJson, like the common names for each variable, and
    modifiers to that common name, to indicate with sequence it takes on...'''

    print "Adding common names"
    dictOfSeqByCommonNamePlusSuffix = {}
    for tempKey, listOfFileAndVars in dictOfNamesAndFilesIndexedByVarSeqTempName.iteritems():
        numFiles = len(listOfFileAndVars)
        if numFiles > sharedVarThreshold:
            dictOfSeqAndCommonNameByTempVar[tempKey] = {}
            dictOfSeqAndCommonNameByTempVar[tempKey]['howManyFiles'] = numFiles

            canonName = extractVarName(tempKey)

            offSetFromMax = dictOfNumFilesByCommonName[canonName].index(numFiles)
            if offSetFromMax > 0:
                canonAppend = '___' + str(offSetFromMax + 1)
            else:
                canonAppend = ''

            dictOfSeqAndCommonNameByTempVar[tempKey]['commonName'] = canonName
            dictOfSeqAndCommonNameByTempVar[tempKey]['commonNameWithSuffix'] = canonName+canonAppend

            seqStr = dictTempNameToSequence[tempKey]
            dictOfSeqByCommonNamePlusSuffix[canonName+canonAppend] = seqStr

            dictOfSeqAndCommonNameByTempVar[tempKey]['sequence'] = seqStr
            for localVarName, solNum in listOfFileAndVars:
                if solNum in dictForJson:
                    if 'sharedVars' in dictForJson[solNum]:
                        dictForJson[solNum]['sharedVars'].append(canonName+canonAppend)
                        dictForJson[solNum]['localVars'].append(localVarName)
                        dictForJson[solNum]['commonNameAppend'].append(canonAppend)
                        dictForJson[solNum]['commonName'].append(canonName)
                    else:
                        dictForJson[solNum]['sharedVars']= [canonName+canonAppend]
                        dictForJson[solNum]['localVars'] = [localVarName]
                        dictForJson[solNum]['commonNameAppend'] = [canonAppend]
                        dictForJson[solNum]['commonName']= [canonName]
                else:
                    dictForJson[solNum]={}
                    dictForJson[solNum]['sharedVars']= [canonName+canonAppend]
                    dictForJson[solNum]['localVars'] = [localVarName]
                    dictForJson[solNum]['commonNameAppend'] = [canonAppend]
                    dictForJson[solNum]['commonName']= [canonName]

    print "Writing common variable legend and main json file"
    dumpOutput(dictOfSeqByCommonNamePlusSuffix, 'commonVarLegend.json')
    dumpOutput(dictForJson, 'dictForJson.json')


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## getFreeVars
    ###############################################################################
    ## adds to:
    ##  freeVars: solnum -> [ local variable, ...]
    ##  argAndReturnData: solnum -> {
    ##      studentCreatedVars: freeVars[solnum]
    ##      *** also args, returnVals from earlier
    ## }
    ##
    ## for each (solnum, dictForJson entry)
    ##      for each local variable and weird variable
    ##          if the variable is not an argument or return variable
    ##              add it to freeVars
    ###############################################################################

    def getFreeVars(dictOfLocalAndCommonVarNames, argAndReturnData):
        freeVars = {}
        for k,v in dictOfLocalAndCommonVarNames.iteritems():
            freeVars[k] = []
            if 'localVars' in v.keys():
                for localVar in v['localVars']:
                    if localVar not in argAndReturnData[int(k)]['args'] and localVar not in argAndReturnData[int(k)]['returnVars']:
                        freeVars[k].append(localVar)
            if 'weirdVars' in v.keys():
                print 'there are weirds!',k, v['weirdVars']
                for weirdVar in v['weirdVars']:
                    if weirdVar not in argAndReturnData[int(k)]['args'] and weirdVar not in argAndReturnData[int(k)]['returnVars']:
                        freeVars[k].append(weirdVar)
            argAndReturnData[int(k)]['studentCreatedVars'] = freeVars[k]
        return freeVars

    print "Getting free variables"
    freeVars = getFreeVars(dictForJson,argAndReturnVarInfo)

    print "Writing arg and return value info and free vars"
    dumpOutput(argAndReturnVarInfo, 'argAndReturnVarInfo.json')
    dumpOutput(freeVars, 'freeVars.json')


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## produceFooBarBazJson
    ###############################################################################
    ## adds to:
    ##  listOfSolDictsForTable: [ ordered dict {
    ##      solution: solnum
    ##      arguments: sorted list of arguments
    ##      studentCreatedVars: sorted list of studentCreatedVars and return variables
    ## }]
    ##
    ## for each solution in argAndReturnVarInfo
    ##      create an ordered dict and add it to the list
    ###############################################################################

    def produceFooBarBazJson(argsAndCode):
        loweringfunc = lambda s: s[:1].lower() + s[1:] if s else ''
        import collections
        import copy
        listOfSolDictsForTable = []
        for solnum,soldata in argsAndCode.iteritems():
            dictWithName = collections.OrderedDict()
            dictWithName['solution'] = int(solnum)
            if 'args' in soldata.keys():
                dictWithName['arguments'] = sorted([loweringfunc(listelement) for listelement in soldata['args']]) #soldata['args']
            else:
                dictWithName['arguments'] = []
            studCreatedVars = []
            if 'studentCreatedVars' in soldata.keys():
                studCreatedVars += soldata['studentCreatedVars']
            if 'returnVars' in soldata.keys():
                studCreatedVars += soldata['returnVars']
            dictWithName['studentCreatedVariables'] = sorted([loweringfunc(listelement) for listelement in studCreatedVars])

            listOfSolDictsForTable.append(dictWithName)
        return listOfSolDictsForTable

    print "Producing and writing solution dicts for table"
    listOfSolDictsForTable = produceFooBarBazJson(argAndReturnVarInfo)
    dumpOutput(listOfSolDictsForTable, 'listOfSolDictsForTable.json')


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## collect common variables
    ###############################################################################
    ## adds to:
    ##  allCommonVar: set of all shared variables from all solutions (via dictForJson)
    ###############################################################################

    '''Collect all common vars into a single list.'''

    print "Collecting common variables"
    allCommonVar = set()
    for sol,varDict in dictForJson.iteritems():
        if 'sharedVars' in varDict.keys():
            allCommonVar.update(varDict['sharedVars'])



    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## collect common variables
    ###############################################################################
    ## adds to:
    ##  dictForExhibit: {
    ##      items: [{
    ##          *** from dictForJson:
    ##          sharedVars
    ##          localVars
    ##          commonNameAppend
    ##          commonName
    ##          weirdVars (maybe)
    ##
    ##          type: 'solution'
    ##          label: solnum
    ##          fnames: list of every node the AST visitor found
    ##          canonicalPycode: list of strings: non-empty lines of stripped, renamed source
    ##          canonicalPYcodeIndents: list of ints: size of indentation for each non-empty
    ##                                  line of the renamed source
    ##          code: colorful HTML version of the renamed source
    ##      }]
    ##      properties: {}
    ##      types: {
    ##          Answer: {}
    ##          pluralLabel: 'Answers'
    ##      }
    ##  }
    ##  phraseCounter: Counter of lines of whitespace-stripped code
    ##  tabCounter: dict: stripped line of code -> Counter of the size of the indentation
    ##                    before that line
    ##
    ## for each dict of solution info in dictForJson
    ##      if there are multiple instances of the same abstract variable
    ##          find the indices of the shared instances
    ##          if the canon name and local name are not equal
    ##              change the shared name to <canon>_<local>__
    ##      make a temporary copy of the solution info dict
    ##      add keys to the temp dict:
    ##          'type': 'solution'
    ##          'label': solnum
    ##          'fnames': the result of calling fnames on the tidy solution
    ##      for each abstract variable in the solution
    ##          rename the local variable to <canon>_temp
    ##          re-rename the local variable to the canon name
    ##      for each weird variable in the solution
    ##          if the weird variable shares the canon name with another abstract var
    ##              rename the variable to <weird>__
    ##      add key to temp dict:
    ##          'canonicalPYcode': a list of non-empty lines in the renamed source
    ##                             with whitespace removed
    ##      write the renamed source to <folderOfData>/tidyDataCanonicalized/<solnum>.py
    ##      update phraseCounter with the stripped, renamed lines of source
    ##      add key to temp dict:
    ##          'canonicalPYcodeIndents': a list of the size of leading indentation for
    ##                                    each non-empty line in the renamed source
    ##      if no problems were encountered in any of the renamings
    ##          format the source as colorful HTML
    ##          write the pretty code into <folderOfData>/tidyDataCanonicalizedHTML/<solnum>.py
    ##          add key to temp dict:
    ##              'code': the pretty code
    ##      append the temp dict to dictForExhibit[items]
    ###############################################################################

    '''Create JSON with format necessary for Exhibit'''
    dictForExihibit = {}

    dictForExihibit['items'] = []
    dictForExihibit['properties'] = {}
    dictForExihibit['types'] = {}

    dictForExihibit['types']['Answer'] = {}
    dictForExihibit['types']['Answer']['pluralLabel'] = 'Answers'

    numSkippedSols_commonVarClash = []
    numSkippedSols_weirdVarClash = []
    ensure_folder_exists(folderOfData+'/tidyDataCanonicalized/')
    ensure_folder_exists(folderOfData+'/tidyDataCanonicalizedHTML/')

    phraseCounter = Counter()
    tabCounter = {}

    src_skipped_by_philip = []
    solnum_skipped_by_philip = []
    numSkippedSols_rewritePipeline = []
    numSkippedSols_NoSharedVars = []

    print "Creating dict for Exhibit"
    for solNum, solDict in dictForJson.iteritems():
        print solNum
        flagged = False
        try:
            if len(solDict['sharedVars'])>len(set(solDict['sharedVars'])):
                print 'this solution has multiple indistinguishable instances of a shared variable; fixing!'
                numSkippedSols_commonVarClash.append(solNum)

                for sv in solDict['sharedVars']:
                    indices = [i for i, x in enumerate(solDict['sharedVars']) if x == sv]
                    if len(indices)>1:
                        for ind in indices:
                            if not solDict['sharedVars'][ind] == solDict['localVars'][ind]: #if it's common and local names are i, don't make it i_i
                                solDict['sharedVars'][ind] = solDict['sharedVars'][ind]+'_'+solDict['localVars'][ind]+'__'
        except:
            print 'no sharedVars in vardict... Why?', sol, varDict
            numSkippedSols_NoSharedVars.append(solNum)

        tempDict = solDict.copy()
        tempDict['type'] = 'Solution'
        tempDict['label'] = solNum
        tidyPath = path.join(folderOfData, 'tidyData', solNum + '.py')
        try:
            tempDict['fnames'] = fnames.main(tidyPath)
        except:
            print 'warning: no fnames for ', solNum

        with open(tidyPath,'U') as f:
            read_data = f.read()

        renamed_src = read_data

        extraToken = '_temp' # a * didn't work so well
        if 'sharedVars' in tempDict.keys():
            for i in range(len(tempDict['sharedVars'])):
                locVarName = tempDict['localVars'][i]
                sharedVarNameWithStar = tempDict['sharedVars'][i]+extraToken 
                try:
                    renamed_src = identifier_renamer.rename_identifier(renamed_src, locVarName,sharedVarNameWithStar)
                except:
                    print 'Could not run Philip renamer; skipping!'
                    src_skipped_by_philip.append(renamed_src)
                    solnum_skipped_by_philip.append(solNum)
                    flagged = True
                if rewrite_pipeline_toggle:
                    try:
                        renamed_src = rewrite_pipeline.reorderVariables(renamed_src,togVar)
                    except:
                        print 'skipping!'
                        numSkippedSols_rewritePipeline.append(solNum)
                        flagged = True

            if getRidOfStars:
                for i in range(len(tempDict['sharedVars'])):

                    sharedVarNameWithStar = tempDict['sharedVars'][i]+extraToken
                    sharedVarNameWithOutStar = tempDict['sharedVars'][i]

                    try:
                        renamed_src = identifier_renamer.rename_identifier(renamed_src, sharedVarNameWithStar,sharedVarNameWithOutStar)
                    except:
                        print 'Could not run Philip renamer; skipping!'
                        src_skipped_by_philip.append(renamed_src)
                        solnum_skipped_by_philip.append(solNum)
                        flagged = True

        if 'weirdVars' in tempDict.keys():
            for weirdInstance in tempDict['weirdVars']:
                if weirdInstance in allCommonVar:
                    numSkippedSols_weirdVarClash.append(solNum)
                    try:
                        renamed_src = identifier_renamer.rename_identifier(renamed_src, weirdInstance,weirdInstance+'__')
                    except:
                        print 'Could not run Philip renamer; skipping!'
                        src_skipped_by_philip.append(renamed_src)
                        solnum_skipped_by_philip.append(solNum)
                        flagged = True

        tempDict['canonicalPYcode'] = [el.strip() for el in renamed_src.split('\n') if not (el == '')]
        with open(folderOfData+'/tidyDataCanonicalized/'+solNum+".py",'w') as g:
            g.write(renamed_src)

        phraseCounter.update([el.strip() for el in renamed_src.split('\n')  if not (el == '')])
        tempDict['canonicalPYcodeIndents'] = []
        for unstrippedLine in renamed_src.split('\n'):
            if not (unstrippedLine == ''):
                strippedLine = unstrippedLine.strip()
                leadingSpace = len(unstrippedLine) - len(strippedLine) #how much was lobbed off?
                tempDict['canonicalPYcodeIndents'].append(leadingSpace)
                if strippedLine in tabCounter.keys():
                    tabCounter[strippedLine].update([leadingSpace])
                else:
                    tabCounter[strippedLine] = Counter()
                    tabCounter[strippedLine].update([leadingSpace])


        if not flagged:
            subprocess.call("pygmentize -O style=colorful,linenos=1 -o "+folderOfData+'/tidyDataCanonicalizedHTML/'+solNum+".py.html"+" "+folderOfData+'/tidyDataCanonicalized/'+solNum+".py", shell=True)

            with open(folderOfData+'/tidyDataCanonicalizedHTML/'+solNum+".py.html",'U') as myfile:
                htmlCode = myfile.read()
                htmlCodeFormatted = htmlCode.replace("\"","'").replace("\n","<br>")
                tempDict['code'] = htmlCodeFormatted
            dictForExihibit['items'].append(tempDict)


    print "Writing dictForExihibit and phraseAndTabCounter"
    dumpOutput(dictForExihibit, 'dictForExihibit.json')

    phraseAndTabCounter = {}
    phraseAndTabCounter['phraseCounter'] = phraseCounter
    phraseAndTabCounter['tabCounter'] = tabCounter

    #this includes phrases and such that never made it into the other json files because of skipping sols
    dumpOutput(phraseAndTabCounter, 'phraseAndTabCounter.json')


    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## making dictOfRepresentatives
    ###############################################################################
    ## adds to:
    ##  dictOfRepresentatives: dict: solnum -> {
    ##      rep: {
    ##          <same as dictForExhibit items above>
    ##      }
    ##      count: int
    ##      members: [ solnum, ...]
    ##  }
    ##
    ## initialize dictOfRepresentatives with the first item in dictForExhibit
    ## for each dict of info in dictForExhibit (beyond the first)
    ##      for each representative
    ##          compare the item in question with the representative by comparing
    ##          sets of canonicalized lines
    ##              if the item matches the representative
    ##                  increase the representative's count
    ##                  add the item's solnum to the representative's members
    ##              otherwise, make it a new representative
    ###############################################################################
    '''Create JSON with format necessary for D3 prototype'''

    theJSON = dictForExihibit.copy()
    dictOfRepresentatives = {}
    dictOfRepresentatives[theJSON['items'][0]['label']]= {}
    dictOfRepresentatives[theJSON['items'][0]['label']]['rep'] = theJSON['items'][0]
    dictOfRepresentatives[theJSON['items'][0]['label']]['count'] = 1
    dictOfRepresentatives[theJSON['items'][0]['label']]['members'] = [theJSON['items'][0]['label']]

    print "Finding representatives"
    for JSONitem in theJSON['items'][1:]:
        for groupID, REPitem in dictOfRepresentatives.iteritems():
            if set(REPitem['rep']['canonicalPYcode']) == set(JSONitem['canonicalPYcode']):
                REPitem['count']+=1
                REPitem['members'].append(JSONitem['label'])
                break
        else:
            dictOfRepresentatives[JSONitem['label']] = {'rep': JSONitem, 'count': 1, 'members' : [JSONitem['label']] }

    print "Writing repDict"
    dumpOutput(dictOfRepresentatives, 'repDict.json')

    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## finding misfits
    ###############################################################################
    ## adds to:
    ##  misfitMembers: list of solnums
    ##  repCounts: list of tuples (solnum, count)
    ##  repDataDict: {
    ##      'misfitMembers': misfitMembers above
    ##      'repCountsSorted': repCounts above, sorted by group size, descending
    ##  }
    ##
    ## for each representative
    ##      if the representative is the only member, append it to misfitMembers
    ## add to repCounts: ('misfits', number of misfits)
    ## for each representative with more than one member
    ##      add its label and count to repCounts
    ## initialize repDataDict
    ###############################################################################

    print "Finding misfits"
    misfitTotal = 0
    misfitMembers = []
    for k, v in dictOfRepresentatives.iteritems():
        if v['count']==1:
            misfitTotal += 1
            misfitMembers.append(k)

    repCounts = []
    repCounts.append(('misfits',misfitTotal))
    for k, v in dictOfRepresentatives.iteritems():
        if v['count'] != 1:
            repCounts.append((k, v['count']))

    newlist = sorted(repCounts, key=lambda k: k[1],reverse = True)

    repDataDict = {}
    repDataDict['misfitMembers']= misfitMembers
    repDataDict['repCountsSorted'] = newlist

    print "Writing sorted rep dict"
    dumpOutput(repDataDict, 'repDictSorted.json')

    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## set up solutions, phrases, variables
    ###############################################################################
    ## adds to:
    ##  solutions: list of dicts: {
    ##      code: colorful html version of a representative's source
    ##      phraseIDs: list of ints
    ##      variableIDs: list of ints
    ##      lines: list of dicts: { indent, phraseID }
    ##      keywords: list of every node the AST visitor found
    ##      number: int: solnum of representative
    ##      members: list of solnums of representative's group
    ##      count: int: size of representative's group
    ##  }
    ##  phrases: list of strings
    ##  variables: list of strings
    ##  sequences: list of sequences of values
    ##
    ## for each group and representative
    ##      for each line of code in the representative
    ##          initialize a solution dict
    ##          append the line to phrases if it's not already there
    ##          the ID of the phrase is the index in phrases + 1
    ##          add the phrase ID to solution[phraseIDs]
    ##          add the indent and phraseID to solution[lines]
    ##      for each abstract variable in the representative that isn't a weird
    ##          add the variable to variables if it is not already there
    ##          add the sequence of that variable to sequences if it is not already there
    ##          the ID of the variable is its index in variables plus 1
    ##          add the variable ID to solution[variableIDs]
    ##      convert phraseIDs and variableIDs from sets to lists
    ###############################################################################

    raw_solutions = dictOfRepresentatives
    solutions = []
    phrases = []
    variables = []
    sequences = []

    print "Setting up solutions, phrases, variables, sequences"
    for groupID, groupDescription in raw_solutions.iteritems():
        solution = {'code': groupDescription['rep']['code']}
        solution['phraseIDs'] = set()
        solution['variableIDs'] = set()
        solution['lines'] = []
        for i in range(len(groupDescription['rep']['canonicalPYcode'])):
            phrase = groupDescription['rep']['canonicalPYcode'][i]
            if phrase not in phrases:
                phrases.append(phrase)
            phraseID = phrases.index(phrase) + 1
            solution['phraseIDs'].add(phraseID)
            lineDict = {}
            lineDict['indent'] = groupDescription['rep']['canonicalPYcodeIndents'][i]
            lineDict['phraseID'] = phraseID
            solution['lines'].append(lineDict)
        if 'sharedVars' in groupDescription['rep'].keys():
            for sharVar in groupDescription['rep']['sharedVars']:
                if not sharVar.endswith('__'):
                    if sharVar not in variables:
                        variables.append(sharVar)
                        sequences.append(dictOfSeqByCommonNamePlusSuffix[sharVar])
                    varID = variables.index(sharVar) + 1
                    solution['variableIDs'].add(varID)
        else:
            solution['variableIDs'] = []
        if 'fnames' in groupDescription['rep'].keys():
            solution['keywords'] = groupDescription['rep']['fnames']
        solution['number'] = int(groupDescription['rep']['label'])
        solution['members'] = list(groupDescription['members'])
        solution['phraseIDs'] = list(solution['phraseIDs'])
        solution['variableIDs'] = list(solution['variableIDs'])
        solution['count'] = int(groupDescription['count'])
        solutions.append(solution)

    print "Writing solutions"
    dumpOutput(solutions, 'solutions.json')

    def generateCodeWithFeatureSpans(phrase):
        p2 = re.compile(r'(\W+)')
        splitPhrase = p2.split(phrase)

        newcodeline = ''
        for tok in splitPhrase:
            if tok.isalnum():
                newcodeline += "<span class='feature feature-"+ tok +"'>" + tok + "</span>"
            else:
                newcodeline += tok

        return newcodeline

    ###############################################################################

      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ###############################################################################
    ## last bit of stuff
    ###############################################################################
    ## for each phrase
    ##      change the phrase to a dict: {
    ##          id: index in list + 1
    ##          code: escaped (HTML-safe) line
    ##          indent: most common indent size for that phrase
    ##          codeWithFeatureSpans: line with each word changed to a <span>
    ##      }
    ## for each variable
    ##      change the variable to a dict: {
    ##          id: index in list + 1
    ##          varName: variable name
    ##          varNameAndSeq: <name>:<str(sequence)>
    ##          sequence: sequence
    ##      }
    ###############################################################################

    print "Finding most common indent and writing phrases"
    for i in range(len(phrases)):
        phrase = phrases[i]
        mostCommonIndent = tabCounter[phrase].most_common(1)[0][0]
        phrases[i] = {'id': i+1, 'code': cgi.escape(phrase), 'indent': mostCommonIndent,'codeWithFeatureSpans':generateCodeWithFeatureSpans(phrase)} #, 'aveLineNum':aveLineNum}

    dumpOutput(phrases, 'phrases.json')

    print "Collecting and writing variables"
    for i in range(len(variables)):
        variable = variables[i]
        sequence = sequences[i]
        variables[i] = {'id': i+1, 'varName': variable, 'varNameAndSeq': variable + ':'+str(sequence), 'sequence': sequence } #, 'aveLineNum':aveLineNum}

    dumpOutput(variables, 'variables.json')