def tidy(source_dir, dest_dir, tested_function_name):
    Go through every .py file in the source_dir directory and call tidy_one on each.
    Write out the results to a file of the same name in the dest_dir directory.

    source_dir: string, path to folder with data to tidy
    dest_dir: string, path to folder to write tidy data to. Does not need
        to already exist.
    tested_function_name: the name of the function which will be tested. Used
        when tidying to remove debugging function calls.

    If STOP_ON_ERROR is set and an error is encountered while tidying, raises
    that error (and returns nothing).

    returns: list of solution ids (filenames without .py) that could not be tidied

    skipped = []
    for filename in os.listdir(source_dir):
        # Skip non-python files
        if not filename.endswith('.py'):

        sol_id = filename.split('.')[0]
        print "Tidying", sol_id

        source_path = path.join(source_dir, filename)
        dest_path = path.join(dest_dir, filename)
            tidy_one(source_path, dest_path, tested_function_name)
            if STOP_ON_ERROR: raise
    return skipped
def tidy(source_dir, dest_dir, tested_function_name):
    Go through every .py file in the source_dir directory and call tidy_one on each.
    Write out the results to a file of the same name in the dest_dir directory.

    source_dir: string, path to folder with data to tidy
    dest_dir: string, path to folder to write tidy data to. Does not need
        to already exist.
    tested_function_name: the name of the function which will be tested. Used
        when tidying to remove debugging function calls.

    If STOP_ON_ERROR is set and an error is encountered while tidying, raises
    that error (and returns nothing).

    returns: list of solution ids (filenames without .py) that could not be tidied

    skipped = []
    for filename in os.listdir(source_dir):
        # Skip non-python files
        if not filename.endswith('.py'):

        sol_id = filename.split('.')[0]
        print "Tidying", sol_id

        source_path = path.join(source_dir, filename)
        dest_path = path.join(dest_dir, filename)
            tidy_one(source_path, dest_path, tested_function_name)
            if STOP_ON_ERROR: raise
    return skipped
def augment(source_dir, dest_dir):
    Append or prepend any extra code from to student submissions.

    source_dir: string, path to folder with data to augment
    dest_dir: string, path to folder to write augmented data to. Does not need
        to already exist.

    for filename in os.listdir(source_dir):
        with open(path.join(source_dir, filename), 'r') as f:
            source =
        with open(path.join(dest_dir, filename), 'w') as f:
            f.write(import_prefix + source + testcase_defs)
def augment(source_dir, dest_dir):
    Append or prepend any extra code from to student submissions.

    source_dir: string, path to folder with data to augment
    dest_dir: string, path to folder to write augmented data to. Does not need
        to already exist.

    for filename in os.listdir(source_dir):
        with open(path.join(source_dir, filename), 'r') as f:
            source =
        with open(path.join(dest_dir, filename), 'w') as f:
            f.write(import_prefix + source + testcase_defs)
def execute_and_pickle(source_dir, dest_dir, testcases, output_only):
    Wrapper function.

    Run pg_logger on each file in source_dir for each test case and store the
    results in pickle files in dest_dir.

    source_dir: string, path to a directory of source files to run
    dest_dir: string, path to a directory to put pickle files. Does not need to
        already exist
    testcases: list of strings, where each string is a well-formed test case
        for the source files in question

    If STOP_ON_ERROR is set and an error is encountered while logging or
        pickling, raises that error (and returns nothing).

    returns: (skipped_running, skipped_pickling) - two lists of solution ids
        (filename without .py) that encountered errors while executing or
        while pickling, respectively. A failure while pickling means that the
        solution executed correctly but the results could not be stored.
    skipped_running, skipped_pickling = [], []

    for filename in os.listdir(source_dir):
        sol_id = filename.split('.')[0]
        with open(path.join(source_dir, filename), 'r') as f:
            source =

        # Execute
        print "Running logger on", sol_id
            all_traces, all_outputs = do_logger_run(source, testcases,
            if STOP_ON_ERROR: raise
            # We had an error, do not try to pickle and just move on

        # Pickle results
            do_pickle(sol_id, all_traces, all_outputs, testcases, dest_dir)
        except pickle.PicklingError:
            if STOP_ON_ERROR: raise

    return skipped_running, skipped_pickling
def execute_and_pickle(source_dir, dest_dir, testcases, output_only):
    Wrapper function.

    Run pg_logger on each file in source_dir for each test case and store the
    results in pickle files in dest_dir.

    source_dir: string, path to a directory of source files to run
    dest_dir: string, path to a directory to put pickle files. Does not need to
        already exist
    testcases: list of strings, where each string is a well-formed test case
        for the source files in question

    If STOP_ON_ERROR is set and an error is encountered while logging or
        pickling, raises that error (and returns nothing).

    returns: (skipped_running, skipped_pickling) - two lists of solution ids
        (filename without .py) that encountered errors while executing or
        while pickling, respectively. A failure while pickling means that the
        solution executed correctly but the results could not be stored.
    skipped_running, skipped_pickling = [], []

    for filename in os.listdir(source_dir):
        sol_id = filename.split('.')[0]
        with open(path.join(source_dir, filename), 'r') as f:
            source =

        # Execute
        print "Running logger on", sol_id
            all_traces, all_outputs = do_logger_run(source, testcases, output_only)
            if STOP_ON_ERROR: raise
            # We had an error, do not try to pickle and just move on

        # Pickle results
            do_pickle(sol_id, all_traces, all_outputs, testcases, dest_dir)
        except pickle.PicklingError:
            if STOP_ON_ERROR: raise

    return skipped_running, skipped_pickling
Esempio n. 7
def run(folderOfData, destFolder):
    def dumpOutput(data, filename, sort_keys=True, indent=4):
        filepath = path.join(destFolder, filename)
        with open(filepath, 'w') as f:
            json.dump(data, f, sort_keys=sort_keys, indent=indent, cls=ElenaEncoder)

    # Load solutions
    all_solutions = []
    populate_from_pickles(all_solutions, path.join(folderOfData, 'pickleFiles'))
    # for sol in all_solutions:
    #     pprint.pprint(sol.getDict())

    # Collect variables into AbstractVariables
    all_abstracts = []
    skipped_extract_sequences = extract_and_collect_var_seqs(all_solutions, all_abstracts)
    #print all_abstracts
    #for absvar in all_abstracts:
    #    pprint.pprint(absvar.getDict())

    all_solutions = [sol for sol in all_solutions if sol.output==CORRECT]

    # for sol in all_solutions:
    #     pprint.pprint(sol.getDict())

    # for absvar in all_abstracts:
    #     pprint.pprint(absvar.getDict())

    all_lines = []
    skipped_by_renamer = compute_all_lines(all_solutions,folderOfData,all_lines)

    for line in all_lines:

    all_templates = []
    for line in all_lines:

    template_dict = {}
    for template in all_templates:
        #print template
        hand_made_counter = {};
        for sol in all_solutions:
            for line in sol.lines:
                line_object = line[0]
                if line_object.template==template:
                    #print sol.solnum,line
                    in_hand_made_counter = False
                    for line_key, count in hand_made_counter.iteritems():
                        if line_key == line_object:
                            in_hand_made_counter = True
                    if not(in_hand_made_counter):
                        hand_made_counter[line_object] = 1
                    #template_dict[template][line[0]] += 1

        template_dict[template] = hand_made_counter

    # for sol in all_solutions:
    #     pprint.pprint(sol.getDict())
    for i in xrange(10):
    for sol in all_solutions:

    print 'template_dict'
    for template,count_dict in template_dict.iteritems():
        print ''
        print 'template'
        print template
        print ':'
        print pprint.pprint(count_dict)
Esempio n. 8
def run(folderOfData, destFolder):

    # Constants and initial lists
    getRidOfStars = True
    rewrite_pipeline_toggle = False
    sharedVarThreshold = 1
    # solnum -> trace
    progTraceDictAll = {}
    argAndReturnVarInfo = {}


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## populate_from_pickles
    ## adds to:
    ##  progTraceDictAll: solnum -> trace
    ##  argAndReturnVarInfo: solnum -> { args, returnVars }

    def populate_from_pickles(pickleSrc, formattedSrc, formattedExtn='.py.html'):
        print "Loading data"
        for filename in os.listdir(pickleSrc):
            solNum = filename.split('.')[0]
            solNumInt = int(solNum)
            print solNum

            with open(path.join(pickleSrc, filename), 'r') as f:
                unpickled = pickle.load(f)
            progTraceDictAll[solNum] = unpickled['trace']

            argAndReturnVarInfo[solNumInt] = {}
            argAndReturnVarInfo[solNumInt]['args'] = unpickled['args']
            argAndReturnVarInfo[solNumInt]['returnVars'] = unpickled['returnVars']
            # with open(path.join(formattedSrc, solNum + formattedExtn), 'r') as f:
            #     argAndReturnVarInfo[solNumInt]['code'] =

    populate_from_pickles(path.join(folderOfData, 'pickleFiles'), path.join(folderOfData, 'tidyDataHTML'))

    class ElenaEncoder(json.JSONEncoder):
        def default(self, obj):
           if isinstance(obj, set):
              return {'type':'set', 'list':list(obj)}
           if isinstance(obj, types.FunctionType):
              return {'type':'function'}
           return json.JSONEncoder.default(self, obj)

    def dumpOutput(data, filename, sort_keys=True, indent=4):
        filepath = path.join(destFolder, filename)
        with open(filepath, 'w') as f:
            json.dump(data, f, sort_keys=sort_keys, indent=indent, cls=ElenaEncoder)


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## getVarEquivsAcrossAllSolsLinear
    ## adds to:
    ##  dictOfNamesAndFilesIndexedByVarSeqTempName: tempName -> (local name, solnum)
    ##  dictTempNameToSequence: tempName -> value sequence
    ## for each trace
    ##      for each local variable
    ##          extract sequence of values
    ##          if we have already seen it,
    ##              add (local name, solnum) to the correct entry in dictOfNames
    ##          else
    ##              add a new entry in dictOfNames: tempName -> (local name, solnum)
    ##              add a new entry in dictTemp: tempName -> sequence

    '''Linearly accumulate variables that have the same sequence of values across
    multiple solutions'''
    def extractSequence(column):
        valueSequence = []
        for elem in column:
            val = elem[1]
            if val != 'myNaN' and val != None:

                if valueSequence == []:
                    lastval = valueSequence[-1]
                    if val != lastval:
        return valueSequence

    # sequence of values is extracted and compared to what we already have
    def isThisVarSeqInOurDict(localVarData):
        for tempVarName, tempVarData in dictTempNameToSequence.iteritems():
            if tempVarData == extractSequence(localVarData):
                return tempVarName
        return None

    # temporary variable name to (local name, solution number)
    dictOfNamesAndFilesIndexedByVarSeqTempName = {}
    dictTempNameToSequence = {}

    def getVarEquivsAcrossAllSolsLinear():
        numSkippedSols_tooLong = []

        # This will not remove duplicates within one solution
        for k,v in progTraceDictAll.iteritems():
            # k is the solution number, v is the trace of variable names and values

            # Extracts the return value from the trace
            if '__return__' not in v.keys():

            for localVarName, localVarData in v.iteritems():
                if localVarName.startswith('__'):
                tempVarName = isThisVarSeqInOurDict(localVarData)
                theExtractedSequence = extractSequence(localVarData)
                if len(theExtractedSequence)==1 and type(theExtractedSequence[0]) is str:
                    if theExtractedSequence[0].startswith('__'):
                        # A function definition
                if tempVarName != None:
                    # we already have the variable sequence in our dict
                    if len(dictOfNamesAndFilesIndexedByVarSeqTempName.keys()) == 0:
                        # 0 is a temp name here
                        dictOfNamesAndFilesIndexedByVarSeqTempName[0] = [(localVarName,k)]
                        dictTempNameToSequence[0] = theExtractedSequence
                        maxVarName = max(dictOfNamesAndFilesIndexedByVarSeqTempName.keys())
                        newTempVarName = maxVarName + 1
                        dictOfNamesAndFilesIndexedByVarSeqTempName[newTempVarName] = [(localVarName,k)]
                        dictTempNameToSequence[newTempVarName] = theExtractedSequence
        return numSkippedSols_tooLong

    print "Getting variable equivalents"
    numSkippedSols_tooLong = getVarEquivsAcrossAllSolsLinear()


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## Populating dictForJson
    ## adds to:
    ##  dictForJson: solnum -> {
    ##      weirdVars: [ local name, ... ]
    ##  }
    ## find the number of occurences of each abstract variable
    ## for each number, in descending order
    ##      for each abstract variable with only 1 occurrance
    ##          add entry to dictForJson[solnum][weirdVars]

    '''Begin populating a json file (dictForJson) that is indexed by
    the solution name and includes things like the weird and common variable names'''

    print "Populating dictForJson"
    dictForJson = {}
    for nums in sorted(set(map(len, dictOfNamesAndFilesIndexedByVarSeqTempName.values())),reverse=True):
        for k,v in dictOfNamesAndFilesIndexedByVarSeqTempName.iteritems():
            if nums == len(v) and nums == 1:
                for (localVar, solname) in v:
                    if str(solname) in dictForJson.keys():
                        dictForJson[solname]['weirdVars']= [localVar]

    '''Determine what the common name of a variable should be
    by extracting the most common variable name'''
    def extractVarName(tempName):
        accumulatedVarNames = []
        accumulatedVarNames+= [tup[0] for tup in dictOfNamesAndFilesIndexedByVarSeqTempName[tempName]]
        cnt = Counter()
        for word in accumulatedVarNames:
                cnt[word] += 1
                print word, ' failed'
        return cnt.most_common(1)[0][0]

    dictOfSeqAndCommonNameByTempVar = {}
    dictOfNumFilesByCommonName = {}


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## Extract variable names
    ## adds to:
    ##  dictOfNumFilesByCommonName: canon name -> [ number of files, ... ]
    ##  NOTE: multiple elements in the list means that more than one abstract
    ##        variable has the same canon name
    ##  NOTE: the list is sorted in descending order
    ## for each temp variable
    ##      get the canon name of the variable (the most common local name)
    ##      add to dictOfNumFilesByCommonName

    print "Extracting variable names"
    # Sweep through and accumulate file lengths into a dict
    for tempKey, listOfFileAndVars in dictOfNamesAndFilesIndexedByVarSeqTempName.iteritems():
        numFiles = len(listOfFileAndVars)
        if numFiles > sharedVarThreshold:

            canonName = extractVarName(tempKey)

            #update the max files associated with a common name
            if canonName in dictOfNumFilesByCommonName:
                dictOfNumFilesByCommonName[canonName] = sorted(dictOfNumFilesByCommonName[canonName],reverse=True)
                dictOfNumFilesByCommonName[canonName] = [numFiles]

    # Write out dictOfNamesAndFilesIndexedByVarSeqTempName to json so I can visualize it
    print "Writing variables by varSeqTempName"


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## Add common names
    ## adds to:
    ##  dictOfSeqAndCommonNameByTempVar: temp name -> {
    ##      howManyFiles
    ##      commonName
    ##      commonNameWithSuffix
    ##      sequence
    ##  }
    ##  dictOfSeqByCommonNamePlusSuffix: canon name with suffix -> sequence of values
    ##  dictForJson: solnum -> {
    ##      sharedVars: [ canon name with suffix, ...]
    ##      localVars: [ local variable name, ...]
    ##      commonNameAppend: [ suffix, ...]
    ##      commonName: [ canon name, ...]
    ##      *** may also have weirdVars from above
    ##  }
    ## for each temp variable
    ##      count the number of files it appears in
    ##      get canon name and add suffix if necessary
    ##      add sequence of values to dictOfSeqBy... and dictOfSeqAnd...
    ##      for each (local variable, solnum) the temp var occurs in
    ##          add to dictForJson

    '''Add more info to dictForJson, like the common names for each variable, and
    modifiers to that common name, to indicate with sequence it takes on...'''

    print "Adding common names"
    dictOfSeqByCommonNamePlusSuffix = {}
    for tempKey, listOfFileAndVars in dictOfNamesAndFilesIndexedByVarSeqTempName.iteritems():
        numFiles = len(listOfFileAndVars)
        if numFiles > sharedVarThreshold:
            dictOfSeqAndCommonNameByTempVar[tempKey] = {}
            dictOfSeqAndCommonNameByTempVar[tempKey]['howManyFiles'] = numFiles

            canonName = extractVarName(tempKey)

            offSetFromMax = dictOfNumFilesByCommonName[canonName].index(numFiles)
            if offSetFromMax > 0:
                canonAppend = '___' + str(offSetFromMax + 1)
                canonAppend = ''

            dictOfSeqAndCommonNameByTempVar[tempKey]['commonName'] = canonName
            dictOfSeqAndCommonNameByTempVar[tempKey]['commonNameWithSuffix'] = canonName+canonAppend

            seqStr = dictTempNameToSequence[tempKey]
            dictOfSeqByCommonNamePlusSuffix[canonName+canonAppend] = seqStr

            dictOfSeqAndCommonNameByTempVar[tempKey]['sequence'] = seqStr
            for localVarName, solNum in listOfFileAndVars:
                if solNum in dictForJson:
                    if 'sharedVars' in dictForJson[solNum]:
                        dictForJson[solNum]['sharedVars']= [canonName+canonAppend]
                        dictForJson[solNum]['localVars'] = [localVarName]
                        dictForJson[solNum]['commonNameAppend'] = [canonAppend]
                        dictForJson[solNum]['commonName']= [canonName]
                    dictForJson[solNum]['sharedVars']= [canonName+canonAppend]
                    dictForJson[solNum]['localVars'] = [localVarName]
                    dictForJson[solNum]['commonNameAppend'] = [canonAppend]
                    dictForJson[solNum]['commonName']= [canonName]

    print "Writing common variable legend and main json file"
    dumpOutput(dictOfSeqByCommonNamePlusSuffix, 'commonVarLegend.json')
    dumpOutput(dictForJson, 'dictForJson.json')


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## getFreeVars
    ## adds to:
    ##  freeVars: solnum -> [ local variable, ...]
    ##  argAndReturnData: solnum -> {
    ##      studentCreatedVars: freeVars[solnum]
    ##      *** also args, returnVals from earlier
    ## }
    ## for each (solnum, dictForJson entry)
    ##      for each local variable and weird variable
    ##          if the variable is not an argument or return variable
    ##              add it to freeVars

    def getFreeVars(dictOfLocalAndCommonVarNames, argAndReturnData):
        freeVars = {}
        for k,v in dictOfLocalAndCommonVarNames.iteritems():
            freeVars[k] = []
            if 'localVars' in v.keys():
                for localVar in v['localVars']:
                    if localVar not in argAndReturnData[int(k)]['args'] and localVar not in argAndReturnData[int(k)]['returnVars']:
            if 'weirdVars' in v.keys():
                print 'there are weirds!',k, v['weirdVars']
                for weirdVar in v['weirdVars']:
                    if weirdVar not in argAndReturnData[int(k)]['args'] and weirdVar not in argAndReturnData[int(k)]['returnVars']:
            argAndReturnData[int(k)]['studentCreatedVars'] = freeVars[k]
        return freeVars

    print "Getting free variables"
    freeVars = getFreeVars(dictForJson,argAndReturnVarInfo)

    print "Writing arg and return value info and free vars"
    dumpOutput(argAndReturnVarInfo, 'argAndReturnVarInfo.json')
    dumpOutput(freeVars, 'freeVars.json')


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## produceFooBarBazJson
    ## adds to:
    ##  listOfSolDictsForTable: [ ordered dict {
    ##      solution: solnum
    ##      arguments: sorted list of arguments
    ##      studentCreatedVars: sorted list of studentCreatedVars and return variables
    ## }]
    ## for each solution in argAndReturnVarInfo
    ##      create an ordered dict and add it to the list

    def produceFooBarBazJson(argsAndCode):
        loweringfunc = lambda s: s[:1].lower() + s[1:] if s else ''
        import collections
        import copy
        listOfSolDictsForTable = []
        for solnum,soldata in argsAndCode.iteritems():
            dictWithName = collections.OrderedDict()
            dictWithName['solution'] = int(solnum)
            if 'args' in soldata.keys():
                dictWithName['arguments'] = sorted([loweringfunc(listelement) for listelement in soldata['args']]) #soldata['args']
                dictWithName['arguments'] = []
            studCreatedVars = []
            if 'studentCreatedVars' in soldata.keys():
                studCreatedVars += soldata['studentCreatedVars']
            if 'returnVars' in soldata.keys():
                studCreatedVars += soldata['returnVars']
            dictWithName['studentCreatedVariables'] = sorted([loweringfunc(listelement) for listelement in studCreatedVars])

        return listOfSolDictsForTable

    print "Producing and writing solution dicts for table"
    listOfSolDictsForTable = produceFooBarBazJson(argAndReturnVarInfo)
    dumpOutput(listOfSolDictsForTable, 'listOfSolDictsForTable.json')


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## collect common variables
    ## adds to:
    ##  allCommonVar: set of all shared variables from all solutions (via dictForJson)

    '''Collect all common vars into a single list.'''

    print "Collecting common variables"
    allCommonVar = set()
    for sol,varDict in dictForJson.iteritems():
        if 'sharedVars' in varDict.keys():


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## collect common variables
    ## adds to:
    ##  dictForExhibit: {
    ##      items: [{
    ##          *** from dictForJson:
    ##          sharedVars
    ##          localVars
    ##          commonNameAppend
    ##          commonName
    ##          weirdVars (maybe)
    ##          type: 'solution'
    ##          label: solnum
    ##          fnames: list of every node the AST visitor found
    ##          canonicalPycode: list of strings: non-empty lines of stripped, renamed source
    ##          canonicalPYcodeIndents: list of ints: size of indentation for each non-empty
    ##                                  line of the renamed source
    ##          code: colorful HTML version of the renamed source
    ##      }]
    ##      properties: {}
    ##      types: {
    ##          Answer: {}
    ##          pluralLabel: 'Answers'
    ##      }
    ##  }
    ##  phraseCounter: Counter of lines of whitespace-stripped code
    ##  tabCounter: dict: stripped line of code -> Counter of the size of the indentation
    ##                    before that line
    ## for each dict of solution info in dictForJson
    ##      if there are multiple instances of the same abstract variable
    ##          find the indices of the shared instances
    ##          if the canon name and local name are not equal
    ##              change the shared name to <canon>_<local>__
    ##      make a temporary copy of the solution info dict
    ##      add keys to the temp dict:
    ##          'type': 'solution'
    ##          'label': solnum
    ##          'fnames': the result of calling fnames on the tidy solution
    ##      for each abstract variable in the solution
    ##          rename the local variable to <canon>_temp
    ##          re-rename the local variable to the canon name
    ##      for each weird variable in the solution
    ##          if the weird variable shares the canon name with another abstract var
    ##              rename the variable to <weird>__
    ##      add key to temp dict:
    ##          'canonicalPYcode': a list of non-empty lines in the renamed source
    ##                             with whitespace removed
    ##      write the renamed source to <folderOfData>/tidyDataCanonicalized/<solnum>.py
    ##      update phraseCounter with the stripped, renamed lines of source
    ##      add key to temp dict:
    ##          'canonicalPYcodeIndents': a list of the size of leading indentation for
    ##                                    each non-empty line in the renamed source
    ##      if no problems were encountered in any of the renamings
    ##          format the source as colorful HTML
    ##          write the pretty code into <folderOfData>/tidyDataCanonicalizedHTML/<solnum>.py
    ##          add key to temp dict:
    ##              'code': the pretty code
    ##      append the temp dict to dictForExhibit[items]

    '''Create JSON with format necessary for Exhibit'''
    dictForExihibit = {}

    dictForExihibit['items'] = []
    dictForExihibit['properties'] = {}
    dictForExihibit['types'] = {}

    dictForExihibit['types']['Answer'] = {}
    dictForExihibit['types']['Answer']['pluralLabel'] = 'Answers'

    numSkippedSols_commonVarClash = []
    numSkippedSols_weirdVarClash = []

    phraseCounter = Counter()
    tabCounter = {}

    src_skipped_by_philip = []
    solnum_skipped_by_philip = []
    numSkippedSols_rewritePipeline = []
    numSkippedSols_NoSharedVars = []

    print "Creating dict for Exhibit"
    for solNum, solDict in dictForJson.iteritems():
        print solNum
        flagged = False
            if len(solDict['sharedVars'])>len(set(solDict['sharedVars'])):
                print 'this solution has multiple indistinguishable instances of a shared variable; fixing!'

                for sv in solDict['sharedVars']:
                    indices = [i for i, x in enumerate(solDict['sharedVars']) if x == sv]
                    if len(indices)>1:
                        for ind in indices:
                            if not solDict['sharedVars'][ind] == solDict['localVars'][ind]: #if it's common and local names are i, don't make it i_i
                                solDict['sharedVars'][ind] = solDict['sharedVars'][ind]+'_'+solDict['localVars'][ind]+'__'
            print 'no sharedVars in vardict... Why?', sol, varDict

        tempDict = solDict.copy()
        tempDict['type'] = 'Solution'
        tempDict['label'] = solNum
        tidyPath = path.join(folderOfData, 'tidyData', solNum + '.py')
            tempDict['fnames'] = fnames.main(tidyPath)
            print 'warning: no fnames for ', solNum

        with open(tidyPath,'U') as f:
            read_data =

        renamed_src = read_data

        extraToken = '_temp' # a * didn't work so well
        if 'sharedVars' in tempDict.keys():
            for i in range(len(tempDict['sharedVars'])):
                locVarName = tempDict['localVars'][i]
                sharedVarNameWithStar = tempDict['sharedVars'][i]+extraToken 
                    renamed_src = identifier_renamer.rename_identifier(renamed_src, locVarName,sharedVarNameWithStar)
                    print 'Could not run Philip renamer; skipping!'
                    flagged = True
                if rewrite_pipeline_toggle:
                        renamed_src = rewrite_pipeline.reorderVariables(renamed_src,togVar)
                        print 'skipping!'
                        flagged = True

            if getRidOfStars:
                for i in range(len(tempDict['sharedVars'])):

                    sharedVarNameWithStar = tempDict['sharedVars'][i]+extraToken
                    sharedVarNameWithOutStar = tempDict['sharedVars'][i]

                        renamed_src = identifier_renamer.rename_identifier(renamed_src, sharedVarNameWithStar,sharedVarNameWithOutStar)
                        print 'Could not run Philip renamer; skipping!'
                        flagged = True

        if 'weirdVars' in tempDict.keys():
            for weirdInstance in tempDict['weirdVars']:
                if weirdInstance in allCommonVar:
                        renamed_src = identifier_renamer.rename_identifier(renamed_src, weirdInstance,weirdInstance+'__')
                        print 'Could not run Philip renamer; skipping!'
                        flagged = True

        tempDict['canonicalPYcode'] = [el.strip() for el in renamed_src.split('\n') if not (el == '')]
        with open(folderOfData+'/tidyDataCanonicalized/'+solNum+".py",'w') as g:

        phraseCounter.update([el.strip() for el in renamed_src.split('\n')  if not (el == '')])
        tempDict['canonicalPYcodeIndents'] = []
        for unstrippedLine in renamed_src.split('\n'):
            if not (unstrippedLine == ''):
                strippedLine = unstrippedLine.strip()
                leadingSpace = len(unstrippedLine) - len(strippedLine) #how much was lobbed off?
                if strippedLine in tabCounter.keys():
                    tabCounter[strippedLine] = Counter()

        if not flagged:
  "pygmentize -O style=colorful,linenos=1 -o "+folderOfData+'/tidyDataCanonicalizedHTML/'+solNum+".py.html"+" "+folderOfData+'/tidyDataCanonicalized/'+solNum+".py", shell=True)

            with open(folderOfData+'/tidyDataCanonicalizedHTML/'+solNum+".py.html",'U') as myfile:
                htmlCode =
                htmlCodeFormatted = htmlCode.replace("\"","'").replace("\n","<br>")
                tempDict['code'] = htmlCodeFormatted

    print "Writing dictForExihibit and phraseAndTabCounter"
    dumpOutput(dictForExihibit, 'dictForExihibit.json')

    phraseAndTabCounter = {}
    phraseAndTabCounter['phraseCounter'] = phraseCounter
    phraseAndTabCounter['tabCounter'] = tabCounter

    #this includes phrases and such that never made it into the other json files because of skipping sols
    dumpOutput(phraseAndTabCounter, 'phraseAndTabCounter.json')


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## making dictOfRepresentatives
    ## adds to:
    ##  dictOfRepresentatives: dict: solnum -> {
    ##      rep: {
    ##          <same as dictForExhibit items above>
    ##      }
    ##      count: int
    ##      members: [ solnum, ...]
    ##  }
    ## initialize dictOfRepresentatives with the first item in dictForExhibit
    ## for each dict of info in dictForExhibit (beyond the first)
    ##      for each representative
    ##          compare the item in question with the representative by comparing
    ##          sets of canonicalized lines
    ##              if the item matches the representative
    ##                  increase the representative's count
    ##                  add the item's solnum to the representative's members
    ##              otherwise, make it a new representative
    '''Create JSON with format necessary for D3 prototype'''

    theJSON = dictForExihibit.copy()
    dictOfRepresentatives = {}
    dictOfRepresentatives[theJSON['items'][0]['label']]= {}
    dictOfRepresentatives[theJSON['items'][0]['label']]['rep'] = theJSON['items'][0]
    dictOfRepresentatives[theJSON['items'][0]['label']]['count'] = 1
    dictOfRepresentatives[theJSON['items'][0]['label']]['members'] = [theJSON['items'][0]['label']]

    print "Finding representatives"
    for JSONitem in theJSON['items'][1:]:
        for groupID, REPitem in dictOfRepresentatives.iteritems():
            if set(REPitem['rep']['canonicalPYcode']) == set(JSONitem['canonicalPYcode']):
            dictOfRepresentatives[JSONitem['label']] = {'rep': JSONitem, 'count': 1, 'members' : [JSONitem['label']] }

    print "Writing repDict"
    dumpOutput(dictOfRepresentatives, 'repDict.json')


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## finding misfits
    ## adds to:
    ##  misfitMembers: list of solnums
    ##  repCounts: list of tuples (solnum, count)
    ##  repDataDict: {
    ##      'misfitMembers': misfitMembers above
    ##      'repCountsSorted': repCounts above, sorted by group size, descending
    ##  }
    ## for each representative
    ##      if the representative is the only member, append it to misfitMembers
    ## add to repCounts: ('misfits', number of misfits)
    ## for each representative with more than one member
    ##      add its label and count to repCounts
    ## initialize repDataDict

    print "Finding misfits"
    misfitTotal = 0
    misfitMembers = []
    for k, v in dictOfRepresentatives.iteritems():
        if v['count']==1:
            misfitTotal += 1

    repCounts = []
    for k, v in dictOfRepresentatives.iteritems():
        if v['count'] != 1:
            repCounts.append((k, v['count']))

    newlist = sorted(repCounts, key=lambda k: k[1],reverse = True)

    repDataDict = {}
    repDataDict['misfitMembers']= misfitMembers
    repDataDict['repCountsSorted'] = newlist

    print "Writing sorted rep dict"
    dumpOutput(repDataDict, 'repDictSorted.json')


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## set up solutions, phrases, variables
    ## adds to:
    ##  solutions: list of dicts: {
    ##      code: colorful html version of a representative's source
    ##      phraseIDs: list of ints
    ##      variableIDs: list of ints
    ##      lines: list of dicts: { indent, phraseID }
    ##      keywords: list of every node the AST visitor found
    ##      number: int: solnum of representative
    ##      members: list of solnums of representative's group
    ##      count: int: size of representative's group
    ##  }
    ##  phrases: list of strings
    ##  variables: list of strings
    ##  sequences: list of sequences of values
    ## for each group and representative
    ##      for each line of code in the representative
    ##          initialize a solution dict
    ##          append the line to phrases if it's not already there
    ##          the ID of the phrase is the index in phrases + 1
    ##          add the phrase ID to solution[phraseIDs]
    ##          add the indent and phraseID to solution[lines]
    ##      for each abstract variable in the representative that isn't a weird
    ##          add the variable to variables if it is not already there
    ##          add the sequence of that variable to sequences if it is not already there
    ##          the ID of the variable is its index in variables plus 1
    ##          add the variable ID to solution[variableIDs]
    ##      convert phraseIDs and variableIDs from sets to lists

    raw_solutions = dictOfRepresentatives
    solutions = []
    phrases = []
    variables = []
    sequences = []

    print "Setting up solutions, phrases, variables, sequences"
    for groupID, groupDescription in raw_solutions.iteritems():
        solution = {'code': groupDescription['rep']['code']}
        solution['phraseIDs'] = set()
        solution['variableIDs'] = set()
        solution['lines'] = []
        for i in range(len(groupDescription['rep']['canonicalPYcode'])):
            phrase = groupDescription['rep']['canonicalPYcode'][i]
            if phrase not in phrases:
            phraseID = phrases.index(phrase) + 1
            lineDict = {}
            lineDict['indent'] = groupDescription['rep']['canonicalPYcodeIndents'][i]
            lineDict['phraseID'] = phraseID
        if 'sharedVars' in groupDescription['rep'].keys():
            for sharVar in groupDescription['rep']['sharedVars']:
                if not sharVar.endswith('__'):
                    if sharVar not in variables:
                    varID = variables.index(sharVar) + 1
            solution['variableIDs'] = []
        if 'fnames' in groupDescription['rep'].keys():
            solution['keywords'] = groupDescription['rep']['fnames']
        solution['number'] = int(groupDescription['rep']['label'])
        solution['members'] = list(groupDescription['members'])
        solution['phraseIDs'] = list(solution['phraseIDs'])
        solution['variableIDs'] = list(solution['variableIDs'])
        solution['count'] = int(groupDescription['count'])

    print "Writing solutions"
    dumpOutput(solutions, 'solutions.json')

    def generateCodeWithFeatureSpans(phrase):
        p2 = re.compile(r'(\W+)')
        splitPhrase = p2.split(phrase)

        newcodeline = ''
        for tok in splitPhrase:
            if tok.isalnum():
                newcodeline += "<span class='feature feature-"+ tok +"'>" + tok + "</span>"
                newcodeline += tok

        return newcodeline


      #   #   ###   #####  #####
      ##  #  #   #    #    #
      # # #  #   #    #    ###
      #  ##  #   #    #    #
      #   #   ###     #    #####

    ## last bit of stuff
    ## for each phrase
    ##      change the phrase to a dict: {
    ##          id: index in list + 1
    ##          code: escaped (HTML-safe) line
    ##          indent: most common indent size for that phrase
    ##          codeWithFeatureSpans: line with each word changed to a <span>
    ##      }
    ## for each variable
    ##      change the variable to a dict: {
    ##          id: index in list + 1
    ##          varName: variable name
    ##          varNameAndSeq: <name>:<str(sequence)>
    ##          sequence: sequence
    ##      }

    print "Finding most common indent and writing phrases"
    for i in range(len(phrases)):
        phrase = phrases[i]
        mostCommonIndent = tabCounter[phrase].most_common(1)[0][0]
        phrases[i] = {'id': i+1, 'code': cgi.escape(phrase), 'indent': mostCommonIndent,'codeWithFeatureSpans':generateCodeWithFeatureSpans(phrase)} #, 'aveLineNum':aveLineNum}

    dumpOutput(phrases, 'phrases.json')

    print "Collecting and writing variables"
    for i in range(len(variables)):
        variable = variables[i]
        sequence = sequences[i]
        variables[i] = {'id': i+1, 'varName': variable, 'varNameAndSeq': variable + ':'+str(sequence), 'sequence': sequence } #, 'aveLineNum':aveLineNum}

    dumpOutput(variables, 'variables.json')