Exemple #1
0
def raw_converter(subject=None,
                  grade=None,
                  form=None,
                  ds_raw_scores=None,
                  ds_standards=None,
                  odict_loc_subs=None,
                  std_grade_column='grade',
                  std_form_column='form',
                  std_location_column='location',
                  ds_out=None,
                  ds_report2=None,
                  ds_sumcheck_out=None,
                  ds_sumcheck_report2=None):
    """ Verify, correct, or create raw_score conversion values.

    Extended Summary:
    =================
    converter() - "convert" raw score to scaled score and possibly other values via lookups 
    into conversion tables (CT) embodied in excel worksheets.

    Parameters:
    ===========
    ds_out: Dataset

      - The output is the modified dataset restricted to the raw score variables and 
        conversion outptut variables
        listed in the semantic standards sheet for the subject and grade. The conversion 
        variable values are the correct values given by the semantic standards sheet 
        and the spreadsheets that it references.

    ds_report2: Dataset (writeable)
        - Any errors found are reported here
        
    odict_loc_subs: OrderedDict
        - An ordered dictionary where key is a regular expression pattern to match a location value substring and value is the replacement value. 
        - Order is important, so if caller did not use an ordered dictionary, it is converted into one. 
        - Each substitution is tried in order.
    """
    me = "raw_score_converter:"
    print(
        "\n%s: Starting with subject=%s,grade=%s,form=%s,ds_raw_scores=%s\n" %
        (me, subject, grade, form, repr(ds_raw_scores)))
    # NB: may want to add params for 'conversion_column_names' and 'standard_column_names',
    # maybe subjects or other names if such names ever vary, and set default values.
    # For now, this method hardcodes several dataset column names and some other names.

    if subject is None:
        raise ValueError("A subject must be specified")
    subject = subject.lower().replace(' ', '_')
    subjects = ("math", "reading", "science", "writing", "soc_stud")
    if subject not in subjects:
        raise ValueError("subject must be in %s" % str(subjects))

    if grade is None:
        raise ValueError("An integer grade must be specified")
    grade = str(grade).lower()
    igrade = 0
    if (grade != 'all'):
        igrade = int(float(grade))
        if igrade < 8 or igrade > 15:
            raise ValueError("Given grade %s is out of range" % grade)

    if form is None:
        form = "all"
    else:
        form = str(form).lower()

    if ds_raw_scores is None:
        raise ValueError("ds_raw_scores must be given")

    if ds_sumcheck_out is None:
        raise ValueError("ds_sumcheck_out must be given")

    if ds_sumcheck_report2 is None:
        raise ValueError("ds_sumcheck_report2 must be given")

    if ds_standards is None:
        raise ValueError("A ds_standards dataset must be specified.")
    if odict_loc_subs is not None:
        # If not an ordered dict, convert it into one
        if not isinstance(odict_loc_subs, OrderedDict):
            odict_loc_subs = OrderedDict(odict_loc_subs)
    print(
        "\nStarting %s params: subject=%s,grade=%s, form=%s, \n\tds_raw_scores=%s, "
        "\n\tds_standards=%s, \n\tds_out=%s, \n\todict_loc_subs=%s" %
        (me, subject, grade, form, repr(ds_raw_scores), repr(ds_standards),
         repr(ds_out), repr(odict_loc_subs)))

    print("Reading standards for grade='%s', form='%s' and subject='%s'..." %
          (grade, form, subject))

    # Get reader for standards worksheet. It has about 5 or 6 'standards' rows for each of 5 subjects.
    # Each row provids a raw_score_variable name and 4 "output" conversion variable names as found in the
    # input dataset.
    # None of the variable names is (none should be) duplicated among all rows of the standards worksheet.
    # Each row also provides a workbook and sheetname to identify a worksheet of a conversion table that
    # is used to take the raw_score_variable value from the input and map it to each of the 'output conversion'
    # values that correspond to the output conversion variable names in the standards sheet.
    # If a value is found in the input for a raw_score_variable  that is not keyed in its conversion table,
    # an error is logged to ds_report2.

    reader_standards = ds_standards.DictReader()
    required_standards_columns = [
        std_grade_column, 'subjects', std_form_column, std_location_column,
        'sheetname', 'raw_score_variable'
    ]
    for rq_col in required_standards_columns:
        if rq_col not in reader_standards.fieldnames:
            ValueError("Required column name '%s' is not in %s" %
                       (rq_col, repr(ds_standards)))

    # Read and set up the standards info for this grade, form, subject.
    # Dict rawscorevar_ctable: (1) key is a rawscore variable name in the
    # data input file and (2) value is conversion table for it.
    rawscorevar_ctable = OrderedDict()

    num_standards = 0
    standard_raw_varnames = []
    standard_total_varname = ""
    for idx, row_std in enumerate(reader_standards):
        # row_std is next row in semantic workbook, sheet standards
        # use only those rows with/for given grade, subjet, form
        std_grade = row_std[std_grade_column].lower()
        std_subject = row_std['subjects'].lower()
        std_form = row_std[std_form_column].lower()

        if (((grade != std_grade) and (std_grade != "all"))
                or (form != std_form) or (subject != std_subject)):
            # Skip irrelevant row for these params
            continue

        # This standards(standard form) row, via composite columns location
        # and sheetname, identifies a conversion sheet/table (CT).
        # The CT is used to convert/map input data values for the given
        # subject, grade and form in the named 'raw_score_variable' to
        # output values for any variable named in scaled_score_variable',
        # 'level_variable', 'label_variable', 'stderror_variable')
        std_orig_location = row_std[std_location_column]

        # Revise location using odict_loc_subs
        std_location = std_orig_location
        if odict_loc_subs is not None:
            for key, val in odict_loc_subs.iteritems():
                print(
                    "Calling re.sub(varname_pattern='%s', "
                    "replacement='%s',location_value='%s'" %
                    (key, val, std_location))
                #test_str = locsub(varname=key, replacement=val, origstring=test_str)
                std_location = re.sub(key, val, std_location)
                print("New location string='%s'" % std_location)
        std_sheetname = row_std['sheetname']

        print(
            "\nStandard idx=%d, std_orig_location='%s',\n"
            "and std_location='%s', sheetname='%s'" %
            (idx, std_orig_location, std_location, std_sheetname))

        std_raw_name = row_std['raw_score_variable']

        # For sumcheck outputs: manage standard raw and total variables
        if std_sheetname == "TO":
            standard_total_varname = std_raw_name
        else:
            standard_raw_varnames.append(std_raw_name)
        print("Reading standard at idx=%d, std_raw_name='%s', sheetname='%s'" %
              (idx, std_raw_name, std_sheetname))

        # Set up and read conversion table for this standard (grade,subject,form).
        ds_ct = Dataset(dbms='excel_srcn',
                        workbook_file=std_location,
                        sheet_name=std_sheetname,
                        open_mode='rb')
        print(
            "For standard idx=%d, Reading Conversion Table/Sheet - ds_ct=%s" %
            (idx, repr(ds_ct)))

        ct_reader = ds_ct.DictReader()
        print("ct_reader = %s" % (repr(ct_reader)))

        # Interesting column names for preparing conversion dictionaries
        standard_float_names = [
            'scaled_score_variable', 'level_variable', 'stderror_variable'
        ]
        standard_str_names = ['label_variable']
        std_names = standard_float_names + standard_str_names

        conversion_float_names = [
            'scalescore', 'proficiencylevel', 'unroundedstandarderrorss'
        ]
        conversion_str_names = ['proficiencylabel']
        conv_names = conversion_float_names + conversion_str_names

        required_ct_names = conv_names
        for rq_col in required_ct_names:
            if rq_col not in ct_reader.fieldnames:
                raise ValueError(
                    "Required column '%s' not in conversion table %s" %
                    (rq_col, repr(ds_ct)))

        # Dict ctrawval_converions:
        # (1) key is a CT raw_score value (eg values 0.0, 0.5...)
        # (2) and value is inpcvar_cval.
        # Ideally, the CT keys values have been populated to cover all
        # possible values encountered in the raw input file for the referring
        # input raw score variable.
        ctrawval_conversions = OrderedDict()
        if rawscorevar_ctable.get(std_raw_name):
            raise ValueError(
                "Standards row idx has duplicate raw_score_variable name ='%s'"
                % (idx, std_raw_name))
        #For this key of std_raw_name save the main conversions table
        rawscorevar_ctable[std_raw_name] = ctrawval_conversions
        print(
            "\nFor standard idx=%d, Set rawscorevar_ctable for key( "
            "std_raw_name)='%s' to dict ctrawval_conversions.\n" %
            (idx, std_raw_name))
        print(
            "Populating ctrawval_conversions with keys of rawscorevalues, "
            "each value with dict with 4 conversions:")
        ct_reader = ds_ct.DictReader()
        print("ct_reader = %s" % (repr(ct_reader)))

        for (idx_ct, row_ct) in enumerate(ct_reader):
            # For dict inpcvar_cval key is the data input's outvar name for a
            # converted value and value is the conversion table value.
            inpcvar_cval = OrderedDict()
            ctrawval = str(row_ct['rawscore'])
            #reject duplicate value
            if ctrawval_conversions.get(ctrawval):
                raise ValueError(
                    "Conversions table row idx %d has duplicate rawscore ='%s'"
                    % (idx_ct, ctrawval))

            ctrawval_conversions[ctrawval] = inpcvar_cval
            """print (
              "ctrawval_conversions: idx_ct=%d, set key ctrawval = '%s'" 
              % (idx_ct, ctrawval))"""
            for (std_conv_name, ct_conv_name) in zip(std_names, conv_names):
                # inpvcvar_cval: set key as the ultimate input data column
                # name for an 'output' conversion variable,
                inpcvar = row_std[std_conv_name]
                # and value is the correct string conversion value for it.
                cval = row_ct[ct_conv_name]
                inpcvar_cval[inpcvar] = cval
                """
                print (
                  "inpcvar_cval: using base std_conv_name=%s, set key "
                  "inpcvar=%s, (ct_conv_name=%s), val cval=%s" 
                    % (std_conv_name, inpcvar, ct_conv_name, cval))
                """
        print("Got %d Conversion Sheet rows" % idx_ct)

        print(
            "standards idx %d, grade=%s, subject='%s', form='%s', location='%s'"
            % (idx, std_grade, std_subject, std_form, std_location))
        num_standards += 1
    # end - for idx,row_std in enumerate(reader_standards)
    print("%s: Found %d relevant standards rows" % (me, num_standards))
    if num_standards < 1:
        raise ValueError(
            "With grade='%s',subject='%s',form='%s',\n"
            "standards sheet '%s' has no rows. " %
            (repr(grade), repr(subject), repr(form), repr(ds_standards)))
        return
    if (standard_total_varname == ""):
        raise ValueError(
            "With grade='%s',subject='%s',form='%s',\n"
            "standards sheet '%s',\nis missing a row"
            " having a total 'raw_score_variable' with 'sheetname' = 'TO'.\n" %
            (repr(grade), repr(subject), repr(form), repr(ds_standards)))
        return

    print(
        "Finished ingesting conversion table information. "
        "Got num_standards=", num_standards)

    ###########################################################

    print("Reading and processing input Dataset ds_raw_scores=%s" %
          repr(ds_raw_scores))
    # Print header row for ds_out:
    prefix = ","
    output_header = "obs"
    for rsvar, ctable in rawscorevar_ctable.iteritems():
        output_header += prefix + rsvar
        for rval, cvtable in ctable.iteritems():
            for cvar in cvtable.keys():
                if cvar != "":
                    output_header += prefix + cvar
            # just needed the conversion var names for the first rval key
            break

    # set up two output datasets for sumcheck processing

    sumcheck_output_fieldnames = (['id', 'obs'] + standard_raw_varnames +
                                  [standard_total_varname] + ['correct_total'])
    print "sumcheck_output_fieldnames = %s" % repr(sumcheck_output_fieldnames)
    writer_sumcheck_out = (
        ds_sumcheck_out.DictWriter(sumcheck_output_fieldnames))
    writer_sumcheck_out.writeheader()
    d_sc_out = OrderedDict()

    sumcheck_report2_fieldnames = ['id', 'obs', 'message']
    print("sumcheck_report2_fieldnames = %s" %
          repr(sumcheck_report2_fieldnames))
    writer_sumcheck_report2 = (
        ds_sumcheck_report2.DictWriter(sumcheck_report2_fieldnames))
    writer_sumcheck_report2.writeheader()
    d_sc_r2 = OrderedDict()
    n_sum_error = 0

    # Set up two output datasets for conversion processing
    output_fieldnames = output_header.split(',')
    print "converter_out.csv fieldnames: %s" % repr(output_fieldnames)
    writer_out = ds_out.DictWriter(output_fieldnames)
    writer_out.writeheader()
    # report2
    report2_fieldnames = ['id', 'obs', 'message']
    writer_report2 = ds_report2.DictWriter(report2_fieldnames)
    writer_report2.writeheader()

    # Get reader_input for the main input dataset with raw_scores
    reader_input = ds_raw_scores.DictReader()
    """for (idx,cn) in enumerate(reader_input.fieldnames):
        print "Input data column index %d, column_name='%s'" % (idx,cn)
    """
    n_input = 0
    # Conversion output dicts.
    d_r2 = OrderedDict()
    d_out = OrderedDict()
    n_error = 0
    n_sumcheck = 0
    for (idx, row_input) in enumerate(reader_input):
        in_obs = str(int(float(row_input['id'])))
        d_sc_out['obs'] = in_obs
        d_sc_r2['obs'] = in_obs
        d_r2['obs'] = in_obs
        d_out['obs'] = in_obs

        # Allow a common data error.
        in_grade = str(row_input['grade']).replace("'", "")
        if grade != 'all':
            try:
                in_igrade = int(float(in_grade))
            except:
                message = ("Error: Skipping input row. Could not convert "
                           "in_grade='%s' to float" % (str(in_grade)))
                d_r2['message'] = message
                n_error += 1
                d_r2['id'] = str(n_error)
                writer_report2.writerow(d_r2)
                continue
        if (grade != 'all' and in_igrade != igrade):
            # Normal operation. Skip rows with non-matching grade value.
            continue
        in_ssid = row_input['ssid']

        if (in_ssid is None or in_ssid == ""):
            # Warn for missing ssid, but can continue processing.
            d_r2['message'] = ("Warning: Missing ssid value.")
            n_error += 1
            d_r2['id'] = str(n_error)
            writer_report2.writerow(d_r2)
        n_input += 1

        # First, dispatch with the relatively simple sumcheck processing.
        total_raw = 0.0
        badrow = 0
        # Summate the input row's scores for the content strands (standards)
        for vn in standard_raw_varnames:
            strval = row_input[vn]
            d_sc_out[vn] = strval
            try:
                total_raw += float(strval)
            except:
                badrow = 1
                d_sc_r2['message'] = (
                    "Error: Could not convert input variable='%s' value '%s' "
                    "to float" % (vn, strval))
                n_sum_error += 1
                d_sc_r2['id'] = str(n_sum_error)
                writer_sumcheck_report2.writerow(d_sc_r2)
                # Simply do not sum anything more for this total value
                continue
        if badrow == 1:
            # could not compute 'correct' total from input data, and an error
            # was already written, so skip further sumcheck processing
            pass
        else:
            # got a good total_raw value, so use it for further processing.
            n_sumcheck += 1
            d_sc_out['id'] = str(n_sumcheck)
            d_sc_out['correct_total'] = str(total_raw)

            # standard_total_varname is supposed to have the correct total
            d_sc_out[standard_total_varname] = (str(
                float(row_input[standard_total_varname])))
            # If input has incorrect total, report a sumcheck error.
            if float(row_input[standard_total_varname]) != total_raw:
                n_sum_error += 1
                d_sc_r2['id'] = str(n_sum_error)
                d_sc_r2['message'] = (
                    "Error: Input total varname='%s', total value ='%s' "
                    "but correct total='%s'" %
                    (standard_total_varname, row_input[standard_total_varname],
                     str(total_raw)))
                writer_sumcheck_report2.writerow(d_sc_r2)
        # Write the 'correct' output, regardless of any errors on input
        writer_sumcheck_out.writerow(d_sc_out)
        # Done with sumcheck

        # For registered raw_score variables (from standards sheet), find
        # conversion values and check for mismatches in data.
        # Setup and output first line of var names:
        row_cval_errors = 0
        row_skip_errors = 0
        prefix = ","
        for (rawscorevar,
             ctrawval_conversions) in rawscorevar_ctable.iteritems():
            # Get normalized string for raw_score
            try:
                raw_score = str(float(row_input[rawscorevar].strip()))
            except:
                d_r2['message'] = (
                    "Error: rawscore variable '%s' value '%s' "
                    "is not a float. Skipping." %
                    (str(rawscorevar), str(row_input[rawscorevar])))
                n_error += 1
                d_r2['id'] = str(n_error)
                writer_report2.writerow(d_r2)
                row_skip_errors = 1
                continue
            # Report an error message if rawscore value is not in the
            # conversions table
            if ctrawval_conversions.get(raw_score) is None:
                d_r2['message'] = ("Error: raw_score_var = '%s' value = '%s' "
                                   "has no conversion data." %
                                   (rawscorevar, raw_score))
                n_error += 1
                d_r2['id'] = str(n_error)
                writer_report2.writerow(d_r2)
                row_skip_errors = 1
                continue

            # We have conversion data for this data row's rawscore
            # variable and score
            d_out[rawscorevar] = raw_score
            inpcvar_cval = ctrawval_conversions[raw_score]
            # For each conversion var with a value, calculate the input
            # row's value.
            # Check proper float conversion values
            # for (conv_var_name) in conversion_float_names:
            for (inp_cvar_name, conv_value) in inpcvar_cval.iteritems():
                if not inp_cvar_name:
                    # This is OK.
                    continue
                if conv_value is None:
                    d_r2['message'] = (
                        "WARN:rawvar=%s, no inpcvar_cval entry for '%s'. "
                        "Skipping." % (rawscorevar, inp_cvar_name))
                    n_error += 1
                    d_r2['id'] = str(n_error)
                    writer_report2.writerow(d_r2)
                    row_skip_errors = 1
                    continue

                # "Normalize" the string for this float value
                try:
                    conv_value = str(float(conv_value))
                except ValueError:
                    raise ValueError(
                        "idx %d, column %s value=%s is not a float." %
                        (idx, inp_cvar_name, conv_value))

                d_out[inp_cvar_name] = conv_value
                #"Normalize" the string data value
                data_value = row_input[inp_cvar_name]
                data_value = str(float(data_value))
                #Detect conversion value error in data
                if data_value != conv_value:
                    d_r2['message'] = (
                        "Variable '%s' data value is '%s' but correct "
                        "value is '%s'" %
                        (inp_cvar_name, data_value, conv_value))
                    n_error += 1
                    d_r2['id'] = str(n_error)
                    writer_report2.writerow(d_r2)
                    row_cval_errors += 1
            # end loop for inpcvar_name
        #end loop for rawscorevar

        if (row_skip_errors > 0):
            continue
        writer_out.writerow(d_out)

    print(
        "%s: Done processing %d data input rows from the ds_raw_scores dataset."
        % (me, idx + 1))
    print("See (1) ds_out='%s' \nand (2) ds_report2='%s'" %
          (ds_out, ds_report2))
    return
def rescorecheck(subject=None,
                 grade=None,
                 admin_term="SP13",
                 ds_input=None,
                 ds_bookmaplocs=None,
                 odict_loc_subs=None,
                 ds_out=None,
                 ds_report2=None,
                 bml_form_style=None):
    """ Verify, correct, or create raw_score conversion values.

Extended Summary:
=================
Output scored test data per bookmap_location and bookmap Datasets.
Scored data goes to Dataset ds_out.

Params:
=======
subject: String
    - subject area to which to restrict rescoring (reading, writing, science, social-studies, math)

grade: String
    - grade as a string '10','11','12'... or 'all' to restrict rescoring by grade

admin_term: String 
    - admin term to which to restrict selection of bookmap_location rows.
    - example: SP13, FA14, etc. Should match the admin_term given in the bookmap_locations file in subfield 3 of the column "form".

ds_input: Dataset
    - Dataset of the input data to be rescored. 
    - The dataset must have these column names: ['test_id', 'form_id', 'item_id', 'finalraw_item', 'score_item']
    - See docs on Dataset for more.

ds_bookmaplocs: Dataset
    - Dataset of the bookmaplocations (traditionally stored in an excel workbook, with sheet named "BookMaps").
    - The required column names are long-standing: ['grade', 'subject', 'form', 'bookmap_location']
    - The column bookmap_location identifies an excel sheet with bookmap info for scoring test items. 
    - In the code, required bookmap columns are listed in list "bookmap_required_columns". 
    - It should have some rows of interest that match the parameter "admin_term" in the column "form", in its third "::"-delimited subfield.
    - Rows should exist for 5 subject area values, 
      - and in each there normally are 3 rows, one for each form_id, as presented in column "form", 
      - for bml_form_style == 0, 
         - the first subfield's suffix value to prefix value "Form" is the form_id. 
         - Minor parsing is done here to primp the form_id value to prefix the integer-looking form_id values with a 0 so they match the presentation in the subsidiary 'bookmaps files' as named in the bookmap_locations sheet in column "bookmap_location".
      - for bml_form_style == 1, the form column simply has the form_id with no special parsing required    
      - Each bookmaplocations row provides a bookmap (answer key sheet, basically),  where such bookmap is applied to all rows in the input data that match the observed answers for the given subject and form and specific items.
        
odict_loc_subs: OrderedDict
    - An ordered dictionary where key is a regular expression pattern to match a location value substring and value is the replacement value. 
    - Order is important, so if caller did not use an ordered dictionary, it is converted into one. 
    - Each substitution is tried in order.

ds_out: Dataset
    - Output dataset of the rescored data, according to the bookmap locations info.
    - columns are same as input, excep the score values are computed: ['test_id', 'form_id', 'item_id', 'finalraw_item', 'score_item']

ds_report2: Dataset
    - all input-data related rescorecheck errors go to this dataset
    - columns are; report2_fieldnames=[
      'id','test_id','item_id','test_answer','test_score','correct_answer',
      'correct_score','message']  
      
bml_form_style: integer
    - style 1 works for OGTSP12 data
    - style 0 works for OGTSP13 data
    - may need to add a style per year as conventions change.
    
bkmap_base_path: String
    - String to substitute to translate conversion sheet location field;
    - rloc = repr(std_location)
        # By tradition, "&ctpath.\\" is to be replaced by given parameter.
        std_location = rloc.replace(
          "&ctpath."'\\''\\', ctpath).replace("'","")
   
    """
    me = "rescorecheck:"
    time_start = datetime.datetime.now()

    if subject is None:
        raise ValueError("A subject must be specified")
    # Normalize the subject name for more friendly matching
    subject = subject.lower().replace(' ', '_').strip()
    subjects = ("math", "reading", "science", "writing", "social_studies")
    if subject not in subjects:
        raise ValueError("Got subject='%s', subject must be in %s" %
                         (subject, str(subjects)))

    if grade is None:
        raise ValueError("A grade must be specified")
    grade = str(grade).lower()
    igrade = 0
    if (grade != 'all'):
        igrade = int(float(grade))
        # May want to narrow limits later, but now leave them a bit lax.
        if igrade < 8 or igrade > 15:
            raise ValueError("Given grade %s is out of range" % grade)
    if admin_term is None:
        raise ValueError(
            "An admin_term must be specified (eg sp13, fa14) as in bookmap "
            "locations sheet column 'form', third '::'-delimited subfield")
    admin_term = admin_term.lower().strip()

    # ds_input checks
    if ds_input is None:
        raise ValueError("ds_input must be given")
    reader_input = ds_input.DictReader()
    if reader_input is None:
        raise ValueError("ds_input reader for dataset '%s' cannot open" %
                         repr(ds_input))
    if odict_loc_subs is not None:
        # if not an ordered dict, convert it into one
        if not isinstance(odict_loc_subs, OrderedDict):
            odict_loc_subs = OrderedDict(odict_loc_subs)
    #Check required columns on reader_input
    required_input_columns = [
        'test_id', 'form_id', 'item_id', 'finalraw_item', 'score_item'
    ]
    for req_col in required_input_columns:
        if req_col not in reader_input.fieldnames:
            raise ValueError(
                "Required column name '%s' is missing in ds_input='%s'" %
                (req_col, repr(ds_input)))

    if ds_bookmaplocs is None:
        raise ValueError("A dataset ds_bookmaplocs must be specified.")

    print(
        "\nSTART %s, time=%s, "
        "\n\t params: subject=%s,grade=%s, admin_term='%s'"
        "\n\tds_raw_scores=%s,"
        "\n\tds_bookmaplocs='%s'"
        "\n\tds_out='%s'"
        "\n\tods_report2='%s'"
        "\n\todict_loc_subs='%s'" %
        (me, time_start, subject, grade, admin_term, repr(ds_input),
         repr(ds_bookmaplocs), repr(odict_loc_subs), repr(ds_out),
         repr(ds_report2)))

    print("Reading bookmaplocs for grade='%s', and subject='%s'..." %
          (grade, subject))

    # Get reader for bookmap locations (bml) worksheet.
    reader_bml = ds_bookmaplocs.DictReader()
    # Per Datacheck manual section 2.4.2, required columns.
    required_columns = ['grade', 'subject', 'form', 'bookmap_location']
    for rq_col in required_columns:
        if rq_col not in reader_bml.fieldnames:
            ValueError("Required column name '%s' is not in %s" %
                       (rq_col, repr(ds_bookmaplocs)))

    if bml_form_style is None:
        # Style of the 'form' field found in development test data:
        # "Formxx:OGTX:SSYY" where xx is 1,2 or SV,
        # and X is in R,M,C,S,W,
        # and SS is in SP, FA
        # and YY is 20YY value: 12 for 2013, etc.
        bml_form_style = 0

    else:
        # Another style found in some test data circa 20130701:
        #Form field is simply 01,02 or SV.
        bml_form_style = 1

    # Read and set up the bookmaploc info for this grade and subject
    # (already should be constant through input rows) and form_id
    # ( a value in the input row).
    # Dict formid_bookmap: (1) key is a form_id value that occurs
    # in the bookmap locations sheet and also in the data input file
    # and (2) value is the bookmap dict for the formid value.
    formid_bookmap = OrderedDict()

    num_bml = 0
    for idx_bml, row_bml in enumerate(reader_bml):
        bml_grade = row_bml['grade'].lower().strip()
        bml_subject = (row_bml['subject'].lower().replace(" ", "").replace(
            "_", "").strip())
        bml_form = row_bml['form'].lower().strip()

        if (((grade != bml_grade) and (bml_grade != "all") and
             (bml_grade != "g")) or (subject != bml_subject)):
            # Skip irrelevant row for these params
            # print "SKIPPIING row, idx_bml=%d" % idx_bml
            continue

        # Form field
        if bml_form_style == 0:
            bml_form_fields = bml_form.split("::")
            print("idx_bml=%d,formfields='%s'" %
                  (idx_bml, repr(bml_form_fields)))

            if (len(bml_form_fields) != 3):
                raise ValueError(
                    "Dataset='%s', row %d, field form='%s' has %d subfields, not 3."
                    % (repr(ds_bookmaplocs), idx_bml + 2, bml_form,
                       len(bml_form_fields)))
            bml_formid = bml_form_fields[0].replace("form", "")
            # if formid is integer, prefix it with a 0 and keep it a string,
            # to match values presented in the traditional input files.
            try:
                bml_formid = ("0" + bml_formid if
                              (len(bml_formid) == 1) else bml_formid)
            except:
                pass
            bml_admin = bml_form_fields[2]
            if (bml_admin != admin_term):
                #skip irrelevant row
                print(
                    "\nSkipping bookmaplocations row id=%d, admin_term='%s' "
                    "but bml_admin='%s'\n" %
                    (idx_bml + 2, admin_term, bml_admin))
                continue
        elif bml_form_style == 1:
            #
            bml_formid = bml_form
        else:
            raise ValueError("Parameter bml_form_style=%d unknown." %
                             bml_form_style)
        # Read the bookmap at this bookmap location
        num_bml += 1
        print("idx_bml=%d, grade='%s',subject='%s',form='%s'" %
              (idx_bml, bml_grade, bml_subject, bml_form))

        bml_location = row_bml['bookmap_location']
        # Revise location using odict_loc_subs
        if odict_loc_subs is not None:
            for key, val in odict_loc_subs.iteritems():
                print(
                    "Calling re.sub(varname_pattern='%s', "
                    "replacement='%s',location_value='%s'" %
                    (key, val, bml_location))
                #test_str = locsub(varname=key, replacement=val, origstring=test_str)
                bml_location = re.sub(key, val, bml_location)
                print("New location string='%s'" % bml_location)

        print "idx_bml=%d, row='%s'" % (idx_bml, repr(row_bml))

        # For this row_bml's bml_form_id as the key, create a formid entry
        # whose value is a dictionary named itemid_info.
        # The itemid_info dictionary key is an itemid and the info value
        # is an OrderedDict of bookmap column-value pairs.
        itemid_info = OrderedDict()
        # Consider: raise error here if key bml_formid already exists.
        formid_bookmap[bml_formid] = itemid_info

        # Populate the itemid_info dictionary from the bookmap.
        # First, init the dataset and reader for this bookmap.
        ds_bookmap = Dataset(dbms='excel_srcn',
                             workbook_file=bml_location,
                             sheet_name="BookMap",
                             open_mode='rb')
        reader_bookmap = ds_bookmap.DictReader()
        bookmap_required_columns = [
            'item_position', 'book_position', 'its_id', 'grade', 'subject',
            'form', 'session', 'description', 'reporting_subscore', 'role',
            'item_format', 'point_value', 'answer_key', 'numeric_key',
            'weight', 'tagged_for_release', 'ohio_code', 'test', 'graphic',
            'benchmark', 'indicator', 'content_standard', 'grade_level'
        ]
        for rcol in bookmap_required_columns:
            if rcol not in reader_bookmap.fieldnames:
                raise ValueError("Required column '%s' not in dataset '%s'" %
                                 (rcol, repr(ds_bookmap)))
        bm_num_rows = 0
        info = "----- Storing MC item_id_info for map_itemid vals: "
        delim = ""
        for (idx_map, row_map) in enumerate(reader_bookmap):
            if row_map['item_format'] != "MC":
                continue
            map_itemid = (str(int(float(
                row_map['item_position']))).lower().strip())
            # For this itemid, save a copy of the row of bookmap data,
            # because row_map is overwritten each time thru this loop.
            info += ("%s %s" % (delim, map_itemid))
            delim = ","
            itemid_info[str(map_itemid)] = row_map.copy()
            bm_num_rows += 1
        print info
        print("\nBookmap idx=%d, read%d rows from ds_bookmap='%s'" %
              (idx_bml, bm_num_rows, repr(ds_bookmap)))

    # end: for idx_bml, row_bml in enumerate(reader_bml):

    print(
        "Finished ingesting ds_bookmaplocs (%s) information. Using %d rows." %
        (repr(ds_bookmaplocs), num_bml))
    if num_bml < 1:
        print "Finished: No ds_bookmaplocs rows of interest found."
        return

    print("Reading and processing input Dataset ds_input=%s" % repr(ds_input))

    # Prepare normal output for ds_out: Same columns as input row, but with
    # authoritative score found in bookmap rather than raw score in input.
    output_columns = required_input_columns
    print "Rescore output column names: %s" % repr(output_columns)
    writer_out = ds_out.DictWriter(output_columns)
    writer_out.writeheader()
    # dict for normal output
    d_out = OrderedDict()

    # report2 - misc errors, incorrect scores in input.
    report2_fieldnames = [
        'id', 'test_id', 'item_id', 'test_answer', 'test_score',
        'correct_answer', 'correct_score', 'message'
    ]
    writer_report2 = ds_report2.DictWriter(report2_fieldnames)
    print "Rescore report2 column names: %s" % repr(report2_fieldnames)
    writer_report2.writeheader()
    # dict for error report output
    r2 = OrderedDict()

    # Read reader_input for the main input dataset with raw scores
    print("Rescore input data column names: %s" %
          repr(reader_input.fieldnames))
    n_input = 0
    n_errors = 0
    now = datetime.datetime.now()
    print("Time=%s: Reading input item rows . . ." % (now))
    for (idx, row_input) in enumerate(reader_input):
        # Each input row basically represents one test-takers response data
        # for a particular item.
        # NB: caller must already have filtered ds_input by the correct
        # subject, grade.
        n_input += 1
        if n_input % 100000 == 0:
            now = datetime.datetime.now()
            print("Time=%s: Processed %d input item rows so far . . ." %
                  (now, n_input))
        # 'test_id': may be called something else in the database, but a
        # 'test_id' identifies metadata and a complete set of answers by
        # one test-taker for a test that covers multiple subject areas.
        inp_test_id = str(int(float(row_input['test_id'])))
        r2['test_id'] = inp_test_id
        d_out['test_id'] = inp_test_id

        inp_item_id = str(int(float(row_input['item_id'])))
        r2['item_id'] = inp_item_id
        d_out['item_id'] = inp_item_id

        # Form id string: Examples: 01, 02, sv
        inp_form_id = str(row_input['form_id']).lower().strip()
        d_out['form_id'] = inp_form_id

        # finalraw_item is the test-takers answer for this item
        fri = row_input['finalraw_item']
        if fri == "-" or fri == "" or fri == "*":
            #Convention, I think is to use 99 in these fields as missing.
            inp_finalraw = str(99)
        else:
            try:
                inp_finalraw = str(int(float(row_input['finalraw_item'])))
            except:
                print(
                    "Input row %d: finalraw_item='%s' not a float. Using 99." %
                    (idx + 1, row_input['finalraw_item']))
                inp_finalraw = str(99)
        d_out['finalraw_item'] = str(inp_finalraw)

        # score_item: is the score already given the test-taker in the
        # input data
        inp_score_str = row_input['score_item']
        if (inp_score_str is None or inp_score_str == ""
                or inp_score_str == 'None'):
            inp_score = 99
        else:
            try:
                inp_score = int(float(inp_score_str))
            except:
                r2['message'] = (
                    "Form='%s', score_item='%s' not a float on input row %d "
                    "in the bookmap file. Skipping" %
                    (inp_form_id, inp_score_str, idx + 1))
                print r2['message']
                n_errors += 1
                r2['id'] = str(n_errors)
                writer_report2.writerow(r2)
                continue

        inp_item_id = str(int(float(row_input['item_id'])))

        if itemid_info.get(inp_item_id) is None:
            # Oops, this input has a test answer for an item not in
            # the bookmap.
            r2['message'] = (
                "Form='%s', item_id='%s' on input row %d is not an MC item "
                "in the bookmap file. Skipping" %
                (inp_form_id, str(inp_item_id), idx + 1))
            n_errors += 1
            r2['id'] = str(n_errors)
            writer_report2.writerow(r2)
            continue
        # Got important data input values so now work with them.
        # For this form and item_id, get bookmap info for this item
        if formid_bookmap.get(inp_form_id) is None:
            # Oops - this input form_id is an unknown bookmap formid
            r2['message'] = ("Form='%s' on input row %d is not "
                             "in the bookmap file. Skipping" %
                             (inp_form_id, idx + 1))
            n_errors += 1
            r2['id'] = str(n_errors)
            writer_report2.writerow(r2)
            continue

        itemid_info = formid_bookmap[inp_form_id]
        iteminfo = itemid_info[str(inp_item_id)]
        # get correct answer - normalized in case of float '.'
        try:
            numeric_key = str(int(float(iteminfo['numeric_key'])))
        except:
            # Effectively set a bad key by setting sentinel value here.
            numeric_key = str(98)

        score_error = 0
        correct_score = 0
        #compare correct_key with actual 'raw' answer and input data's score
        if (int(numeric_key) == int(inp_finalraw)):
            # test-taker got this correct. Check if proper score was credited.
            correct_score = int(float(iteminfo['point_value']))
            if int(inp_score) != int(correct_score):
                score_error = 1
        else:
            if int(inp_score) != 0:
                score_error = 1
        if score_error == 1:
            r2['test_answer'] = inp_finalraw
            r2['correct_answer'] = numeric_key
            r2['test_score'] = inp_score
            r2['correct_score'] = correct_score
            r2['message'] = ("Score = '%s', but input shows '%s'" %
                             (str(correct_score), str(inp_score)))
            n_errors += 1
            r2['id'] = str(n_errors)
            writer_report2.writerow(r2)

        d_out['score_item'] = str(correct_score)
        writer_out.writerow(d_out)
    #end loop for reader_input
    time_end = datetime.datetime.now()

    print(
        "%s: "
        "Processed %d total data input rows from dataset=%s."
        "Done at time=%s.\n\nTime elapsed=%s\n"
        "================================================\n" %
        (me, idx + 1, repr(ds_input), time_end, str(time_end - time_start)))
    return