def genSetsCategoryList(mrsets, allvars, vartypes, resolver, specialvalues, macroname, missing, order, weightvar, categorylabels, specialsorder, other): """Generate sorted list(s) of values with possible insertion of extra values and create SPSS macros. varnames is a sequence of mr set names to process. allvars is the resolved list of variables in the sets vartypes is the list of variable types resolver is a class that contains the MR set information from the SPSS dictionary. specialvalues is a sequence of values that should be inserted before the first zero count or at the end if no zeros or None. If a special value already occurs in a varname, it will be moved. macroname is a list of macronames of the same length as varnames to generate or None. missing is 'include' or 'exclude' to determine whether user missing values are included or excluded. order is 'a' or 'd' to specify the sort direction. weightvar can be specified as a variable name to be used as a weight in determing the counts to sort by. It must not occur in varnames. This function is mainly useful as a helper function for Ctables in building CATEGORIES subcommands. It may be useful to combine it with other and/or MISSING in the category list.""" for name in mrsets: if resolver.getSetType(name) != "Dichotomies": raise ValueError(_("""The specified set is a multiple category set. Only multiple dichotomy sets can be used: %s""") % name) if weightvar in allvars: raise ValueError(_("""The weight variable cannot be included in an MR set.""")) if weightvar: varnamesAndWeight = allvars + [weightvar] else: varnamesAndWeight = allvars curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, omitmissing=missing == 'exclude') nvar = len(allvars) vvalues=[{} for i in range(nvar)] # for accumulating counts for all variable values for cn, case in enumerate(curs): casecpy = copy.copy(case) if weightvar: w = casecpy[nvar] if w is None: w = 0.0 else: w = 1.0 for i in range(nvar): if not casecpy[i] is None: # omit sysmis values and optionally user missing values curval = casecpy[i] vvalues[i][curval] = vvalues[i].get(curval,0.) + w # count occurrences, possibly weighted curs.CClose() # produce value list for variables in a set if macroname is None: raise ValueError("No macro names were specified") manager = ManageValues(mrsetinfo=resolver.mrsets, allvars=allvars, allvalues=vvalues, specials=specialvalues, order=order, macroname=macroname, categorylabels=categorylabels, specialsorder=specialsorder, other=other) for i, s in enumerate(mrsets): manager.collapse(s) manager.setgen(macroname[i], s)
def doaggr(self, doindex): """create an aggregate dataset and tally values doindex is the index into varstolabel at which to start""" vtl = self.varstolabel[doindex:doindex+self.varsperpass] vtllen = len(vtl) if len(self.labelvars) == 1: lbls = self.labelvars lastlbl = vtllen + 1 else: lbls = self.labelvars[doindex:doindex+self.varsperpass] lastlbl = 3 * vtllen - 1 brkvarlist = "\n".join(textwrap.wrap(" ".join(vtl), width=100)) outvars = ["/min_%s=MIN(%s)/max_%s=MAX(%s)" % (mkrandomname(), v, mkrandomname(), v) for v in lbls] aggrcmd = Mkvls.aggrtemplate % (self.aggrdsname, self.aggrdsname, brkvarlist) + "\n".join(outvars) spss.Submit(aggrcmd) spss.Submit("DATASET ACTIVATE %s" % self.aggrdsname) # for each variable, build label information based on data # AGGREGATE dataset structure: # var1value, var2value,..., min(text lbl1), max(text lbl1), min(text lbl2), max(text lbl2)... # but if only one label set, only one pair of label aggregates is produced # user missing values are exposed and subject to labelling curs = spssdata.Spssdata(names=False, convertUserMissing=False) for case in curs: for v, vname in enumerate(vtl): value = case[v] minlbl = self.truncate(case[min(vtllen + v*2, lastlbl-1)], 120).rstrip() maxlbl = self.truncate(case[min(vtllen + v*2 + 1, lastlbl)], 120).rstrip() # more than one label for the same value? if minlbl != maxlbl and (minlbl != "" and minlbl is not None): self.conflicts[vname].add(value) # ignore empty or missing labels if maxlbl != "" and maxlbl is not None: # if the value has already been seen but with a different label, it's a conflict if value in self.values[vname] and not (value, maxlbl) in self.vlabels[vname]: self.conflicts[vname].add(value) else: self.vlabels[vname].add((value, maxlbl)) # first one wins self.values[vname].add(value) # tally instances where the same label used for different value # need to see whether labels has been assigned to a different value previousvalue = self.labelusage[vname].get(maxlbl, None) if previousvalue is not None and value != previousvalue: ###self.duplabels[vname] = self.duplabels[vname] + 1 self.duplabels[vname].add(maxlbl) self.labelusage[vname][maxlbl] = value curs.CClose() spss.Submit("DATASET CLOSE %s" % self.aggrdsname)
def getsav(self, filespec, delete=True): """Open sav file and return all contents filespec is the file path filespec is deleted after the contents are read unless delete==False""" item = self.wdsname spss.Submit(r"""get file="%(filespec)s". DATASET NAME %(item)s. DATASET ACTIVATE %(item)s.""" % locals()) contents = spssdata.Spssdata(names=False).fetchall() spss.Submit("""DATASET CLOSE %(item)s. NEW FILE.""" % locals()) if delete: os.remove(filespec) return contents
def genSetsCategoryList(mcset, allvars, resolver, setname, varprefix): """Generate sorted list(s) of values with possible insertion of extra values and create SPSS macros. mcset is the mc set to convert allvars is the resolved list of variables in the sets resolver is a class that contains the MR set information from the SPSS dictionary. setname is the name for the output set varprefix is the prefix for variable names to generate""" if resolver.getSetType(mcset) != "Categories": raise ValueError( _("""The specified set is not a multiple category set. Only a set of that type can be used in this procedure: %s""" ) % mcset) curs = spssdata.Spssdata( indexes=allvars, names=False) # keep cases w missing, mv's set to None nvar = len(allvars) vvalues = set() for case in curs: for i in range(nvar): if not case[i] is None: # omit sysmis and user missing values if resolver.getVarType(mcset) == "String": val = case[i].rstrip() else: val = case[i] vvalues.add(val) curs.CClose() if len(vvalues) == 0: raise ValueError( _("""There are no values in the set variables for set: %s""" % mcset)) # copy values labels from the first variable in the set # MC sets are expected to have consistent value labels across variable # if any are defined. with spss.DataStep(): valuelabels = spss.Dataset().varlist[allvars[0]].valueLabels.data manager = ManageValues(resolver, mcset, vvalues, setname, varprefix, valuelabels) manager.genData() manager.setgen() return (manager.generatednames, manager.generatedvalues, manager.generatedlabels)
def rake(info, variables, marginals,finalweight, visible=False, showweights=True, poptotal=None, delta=0, iter=20, conv=.0001,checkempty=True, yvar=None, xvar=None, paneldownvar=None, panelacrossvar=None, autoheatmap=None, histogram=True): """Calculate a weight variable such that for each controlled dimension, the (weighted) count in each category matches a specified total or fraction. e variables is a list of the variables for which control totals or proportions are provided. It can be a sequence or a white-space separated string marginals is a list of dictionaries where the key is the value of a variable and the value is the target control total or fraction. fractional marginals should normally add to 1 and counts should total the same in each dimension, but this is not enforced. If there are no cases for a given value, the total or fraction will be less than expected. Negative and zero marginals are tolerated but are generally inappropriate. If a control value is not in the appropriate marginals dictionary or is zero, the resulting weight will be SYSMIS. finalweight is a string naming the new weight variable. The variable must not already exist in the active dataset. visible indicates whether or not the procedure output and auxiliary dataset are displayed. poptotal is a total to which the final weights will be scaled. By default, they are scaled to sum to the existing weight total, if any, or the number of cases. delta, iter, and conv are iteration parameters corresponding to the GENLOG parameters and can be used if there are convergence problems. checkempty adjusts for empty cells. If it is known that there are none, this adjustment, which can use considerable memory, can be bypassed. If the active dataset does not have a name, one is assigned automatically.""" variables = _buildvarlist(variables) if len(variables) == 1: # delta should always be 0 if this is a 1-d problem delta = 0. wtvar = spss.GetWeightVar() if wtvar: #aggrweight will be the GENLOG CSTRUCTURE variable aggrweight = "W_" + rname() else: aggrweight = "N_" + rname() activeds = spssaux.GetActiveDatasetName() if not activeds: activeds = "D_" + rname() spss.Submit("DATASET NAME " + activeds) nbreakvars = len(variables) if nbreakvars != len(marginals): raise ValueError(_("The number of control variables does not match number of sets of control totals")) # aggregate the data according to the list of control variables aggrdsname = "D_" + rname() countname = "N_" + rname() spss.Submit("WEIGHT OFF.") # aggregate to a new dataset and activate it. Sum original weight variable if any cmd=\ """DATASET DECLARE %(aggrdsname)s %(vis)s. AGGREGATE /OUTFILE= %(aggrdsname)s /BREAK=%(breakvars)s %(wtspec)s /%(countname)s=N. dataset activate %(aggrdsname)s.""" % \ {'aggrdsname': aggrdsname, 'breakvars' : " ".join(variables), 'wtspec' : ("/" + aggrweight + (wtvar and ("= SUM(" + wtvar + ")") or "= N")), 'countname' : countname, 'vis' : visible and " " or " WINDOW=HIDDEN" } spss.Submit(cmd) # if weighting, make a dictionary of the mean weight in each cell for use in final adjustment pass if wtvar: allaggrdata = spssdata.Spssdata(variables + [aggrweight] + [countname]).fetchall() meaninputwts = {} for row in allaggrdata: try: themean = row[nbreakvars] / row[nbreakvars+1] except: themean = None meaninputwts[row[:nbreakvars]] = themean # get a cursor and add control totals to this dataset # The control total is the product of all the variable value control totals or proportions # Track the cases to see if there are any empty cells, i.e., some combination of the marginals across # all the control variables that does not occur in the aggregated dataset. spss.StartProcedure("SPSSINC RAKE - I") if checkempty: cellset = _setprod(_dictlisttotupledsets(marginals)) # build set of tuples of all sets curs = spssdata.Spssdata(accessType='r', indexes=variables, names=False) for case in curs: cellset.discard(tuple(case[:nbreakvars])) # remove found cells curs.CClose() spss.EndProcedure() #add cases for any empty cells if cellset: spss.StartProcedure("SPSSINC RAKE - I") curs = spssdata.Spssdata(accessType='a', names=False) for s in cellset: for cv in range(nbreakvars): curs.appendvalue(cv, s[cv]) for cv in range(2): #was nbreakvars curs.appendvalue(nbreakvars+cv, 1e-12) # 1e-8 curs.CommitCase() curs.CClose() spss.EndProcedure() # compute expected count from marginals try: curs = spssdata.Spssdata(accessType='w') ctrlwt = "W_"+ rname() # this will be the SPSS case weight curs.append(ctrlwt) curs.commitdict() novalues = [] for case in curs: w = 1. for i, v in enumerate(marginals): w = w* v.get(case[i], 0) if w == 0: spec = (variables[i], case[i]) if not spec in novalues: uspec = spec[0] if not isinstance(uspec, str): uspec = str(uspec, locale.getlocale()[1]) info.addrow(_("Variable: %s, value: %s. No control value supplied: weight will be SYSMIS.") % (uspec, spec[1])) novalues.append(spec) break curs.casevalues([w]) finally: curs.CClose() spss.EndProcedure() info.generate() # run GENLOG expectedname = 'expected_' + rname() newwt = "W_"+ rname() #dbg ###spss.Submit("""save outfile="c:/temp/genloginput.sav".""") if not visible: omstag = "O_" + rname() spss.Submit("OMS /SELECT ALL EXCEPT =WARNINGS /DESTINATION VIEWER=NO /TAG=" + omstag) #debugcmd = """WEIGHT BY %(ctrlwt)s. #GENLOG #%(breakvars)s /CSTRUCTURE = %(aggrweight)s #/MODEL = POISSON #/PRINT = FREQ ESTIM #/plot none #/CRITERIA = CIN(95) ITERATE(%(iter)s) CONVERGE(%(conv)s) DELTA(%(delta)s) #/save= pred(%(expectedname)s) #/DESIGN %(breakvars)s . """ %\ #{'ctrlwt' : ctrlwt, #'breakvars': " ".join(variables), #'aggrweight' : aggrweight , #'expectedname' : expectedname, #'iter' : iter, #'conv' : conv, #'delta' : delta #} try: spss.Submit("""WEIGHT BY %(ctrlwt)s. GENLOG %(breakvars)s /CSTRUCTURE = %(aggrweight)s /MODEL = POISSON /PRINT = FREQ ESTIM /plot none /CRITERIA = CIN(95) ITERATE(%(iter)s) CONVERGE(%(conv)s) DELTA(%(delta)s) /save= pred(%(expectedname)s) /DESIGN %(breakvars)s . """ %\ {'ctrlwt' : ctrlwt, 'breakvars': " ".join(variables), 'aggrweight' : aggrweight , 'expectedname' : expectedname, 'iter' : iter, 'conv' : conv, 'delta' : delta }) except: spss.Submit("DATASET ACTIVATE " + activeds) raise ValueError(_("""Failure in GENLOG procedure. Processing stopped. The error could be either a failure to compute the result or not having a license for the Advanced Statistics option""")) finally: if not visible: spss.Submit("OMSEND TAG=" + omstag) # get the expected counts, normalized by the cell N in order to distribute expkts = {} spss.StartProcedure("SPSSINC RAKE - II") try: curs = spssdata.Spssdata(accessType='r', indexes = variables + [expectedname] + [aggrweight]+ [countname]) weightsum = 0. wsum = 0. for case in curs: weightsum += case[nbreakvars] or 0 # allow for missing values j.i.c if case[-3]: wsum += case[-2] w = case[-3]/case[-1] else: w = None expkts[tuple(case[:nbreakvars])] = w finally: curs.CClose() spss.EndProcedure() # normalize weights to user total or sum of sample weights poptotal = poptotal or wsum for key in expkts: if expkts[key]: expkts[key] *= poptotal/weightsum # return to the first dataset and apply weights. spss.Submit("DATASET ACTIVATE " + activeds) if not visible: spss.Submit("DATASET CLOSE " + aggrdsname) spss.StartProcedure("SPSSINC RAKE - III") if wtvar: indexes = variables+ [wtvar] else: indexes = variables curs = spssdata.Spssdata(accessType='w', indexes = indexes) try: failed = False curs.append(spssdata.vdef(finalweight, vlabel=_("Raked Weight"))) curs.commitdict() wirisum = 0. wirisumNewwt = 0. wiri2sum = 0. wiri2sumNewwt = 0. wisum = 0. actuals = {} # dictionary for weights and counts actually used actualsNewwt = {} for case in curs: index = case[:nbreakvars] rwt = expkts.get(index, None) # raked weight if wtvar: wt = case[-1] else: wt = 1. # for weighted data, adjust cell weights by input case weight normalized by cell mean weight try: if wtvar: newwt = rwt * wt / meaninputwts[index] else: newwt = rwt except: newwt = None curs.casevalues([newwt]) #curs.casevalues([rwt]) try: kt = actuals.get(index, (0,0))[1] + wt actuals[index] = [rwt, kt] except: pass if wtvar: try: cumwt, cumkt = actualsNewwt.get(index, [0,0]) cumwt += newwt cumkt += wt #cumkt += 1 actualsNewwt[index] = [cumwt, cumkt] except: pass if not rwt is None: wisum += wt wirisum += wt * newwt wiri2sum += wt * newwt * newwt except: curs.CClose() curs = None spss.EndProcedure() raise finally: if not curs is None: curs.CClose() denom = wisum * wiri2sum if denom != 0: sampleeff = 100. * wirisum * wirisum / denom else: sampleeff = None cells = [sampleeff] rowlabels = [_("""Sample Balance""")] if wtvar: #denom = wisum * wiri2sumNewwt #if denom != 0: #sampleeffNewwt = 100. * wirisumNewwt * wirisumNewwt / denom #else: #sampleeffNewwt = None #cells.append(sampleeffNewwt) rowlabels = [_("""Sample Balance Including Final Weight Adjustment""")] tbl = spss.BasePivotTable(_("""Sample Balance Based on Variables: %s""") % ", ".join(variables), "RAKEBALANCE") tbl.SimplePivotTable(rowlabels=rowlabels, collabels=[_("""Balance""")], cells=cells) # table of weights if showweights: collabels = [_("""Category Rake Weight""")] if wtvar: for k in actuals: cumwt, cumkt = actualsNewwt[k] act = actuals[k] act.append(cumwt / cumkt) actuals[k] = act collabels.append(_("Case Count Weighted by Input Weight")) collabels.append(_("Mean Adjusted Raked Weight")) else: collabels.append(_("Unweighted Case Count")) #items = sorted(expkts.items()) items = sorted(actuals.items()) rowlabels = [", ".join([str(v) for v in item[0]]) for item in items] cells = [item[-1] for item in items] tbl2 = spss.BasePivotTable(_("""Raked Weights"""), "RAKEDWEIGHTS") tbl2.SimplePivotTable(rowdim=", ".join(variables), rowlabels = rowlabels, collabels=collabels, cells=cells) spss.EndProcedure() if not failed: if histogram: dohistogram(finalweight) doheatmap(variables, yvar, xvar, paneldownvar, panelacrossvar, finalweight, autoheatmap) spss.Submit("WEIGHT BY " + finalweight)
def buildspec(dims, dss, catvars, totvars, encoding, finalweight): """create raking specification and return control variable list and totals list dims is a list of dimension variables, categories, and totals dss, catvars, and totvars are alternative ways of specifying the same information dss is a list of dataset names, catvars a list of category variable names, and totvars a list of the corresponding control totals""" vardict = spssaux.VariableDict() if finalweight in vardict: raise ValueError(_("FINALWEIGHT cannot specify an existing variable name")) ctlvars= [] ctltotals = [] activedsname = spss.ActiveDataset() if activedsname == "*": #unnamed activedsname = "D" + str(random.uniform(.1,1)) spss.Submit("DATASET NAME %s" % activedsname) for dim in dims: if dim: v = dim if not isinstance(v[0], str): vvname = str(v[0], encoding) else: vvname = v[0] if not v[0] in vardict: raise ValueError(_("A control total variable does not exist: %s") % vvname) if not vardict[v[0]].VariableType == 0: raise ValueError(_("A nonnumeric variable was specified for a control dimension: %s") % vvname) if len(v) == 1 or not len(v) % 2 == 1: raise ValueError(_("An invalid set of values and totals was found for a control dimension: %s") % " ".join(v)) ctlvars.append(v[0]) #ctltotals.append(dict([(float(k),float(v)) for k,v in zip(v[1::2], v[2::2])])) try: # category totals can be numerical expressions # convert to a value after insuring that all numbers are floats ctltotals.append(dict([(float(k), float(eval(decimalize(v)))) for k,v in zip(v[1::2], v[2::2])])) except: raise ValueError(_("""Invalid category or category total for variable: %s""") % vvname) for i, ds in enumerate(dss): catvar = catvars[i] totvar = totvars[i] if not any([ds, catvar, totvar]): continue if ds and (catvar is None or totvar is None): raise ValueError(_("""A dataset was specified without the category or totals variable names: %s""") % ds) try: spss.Submit("DATASET ACTIVATE %s" % ds) dta = spssdata.Spssdata([catvar, totvar], names=False).fetchall() ctlvars.append(catvar) # A dataset value might be simply numeric or a string expression ctltotals.append(dict([(float(k), float(eval(decimalize((v))))) for k,v in dta])) except: # error conditions include nonexistant dataset and variables and type problems spss.Submit("DATASET ACTIVATE %s" % activedsname) raise spss.Submit("DATASET ACTIVATE %s" % activedsname) if not ctlvars: raise ValueError(_("""No raking specifications were given""")) # check for duplicate control variables ctllc = [v.lower() for v in ctlvars] ctlset = set(ctllc) if len(ctllc) != len(ctlset): # any duplicates? for v in ctlset: ctllc.remove(v) raise ValueError(_("""Duplicate control variables were specified: %s""") % ", ".join(set(ctllc))) return ctlvars, ctltotals
def docorr(variables, withvars=None, clevel=95, method="fisher", include=False, exclude=False, listwise=False, pairwise=False): """Calculate confidence intervals for correlations based on CORRELATION output""" activeds = spss.ActiveDataset() if activeds == "*": raise ValueError( _("""The active dataset must have a dataset name to use this procedure""" )) if listwise and pairwise: raise ValueError( _("""Cannot specify both listwise and pairwise deletion""")) missing = listwise and "LISTWISE" or "PAIRWISE" if include and exclude: raise ValueError( _("""Cannot specify both include and exclude missing values""")) inclusion = include and "INCLUDE" or "EXCLUDE" allvars = " ".join(variables) if withvars: allvars2 = allvars + " " + " ".join(withvars) allvarswith = allvars + " WITH " + " ".join(withvars) else: allvarswith = allvars allvars2 = allvars if method == "bootstrap": spss.Submit(r"""PRESERVE. SET RNG=MT. BOOTSTRAP /VARIABLES INPUT = %(allvars2)s /CRITERIA CILEVEL=%(clevel)s CITYPE=PERCENTILE NSAMPLES=1000. CORRELATIONS /VARIABLES = %(allvarswith)s /PRINT=NOSIG /MISSING=%(missing)s %(inclusion)s. RESTORE.""" % locals()) return # regular CIs dsname = "D" + str(random.uniform(.05, 1.)) omstag = "O" + str(random.uniform(.05, 1.)) # run CORRELATIONS with MATRIX output. # Validation of variable list requirements is handled # by CORRELATIONS. try: failed = False spss.Submit(r"""oms /select all except = warnings/destination viewer=no /tag = "%(omstag)s". dataset declare %(dsname)s. correlations /variables = %(allvars2)s /missing=%(missing)s %(inclusion)s /matrix=out(%(dsname)s). """ % locals()) except spss.SpssError: failed = True finally: spss.Submit("""omsend tag=%(omstag)s""" % locals()) if failed: return spss.Submit("dataset activate %(dsname)s." % locals()) spss.Submit("""select if ROWTYPE_ eq "N" or ROWTYPE_ eq "CORR".""") spss.Submit("""sort cases by VARNAME_.""") #dictionary of variable names in matrix dataset matnames = dict([(spss.GetVariableName(i), i) for i in range(spss.GetVariableCount())]) rowtypeloc = matnames["ROWTYPE_"] curs = spssdata.Spssdata() stats = [] uppervariables = [v.upper() for v in variables] for i, case in enumerate(curs): if case.ROWTYPE_.rstrip() == "N": N = case[rowtypeloc + 2:] # screen out rows for any WITH variables if case[rowtypeloc + 1].upper().rstrip() not in uppervariables: continue if case.ROWTYPE_.rstrip() == "CORR": CORR = case[rowtypeloc + 2:] dta = cidata(splitvars=case[0:rowtypeloc], variable=case[rowtypeloc + 1], ns=N, corrs=CORR, cis=ci(N, CORR, clevel / 100.)) stats.append(dta)
def genVarMacro(variables, countvalues, order, macroname, mincount, minpercent, maxcount, maxpercent, separator, weightvar, missing): """Generate a macro listing the variables in order of the weighted counts variables is the list of candidate variables countvalues is a list of the values to be counted order is a or d for the variable order in the macro macroname is the name of the macro to be generated mincount and minpercent specify minimum thresholds for including a variable (<) maxcount and maxpercent specific maximum thresholds (>=) separator is the variable name separator to use in the macro definition weightvar is the name of the weight variable or None missing specifies the missing value treatment""" if weightvar: varnamesAndWeight = variables + [weightvar] else: varnamesAndWeight = variables nvar = len(variables) if len(separator) == 0: separator = " " vardict = spssaux.VariableDict(variables) types = set(min(v.VariableType, 1) for v in vardict) if len(types) > 1: raise ValueError(_("""Variable must all be of the same type""")) vartypes = types.pop() if vartypes == 0: try: countvalues = [float(v) for v in countvalues] except: raise ValueError(_("""A non-numeric value to count was specified for a numeric variable""")) else: countvalues = [v.rstrip() for v in countvalues] countvalues = set(countvalues) curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, convertUserMissing=False, omitmissing=missing == 'exclude') counts = {} # a dictionary of weighted counts with variable names as keys # populate counts as all zeros so that all variables will # appear in the dictionary for later use for v in variables: counts[v] = 0 w = 1.0 wsum = 0 minpercent = minpercent / 100. if maxpercent is not None: maxpercent = maxpercent / 100. # calculate weighted count of counted values for each variable # string variables must be trimmed to match counted values list for case in curs: if weightvar: w = case[nvar] if w is None: w = 0.0 wsum += w # accumulate weight if vartypes == 1: case = [val.rstrip() for val in case[:nvar]] # don't include any weight variable for i in range(nvar): if case[i] in countvalues: counts[variables[i]] = counts[variables[i]] + w
def genVarsCategoryList(varnames, specialvalues, macroname, missing, order, weightvar, specialsorder, valuelabelsdict, missingvaluesdict, customattr, attrname): """Generate sorted list(s) of values with possible insertion of extra values and return list of SPSS macros to be created. varnames is a sequence of variable names to process. specialvalues is a sequence of values that should be inserted before the first zero count or at the end if no zeros or None. If a special value already occurs in a varname, it will be moved. macroname is a list of macronames of the same length as varnames to generate or None. missing is 'include' or 'exclude' to determine whether user missing values are included or excluded. order is 'a' or 'd' to specify the sort direction. weightvar can be specified as a variable name to be used as a weight in determing the counts to sort by. It must not occur in varnames. specialsorder is 'before' or 'after' and indicates the location of the specials section If other, values that have value labels are appended to the list of values found in the data. customattr indicates whether a custom attribute with the order should be generated attrname is the name of the custom attribute This function is mainly useful as a helper function for Ctables in building CATEGORIES subcommands. It may be useful to combine it with OTHERNM and/or MISSING in the category list. """ if weightvar: if weightvar in varnames: raise ValueError(_("""The weight variable cannot be included as a variable.""")) varnamesAndWeight = varnames + [weightvar] else: varnamesAndWeight = varnames curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, omitmissing=missing =='exclude') nvar = len(varnames) vvalues=[{} for i in range(nvar)] # for accumulating counts for all variable values for cn, case in enumerate(curs): casecpy = copy.copy(case) if weightvar: w = casecpy[nvar] if w is None: w = 0.0 else: w = 1.0 for i in range(nvar): if not casecpy[i] is None: # omit sysmis values and optionally user missing values curval = casecpy[i] vvalues[i][curval] = vvalues[i].get(curval,0.) + w # count occurrences, possibly weighted curs.CClose() valuelist = [] macrosgenerated = [] customattrlist = [] for i, vname in enumerate(varnames): # if labeled values were supplied but did not occur in the data, # add them with a count of zero if not valuelabelsdict is None: labeledbutnotfound = valuelabelsdict[vname] - set(vvalues[i].keys()) for val in labeledbutnotfound: vvalues[i][val] = 0. if not specialvalues is None: # remove special values from count list for v in specialvalues: if v in vvalues[i]: del(vvalues[i][v]) valuelist.append(sorted([(value, key) for (key, value) in vvalues[i].iteritems()], reverse = order == 'd')) if not specialvalues is None: if specialsorder == "after": valuelist[i].extend([(None, v) for v in specialvalues]) else: valuelist[i] = [(None, v) for v in specialvalues] + valuelist[i] if isinstance(valuelist[i][0][1], basestring): qchar = '"' else: qchar = '' if macroname is not None: if not macroname[i].startswith("!"): macroname[i] = "!" + macroname[i] macrosgenerated.append([macroname[i], " ".join([qchar + strconv(k).rstrip() + qchar for (value, k) in valuelist[i]])]) if customattr: customattrlist.append([vname, " ".join([qchar + strconv(k).rstrip() + qchar for (value, k) in valuelist[i]])]) if customattr: try: # cannot start datastep if there are pending transformations spss.StartDataStep() except: spss.Submit("EXECUTE.") spss.StartDataStep() ds = spss.Dataset() for spec in customattrlist: ds.varlist[spec[0]].attributes[attrname] = spec[1] spss.EndDataStep() return macrosgenerated, customattrlist