def genSetsCategoryList(mrsets, allvars, vartypes, resolver, specialvalues, macroname, 
        missing, order, weightvar, categorylabels, specialsorder, other):
    """Generate sorted list(s) of values with possible insertion of extra values and create SPSS macros.
    
    varnames is a sequence of mr set names to process.
    allvars is the resolved list of variables in the sets
    vartypes is the list of variable types
    resolver is a class that contains the MR set information from the SPSS dictionary.
    specialvalues is a sequence of values that should be inserted before the first zero count or at the end if no zeros or None.
    If a special value already occurs in a varname, it will be moved.
    macroname is a list of macronames of the same length as varnames to generate or None.
    missing is 'include' or 'exclude' to determine whether user missing values are included or excluded.
    order is 'a' or 'd' to specify the sort direction.
    weightvar can be specified as a variable name to be used as a weight in determing the counts to sort by.  It must not occur in varnames.

    This function is mainly useful as a helper function for Ctables in building CATEGORIES subcommands.
    It may be useful to combine it with other and/or MISSING in the category list."""
    
    for name in mrsets:
        if resolver.getSetType(name) != "Dichotomies":
            raise ValueError(_("""The specified set is a multiple category set.  Only multiple dichotomy sets can be used: %s""")
                % name)
    if weightvar in allvars:
        raise ValueError(_("""The weight variable cannot be included in an MR set."""))
    if weightvar:
        varnamesAndWeight = allvars + [weightvar]
    else:
        varnamesAndWeight = allvars
    curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, omitmissing=missing == 'exclude')
    nvar = len(allvars)
    
    vvalues=[{} for i in range(nvar)]  # for accumulating counts for all variable values
    for cn, case in enumerate(curs):
        casecpy = copy.copy(case)
        if weightvar:
            w = casecpy[nvar]
            if w is None:
                w = 0.0
        else:
            w = 1.0
        for i in range(nvar):
            if not casecpy[i] is None:   # omit sysmis values and optionally user missing values
                curval = casecpy[i]
                vvalues[i][curval] = vvalues[i].get(curval,0.) + w   # count occurrences, possibly weighted
    curs.CClose()
    
    # produce value list for variables in a set

    if macroname is None:
        raise ValueError("No macro names were specified")
    manager = ManageValues(mrsetinfo=resolver.mrsets, allvars=allvars, allvalues=vvalues,
        specials=specialvalues, order=order, macroname=macroname, 
        categorylabels=categorylabels, specialsorder=specialsorder, other=other)
    for i, s in enumerate(mrsets):
        manager.collapse(s)
        manager.setgen(macroname[i], s)
Exemple #2
0
    def doaggr(self, doindex):
        """create an aggregate dataset and tally values
        
        doindex is the index into varstolabel at which to start"""
        
        vtl = self.varstolabel[doindex:doindex+self.varsperpass]
        vtllen = len(vtl)
        if len(self.labelvars) == 1:
            lbls = self.labelvars
            lastlbl = vtllen + 1
        else:
            lbls = self.labelvars[doindex:doindex+self.varsperpass]
            lastlbl = 3 * vtllen - 1
        brkvarlist = "\n".join(textwrap.wrap(" ".join(vtl), width=100))
        outvars = ["/min_%s=MIN(%s)/max_%s=MAX(%s)" % (mkrandomname(), v, mkrandomname(), v) for v in lbls]
        aggrcmd = Mkvls.aggrtemplate % (self.aggrdsname, self.aggrdsname, brkvarlist) + "\n".join(outvars)
        spss.Submit(aggrcmd)
        spss.Submit("DATASET ACTIVATE %s" % self.aggrdsname)
        
        # for each variable, build label information based on data
        # AGGREGATE dataset structure:
        # var1value, var2value,..., min(text lbl1), max(text lbl1), min(text lbl2), max(text lbl2)...
        # but if only one label set, only one pair of label aggregates is produced
        # user missing values are exposed and subject to labelling
        
        curs = spssdata.Spssdata(names=False, convertUserMissing=False)
        for case in curs:
            for v, vname in enumerate(vtl):
                value = case[v]
                minlbl = self.truncate(case[min(vtllen + v*2, lastlbl-1)], 120).rstrip()
                maxlbl = self.truncate(case[min(vtllen + v*2 + 1, lastlbl)], 120).rstrip()
                # more than one label for the same value?
                if minlbl != maxlbl and (minlbl != "" and minlbl is not None):
                    self.conflicts[vname].add(value)
                # ignore empty or missing labels
                if maxlbl != "" and maxlbl is not None:
                    # if the value has already been seen but with a different label, it's a conflict
                    if value in self.values[vname] and not (value, maxlbl) in self.vlabels[vname]:
                        self.conflicts[vname].add(value)
                    else:
                        self.vlabels[vname].add((value, maxlbl))  # first one wins
                        self.values[vname].add(value)
                        # tally instances where the same label used for different value
                        # need to see whether labels has been assigned to a different value
                        previousvalue =  self.labelusage[vname].get(maxlbl, None)
                        if previousvalue is not None and value != previousvalue:
                            ###self.duplabels[vname] = self.duplabels[vname] + 1
                            self.duplabels[vname].add(maxlbl)
                        self.labelusage[vname][maxlbl] = value

        curs.CClose()
        spss.Submit("DATASET CLOSE %s" % self.aggrdsname)
Exemple #3
0
    def getsav(self, filespec, delete=True):
        """Open sav file and return all contents
        
        filespec is the file path
        filespec is deleted after the contents are read unless delete==False"""
     
        item = self.wdsname
        spss.Submit(r"""get file="%(filespec)s".
DATASET NAME %(item)s.
DATASET ACTIVATE %(item)s.""" % locals())
        contents = spssdata.Spssdata(names=False).fetchall()
        spss.Submit("""DATASET CLOSE %(item)s.
        NEW FILE.""" % locals())
        if delete:
            os.remove(filespec)
        return contents
Exemple #4
0
def genSetsCategoryList(mcset, allvars, resolver, setname, varprefix):
    """Generate sorted list(s) of values with possible insertion of extra values and create SPSS macros.
    
    mcset is the mc set to convert
    allvars is the resolved list of variables in the sets
    resolver is a class that contains the MR set information from the SPSS dictionary.
    setname is the name for the output set
    varprefix is the prefix for variable names to generate"""

    if resolver.getSetType(mcset) != "Categories":
        raise ValueError(
            _("""The specified set is not a multiple category set.  Only a set of that type can be used in this procedure: %s"""
              ) % mcset)

    curs = spssdata.Spssdata(
        indexes=allvars, names=False)  # keep cases w missing, mv's set to None
    nvar = len(allvars)

    vvalues = set()
    for case in curs:
        for i in range(nvar):
            if not case[i] is None:  # omit sysmis and user missing values
                if resolver.getVarType(mcset) == "String":
                    val = case[i].rstrip()
                else:
                    val = case[i]
                vvalues.add(val)
    curs.CClose()
    if len(vvalues) == 0:
        raise ValueError(
            _("""There are no values in the set variables for set: %s""" %
              mcset))

    # copy values labels from the first variable in the set
    # MC sets are expected to have consistent value labels across variable
    # if any are defined.
    with spss.DataStep():
        valuelabels = spss.Dataset().varlist[allvars[0]].valueLabels.data

    manager = ManageValues(resolver, mcset, vvalues, setname, varprefix,
                           valuelabels)
    manager.genData()
    manager.setgen()
    return (manager.generatednames, manager.generatedvalues,
            manager.generatedlabels)
Exemple #5
0
def rake(info, variables, marginals,finalweight, visible=False, showweights=True, 
        poptotal=None, delta=0, iter=20, conv=.0001,checkempty=True,
        yvar=None, xvar=None, paneldownvar=None, panelacrossvar=None, autoheatmap=None, histogram=True):
    """Calculate a weight variable such that for each controlled dimension, the (weighted) count in each category matches a specified total or fraction.
e    variables is a list of the variables for which control totals or proportions are provided.  It can be a sequence or
    a white-space separated string
    marginals is a list of dictionaries where the key is the value of a variable and the value is the target control total or fraction.
    fractional marginals should normally add to 1 and counts should total the same in each dimension, but this is not enforced.
    If there are no cases for a given value, the total or fraction will be less than expected.
    Negative and zero marginals are tolerated but are generally inappropriate.  
    If a control value is not in the appropriate marginals dictionary or is zero, the resulting weight will be SYSMIS.

    finalweight is a string naming the new weight variable.  The variable must not already exist in the active dataset.
    visible indicates whether or not the procedure output and auxiliary dataset are displayed.
    poptotal is a total to which the final weights will be scaled.  By default, they are scaled to sum to the existing weight total, if any, or the number of cases.
    delta, iter, and conv are iteration parameters corresponding to the GENLOG parameters and can be used if there are convergence problems.
    checkempty adjusts for empty cells.  If it is known that there are none, this adjustment, which can use considerable memory, can be bypassed.
    If the active dataset does not have a name, one is assigned automatically."""

    variables = _buildvarlist(variables)
    if len(variables) == 1:   # delta should always be 0 if this is a 1-d problem
        delta = 0.
    wtvar = spss.GetWeightVar()
    if wtvar:  #aggrweight will be the GENLOG CSTRUCTURE variable
        aggrweight = "W_" + rname()
    else:
        aggrweight = "N_" + rname()
    activeds = spssaux.GetActiveDatasetName()
    if not activeds:
        activeds = "D_" + rname()
        spss.Submit("DATASET NAME " + activeds)
    nbreakvars = len(variables)
    if nbreakvars != len(marginals):
        raise ValueError(_("The number of control variables does not match number of sets of control totals"))

    # aggregate the data according to the list of control variables
    aggrdsname = "D_" + rname()
    countname = "N_" + rname()
    spss.Submit("WEIGHT OFF.")
    # aggregate to a new dataset and activate it.  Sum original weight variable if any
    cmd=\
       """DATASET DECLARE  %(aggrdsname)s %(vis)s.
    AGGREGATE 
     /OUTFILE= %(aggrdsname)s
     /BREAK=%(breakvars)s
    %(wtspec)s 
    /%(countname)s=N.
   dataset activate %(aggrdsname)s.""" % \
                                       {'aggrdsname': aggrdsname,
                                        'breakvars' : " ".join(variables),
                                        'wtspec' : ("/" + aggrweight + (wtvar and ("= SUM(" + wtvar + ")") or "= N")),
                                        'countname' : countname,
                                        'vis' : visible and " " or " WINDOW=HIDDEN"
                                        }
    spss.Submit(cmd)
    # if weighting, make a dictionary of the mean weight in each cell for use in final adjustment pass
    if wtvar:
        allaggrdata = spssdata.Spssdata(variables + [aggrweight] + [countname]).fetchall()
        meaninputwts = {}
        for row in allaggrdata:
            try:
                themean = row[nbreakvars] / row[nbreakvars+1]
            except:
                themean = None
            meaninputwts[row[:nbreakvars]] = themean
        

    # get a cursor and add control totals to this dataset
    # The control total is the product of all the variable value control totals or proportions
    # Track the cases to see if there are any empty cells, i.e., some combination of the marginals across
    # all the control variables that does not occur in the aggregated dataset.
    spss.StartProcedure("SPSSINC RAKE - I")
    if checkempty:
        cellset = _setprod(_dictlisttotupledsets(marginals))   # build set of tuples of all sets
        curs = spssdata.Spssdata(accessType='r', indexes=variables, names=False)
        for case in curs:
            cellset.discard(tuple(case[:nbreakvars]))  # remove found cells
        curs.CClose()
        spss.EndProcedure()
        #add cases for any empty cells
        if cellset:
            spss.StartProcedure("SPSSINC RAKE - I")
            curs = spssdata.Spssdata(accessType='a', names=False)
            for s in cellset:
                for cv in range(nbreakvars):
                    curs.appendvalue(cv, s[cv])
                for cv in range(2):  #was nbreakvars
                    curs.appendvalue(nbreakvars+cv, 1e-12)   # 1e-8
                curs.CommitCase()
            curs.CClose()
            spss.EndProcedure()

        # compute expected count from marginals
    try:
        curs = spssdata.Spssdata(accessType='w')
        ctrlwt = "W_"+ rname()  # this will be the SPSS case weight
        curs.append(ctrlwt)
        curs.commitdict()
        novalues = []
        for case in curs:
            w = 1.
            for i, v in enumerate(marginals):
                w = w* v.get(case[i], 0)
                if w == 0:
                    spec = (variables[i], case[i])
                    if not spec in novalues:
                        uspec = spec[0]
                        if not isinstance(uspec, str):
                            uspec = str(uspec, locale.getlocale()[1])
                        info.addrow(_("Variable: %s, value: %s. No control value supplied: weight will be SYSMIS.") % (uspec,  spec[1]))
                    novalues.append(spec)
                    break
            curs.casevalues([w])
    finally:
        curs.CClose()
        spss.EndProcedure()
        info.generate()

    # run GENLOG
    expectedname = 'expected_' + rname()
    newwt = "W_"+ rname()
    #dbg
    ###spss.Submit("""save outfile="c:/temp/genloginput.sav".""")
    if not visible:
        omstag = "O_" + rname()
        spss.Submit("OMS /SELECT ALL EXCEPT =WARNINGS /DESTINATION VIEWER=NO /TAG=" + omstag)
    #debugcmd =  """WEIGHT BY %(ctrlwt)s.
        #GENLOG
        #%(breakvars)s  /CSTRUCTURE = %(aggrweight)s
        #/MODEL = POISSON
        #/PRINT = FREQ ESTIM
        #/plot none
        #/CRITERIA = CIN(95) ITERATE(%(iter)s) CONVERGE(%(conv)s) DELTA(%(delta)s)
       #/save= pred(%(expectedname)s)
        #/DESIGN %(breakvars)s .  """ %\
                                     #{'ctrlwt' : ctrlwt,
                                      #'breakvars': " ".join(variables),
                                      #'aggrweight' : aggrweight ,
                                      #'expectedname' : expectedname,
                                      #'iter' : iter,
                                      #'conv' : conv,
                                      #'delta' : delta
                                      #}

    try:
        spss.Submit("""WEIGHT BY %(ctrlwt)s.
            GENLOG
            %(breakvars)s  /CSTRUCTURE = %(aggrweight)s
            /MODEL = POISSON
            /PRINT = FREQ ESTIM
            /plot none
            /CRITERIA = CIN(95) ITERATE(%(iter)s) CONVERGE(%(conv)s) DELTA(%(delta)s)
           /save= pred(%(expectedname)s)
            /DESIGN %(breakvars)s .  """ %\
                                         {'ctrlwt' : ctrlwt,
                                          'breakvars': " ".join(variables),
                                          'aggrweight' : aggrweight ,
                                          'expectedname' : expectedname,
                                          'iter' : iter,
                                          'conv' : conv,
                                          'delta' : delta
                                          })
    except:
        spss.Submit("DATASET ACTIVATE " + activeds)
        raise ValueError(_("""Failure in GENLOG procedure.  Processing stopped.
        The error could be either a failure to compute the result 
        or not having a license for the Advanced Statistics option"""))
    finally:
        if not visible:
            spss.Submit("OMSEND TAG=" + omstag)
    # get the expected counts, normalized by the cell N in order to distribute
    expkts = {}
    spss.StartProcedure("SPSSINC RAKE - II")

    try:
        curs = spssdata.Spssdata(accessType='r', indexes = variables + [expectedname] + [aggrweight]+ [countname])
        weightsum = 0.
        wsum = 0.
        for case in curs:
            weightsum += case[nbreakvars] or 0  # allow for missing values j.i.c
            if case[-3]:
                wsum += case[-2]
                w = case[-3]/case[-1]
            else:
                w = None
            expkts[tuple(case[:nbreakvars])] = w
    finally:
        curs.CClose()
        spss.EndProcedure()
    # normalize weights to user total or sum of sample weights
    poptotal = poptotal or wsum
    for key in expkts:
        if expkts[key]:
            expkts[key] *= poptotal/weightsum

    # return to the first dataset and apply weights.
    spss.Submit("DATASET ACTIVATE " + activeds)
    if not visible:
        spss.Submit("DATASET CLOSE " + aggrdsname)
    spss.StartProcedure("SPSSINC RAKE - III")
    if wtvar:
        indexes = variables+ [wtvar]
    else:
        indexes = variables
    curs = spssdata.Spssdata(accessType='w', indexes = indexes)
    try:
        failed = False
        curs.append(spssdata.vdef(finalweight, vlabel=_("Raked Weight")))
        curs.commitdict()
        wirisum = 0.
        wirisumNewwt = 0.
        wiri2sum = 0.
        wiri2sumNewwt = 0.
        wisum = 0.
        actuals = {}   # dictionary for weights and counts actually used
        actualsNewwt = {}

        for case in curs:
            index = case[:nbreakvars]
            rwt = expkts.get(index, None)   # raked weight
            if wtvar:
                wt = case[-1]
            else:
                wt = 1.
            # for weighted data, adjust cell weights by input case weight normalized by cell mean weight
            try:
                if wtvar:
                    newwt = rwt * wt / meaninputwts[index]
                else:
                    newwt = rwt
            except:
                newwt = None
            curs.casevalues([newwt])
            #curs.casevalues([rwt])
            try:
                kt = actuals.get(index, (0,0))[1] + wt
                actuals[index] = [rwt, kt]
            except: 
                pass
            if wtvar:
                try:
                    cumwt, cumkt = actualsNewwt.get(index, [0,0])
                    cumwt += newwt
                    cumkt += wt
                    #cumkt += 1
                    actualsNewwt[index] = [cumwt, cumkt]
                except:
                    pass

            if not rwt is None:
                wisum += wt
                wirisum += wt * newwt
                wiri2sum += wt * newwt * newwt

    except:
        curs.CClose()
        curs = None
        spss.EndProcedure()
        raise
    finally:
        if not curs is None:
            curs.CClose()
    denom = wisum * wiri2sum
    if denom != 0:
        sampleeff = 100. * wirisum * wirisum / denom
    else:
        sampleeff = None
    cells = [sampleeff]
    rowlabels = [_("""Sample Balance""")]
    if wtvar:
        #denom = wisum * wiri2sumNewwt
        #if denom != 0:
            #sampleeffNewwt = 100. * wirisumNewwt * wirisumNewwt / denom
        #else:
            #sampleeffNewwt = None
        #cells.append(sampleeffNewwt)
        rowlabels = [_("""Sample Balance Including Final Weight Adjustment""")]
    tbl = spss.BasePivotTable(_("""Sample Balance Based on Variables: %s""") % ", ".join(variables), 
        "RAKEBALANCE")
    tbl.SimplePivotTable(rowlabels=rowlabels, collabels=[_("""Balance""")], cells=cells)
    
    # table of weights
    if showweights:
        collabels = [_("""Category Rake Weight""")]
        if wtvar:
            for k in actuals:
                cumwt, cumkt = actualsNewwt[k]
                act = actuals[k]
                act.append(cumwt / cumkt)
                actuals[k] = act
            collabels.append(_("Case Count Weighted by Input Weight"))
            collabels.append(_("Mean Adjusted Raked Weight"))
        else:
            collabels.append(_("Unweighted Case Count"))
        #items = sorted(expkts.items())
        items = sorted(actuals.items())
        rowlabels = [", ".join([str(v) for v in item[0]]) for item in items]
        cells = [item[-1] for item in items]
    
        tbl2 = spss.BasePivotTable(_("""Raked Weights"""), "RAKEDWEIGHTS")
        tbl2.SimplePivotTable(rowdim=", ".join(variables), rowlabels = rowlabels, 
            collabels=collabels, cells=cells)
    spss.EndProcedure()
    if not failed:
        if histogram:
            dohistogram(finalweight)
        doheatmap(variables, yvar, xvar, paneldownvar, panelacrossvar, finalweight, autoheatmap)
        spss.Submit("WEIGHT BY " + finalweight)
Exemple #6
0
def buildspec(dims, dss, catvars, totvars, encoding, finalweight):
    """create raking specification and return control variable list and totals list
    
    dims is a list of dimension variables, categories, and totals
    dss, catvars, and totvars are alternative ways of specifying the same information
    dss is a list of dataset names, catvars a list of category variable names, and 
    totvars a list of the corresponding control totals"""
    
    vardict = spssaux.VariableDict()
    if finalweight in vardict:
        raise ValueError(_("FINALWEIGHT cannot specify an existing variable name"))    
    ctlvars= []
    ctltotals = []
    activedsname = spss.ActiveDataset()
    if activedsname == "*": #unnamed
        activedsname = "D" + str(random.uniform(.1,1))
        spss.Submit("DATASET NAME %s" % activedsname)

    for dim in dims:    
        if dim:
            v = dim
            if not isinstance(v[0], str):
                vvname = str(v[0], encoding)
            else:
                vvname = v[0]
            if not v[0] in vardict:
                raise ValueError(_("A control total variable does not exist: %s") % vvname)
            if not vardict[v[0]].VariableType == 0:
                raise ValueError(_("A nonnumeric variable was specified for a control dimension: %s") % vvname)
            if len(v) == 1 or not len(v) % 2 == 1:
                raise ValueError(_("An invalid set of values and totals was found for a control dimension: %s") % " ".join(v))
            ctlvars.append(v[0])
            #ctltotals.append(dict([(float(k),float(v)) for k,v in zip(v[1::2], v[2::2])]))
            try:
                # category totals can be numerical expressions
                # convert to a value after insuring that all numbers are floats
                ctltotals.append(dict([(float(k), float(eval(decimalize(v)))) for k,v in zip(v[1::2], v[2::2])]))
            except:
                raise ValueError(_("""Invalid category or category total for variable: %s""") % vvname)
    for i, ds in enumerate(dss):
        catvar = catvars[i]
        totvar = totvars[i]
        if not any([ds, catvar, totvar]):
            continue
        if ds and (catvar is None or totvar is None):
            raise ValueError(_("""A dataset was specified without the category or totals variable names: %s""") % ds)
        try:
            spss.Submit("DATASET ACTIVATE %s" % ds)
            dta = spssdata.Spssdata([catvar, totvar], names=False).fetchall()
            ctlvars.append(catvar)
            # A dataset value might be simply numeric or a string expression
            ctltotals.append(dict([(float(k), float(eval(decimalize((v))))) for k,v in dta]))
        except: # error conditions include nonexistant dataset and variables and type problems
            spss.Submit("DATASET ACTIVATE %s" % activedsname)
            raise
    spss.Submit("DATASET ACTIVATE %s" % activedsname)
    if not ctlvars:
        raise ValueError(_("""No raking specifications were given"""))
    # check for duplicate control variables
    ctllc = [v.lower() for v in ctlvars]
    ctlset = set(ctllc)
    if len(ctllc) != len(ctlset):  # any duplicates?
        for v in ctlset:
            ctllc.remove(v)
        raise ValueError(_("""Duplicate control variables were specified: %s""") % ", ".join(set(ctllc)))
    return ctlvars, ctltotals
Exemple #7
0
def docorr(variables,
           withvars=None,
           clevel=95,
           method="fisher",
           include=False,
           exclude=False,
           listwise=False,
           pairwise=False):
    """Calculate confidence intervals for correlations based on CORRELATION output"""

    activeds = spss.ActiveDataset()
    if activeds == "*":
        raise ValueError(
            _("""The active dataset must have a dataset name to use this procedure"""
              ))
    if listwise and pairwise:
        raise ValueError(
            _("""Cannot specify both listwise and pairwise deletion"""))
    missing = listwise and "LISTWISE" or "PAIRWISE"
    if include and exclude:
        raise ValueError(
            _("""Cannot specify both include and exclude missing values"""))
    inclusion = include and "INCLUDE" or "EXCLUDE"
    allvars = " ".join(variables)
    if withvars:
        allvars2 = allvars + " " + " ".join(withvars)
        allvarswith = allvars + " WITH " + " ".join(withvars)
    else:
        allvarswith = allvars
        allvars2 = allvars
    if method == "bootstrap":
        spss.Submit(r"""PRESERVE.
SET RNG=MT.
BOOTSTRAP /VARIABLES INPUT = %(allvars2)s
/CRITERIA CILEVEL=%(clevel)s CITYPE=PERCENTILE NSAMPLES=1000.
CORRELATIONS
  /VARIABLES = %(allvarswith)s
  /PRINT=NOSIG
  /MISSING=%(missing)s %(inclusion)s.
RESTORE.""" % locals())
        return
    # regular CIs
    dsname = "D" + str(random.uniform(.05, 1.))
    omstag = "O" + str(random.uniform(.05, 1.))

    # run CORRELATIONS with MATRIX output.
    # Validation of variable list requirements is handled
    # by CORRELATIONS.
    try:
        failed = False
        spss.Submit(r"""oms /select all except = warnings/destination viewer=no
    /tag = "%(omstag)s".
    dataset declare %(dsname)s.
    correlations /variables = %(allvars2)s
    /missing=%(missing)s %(inclusion)s
    /matrix=out(%(dsname)s).
    """ % locals())
    except spss.SpssError:
        failed = True
    finally:
        spss.Submit("""omsend tag=%(omstag)s""" % locals())
    if failed:
        return
    spss.Submit("dataset activate %(dsname)s." % locals())
    spss.Submit("""select if ROWTYPE_ eq "N" or ROWTYPE_ eq "CORR".""")
    spss.Submit("""sort cases by VARNAME_.""")
    #dictionary of variable names in matrix dataset
    matnames = dict([(spss.GetVariableName(i), i)
                     for i in range(spss.GetVariableCount())])
    rowtypeloc = matnames["ROWTYPE_"]
    curs = spssdata.Spssdata()
    stats = []
    uppervariables = [v.upper() for v in variables]

    for i, case in enumerate(curs):
        if case.ROWTYPE_.rstrip() == "N":
            N = case[rowtypeloc + 2:]
        # screen out rows for any WITH variables
        if case[rowtypeloc + 1].upper().rstrip() not in uppervariables:
            continue
        if case.ROWTYPE_.rstrip() == "CORR":
            CORR = case[rowtypeloc + 2:]
            dta = cidata(splitvars=case[0:rowtypeloc],
                         variable=case[rowtypeloc + 1],
                         ns=N,
                         corrs=CORR,
                         cis=ci(N, CORR, clevel / 100.))
            stats.append(dta)
def genVarMacro(variables, countvalues, order, macroname, mincount,
            minpercent, maxcount, maxpercent, separator, weightvar, missing):
    """Generate a macro listing the variables in order of the weighted counts
    
    variables is the list of candidate variables
    countvalues is a list of the values to be counted
    order is a or d for the variable order in the macro
    macroname is the name of the macro to be generated
    mincount and minpercent specify minimum thresholds for including a variable (<)
    maxcount and maxpercent specific maximum thresholds (>=)
    separator is the variable name separator to use in the macro definition
    weightvar is the name of the weight variable or None
    missing specifies the missing value treatment"""
    
    if weightvar:
        varnamesAndWeight = variables + [weightvar]
    else:
        varnamesAndWeight = variables
    nvar = len(variables)
    if len(separator) == 0:
        separator = " "
    vardict = spssaux.VariableDict(variables)
    types = set(min(v.VariableType, 1) for v in vardict)
    if len(types) > 1:
        raise ValueError(_("""Variable must all be of the same type"""))
    vartypes = types.pop()
    if vartypes == 0:
        try:
            countvalues = [float(v) for v in countvalues]
        except:
            raise ValueError(_("""A non-numeric value to count was specified for a numeric variable"""))
    else:
        countvalues = [v.rstrip() for v in countvalues]
    countvalues = set(countvalues)
    curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, 
        convertUserMissing=False, omitmissing=missing == 'exclude')

    counts = {}  # a dictionary of weighted counts with variable names as keys
    # populate counts as all zeros so that all variables will
    # appear in the dictionary for later use
    for v in variables:
        counts[v] = 0    
    w = 1.0
    wsum = 0
    minpercent = minpercent / 100.
    if maxpercent is not None:
        maxpercent = maxpercent / 100.
    # calculate weighted count of counted values for each variable
    # string variables must be trimmed to match counted values list

    for case in curs:
        if weightvar:
            w = case[nvar]
            if w is None:
                w = 0.0
        wsum += w   # accumulate weight
        if vartypes == 1:
            case = [val.rstrip() for val in case[:nvar]] # don't include any weight variable   
        for i in range(nvar):
            if case[i] in countvalues:
                counts[variables[i]] = counts[variables[i]] + w 
def genVarsCategoryList(varnames, specialvalues, macroname, missing, order, 
        weightvar, specialsorder, valuelabelsdict, missingvaluesdict,
        customattr, attrname):
    """Generate sorted list(s) of values with possible insertion of extra values
    and return list of SPSS macros to be created.
    
    varnames is a sequence of variable names to process.
    specialvalues is a sequence of values that should be inserted before the first zero count or at the end if no zeros or None.
    If a special value already occurs in a varname, it will be moved.
    macroname is a list of macronames of the same length as varnames to generate or None.
    missing is 'include' or 'exclude' to determine whether user missing values are included or excluded.
    order is 'a' or 'd' to specify the sort direction.
    weightvar can be specified as a variable name to be used as a weight in determing the counts to sort by.
    It must not occur in varnames.
    specialsorder is 'before' or 'after' and indicates the location of the specials section
    If other, values that have value labels are appended to the list of values found
    in the data.
    customattr indicates whether a custom attribute with the order should be generated
    attrname is the name of the custom attribute
    

    This function is mainly useful as a helper function for Ctables in building CATEGORIES subcommands.
    It may be useful to combine it with OTHERNM and/or MISSING in the category list.
    """

    if weightvar:
        if weightvar in varnames:
            raise ValueError(_("""The weight variable cannot be included as a variable."""))
        varnamesAndWeight = varnames + [weightvar]
    else:
        varnamesAndWeight = varnames
    curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, omitmissing=missing =='exclude')
    nvar = len(varnames)
    
    vvalues=[{} for i in range(nvar)]  # for accumulating counts for all variable values
    for cn, case in enumerate(curs):
        casecpy = copy.copy(case)
        if weightvar:
            w = casecpy[nvar]
            if w is None:
                w = 0.0
        else:
            w = 1.0
        for i in range(nvar):
            if not casecpy[i] is None:   # omit sysmis values and optionally user missing values
                curval = casecpy[i]
                vvalues[i][curval] = vvalues[i].get(curval,0.) + w   # count occurrences, possibly weighted
    curs.CClose()
    
    valuelist = []
    macrosgenerated = []
    customattrlist = []
    for i, vname in enumerate(varnames):
        # if labeled values were supplied but did not occur in the data,
        # add them with a count of zero
        if not valuelabelsdict is None:
            labeledbutnotfound = valuelabelsdict[vname] - set(vvalues[i].keys())
            for val in labeledbutnotfound:
                vvalues[i][val] = 0.
        if not specialvalues is None:  # remove special values from count list
            for v in specialvalues:
                if v in vvalues[i]:
                    del(vvalues[i][v])
        valuelist.append(sorted([(value, key) for (key, value) in vvalues[i].iteritems()], reverse = order == 'd'))
        if not specialvalues is None:
            if specialsorder == "after":
                valuelist[i].extend([(None, v) for v in specialvalues])
            else:
                valuelist[i] = [(None, v) for v in specialvalues] + valuelist[i]

        if isinstance(valuelist[i][0][1], basestring):
            qchar = '"'
        else:
            qchar = ''
        if macroname is not None:
            if not macroname[i].startswith("!"):
                macroname[i] = "!" + macroname[i]
            macrosgenerated.append([macroname[i],
                " ".join([qchar + strconv(k).rstrip() + qchar  for (value, k) in valuelist[i]])])
        if customattr:
            customattrlist.append([vname, " ".join([qchar + strconv(k).rstrip() + qchar  for (value, k) in valuelist[i]])])
    
    if customattr:
        try:   # cannot start datastep if there are pending transformations
            spss.StartDataStep()
        except:
            spss.Submit("EXECUTE.")
            spss.StartDataStep()
        ds = spss.Dataset()
        
        for spec in customattrlist:
            ds.varlist[spec[0]].attributes[attrname] = spec[1]
        spss.EndDataStep()
            
        
    return macrosgenerated, customattrlist