def test_metadata_same(self):
        u_settings = dict(ioUtf8=True, ioLocale=ioLocale)
        with sav.SavHeaderReader(in_savFileName, **u_settings) as header:
            in_metadata = header.dataDictionary()
            in_encoding = header.fileEncoding
        self.assertEqual("utf_8", in_encoding)

        with sav.SavHeaderReader(out_savFileName, **u_settings) as header:
            out_metadata = header.dataDictionary()
            out_encoding = header.fileEncoding
        self.assertEqual("utf_8", out_encoding)
        self.assertEqual(in_metadata, out_metadata)
        # check if the locale is reset
        self.assertEqual(locale.setlocale(locale.LC_CTYPE), currLocale)
Exemple #2
0
def main():

    MAX_ROWS = 5000
    MAX_DECIMALS = 6

    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print('Usage: ' + sys.argv[0] + ' inputfile [outputpath]')
        sys.exit(1)

    if len(sys.argv) == 2:
        sys.argv.append('')

    try:
        with savReaderWriter.SavHeaderReader(sys.argv[1],
                                             ioUtf8=False) as header:
            metadata = header.all()

        res = {
            'alignments': metadata.alignments,
            'columnWidths': metadata.columnWidths,
            'measureLevels': metadata.measureLevels,
            'varFormats': metadata.formats,
            'varTypes': metadata.varTypes,
            'varNames': metadata.varNames,
            'valueLabels': metadata.valueLabels,
            'varLabels': metadata.varLabels,
            # # Otros valores que vienen en el header:
            # 'caseWeightVar': metadata.caseWeightVar,
            # 'fileAttributes': metadata.fileAttributes,
            # 'fileLabel': metadata.fileLabel,
            # 'missingValues': metadata.missingValues,
            # 'multRespDefs': metadata.multRespDefs,
            # 'varAttributes': metadata.varAttributes,
            # 'varRoles': metadata.varRoles,
            # 'varSets': metadata.varSets,
        }

        with open(os.path.join(sys.argv[2], 'header.json'), 'w') as h:
            h.write(json.dumps(convert_recursive(res), indent=4))

        with savReaderWriter.SavReader(sys.argv[1], ioUtf8=False) as reader:
            for i, lines in enumerate(chunks(reader, MAX_ROWS), 1):
                with open(
                        os.path.join(sys.argv[2],
                                     'data_' + str(i).zfill(5) + '.json'),
                        'w') as f:
                    encoded = convert_recursive(lines)
                    jsonText = json.dumps(encoded)
                    # jsonText = truncate_decimals(jsonText, MAX_DECIMALS)
                    f.write(jsonText)

        print('Files successfully created.')

        return

    except:
        print("Error: ", sys.exc_info())
        traceback.print_exc()
        sys.exit(1)
 def test_raises_SPSSIOWarning(self):
     module = rw if sys.version_info[0] > 2 else rw.error
     SPSSIOWarning = module.SPSSIOWarning  
     #with self.assertRaises(SPSSIOWarning) as e:
     with warnings.catch_warnings(record=True) as w:
         with rw.SavHeaderReader(self.savFileName) as header:
             metadata = str(header)
             self.assertTrue(issubclass(w[-1].category, UserWarning))
             self.assertTrue("SPSS_NO_CASEWGT" in str(w[-1].message))
Exemple #4
0
def test_accented_varSet_codepage_mode():
    with srw.SavWriter(**kwargs) as writer:
        for i in range(10):
            writer.writerow([1, 1])
    with srw.SavHeaderReader(kwargs["savFileName"]) as header:
        actual = header.varSets
    desired = {b'\xc3\xbcberhaupt': [b'salbegin', b'salary']}
    remove(kwargs["savFileName"])
    assert actual == desired, actual
Exemple #5
0
def test_accented_varSet_unicode_mode():
    kwargs["varSets"] = {u'\xfcberhaupt': varNames}
    kwargs["ioUtf8"] = True
    with srw.SavWriter(**kwargs) as writer:
        for i in range(10):
            writer.writerow([1, 1])
    with srw.SavHeaderReader(kwargs["savFileName"], ioUtf8=True) as header:
        actual = header.varSets
    desired = {u'\xfcberhaupt': [u'salbegin', u'salary']}
    remove(kwargs["savFileName"])
    assert actual == desired, actual
Exemple #6
0
def spss_to_csv(input_filepath,
                output_filepath,
                upsample_size=100000,
                w_col='Weight'):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    with srw.SavHeaderReader(input_filepath, ioUtf8=True) as header:
        metadata = header.all()
        value_replacements = metadata.valueLabels

        nan_replacements = dict()
        for k, v in metadata.missingValues.items():
            if v and 'values' in v:
                nan_values = dict()
                if isinstance(v['values'], list):
                    for nan_val in v['values']:
                        nan_values[nan_val] = np.nan
                else:
                    nan_values[v['values']] = np.nan

                nan_replacements[k] = nan_values

        questions = metadata.varLabels

    with srw.SavReader(input_filepath, ioUtf8=True) as reader:
        header = reader.header
        records = reader.all()

    df = pd.DataFrame(records, columns=header)
    df.replace(value_replacements, inplace=True)
    df.replace(nan_replacements, inplace=True)

    df.to_csv(output_filepath)

    questions_file = Path(output_filepath).with_suffix('.json')
    with open(questions_file, 'w') as qf:
        json.dump(questions, qf)

    # upsample the survey for a "representative" sample for analysis
    # if specified
    if upsample_size and upsample_size > 0 and w_col in df.columns:
        rng = np.random.RandomState(12345)

        smpl = rng.choice(df.index,
                          upsample_size,
                          p=df[w_col] / df[w_col].sum())
        df_resampled = df.loc[smpl, :]

        output_path = Path(output_filepath)
        df_resampled.to_csv(
            output_path.with_name(output_path.stem + '_upsampled.csv'))
currLocale = locale.setlocale(locale.LC_CTYPE)

# Demonstrates use of ioUtf8=UNICODE_BMODE, or ioUtf8=2.
# This is regular unicode mode (ioUtf8=UNICODE_BMODE, or ioUtf8=1,
# or ioUtf8=True), but data will be returned as bytes.
in_savFileName = "test_data/metadata_copy_test.sav"
is_windows = sys.platform.startswith("win")
ioLocale = "german" if is_windows else "de_DE.cp1252"
b_settings = dict(ioUtf8=sav.UNICODE_BMODE, ioLocale=ioLocale)

# read SPSS file data
with sav.SavReader(in_savFileName, rawMode=True, **b_settings) as data:
    in_records = data.all(False)

# read SPSS file metadata
with sav.SavHeaderReader(in_savFileName, **b_settings) as header:
    metadata = header.dataDictionary()
    #pprint(metadata)

# write (unmodified) data to SPSS file
out_savFileName = os.path.join(tempfile.gettempdir(), 'out.sav')
metadata.update(b_settings)
with sav.SavWriter(out_savFileName, **metadata) as writer:
    writer.writerows(in_records)


# Now test whether input and output are the same
class Test_MetadataRoundTrip(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
Exemple #8
0
def get_metadata(savFilename):
    """Gets variable names (list), variable types and formats (dict)"""
    with savReaderWriter.SavHeaderReader(savFilename) as header:
        varNames, varTypes = header.varNames, header.varTypes
        formats = header.formats
    return varNames, varTypes, formats
def leervaloresVariables():
    h = savReaderWriter.SavHeaderReader(archivo,
                                        ioUtf8=True,
                                        ioLocale='Spanish_Spain.1252')
    return h.valueLabels
def leerrotulos():
    with savReaderWriter.SavHeaderReader(
            archivo, ioLocale='Spanish_Spain.1252') as header:
        return header.varLabels
def leercabezotes():
    with savReaderWriter.SavHeaderReader(
            archivo, ioLocale='Spanish_Spain.1252') as header:
        return header.varNames
def leervaloresVariables():
    with savReaderWriter.SavHeaderReader(
            archivo, ioLocale='Spanish_Spain.1252') as header:
        return header.valueLabels.items()
    formats = [
        "S%d" % data.varTypes[v] if data.varTypes[v] else np.float64
        for v in data.varNames
    ]
    dtype = np.dtype({'names': data.varNames, 'formats': formats})
    structured_array = np.array([tuple(record) for record in records],
                                dtype=dtype)

allDataArray = np.array(
    records
)  # in the most recent version one can directly read to numpy arrays
print(records)

# reading metadata from SPSS file
with sav.SavHeaderReader(spss_file, ioUtf8=True, ioLocale=ioLocale) as header:
    metadata = header.dataDictionary(
        asNamedtuple=False)  # Why does this take so long?

pprint.pprint(metadata)

# writing unmodified data
with sav.SavWriter(spss_file_out,
                   overwrite=True,
                   ioUtf8=True,
                   ioLocale=ioLocale,
                   mode=b'wb',
                   refSavFileName=None,
                   **metadata) as writer:
    for i, record in enumerate(structured_array):
        writer.writerow(record)
Exemple #14
0
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True):
    """ see parse_sav_file doc """
    with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header:
        # Metadata Attributes
        # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles',
        #  'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats',
        #  'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments',
        #  'fileLabel', 'missingValues']
        metadata = header.dataDictionary(True)

    meta = start_meta(name=name)
    meta['info']['text'] = 'Converted from SAV file %s.' % (name)
    meta['info']['from_source'] = {'pandas_reader':'sav'}
    meta['sets']['data file']['items'] = [
        'columns@%s' % (varName)
        for varName in metadata.varNames
    ]

    # This should probably be somewhere in the metadata
    # weight_variable_name = metadata.caseWeightVar

    # Descriptions of attributes in metadata are are located here :
    # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files
    for column in metadata.varNames:
        meta['columns'][column] = {}

        if column in metadata.valueLabels:
            # ValueLabels is type = 'single' (possibry 1-1 map)
            meta['columns'][column]['values'] = []
            meta['columns'][column]['type'] = "single"
            for value, text in metadata.valueLabels[column].iteritems():
                values = {'text': {'main': unicode(text)},
                          'value': unicode(int(value))}
                meta['columns'][column]['values'].append(values)
        else:
            if column in metadata.formats:
                f = metadata.formats[column]
                if '.' in f:
                    meta['columns'][column]['type'] = "float"
                else:
                    meta['columns'][column]['type'] = "int"
            else:
                # Infer meta from data
                if data is not None:
                    # print "VAR '{}' NOT IN value_labels".format(column)
                    column_values = data[column].dropna()
                    if len(column_values) > 0:
                        # Get the first "not nan" value from the column
                        value = column_values.values[0]
                        if isinstance(value, pd.np.float64):
                            # Float AND Int because savReaderWriter loads them both as float64
                            meta['columns'][column]['text'] = {'main': [column]}
                            meta['columns'][column]['type'] = "float"
                            if (data[column].dropna() % 1).sum() == 0:
                                if (data[column].dropna() % 1).unique() == [0]:
                                    try:
                                        data[column] = data[column].astype('int')
                                    except:
                                        pass
                                    meta['columns'][column]['type'] = "int"

                        elif isinstance(value, unicode) or isinstance(value, str):
                            # Strings
                            meta['columns'][column]['text'] = {'main': [column]}
                            meta['columns'][column]['type'] = "string"

        if column in metadata.varTypes:
            pass

        if column in metadata.varSets:
            pass

        if column in metadata.varAttributes:
            pass

        if column in metadata.varRoles:
            pass

        if column in metadata.measureLevels:
            pass

        # Some labels are empty strings.
        if column in metadata.varLabels:
            meta['columns'][column]['text'] = {'main': metadata.varLabels[column]}

    for mrset in metadata.multRespDefs:
        # meta['masks'][mrset] = {}
        # 'D' is "multiple dichotomy sets" in SPSS
        # 'C' is "multiple category sets" in SPSS
        if metadata.multRespDefs[mrset]['setType'] == 'C':
            'C'
#             meta['masks'][mrset]['type'] = "categorical set"
        elif metadata.multRespDefs[mrset]['setType'] == 'D':
            'D'
#             meta['masks'][mrset]['type'] = "dichotomous set"
#             meta['masks'][mrset]['countedValue'] = metadata.multRespDefs[mrset]['countedValue']
            varNames = metadata.multRespDefs[mrset]['varNames']
#             meta, data[mrset] = delimited_from_dichotomous(meta, data[varNames], mrset)
            data[mrset] = condense_dichotomous_set(data[varNames], values_from_labels=False)
            meta['columns'][mrset] = {
                'type': 'delimited set',
                'text': {'main': metadata.multRespDefs[mrset]['label']},
                'values': [
                    {
                        'text': {'main': metadata.varLabels[varName]},
                        'value': v
                    }
                    for v, varName in enumerate(varNames, start=1)
                ]
            }
            idx = meta['sets']['data file']['items'].index('columns@%s' % (varNames[0]))
            items = meta['sets']['data file']['items']
            meta['sets']['data file']['items'] = items[:idx] + ['columns@%s' % (mrset)] + items[idx+len(varNames):]
            
#         meta['masks'][mrset]['text'] = [metadata.multRespDefs[mrset]['label']]
#         meta['masks'][mrset]['items'] = []
#         for var_name in metadata.multRespDefs[mrset]['varNames']:
#             meta['masks'][mrset]['items'].append({'source':"columns@{0}".format(var_name)})

        # df = make_delimited_from_dichotmous(data[common_vars[var]])

    return meta, data
Exemple #15
0
def extract_sav_meta(sav_file,
                     name="",
                     data=None,
                     ioLocale='en_US.UTF-8',
                     ioUtf8=True,
                     dichot=None,
                     dates_as_strings=False,
                     text_key="en-GB",
                     engine='savReaderWriter'):

    if engine == 'readstat':
        df, metadata = pyreadstat.read_sav(sav_file,
                                           encoding=ioLocale.split(".")[-1],
                                           metadataonly=True)
        meta = start_meta(text_key=text_key)

        meta['info']['text'] = 'Converted from SAV file {}.'.format(name)
        meta['info']['from_source'] = {'pandas_reader': 'sav'}
        meta['sets']['data file']['items'] = [
            'columns@{}'.format(varName) for varName in metadata.column_names
        ]

        for index, column in enumerate(metadata.column_names):
            meta['columns'][column] = {}
            meta['columns'][column]['name'] = column
            meta['columns'][column]['parent'] = {}
            if column in metadata.variable_value_labels:
                meta['columns'][column]['values'] = []
                meta['columns'][column]['type'] = "single"
                for value, text in metadata.variable_value_labels[
                        column].items():
                    values = {
                        'text': {
                            text_key: str(text)
                        },
                        'value': int(value)
                    }
                    meta['columns'][column]['values'].append(values)
                    # if user has stored single answer data as a string rather than number
                    # we convert it to floats and store non convertables as nan (with coerce)
                    if column in data.columns and data[column].dtype == 'O':
                        data[column] = pd.to_numeric(data[column],
                                                     errors='coerce',
                                                     downcast='float')
            else:
                if column in metadata.original_variable_types:
                    f = metadata.original_variable_types[column]
                    if 'DATETIME' in f:
                        if dates_as_strings:
                            # DATETIME fields from SPSS are currently
                            # being read in as strings because there's an
                            # as-yet undetermined discrepancy between the
                            # input and output dates if datetime64 is used
                            meta['columns'][column]['type'] = 'string'
                        else:
                            meta['columns'][column]['type'] = 'date'
                            data[column] = pd.to_datetime(data[column])
                    elif f.startswith('A'):
                        meta['columns'][column]['type'] = 'string'
                    elif '.' in f:
                        meta['columns'][column]['type'] = "float"
                    else:
                        meta['columns'][column]['type'] = "int"

            # add the variable label to the meta
            meta['columns'][column]['text'] = {
                text_key: metadata.column_labels[index]
            }
        return meta, data

    elif engine == 'savReaderWriter':
        if dichot is None: dichot = {'yes': 1, 'no': 0}
        """ see parse_sav_file doc """
        with sr.SavHeaderReader(sav_file, ioLocale=ioLocale,
                                ioUtf8=ioUtf8) as header:
            # Metadata Attributes
            # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles',
            #  'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats',
            #  'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments',
            #  'fileLabel', 'missingValues']
            metadata = header.dataDictionary(True)

        meta = start_meta(text_key=text_key)
        meta['info']['text'] = 'Converted from SAV file {}.'.format(name)
        meta['info']['from_source'] = {'pandas_reader': 'sav'}
        meta['sets']['data file']['items'] = [
            'columns@{}'.format(varName) for varName in metadata.varNames
        ]

        # This should probably be somewhere in the metadata
        # weight_variable_name = metadata.caseWeightVar

        # Descriptions of attributes in metadata are are located here :
        # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files
        for column in metadata.varNames:
            meta['columns'][column] = {}
            meta['columns'][column]['name'] = column
            meta['columns'][column]['parent'] = {}
            if column in metadata.valueLabels:
                # ValueLabels is type = 'single' (possibry 1-1 map)
                meta['columns'][column]['values'] = []
                meta['columns'][column]['type'] = "single"
                for value, text in metadata.valueLabels[column].items():
                    values = {
                        'text': {
                            text_key: str(text)
                        },
                        'value': int(value)
                    }
                    meta['columns'][column]['values'].append(values)
            else:
                if column in metadata.formats:
                    f = metadata.formats[column]
                    if 'DATETIME' in f:
                        if dates_as_strings:
                            # DATETIME fields from SPSS are currently
                            # being read in as strings because there's an
                            # as-yet undetermined discrepancy between the
                            # input and output dates if datetime64 is used
                            meta['columns'][column]['type'] = 'string'
                        else:
                            meta['columns'][column]['type'] = 'date'
                            data[column] = pd.to_datetime(data[column])
                    elif f.startswith('A'):
                        meta['columns'][column]['type'] = 'string'
                    elif '.' in f:
                        meta['columns'][column]['type'] = "float"
                    else:
                        meta['columns'][column]['type'] = "int"
                else:
                    # Infer meta from data
                    if data is not None:
                        # print "VAR '{}' NOT IN value_labels".format(column)
                        column_values = data[column].dropna()
                        if len(column_values) > 0:
                            # Get the first "not nan" value from the column
                            value = column_values.values[0]
                            if isinstance(value, pd.np.float64):
                                # Float AND Int because savReaderWriter loads them both as float64
                                meta['columns'][column]['text'] = {
                                    text_key: [column]
                                }
                                meta['columns'][column]['type'] = "float"
                                if (data[column].dropna() % 1).sum() == 0:
                                    if (data[column].dropna() %
                                            1).unique() == [0]:
                                        try:
                                            data[column] = data[column].astype(
                                                'int')
                                        except:
                                            pass
                                        meta['columns'][column]['type'] = "int"

                            elif isinstance(value, str) or isinstance(
                                    value, str):
                                # Strings
                                meta['columns'][column]['text'] = {
                                    text_key: [column]
                                }
                                meta['columns'][column]['type'] = "string"

            if column in metadata.varTypes:
                pass

            if column in metadata.varSets:
                pass

            if column in metadata.varAttributes:
                pass

            if column in metadata.varRoles:
                pass

            if column in metadata.measureLevels:
                pass

            # Some labels are empty strings.note
            if column in metadata.varLabels:
                meta['columns'][column]['text'] = {
                    text_key: metadata.varLabels[column]
                }

        for mrset in metadata.multRespDefs:
            # meta['masks'][mrset] = {}
            # 'D' is "multiple dichotomy sets" in SPSS
            # 'C' is "multiple category sets" in SPSS
            varNames = list(metadata.multRespDefs[mrset]['varNames'])
            # Find the index where there delimited set should be inserted
            # into data, which is immediately prior to the start of the
            # dichotomous set columns
            dls_idx = data.columns.tolist().index(varNames[0])
            if metadata.multRespDefs[mrset]['setType'] == 'C':
                # Raise if value object of columns is not equal
                if not all(meta['columns'][v]['values'] == meta['columns'][
                        varNames[0]]['values'] for v in varNames):
                    msg = 'Columns must have equal values to be combined in a set: {}'
                    raise ValueError(msg.format(varNames))
                # Concatenate columns to set
                df_str = data[varNames].astype('str')
                dls = df_str.apply(lambda x: ';'.join([
                    v.replace('.0', '')
                    for v in x.tolist() if not v in ['nan', 'None']
                ]),
                                   axis=1) + ';'
                dls.replace({';': np.NaN}, inplace=True)
                # Get value object
                values = meta['columns'][varNames[0]]['values']

            elif metadata.multRespDefs[mrset]['setType'] == 'D':
                # Generate the delimited set from the dichotomous set
                dls = condense_dichotomous_set(data[varNames],
                                               values_from_labels=False,
                                               **dichot)
                # Get value object
                values = [{
                    'text': {
                        text_key: metadata.varLabels[varName]
                    },
                    'value': int(v)
                } for v, varName in enumerate(varNames, start=1)]
            else:
                continue
            # Insert the delimited set into data
            data.insert(dls_idx, mrset, dls)
            # Generate the column meta for the new delimited set
            meta['columns'][mrset] = {
                'name': mrset,
                'type': 'delimited set',
                'text': {
                    text_key: metadata.multRespDefs[mrset]['label']
                },
                'parent': {},
                'values': values
            }
            # Add the new delimited set to the 'data file' set
            df_items = meta['sets']['data file']['items']
            df_items.insert(df_items.index('columns@{}'.format(varNames[0])),
                            'columns@{}'.format(mrset))

            data = data.drop(varNames, axis=1)
            for varName in varNames:
                df_items.remove('columns@{}'.format(varName))
                del meta['columns'][varName]

        return meta, data
Exemple #16
0
def extract_sav_meta(sav_file,
                     name="",
                     data=None,
                     ioLocale='en_US.UTF-8',
                     ioUtf8=True,
                     dichot=None,
                     dates_as_strings=False,
                     text_key="main"):

    if dichot is None: dichot = {'yes': 1, 'no': 0}
    """ see parse_sav_file doc """
    with sr.SavHeaderReader(sav_file, ioLocale=ioLocale,
                            ioUtf8=ioUtf8) as header:
        # Metadata Attributes
        # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles',
        #  'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats',
        #  'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments',
        #  'fileLabel', 'missingValues']
        metadata = header.dataDictionary(True)

    meta = start_meta(text_key=text_key)
    meta['info']['text'] = 'Converted from SAV file {}.'.format(name)
    meta['info']['from_source'] = {'pandas_reader': 'sav'}
    meta['sets']['data file']['items'] = [
        'columns@{}'.format(varName) for varName in metadata.varNames
    ]

    # This should probably be somewhere in the metadata
    # weight_variable_name = metadata.caseWeightVar

    # Descriptions of attributes in metadata are are located here :
    # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files
    for column in metadata.varNames:
        meta['columns'][column] = {}
        meta['columns'][column]['name'] = column
        meta['columns'][column]['parent'] = {}
        if column in metadata.valueLabels:
            # ValueLabels is type = 'single' (possibry 1-1 map)
            meta['columns'][column]['values'] = []
            meta['columns'][column]['type'] = "single"
            for value, text in metadata.valueLabels[column].iteritems():
                values = {
                    'text': {
                        text_key: unicode(text)
                    },
                    'value': int(value)
                }
                meta['columns'][column]['values'].append(values)
        else:
            if column in metadata.formats:
                f = metadata.formats[column]
                if 'DATETIME' in f:
                    if dates_as_strings:
                        # DATETIME fields from SPSS are currently
                        # being read in as strings because there's an
                        # as-yet undetermined discrepancy between the
                        # input and output dates if datetime64 is used
                        meta['columns'][column]['type'] = 'string'
                    else:
                        meta['columns'][column]['type'] = 'date'
                        data[column] = pd.to_datetime(data[column])
                elif f.startswith('A'):
                    meta['columns'][column]['type'] = 'string'
                elif '.' in f:
                    meta['columns'][column]['type'] = "float"
                else:
                    meta['columns'][column]['type'] = "int"
            else:
                # Infer meta from data
                if data is not None:
                    # print "VAR '{}' NOT IN value_labels".format(column)
                    column_values = data[column].dropna()
                    if len(column_values) > 0:
                        # Get the first "not nan" value from the column
                        value = column_values.values[0]
                        if isinstance(value, pd.np.float64):
                            # Float AND Int because savReaderWriter loads them both as float64
                            meta['columns'][column]['text'] = {
                                text_key: [column]
                            }
                            meta['columns'][column]['type'] = "float"
                            if (data[column].dropna() % 1).sum() == 0:
                                if (data[column].dropna() % 1).unique() == [0]:
                                    try:
                                        data[column] = data[column].astype(
                                            'int')
                                    except:
                                        pass
                                    meta['columns'][column]['type'] = "int"

                        elif isinstance(value, unicode) or isinstance(
                                value, str):
                            # Strings
                            meta['columns'][column]['text'] = {
                                text_key: [column]
                            }
                            meta['columns'][column]['type'] = "string"

        if column in metadata.varTypes:
            pass

        if column in metadata.varSets:
            pass

        if column in metadata.varAttributes:
            pass

        if column in metadata.varRoles:
            pass

        if column in metadata.measureLevels:
            pass

        # Some labels are empty strings.
        if column in metadata.varLabels:
            meta['columns'][column]['text'] = {
                text_key: metadata.varLabels[column]
            }

    for mrset in metadata.multRespDefs:
        # meta['masks'][mrset] = {}
        # 'D' is "multiple dichotomy sets" in SPSS
        # 'C' is "multiple category sets" in SPSS
        if metadata.multRespDefs[mrset]['setType'] == 'C':
            'C'
#             meta['masks'][mrset]['type'] = "categorical set"
        elif metadata.multRespDefs[mrset]['setType'] == 'D':
            'D'
            varNames = metadata.multRespDefs[mrset]['varNames']
            # Find the index where there delimited set should be inserted
            # into data, which is immediately prior to the start of the
            # dichotomous set columns
            dls_idx = data.columns.tolist().index(varNames[0])
            # Generate the delimited set from the dichotomous set
            dls = condense_dichotomous_set(data[varNames],
                                           values_from_labels=False,
                                           **dichot)
            # Insert the delimited set into data
            data.insert(dls_idx, mrset, dls)
            # Generate the column meta for the new delimited set
            meta['columns'][mrset] = {
                'name':
                mrset,
                'type':
                'delimited set',
                'text': {
                    text_key: metadata.multRespDefs[mrset]['label']
                },
                'parent': {},
                'values': [{
                    'text': {
                        text_key: metadata.varLabels[varName]
                    },
                    'value': int(v)
                } for v, varName in enumerate(varNames, start=1)]
            }
            # Add the new delimited set to the 'data file' set
            df_items = meta['sets']['data file']['items']
            df_items.insert(df_items.index('columns@{}'.format(varNames[0])),
                            'columns@{}'.format(mrset))

            data = data.drop(varNames, axis=1)
            for varName in varNames:
                df_items.remove('columns@{}'.format(varName))
                del meta['columns'][varName]

    return meta, data
Exemple #17
0
#
#--------
dbspss = "C:/Users/yoni/Desktop/IMO Nfi Entry/spss iom/TestSpss.sav"  #
cnx = mysql.connector.connect(user='******',
                              password='',
                              host='127.0.0.1',
                              database='pty_test')  #open MySql database
cursor = cnx.cursor()  #load cursor
cursorUPD = cnx.cursor()  #load cursor other cursor for update the record
cursor.execute('SELECT * FROM SPSST')  #run sql query and put it on cursor

#cursor.execute('SELECT concat("b'",name,"'") FROM spss_val WHERE 1 ORDER BY concat("b'",name,"'") ASC')
remaining_rows = cursor.fetchall(
)  #fetch all every selected rows and put it on cursor

with savReaderWriter.SavHeaderReader(
        dbspss) as header:  #open spss file for collecting meta data
    metadata = header.all()  #get all meta data from spss
# the following array declaration help to create [[],[],.....]
myouter = []  #declare array for outer array
myinner = []  #declare array for inner element
# the following nested loop used to create array structor
for r in remaining_rows:  #loop itteration for every database selected rows
    myinner = []  #this piece of code help us to clear existion value
    for i in r:  #itteration for every data columen of databse
        if isinstance(i, str):

            myinner.append(
                i.encode('UTF-8')
            )  #every row/column data converted to bytes which is allow us to insert on spss file and append on inner array

        elif i == None:  #i replace a null value to empity string and append it on the array
Exemple #18
0
writer = csv.writer(open('EMX_attributes.csv', 'wb'), delimiter=',')
writer.writerow(['name'] + ['label'] + ['description'] + ['entity'] +
                ['dataType'] + ['refEntity'] + ['nillable'] + ['idAttribute'])

#For each file in filePath, check if file exists and ends with the SPSS file extension.
for file in os.listdir(filePath):

    file = filePath + file

    if os.path.isfile(file) and file.endswith('.sav'):

        #Read SPSS file and get header + meta data
        with savReaderWriter.SavReader(file) as reader:
            columns = reader.header

        with savReaderWriter.SavHeaderReader(file) as header:
            valueLabels = header.all().valueLabels  #only categoricals
            varLabels = header.all().varLabels  #description
            measureLevels = header.all().measureLevels  #all variables

        #Store data for the first two columns of EMX attributes
        for column in columns:
            labelColumn = column.lower().replace('_', ' ')
            column = column.replace(' ', '_')

            if len(column) > 30:
                print column + " has more than 30 characters!"

            columnList.append(column)
            labelColumnList.append(labelColumn)