def quantipy_from_decipher(decipher_meta, decipher_data, text_key='main'): """ Converts the given Decipher data (which must have been exported in tab-delimited format) to Quantipy-ready meta and data. Parameters ---------- decipher_meta : str or dict Either the path to the Decipher meta document saved as JSON or said document read into memory decipher_data : str or pandas.DataFrame Either the path to the Decipher data saved as tab-delimited text said file read into memory Returns ------- meta : dict The Quantipy meta document data : pandas.DataFrame The converted data """ # If they're not already in memory, read in the Decipher meta and # data files if isinstance(decipher_meta, str): dmeta = load_json(decipher_meta) if isinstance(decipher_data, str): data = pd.DataFrame.from_csv(decipher_data, sep='\t') data[data.index.name] = data.index meta = start_meta(text_key=text_key) quotas = { 'vqtable': {}, 'voqtable': {} } types_map = { 'text': 'string', 'number': 'int', 'float': 'float', 'single': 'single', 'multiple': 'delimited set' } # Create generator for compound questions compound_questions = [ question for question in dmeta['questions'] if len(question['variables']) > 1] # Get basic variables for var in dmeta['variables']: # Collect quota variables # These will be dealt with later for qtable in ['vqtable', 'voqtable']: if qtable in var['vgroup']: if not var['vgroup'] in quotas[qtable]: quotas[qtable][var['vgroup']] = [] quotas[qtable][var['vgroup']].append(var) continue # Start the column meta for the current variable var_name = var['label'] column = meta['columns'][var_name] = { 'type': types_map[var['type']], 'text': {text_key: var['title']} } # Add meta-mapped path for current column to the 'data file' set # object so that the original order of the variables is known set_item = 'columns@%s' % (var_name) if not set_item in meta['sets']['data file']['items']: meta['sets']['data file']['items'].append(set_item) if var['type']=='single': # Get the response values column['values'] = get_decipher_values(var['values'], text_key) # Manage compound variables (delimited sets, arrays, mixed-type # sets) for question in compound_questions: if question['type']=='multiple': # Construct delimited set meta, data, vgroups, vgroup_variables = make_delimited_set( meta, data, question ) # If there's only 1 vgroup then this is a basic multiple- # choice question and doesn't require construction as an # array or set if len(vgroups)==1: continue else: # vgroups indicate how many groups of discrete variables sit # in the question # Find the number of variable groups in the set vgroups = get_vgroups(question['variables']) # For each variable group, get its members vgroup_variables = get_vgroup_variables( vgroups, question['variables'] ) # vgroup_types is used to keep track of the types used in the # variable group. This will help us identify mixed-type # question groups which are not arrays. vgroup_types = get_vgroup_types(vgroups, question['variables']) unique_vgroup_types = set(vgroup_types.values()) # Note if the vgroups use more than one variable type mixed_types = len(unique_vgroup_types) > 1 if mixed_types: # A set should be creted to bind mixed-type variables # together vgroup = vgroups[0] # Create the set mask = meta['sets'][vgroup] = { 'item type': 'mixed', 'text': {text_key: question['qtitle']}, 'items': [ 'columns@%s' % (var['label']) for var in question['variables'] ] } if 'multiple' in list(vgroup_types.values()): # This is a multiple grid # vgroup and vgroup_variables needs to be # edited to make them useable in the next step # This is related to the structure of multiple # response variables in Decipher multiple_vgroups = [ vgroup for vgroup in vgroups if vgroup_types[vgroup] == 'multiple' ] vgroup_variables = [copy.copy(vgroups)] new_vgroup_match = re.match('(^.+)(?=[c|r][0-9]+)', vgroups[0]) if new_vgroup_match is None: continue else: vgroups = [new_vgroup_match.group(0)] vgroup_types[vgroups[0]] = 'multiple' # Extract only the vgroups that contain multiple variables # so that an array mask can be created for each of them array_vgroups = [ (vgroup, vars) for vgroup, vars in zip(vgroups, vgroup_variables) if len(vars) > 1 ] # If there are any array-like groups of variables inside the # question, add an array mask/s accordingly for vgroup, vars in array_vgroups: if vgroup in meta['masks']: # This was a multiple-choice grid and has # already been converted continue # It's possible the vgroup is in the 'data file' set # and needs to be replaced with the name of the group's # component vars. This happens with compound questions # that are arrays with added open-ends variables mapped_vgroup = 'columns@%s' % (vgroup) df_items = meta['sets']['data file']['items'] if mapped_vgroup in df_items: mapped_vars = [('columns@%s' % v['label']) for v in vars] idx = meta['sets']['data file']['items'].index(mapped_vgroup) df_items = df_items[:idx] + mapped_vars + df_items[idx+1:] meta['sets']['data file']['items'] = df_items # Create the array mask mask = meta['masks'][vgroup] = { 'type': 'array', 'item type': types_map[vgroup_types[vgroup]], 'text': {text_key: ( '{} - {}'.format( vars[0]['rowTitle'], question['qtitle'] ) if vgroup_types[vgroup] in ['number', 'float', 'text'] else question['qtitle'] )}, 'items': [{ 'source': 'columns@{}'.format(var['label']), 'text': {text_key: var['rowTitle']}} for var in vars ]} if vgroup_types[vgroup] in ['single', 'multiple']: # Create lib values entry values_mapper = 'lib@values@%s' % (vgroup) meta['masks'][vgroup]['values'] = values_mapper if vgroup_types[vgroup] == 'single': values = get_decipher_values(question['values'], text_key) elif vgroup_types[vgroup] == 'multiple': values = copy.deepcopy(meta['columns'][vars[0]]['values']) meta['lib']['values'][vgroup] = values # Use meta-mapped values reference for single or # multiple array variables for item in mask['items']: col = item['source'].split('@')[-1] if col in meta['columns']: if 'values' in meta['columns'][col]: meta['columns'][col]['values'] = values_mapper # Construct quota columns (meta+data) meta, data = manage_decipher_quota_variables(meta, data, quotas) # Confirm that all meta columns exist in the data for col in list(meta['columns'].keys()): if not col in data.columns: print(( "Unpaired data warning: {} found in meta['columns']" " but not in data.columns. Removing it.".format(col))) del meta['columns'][col] set_item = 'columns@{}'.format(col) if set_item in meta['sets']['data file']['items']: idx = meta['sets']['data file']['items'].remove(set_item) # Confirm that all data columns exist in the meta for col in data.columns: if not col in meta['columns']: print(( "Unpaired meta warning: {} found in data.columns" " but not in meta['columns']. Removing it.".format(col))) data.drop(col, axis=1, inplace=True) return meta, data
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True, dichot=None, dates_as_strings=False, text_key="main"): if dichot is None: dichot = {'yes': 1, 'no': 0} """ see parse_sav_file doc """ with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header: # Metadata Attributes # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles', # 'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats', # 'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments', # 'fileLabel', 'missingValues'] metadata = header.dataDictionary(True) meta = start_meta(text_key=text_key) meta['info']['text'] = 'Converted from SAV file {}.'.format(name) meta['info']['from_source'] = {'pandas_reader': 'sav'} meta['sets']['data file']['items'] = [ 'columns@{}'.format(varName) for varName in metadata.varNames ] # This should probably be somewhere in the metadata # weight_variable_name = metadata.caseWeightVar # Descriptions of attributes in metadata are are located here : # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files for column in metadata.varNames: meta['columns'][column] = {} meta['columns'][column]['name'] = column meta['columns'][column]['parent'] = {} if column in metadata.valueLabels: # ValueLabels is type = 'single' (possibry 1-1 map) meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.valueLabels[column].iteritems(): values = { 'text': { text_key: unicode(text) }, 'value': int(value) } meta['columns'][column]['values'].append(values) else: if column in metadata.formats: f = metadata.formats[column] if 'DATETIME' in f: if dates_as_strings: # DATETIME fields from SPSS are currently # being read in as strings because there's an # as-yet undetermined discrepancy between the # input and output dates if datetime64 is used meta['columns'][column]['type'] = 'string' else: meta['columns'][column]['type'] = 'date' data[column] = pd.to_datetime(data[column]) elif f.startswith('A'): meta['columns'][column]['type'] = 'string' elif '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" else: # Infer meta from data if data is not None: # print "VAR '{}' NOT IN value_labels".format(column) column_values = data[column].dropna() if len(column_values) > 0: # Get the first "not nan" value from the column value = column_values.values[0] if isinstance(value, pd.np.float64): # Float AND Int because savReaderWriter loads them both as float64 meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "float" if (data[column].dropna() % 1).sum() == 0: if (data[column].dropna() % 1).unique() == [0]: try: data[column] = data[column].astype( 'int') except: pass meta['columns'][column]['type'] = "int" elif isinstance(value, unicode) or isinstance( value, str): # Strings meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "string" if column in metadata.varTypes: pass if column in metadata.varSets: pass if column in metadata.varAttributes: pass if column in metadata.varRoles: pass if column in metadata.measureLevels: pass # Some labels are empty strings. if column in metadata.varLabels: meta['columns'][column]['text'] = { text_key: metadata.varLabels[column] } for mrset in metadata.multRespDefs: # meta['masks'][mrset] = {} # 'D' is "multiple dichotomy sets" in SPSS # 'C' is "multiple category sets" in SPSS if metadata.multRespDefs[mrset]['setType'] == 'C': 'C' # meta['masks'][mrset]['type'] = "categorical set" elif metadata.multRespDefs[mrset]['setType'] == 'D': 'D' varNames = metadata.multRespDefs[mrset]['varNames'] # Find the index where there delimited set should be inserted # into data, which is immediately prior to the start of the # dichotomous set columns dls_idx = data.columns.tolist().index(varNames[0]) # Generate the delimited set from the dichotomous set dls = condense_dichotomous_set(data[varNames], values_from_labels=False, **dichot) # Insert the delimited set into data data.insert(dls_idx, mrset, dls) # Generate the column meta for the new delimited set meta['columns'][mrset] = { 'name': mrset, 'type': 'delimited set', 'text': { text_key: metadata.multRespDefs[mrset]['label'] }, 'parent': {}, 'values': [{ 'text': { text_key: metadata.varLabels[varName] }, 'value': int(v) } for v, varName in enumerate(varNames, start=1)] } # Add the new delimited set to the 'data file' set df_items = meta['sets']['data file']['items'] df_items.insert(df_items.index('columns@{}'.format(varNames[0])), 'columns@{}'.format(mrset)) data = data.drop(varNames, axis=1) for varName in varNames: df_items.remove('columns@{}'.format(varName)) del meta['columns'][varName] return meta, data
def quantipy_from_decipher(decipher_meta, decipher_data, text_key='main'): """ Converts the given Decipher data (which must have been exported in tab-delimited format) to Quantipy-ready meta and data. Parameters ---------- decipher_meta : str or dict Either the path to the Decipher meta document saved as JSON or said document read into memory decipher_data : str or pandas.DataFrame Either the path to the Decipher data saved as tab-delimited text said file read into memory Returns ------- meta : dict The Quantipy meta document data : pandas.DataFrame The converted data """ # If they're not already in memory, read in the Decipher meta and # data files if isinstance(decipher_meta, (str, unicode)): dmeta = load_json(decipher_meta) if isinstance(decipher_data, (str, unicode)): data = pd.DataFrame.from_csv(decipher_data, sep='\t') meta = start_meta(text_key=text_key) quotas = { 'vqtable': {}, 'voqtable': {} } types_map = { 'text': 'string', 'number': 'int', 'float': 'float', 'single': 'single', 'multiple': 'delimited set' } # Get basic variables for var in dmeta['variables']: # Collect quota variables # These will be dealt with later for qtable in ['vqtable', 'voqtable']: if qtable in var['vgroup']: if not var['vgroup'] in quotas[qtable]: quotas[qtable][var['vgroup']] = [] quotas[qtable][var['vgroup']].append(var) continue # Add meta-mapped path for current column to the 'data file' set # object so that the original order of the variables is known set_item = 'columns@%s' % (var['vgroup']) if not set_item in meta['sets']['data file']['items']: meta['sets']['data file']['items'].append(set_item) # Start the column meta for the current variable var_name = var['label'] column = meta['columns'][var_name] = { 'type': types_map[var['type']], 'text': {text_key: var['title']} } if var['type']=='single': # Get the response values column['values'] = get_decipher_values(var['values'], text_key) # Create generator for compound questions compound_questions = ( question for question in dmeta['questions'] if len(question['variables']) > 1 ) # Manage compound variables (delimited sets, arrays, mixed-type # sets) for question in compound_questions: if question['type']=='multiple': # Construct delimited set meta, data, vgroups, vgroup_variables = make_delimited_set( meta, data, question ) # If there's only 1 vgroup then this is a basic multiple- # choice question and doesn't require construction as an # array or set if len(vgroups)==1: continue else: # vgroups indicate how many groups of discrete variables sit # in the question # Find the number of variable groups in the set vgroups = get_vgroups(question['variables']) # For each variable group, get its members vgroup_variables = get_vgroup_variables( vgroups, question['variables'] ) # vgroup_types is used to keep track of the types used in the # variable group. This will help us identify mixed-type # question groups which are not arrays. vgroup_types = get_vgroup_types(vgroups, question['variables']) unique_vgroup_types = set(vgroup_types.values()) # Note if the vgroups use more than one variable type mixed_types = len(unique_vgroup_types) > 1 if mixed_types: # A set should be creted to bind mixed-type variables # together vgroup = vgroups[0] # Create the set mask = meta['sets'][vgroup] = { 'item type': 'mixed', 'text': {text_key: question['qtitle']}, 'items': [ 'columns@%s' % (var['label']) for var in question['variables'] ] } if 'multiple' in vgroup_types.values(): # This is a multiple grid # vgroup and vgroup_variables needs to be # edited to make them useable in the next step # This is related to the structure of multiple # response variables in Decipher multiple_vgroups = [ vgroup for vgroup in vgroups if vgroup_types[vgroup] == 'multiple' ] vgroup_variables = [copy.copy(vgroups)] new_vgroup_match = re.match('(^.+)(?=[c|r][0-9]+)', vgroups[0]) if new_vgroup_match is None: continue else: vgroups = [new_vgroup_match.group(0)] vgroup_types[vgroups[0]] = 'multiple' # Extract only the vgroups that contain multiple variables # so that an array mask can be created for each of them array_vgroups = [ (vgroup, vars) for vgroup, vars in zip(vgroups, vgroup_variables) if len(vars) > 1 ] # If there are any array-like groups of variables inside the # question, add an array mask/s accordingly for vgroup, vars in array_vgroups: # It's possible the vgroup is in the 'data file' set # and needs to be replaced with the name of the group's # component vars. This happens with compound questions # that are arrays with added open-ends variables mapped_vgroup = 'columns@%s' % (vgroup) df__items = meta['sets']['data file']['items'] if mapped_vgroup in df__items: mapped_vars = [('columns@%s' % v['label']) for v in vars] idx = meta['sets']['data file']['items'].index(mapped_vgroup) df__items = df__items[:idx] + mapped_vars + df__items[idx+1:] meta['sets']['data file']['items'] = df__items # Create the array mask mask = meta['masks'][vgroup] = { 'type': 'array', 'item type': types_map[vgroup_types[vgroup]], 'text': {text_key: ( '%s - %s' % ( vars[0]['rowTitle'], question['qtitle'] ) if vgroup_types[vgroup] in ['number', 'float', 'text'] else question['qtitle'] )}, 'items': [ 'columns@%s' % ( var if vgroup_types[vgroup]=='multiple' else var['label'] ) for var in vars ] } if vgroup_types[vgroup] in ['single', 'multiple']: # Create lib values entry values_mapping = 'lib@values@%s' % (vgroup) if vgroup_types[vgroup] == 'single': values = get_decipher_values(question['values'], text_key) elif vgroup_types[vgroup] == 'multiple': values = copy.deepcopy(meta['columns'][vars[0]]['values']) meta['lib']['values'][vgroup] = values # Use meta-mapped values reference for single or # multiple array variables for item in mask['items']: col = item.split('@')[-1] meta['columns'][col]['values'] = values_mapping # Construct quota columns (meta+data) meta, data = manage_decipher_quota_variables(meta, data, quotas) return meta, data
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True): """ see parse_sav_file doc """ with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header: # Metadata Attributes # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles', # 'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats', # 'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments', # 'fileLabel', 'missingValues'] metadata = header.dataDictionary(True) meta = start_meta(name=name) meta['info']['text'] = 'Converted from SAV file %s.' % (name) meta['info']['from_source'] = {'pandas_reader':'sav'} meta['sets']['data file']['items'] = [ 'columns@%s' % (varName) for varName in metadata.varNames ] # This should probably be somewhere in the metadata # weight_variable_name = metadata.caseWeightVar # Descriptions of attributes in metadata are are located here : # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files for column in metadata.varNames: meta['columns'][column] = {} if column in metadata.valueLabels: # ValueLabels is type = 'single' (possibry 1-1 map) meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.valueLabels[column].iteritems(): values = {'text': {'main': unicode(text)}, 'value': unicode(int(value))} meta['columns'][column]['values'].append(values) else: if column in metadata.formats: f = metadata.formats[column] if '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" else: # Infer meta from data if data is not None: # print "VAR '{}' NOT IN value_labels".format(column) column_values = data[column].dropna() if len(column_values) > 0: # Get the first "not nan" value from the column value = column_values.values[0] if isinstance(value, pd.np.float64): # Float AND Int because savReaderWriter loads them both as float64 meta['columns'][column]['text'] = {'main': [column]} meta['columns'][column]['type'] = "float" if (data[column].dropna() % 1).sum() == 0: if (data[column].dropna() % 1).unique() == [0]: try: data[column] = data[column].astype('int') except: pass meta['columns'][column]['type'] = "int" elif isinstance(value, unicode) or isinstance(value, str): # Strings meta['columns'][column]['text'] = {'main': [column]} meta['columns'][column]['type'] = "string" if column in metadata.varTypes: pass if column in metadata.varSets: pass if column in metadata.varAttributes: pass if column in metadata.varRoles: pass if column in metadata.measureLevels: pass # Some labels are empty strings. if column in metadata.varLabels: meta['columns'][column]['text'] = {'main': metadata.varLabels[column]} for mrset in metadata.multRespDefs: # meta['masks'][mrset] = {} # 'D' is "multiple dichotomy sets" in SPSS # 'C' is "multiple category sets" in SPSS if metadata.multRespDefs[mrset]['setType'] == 'C': 'C' # meta['masks'][mrset]['type'] = "categorical set" elif metadata.multRespDefs[mrset]['setType'] == 'D': 'D' # meta['masks'][mrset]['type'] = "dichotomous set" # meta['masks'][mrset]['countedValue'] = metadata.multRespDefs[mrset]['countedValue'] varNames = metadata.multRespDefs[mrset]['varNames'] # meta, data[mrset] = delimited_from_dichotomous(meta, data[varNames], mrset) data[mrset] = condense_dichotomous_set(data[varNames], values_from_labels=False) meta['columns'][mrset] = { 'type': 'delimited set', 'text': {'main': metadata.multRespDefs[mrset]['label']}, 'values': [ { 'text': {'main': metadata.varLabels[varName]}, 'value': v } for v, varName in enumerate(varNames, start=1) ] } idx = meta['sets']['data file']['items'].index('columns@%s' % (varNames[0])) items = meta['sets']['data file']['items'] meta['sets']['data file']['items'] = items[:idx] + ['columns@%s' % (mrset)] + items[idx+len(varNames):] # meta['masks'][mrset]['text'] = [metadata.multRespDefs[mrset]['label']] # meta['masks'][mrset]['items'] = [] # for var_name in metadata.multRespDefs[mrset]['varNames']: # meta['masks'][mrset]['items'].append({'source':"columns@{0}".format(var_name)}) # df = make_delimited_from_dichotmous(data[common_vars[var]]) return meta, data
def quantipy_from_ascribe(path_xml, path_txt, text_key='main'): # Read the AScribe data (tab-delimited) meta_ascribe = xmltodict.parse(open(path_xml)) data_ascribe = pd.DataFrame.from_csv( path_txt, sep='\t', header=0, encoding='utf-16' ) # Start a Quantipy meta document meta = start_meta(text_key=text_key) meta['columns']['responseid'] = { 'type': 'int', 'text': {text_key: 'responseid'} } # Container to record the names, in order, of the resulting # coded columns coded_names = [] for var in meta_ascribe['CodedQuestions']['MultiForm']: name = var['Name'] coded_names.append(name) coded_from = var['FormTexts']['FormText']['Title'] var_text = var['FormTexts']['FormText']['Text'] if var_text is None: var_text = 'Label not provided' var_text = {text_key: var_text} columns = [] values = [] for val in var['Answers']['Answer']: value = int(val['@Precode']) if value==0: msg = ( "The value 0 has been assigned to a code for the " "variable '%s'." ) % (name) warnings.warn(msg) val_text = val['Texts']['Text']['#text'] if val_text is None: val_text = 'Label not provided' val_text = {text_key: val_text} values.append({'value': value, 'text': val_text}) columns.append('%s_%s' % (name, value)) # Create a single series from the dichotomous set data_ascribe[name] = condense_dichotomous_set( data_ascribe[columns], sniff_single=True ) # Determine the Quantipy type of the returned # series from its dtype (see 'sniff_sinlge' in # condense_dichotomous_set() if data_ascribe[columns].sum(axis=1).max()==1: col_type = 'single' else: col_type = 'delimited set' # Create the new Quantipy column meta column = { 'type': col_type, 'text': var_text, 'values': values } # Add the newly defined column to the Quantipy meta meta['columns'][name] = column meta['sets']['data file']['items'] = [ 'columns@%s' % (col_name) for col_name in coded_names ] # Keep only the slice that has been converted. data = data_ascribe[coded_names] return meta, data
def quantipy_from_ascribe(path_xml, path_txt, text_key='main'): # Read the AScribe data (tab-delimited) meta_ascribe = xmltodict.parse(open(path_xml)) data_ascribe = pd.DataFrame.from_csv(path_txt, sep='\t', header=0, encoding='utf-16') # Start a Quantipy meta document meta = start_meta(text_key=text_key) meta['columns']['responseid'] = { 'type': 'int', 'text': { text_key: 'responseid' } } # Container to record the names, in order, of the resulting # coded columns coded_names = [] for var in meta_ascribe['CodedQuestions']['MultiForm']: name = var['Name'] coded_names.append(name) coded_from = var['FormTexts']['FormText']['Title'] var_text = var['FormTexts']['FormText']['Text'] if var_text is None: var_text = 'Label not provided' var_text = {text_key: var_text} columns = [] values = [] for val in var['Answers']['Answer']: value = int(val['@Precode']) if value == 0: msg = ("The value 0 has been assigned to a code for the " "variable '%s'.") % (name) warnings.warn(msg) val_text = val['Texts']['Text']['#text'] if val_text is None: val_text = 'Label not provided' val_text = {text_key: val_text} values.append({'value': value, 'text': val_text}) columns.append('%s_%s' % (name, value)) # Create a single series from the dichotomous set data_ascribe[name] = condense_dichotomous_set(data_ascribe[columns], sniff_single=True) # Determine the Quantipy type of the returned # series from its dtype (see 'sniff_sinlge' in # condense_dichotomous_set() if data_ascribe[columns].sum(axis=1).max() == 1: col_type = 'single' else: col_type = 'delimited set' # Create the new Quantipy column meta column = {'type': col_type, 'text': var_text, 'values': values} # Add the newly defined column to the Quantipy meta meta['columns'][name] = column meta['sets']['data file']['items'] = [ 'columns@%s' % (col_name) for col_name in coded_names ] # Keep only the slice that has been converted. data = data_ascribe[coded_names] return meta, data
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True, dichot=None, dates_as_strings=False, text_key="en-GB", engine='savReaderWriter'): if engine == 'readstat': df, metadata = pyreadstat.read_sav(sav_file, encoding=ioLocale.split(".")[-1], metadataonly=True) meta = start_meta(text_key=text_key) meta['info']['text'] = 'Converted from SAV file {}.'.format(name) meta['info']['from_source'] = {'pandas_reader': 'sav'} meta['sets']['data file']['items'] = [ 'columns@{}'.format(varName) for varName in metadata.column_names ] for index, column in enumerate(metadata.column_names): meta['columns'][column] = {} meta['columns'][column]['name'] = column meta['columns'][column]['parent'] = {} if column in metadata.variable_value_labels: meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.variable_value_labels[ column].items(): values = { 'text': { text_key: str(text) }, 'value': int(value) } meta['columns'][column]['values'].append(values) # if user has stored single answer data as a string rather than number # we convert it to floats and store non convertables as nan (with coerce) if column in data.columns and data[column].dtype == 'O': data[column] = pd.to_numeric(data[column], errors='coerce', downcast='float') else: if column in metadata.original_variable_types: f = metadata.original_variable_types[column] if 'DATETIME' in f: if dates_as_strings: # DATETIME fields from SPSS are currently # being read in as strings because there's an # as-yet undetermined discrepancy between the # input and output dates if datetime64 is used meta['columns'][column]['type'] = 'string' else: meta['columns'][column]['type'] = 'date' data[column] = pd.to_datetime(data[column]) elif f.startswith('A'): meta['columns'][column]['type'] = 'string' elif '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" # add the variable label to the meta meta['columns'][column]['text'] = { text_key: metadata.column_labels[index] } return meta, data elif engine == 'savReaderWriter': if dichot is None: dichot = {'yes': 1, 'no': 0} """ see parse_sav_file doc """ with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header: # Metadata Attributes # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles', # 'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats', # 'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments', # 'fileLabel', 'missingValues'] metadata = header.dataDictionary(True) meta = start_meta(text_key=text_key) meta['info']['text'] = 'Converted from SAV file {}.'.format(name) meta['info']['from_source'] = {'pandas_reader': 'sav'} meta['sets']['data file']['items'] = [ 'columns@{}'.format(varName) for varName in metadata.varNames ] # This should probably be somewhere in the metadata # weight_variable_name = metadata.caseWeightVar # Descriptions of attributes in metadata are are located here : # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files for column in metadata.varNames: meta['columns'][column] = {} meta['columns'][column]['name'] = column meta['columns'][column]['parent'] = {} if column in metadata.valueLabels: # ValueLabels is type = 'single' (possibry 1-1 map) meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.valueLabels[column].items(): values = { 'text': { text_key: str(text) }, 'value': int(value) } meta['columns'][column]['values'].append(values) else: if column in metadata.formats: f = metadata.formats[column] if 'DATETIME' in f: if dates_as_strings: # DATETIME fields from SPSS are currently # being read in as strings because there's an # as-yet undetermined discrepancy between the # input and output dates if datetime64 is used meta['columns'][column]['type'] = 'string' else: meta['columns'][column]['type'] = 'date' data[column] = pd.to_datetime(data[column]) elif f.startswith('A'): meta['columns'][column]['type'] = 'string' elif '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" else: # Infer meta from data if data is not None: # print "VAR '{}' NOT IN value_labels".format(column) column_values = data[column].dropna() if len(column_values) > 0: # Get the first "not nan" value from the column value = column_values.values[0] if isinstance(value, pd.np.float64): # Float AND Int because savReaderWriter loads them both as float64 meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "float" if (data[column].dropna() % 1).sum() == 0: if (data[column].dropna() % 1).unique() == [0]: try: data[column] = data[column].astype( 'int') except: pass meta['columns'][column]['type'] = "int" elif isinstance(value, str) or isinstance( value, str): # Strings meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "string" if column in metadata.varTypes: pass if column in metadata.varSets: pass if column in metadata.varAttributes: pass if column in metadata.varRoles: pass if column in metadata.measureLevels: pass # Some labels are empty strings.note if column in metadata.varLabels: meta['columns'][column]['text'] = { text_key: metadata.varLabels[column] } for mrset in metadata.multRespDefs: # meta['masks'][mrset] = {} # 'D' is "multiple dichotomy sets" in SPSS # 'C' is "multiple category sets" in SPSS varNames = list(metadata.multRespDefs[mrset]['varNames']) # Find the index where there delimited set should be inserted # into data, which is immediately prior to the start of the # dichotomous set columns dls_idx = data.columns.tolist().index(varNames[0]) if metadata.multRespDefs[mrset]['setType'] == 'C': # Raise if value object of columns is not equal if not all(meta['columns'][v]['values'] == meta['columns'][ varNames[0]]['values'] for v in varNames): msg = 'Columns must have equal values to be combined in a set: {}' raise ValueError(msg.format(varNames)) # Concatenate columns to set df_str = data[varNames].astype('str') dls = df_str.apply(lambda x: ';'.join([ v.replace('.0', '') for v in x.tolist() if not v in ['nan', 'None'] ]), axis=1) + ';' dls.replace({';': np.NaN}, inplace=True) # Get value object values = meta['columns'][varNames[0]]['values'] elif metadata.multRespDefs[mrset]['setType'] == 'D': # Generate the delimited set from the dichotomous set dls = condense_dichotomous_set(data[varNames], values_from_labels=False, **dichot) # Get value object values = [{ 'text': { text_key: metadata.varLabels[varName] }, 'value': int(v) } for v, varName in enumerate(varNames, start=1)] else: continue # Insert the delimited set into data data.insert(dls_idx, mrset, dls) # Generate the column meta for the new delimited set meta['columns'][mrset] = { 'name': mrset, 'type': 'delimited set', 'text': { text_key: metadata.multRespDefs[mrset]['label'] }, 'parent': {}, 'values': values } # Add the new delimited set to the 'data file' set df_items = meta['sets']['data file']['items'] df_items.insert(df_items.index('columns@{}'.format(varNames[0])), 'columns@{}'.format(mrset)) data = data.drop(varNames, axis=1) for varName in varNames: df_items.remove('columns@{}'.format(varName)) del meta['columns'][varName] return meta, data