def _add_oe(oe, break_by, title, drop_empty, incl_nan, filter_by): columns = break_by + oe oe_data = self._data.copy() if self.filter != 'no_filter': ds = qp.DataSet('open_ends') ds.from_components(oe_data, self._meta) slicer = ds.take(self.filter.values()[0]) oe_data = oe_data.loc[slicer, :] if filter_by: ds = qp.DataSet('open_ends') ds.from_components(oe_data, self._meta) slicer = ds.take(filter_by) oe_data = oe_data.loc[slicer, :] oe_data = oe_data[columns] oe_data.replace('__NA__', np.NaN, inplace=True) if replacements: for target, repl in replacements.items(): oe_data.replace(target, repl, inplace=True) if drop_empty: oe_data.dropna(subset=oe, how='all', inplace=True) if not incl_nan: for col in oe: oe_data[col].replace(np.NaN, '', inplace=True) self.verbatims[title] = oe_data self.verbatim_names.extend(oe)
def test_read_spss_readstat(self): dataset_v1 = qp.DataSet('spss') dataset_v1.read_spss( 'tests/Example Data (A) - with multi choice q2.sav') dataset = qp.DataSet('spss') dataset.read_spss('tests/Example Data (A) - with multi choice q2.sav', engine='readstat') # the label of the set is lost as this engine doesn't support delimited sets dataset.to_delimited_set('q2', dataset_v1.text('q2'), dataset.find('q2_')) self.assertTrue(dataset.meta('q2').shape == (8, 3)) assert dataset.crosstab('q2').equals(dataset_v1.crosstab('q2')) assert dataset.crosstab('q2b').equals(dataset_v1.crosstab('q2b'))
def _load_ds(self, name): path_json = '{}/{}.json'.format(self.path, name) path_csv = '{}/{}.csv'.format(self.path, name) dataset = qp.DataSet(name, self._dimensions_comp) dataset.set_verbose_infomsg(False) dataset.read_quantipy(path_json, path_csv) return dataset
def test_read_from_api(): dataset_from_api = qp.DataSet("confirmit") dataset_from_api.read_confirmit_api(projectid="p913481003361", public_url="https://ws.euro.confirmit.com/", idp_url="https://idp.euro.confirmit.com/", client_id="71a15e5d-b52d-4534-b54b-fa6e2a9da8a7", client_secret="2a943d4d-58ab-42b8-a276-53d07ad34064", schema_vars='status,q39,q21', schema_filter="response:status='complete'" ) print(dataset_from_api.meta('q39')) assert dataset_from_api.crosstab('q39').shape == (3, 1) print(dataset_from_api.meta('q21')) assert dataset_from_api.crosstab('q21').shape == (6, 1) print(dataset_from_api.crosstab('q39', 'q21')) assert dataset_from_api.crosstab('q39', 'q21').shape == (3, 5) assert dataset_from_api.meta()['columns']['q39'] == json.loads(""" {"name": "q39", "parent": {}, "type": "single", "values": [ {"text": {"en": "yes"}, "value": 1}, {"text": {"en": "no"}, "value": 2}], "text": {"en": "Use script to set values"}}""")
def _get_dataset(self): path = os.path.dirname(os.path.abspath(__file__)) + '/' name = 'Example Data (A)' casedata = '{}.csv'.format(name) metadata = '{}.json'.format(name) dataset = qp.DataSet(name) dataset.set_verbose_infomsg(False) dataset.read_quantipy(path + metadata, path + casedata) return dataset
def test_writer_to_api(): dataset = qp.DataSet("confirmit") response = dataset.write_confirmit_api(projectid="p913481003361", public_url="https://ws.euro.confirmit.com/", idp_url="https://idp.euro.confirmit.com/", client_id="71a15e5d-b52d-4534-b54b-fa6e2a9da8a7", client_secret="2a943d4d-58ab-42b8-a276-53d07ad34064", schema_vars=["q7", "q9", "q11"]) assert response.status_code == 200 assert b'insertedRecords' in response.content assert b'updatedRecords' in response.content print(response.content)
def process_file(): try: uid = str(uuid.uuid4()) # use uid to track multiple instances in separate tabs/windows # reset and initialize session session[uid] = {} session[uid]['active'] = True file = request.files['file'] # .sav file user selected filename = file.filename print("name of file: " + filename) session[uid]['filename'] = filename # create temp directory if it does not exist if not os.path.isdir('./temp'): print('Creating temp directory') os.system(f"mkdir {'./temp'}") file.save('./temp/%s' %(filename)) print("Temporary file saved") # read dataset from saved .sav file ds = qp.DataSet('data') ds.read_spss('./temp/' + filename, ioLocale=None, detect_dichot=False) # get file metadata meta_data = ds.meta()['columns'].values() meta_data = list(iter(meta_data)) ## try to remove this to make upload faster if ('uuid' not in ds.meta()['columns']): return {'success': 'false', 'message': 'This file does not have a uuid variable.'} return {'success': 'true', 'uid': uid, 'meta_data_array': meta_data, 'meta_data_obj': ds.meta()['columns']} except Exception as e: print(e) print(sys.exc_info()[0]) print(traceback.print_exc()) return {'success': 'false'}
def to_dataset(self, mode=None, from_set="data file", additions="sort_within", manifest_edits="keep", integrate_rc=(["_rc", "_rb"], True), misc=["RecordNo", "caseid", "identity"]): """ Create a qp.DataSet instance out of the batch settings. Parameters ---------- mode: list of str {'x', 'y', 'v', 'oe', 'w', 'f'} Variables to keep. from_set: str or list of str, default None Set name or a list of variables to sort against. additions: str {'sort_within, sort_between', False} Add variables from additional batches. manifest_edits: str {'keep', 'apply', False} Keep meta from edits or apply rules. """ batches = self._meta['sets']['batches'] adds = batches[self.name]['additions'] # prepare variable list if not mode: mode = ['x', 'y', 'v', 'oe', 'w', 'f'] vlist = self._get_vlist(batches[self.name], mode) if additions == "sort_between": for add in adds: vlist += self._get_vlist(batches[add], mode) if not from_set: from_set = vlist vlist = self.align_order(vlist, from_set, integrate_rc, fix=misc) if additions == "sort_within": for add in adds: add_list = self._get_vlist(batches[add], mode) add_list = self.align_order(add_list, from_set, integrate_rc, fix=misc) vlist += add_list vlist = self.de_duplicate(vlist) vlist = self.roll_up(vlist) # handle filters merge_f = False f = self.filter if adds: filters = [self.filter] + [batches[add]['filter'] for add in adds] filters = [fi for fi in filters if fi] if len(filters) == 1: f = filters[0] elif not self.compare_filter(filters[0], filters[1:]): f = "merge_filter" merge_f = filters else: f = filters[0] # create ds ds = qp.DataSet(self.name, self._dimensions_comp) ds.from_components(self._data.copy(), org_copy.deepcopy(self._meta), True, self.language) for b in ds.batches(): if not (b in adds or b == ds.name): del ds._meta['sets']['batches'][b] if merge_f: ds.merge_filter(f, filters) if not manifest_edits: vlist.append(f) if f and manifest_edits: ds.filter(self.name, {f: 0}, True) if merge_f: ds.drop(f) ds.create_set(str(self.name), included=vlist, overwrite=True) ds.subset(from_set=self.name, inplace=True) ds.order(vlist) # manifest edits if manifest_edits in ['apply', 'keep']: b_meta = batches[self.name]['meta_edits'] for v in ds.variables(): if ds.is_array(v) and b_meta.get(v): ds._meta['masks'][v] = b_meta[v] try: ds._meta['lib']['values'][v] = b_meta['lib'][v] except: pass elif b_meta.get(v): ds._meta['columns'][v] = b_meta[v] if manifest_edits == "apply" and not ds._is_array_item(v): for axis in ['x', 'y']: if all(rule in ds._get_rules(v, axis) for rule in ['dropx', 'slicex']): drops = ds._get_rules(v, axis)['dropx']['values'] slicer = ds._get_rules(v, axis)['slicex']['values'] elif 'dropx' in ds._get_rules(v, axis): drops = ds._get_rules(v, axis)['dropx']['values'] slicer = ds.codes(v) elif 'slicex' in ds._get_rules(v, axis): drops = [] slicer = ds._get_rules(v, axis)['slicex']['values'] else: drops = slicer = [] if drops or slicer: if not all(isinstance(c, int) for c in drops): item_no = [ds.item_no(d) for d in drops] ds.remove_items(v, item_no) else: codes = ds.codes(v) n_codes = [c for c in slicer if not c in drops] if not len(n_codes) == len(codes): remove = [c for c in codes if not c in n_codes] ds.remove_values(v, remove) ds.reorder_values(v, n_codes) if ds.is_array(v): ds._meta['masks'][v].pop('rules') else: ds._meta['columns'][v].pop('rules') return ds
def test_read_spss(self): dataset = qp.DataSet('spss') dataset.read_spss('tests/Example Data (A) - with multi choice q2.sav') self.assertTrue(dataset.meta('q2').shape == (8, 3))
def dataset(): _dataset = qp.DataSet(NAME_PROJ, dimensions_comp=False) _dataset.read_quantipy(PATH_META, PATH_DATA) yield _dataset.split() del _dataset
def main(spss_filename): """ Purpose: Run program from terminal Parameters: spss_filename - name of .sav file, extension not included Returns: None """ ### OPEN SPSS DATA # edit main() based on desired weighting scheme ds = qp.DataSet(spss_filename) # ds.read_spss('./' + spss_filename, ioLocale=None, detect_dichot=False) ds.read_spss( 'C://Users//Jamie Smart//Dropbox (Latitude)//Active Projects//AETN - Lifetime - KFC (10478)//Fieldwork//data management//' + spss_filename, ioLocale=None, detect_dichot=False) scheme = create_scheme() ### ADD TARGETS TO SCHEME tv_only_targets = [] tv_social_targets = [] #add_target(name, variable, dictionary_of_targets, target_list_to_add_to) add_target("age_targets", "S1_RC", {1: 46.6, 2: 53.4}, tv_only_targets) add_target("age_targets", "S1_RC", {1: 66.0, 2: 34.0}, tv_social_targets) add_target("gender_targets", "S2", {1: 32.4, 2: 67.6}, tv_only_targets) add_target("gender_targets", "S2", {1: 17.0, 2: 83.0}, tv_social_targets) add_target("ethnicity_targets", "Ethnicity", { 1: 80.4, 2: 4.7, 3: 8.8, 4: 6.1 }, tv_only_targets) add_target("ethnicity_targets", "Ethnicity", { 1: 54.2, 2: 10.5, 3: 17.0, 4: 18.3 }, tv_social_targets) add_target("usage_targets", "A10r1", {0: 54.1, 1: 45.9}, tv_only_targets) add_target("usage_targets", "A10r1", {0: 35.9, 1: 64.1}, tv_social_targets) add_target("frequency_targets", "A11_KFC", { 0: 73.0, 1: 27.0 }, tv_only_targets) add_target("frequency_targets", "A11_KFC", { 0: 52.9, 1: 47.1 }, tv_social_targets) # apply_targets(scheme, tv_only_targets) # apply_targets(scheme, tv_social_targets) ds_group = ds[[ 'CellSocial', 'S1_RC', 'S2', 'Ethnicity', 'A10r1', 'A11_KFC' ]] add_group(ds_group, scheme, "tv only primary", "CellSocial", "2", tv_only_targets) add_group(ds_group, scheme, "tv only secondary", "CellSocial", "3", tv_only_targets) add_group(ds_group, scheme, "tv+social primary", "CellSocial", "5", tv_social_targets) add_group(ds_group, scheme, "tv+social secondary", "CellSocial", "6", tv_social_targets) ds.weight(scheme, weight_name="weight", unique_key='uuid') ds["weight"].fillna(1, inplace=True) check_weights(ds, ['S1_RC', 'S2', 'Ethnicity', 'A10r1', 'A11_KFC'], group='CellSocial') save_file( ds, 'C://Users//Jamie Smart//Dropbox (Latitude)//Active Projects//AETN - Lifetime - KFC (10478)//Fieldwork//data management//' + spss_filename + '_weighted.sav') save_syntax_file( ds, 'C://Users//Jamie Smart//Dropbox (Latitude)//Active Projects//AETN - Lifetime - KFC (10478)//Fieldwork//data management//' + spss_filename + '_syntax.sps')
def weight_data(variables, mapping, grouping, file_name, weight_name="weight", unique_key="uuid"): """ Purpose: Takes variables, groupings and uses them to weight the dataset Parameters: variables - array of variables to use for weighting mapping - dict with weighting targets for each variable in variables array grouping - None or a metadata dict for a variable to group by file_name - name of original data file weight_name - name for weight variable Returns: file_location - location to save weighted file to syntax_location - location to save syntax file to crosstabs - unweighted and weighted crosstabs of variables used to set targets report - weighting summary """ # text description of weighting scheme weighting_desc = "* Weighting targets: " + str( mapping) + "\n* Grouping variables: " + (str(grouping['name']) if grouping else "None") + "\n\n" ds = qp.DataSet('data') ds.read_spss('./temp/' + file_name, ioLocale=None, detect_dichot=False) #meta, data = read_spss('./temp/' + file_name, ioLocale=None, detect_dichot=False) scheme = create_scheme("rake_scheme") ### ADD TARGETS TO SCHEME all_targets = [] for i in range(len(variables)): # loop through target variables target_var = variables[i] target_dict = mapping[target_var] str_keys = list(target_dict.keys()) for key in str_keys: # loop through keys and replace string key with integer key int_key = int(key) val_copy = target_dict[key] del target_dict[key] target_dict[int_key] = val_copy add_target("targets_" + str(i), target_var, target_dict, all_targets) apply_targets(scheme, all_targets) if grouping: grouping_var = grouping['name'] for v in grouping['values']: val = v['value'] label = list(v['text'].values())[0] ds_minified = ds[variables + [ grouping_var ]] ## create smaller version of ds to pass to add_group function add_group(ds_minified, scheme, label, grouping_var, val, all_targets) grouping = grouping_var else: grouping = None # set for crosstabs ds.weight(scheme, weight_name=weight_name, unique_key=unique_key) ds[weight_name].fillna(1, inplace=True) ds.meta()['measureLevels'][weight_name] = 'scale' ### Relabel weight weight_label_key = list(ds.meta()['columns'][weight_name]['text'])[ 0] ## get key from weight dict ds.meta()['columns'][weight_name]['text'][weight_label_key] = weight_name ### Create name for saved weighted dataset using the time it was created dt = str(datetime.now()).replace(':', '').replace('.', '') file_location = './temp/' + file_name.replace( ".sav", "") + "_" + dt.replace(" ", "_") + '_weighted.sav' syntax_location = './temp/' + file_name.replace( ".sav", "") + "_" + dt.replace(" ", "_") + '_weight.sps' crosstabs = check_weights(ds, variables, group=grouping, weight=weight_name) report = generate_report(scheme) save_file(ds, file_location) save_syntax_file(ds, syntax_location, weight_name, unique_key, weighting_desc) return file_location, syntax_location, crosstabs, report
def confirmit_dataset(): dataset = qp.DataSet("confirmit") dataset.read_confirmit_from_files('tests/confirmit_meta.json', 'tests/confirmit_data.json') return dataset
def quantipy_dataset(): dataset = qp.DataSet("quantipy test data") dataset.read_quantipy('tests/Example Data (A).json', 'tests/Example Data (A).csv') return dataset
weighting_collumn = 'weighting' if (len(sys.argv) > 1): data_dir = sys.argv[1] weighting_collumn = sys.argv[2] weights_filename = 'weights_def.json' results_filename = 'results.json' schema_filename = 'schema.json' data_filename = 'data.json' schema_file = os.path.join(data_dir, schema_filename) data_file = os.path.join(data_dir, data_filename) weights_file = os.path.join(data_dir, weights_filename) results_file = os.path.join(data_dir, 'results', results_filename) # READ DATA AND SCHEMA dataset = qp.DataSet("test") dataset.read_confirmit_from_files(schema_file, data_file) # READ WEIGHTS DEF with open(weights_file, "r") as weights_json: weights_def = json.load(weights_json) # DEFINE WEIGHTS scheme = qp.Rim('w') group_targets = {} for group_def in weights_def: var_targets = [] for target_def in group_def['targets']: targets = dict() for var_target_def in target_def['targets']: targets.update({var_target_def['code']: var_target_def['target']})
def confirmit_dataset_verbose(): dataset = qp.DataSet("confirmit") dataset.read_confirmit_from_files('tests/confirmit_meta.json', 'tests/confirmit_data.json', verbose=True ) return dataset
def dataset(): ds = qp.DataSet(NAME_PROJ, dimensions_comp=False) ds.read_quantipy(PATH_META, PATH_DATA) meta, data = ds.split() return meta, data.head(250)