def svr_xml2dict(raw_data): ''' This method converts the supplied xml file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @list_observation_label, is a list containing dependent variable labels. ''' feature_count = None list_dataset = [] list_observation_label = [] logger = Logger(__name__, 'error', 'error') # convert xml file to python 'dict' dataset = xmltodict.parse(raw_data) # build 'list_dataset' for observation in dataset['dataset']['observation']: for key in observation: if key == 'criterion': observation_label = observation['criterion'] list_observation_label.append(observation[key]) elif key == 'predictor': for predictor in observation[key]: predictor_label = predictor['label'] predictor_value = predictor['value'] validate_value = Validator(predictor_value) validate_value.validate_value() list_error_value = validate_value.get_errors() if list_error_value: logger.log(list_error_value) return None else: list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(predictor_label), 'indep_variable_value': predictor_value }) # generalized feature count in an observation if not feature_count: feature_count = len(observation['predictor']) # save observation labels, and return raw_data.close() return { 'dataset': list_dataset, 'observation_labels': list_observation_label, 'feature_count': feature_count }
def csv2dict(raw_data): ''' This method converts the supplied csv file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. Note: we use the 'Universal Newline Support' with the 'U' parameter when opening 'raw_data'. This allows newlines to be understood regardless, if the newline character was created in osx, windows, or linux. Note: since 'row' is a list, with one comma-delimited string element, the following line is required in this method: row = row[0].split(',') ''' # local variables: dataset = [] validate = Validator() # local variable: open temporary 'csvfile' reader object dataset_reader = csv.reader( raw_data, delimiter=' ', quotechar='|' ) # first row of csvfile: get all columns, except first for row in islice(dataset_reader, 0, 1): indep_labels_list = row[0].split(',')[1:] # all rows of csvfile: except first row for dep_index, row in enumerate(islice(dataset_reader, 0, None)): row_arr = row[0].split(',') features_list = row_arr[1:] # merge lists into dict if each independent variable validates if all(validate.validate_value(item) for item in features_list): features_dict = {k: v for k, v in zip(indep_labels_list, features_list)} error = None else: error = 'csv conversion failed: ' + validate.get_error() observation = { 'dependent-variable': row_arr[:1][0], 'independent-variables': [features_dict], 'error': error } dataset.append(observation) # close file, return dataset raw_data.close() return dataset
def csv2dict(raw_data): ''' This method converts the supplied csv file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. Note: we use the 'Universal Newline Support' with the 'U' parameter when opening 'raw_data'. This allows newlines to be understood regardless, if the newline character was created in osx, windows, or linux. Note: since 'row' is a list, with one comma-delimited string element, the following line is required in this method: row = row[0].split(',') ''' # local variables: dataset = [] validate = Validator() # local variable: open temporary 'csvfile' reader object dataset_reader = csv.reader(raw_data, delimiter=' ', quotechar='|') # first row of csvfile: get all columns, except first for row in islice(dataset_reader, 0, 1): indep_labels_list = row[0].split(',')[1:] # all rows of csvfile: except first row for dep_index, row in enumerate(islice(dataset_reader, 0, None)): row_arr = row[0].split(',') features_list = row_arr[1:] # merge lists into dict if each independent variable validates if all(validate.validate_value(item) for item in features_list): features_dict = { k: v for k, v in zip(indep_labels_list, features_list) } error = None else: error = 'csv conversion failed: ' + validate.get_error() observation = { 'dependent-variable': row_arr[:1][0], 'independent-variables': [features_dict], 'error': error } dataset.append(observation) # close file, return dataset raw_data.close() return dataset
def xml2dict(raw_data): ''' This method converts the supplied xml file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. ''' # local variables dataset = [] validate = Validator() # local variable: open temporary 'xmltodict' object dataset_reader = xmltodict.parse(raw_data) # build dataset for observation in dataset_reader['dataset']['observation']: features_dict = {} dependent_variable = observation['dependent-variable'] # define features set if independent variable validates for feature in observation['independent-variable']: if validate.validate_value(feature['value']): features_dict[feature['label']] = feature['value'] error = None else: error = 'xml conversion failed: ' + validate.get_error() adjusted = { 'dependent-variable': dependent_variable, 'independent-variables': [features_dict], 'error': error } dataset.append(adjusted) # save observation labels, and return raw_data.close() return dataset
def svm_csv2dict(raw_data): ''' This method converts the supplied csv file-object, intended for an svm model, to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @list_observation_label, is a list containing dependent variable labels. Note: we use the 'Universal Newline Support' with the 'U' parameter when opening 'raw_data'. This allows newlines to be understood regardless, if the newline character was created in osx, windows, or linux. Note: since 'row' is a list, with one comma-delimited string element, the following line is required in this method: row = row[0].split(',') ''' feature_count = None list_dataset = [] list_observation_label = [] list_feature_label = [] logger = Logger(__name__, 'error', 'error') # open temporary 'csvfile' reader object dataset_reader = csv.reader( raw_data, delimiter=' ', quotechar='|' ) # iterate first row of csvfile for row in islice(dataset_reader, 0, 1): # iterate each column in a given row row_indep_label = row[0].split(',') for value in islice(row_indep_label, 1, None): list_feature_label.append(str(value)) # iterate all rows of csvfile for dep_index, row in enumerate(islice(dataset_reader, 0, None)): # iterate first column of each row (except first) row_dep_label = row[0].split(',') for value in row_dep_label[:1]: list_observation_label.append(str(value)) # generalized feature count in an observation row_indep_variable = row[0].split(',') if not feature_count: feature_count = len(row_indep_variable) - 1 # iterate each column in a given row for indep_index, value in enumerate( islice(row_indep_variable, 1, None) ): try: validate = Validator(value) validate.validate_value() list_error = validate.get_errors() if list_error: logger.log(list_error) return None else: value = float(value) except Exception as error: logger.log(error) return False list_dataset.append({ 'dep_variable_label': list_observation_label[dep_index], 'indep_variable_label': list_feature_label[indep_index], 'indep_variable_value': value }) # close file, save observation labels, and return raw_data.close() return { 'dataset': list_dataset, 'observation_labels': list_observation_label, 'feature_count': feature_count }
def svm_json2dict(raw_data, is_json): ''' This method converts the supplied json file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @is_json, flag indicating 'raw_data' is a json string. @observation_labels, is a list containing dependent variable labels. ''' # local variables feature_count = None list_dataset = [] observation_labels = [] logger = Logger(__name__, 'error', 'error') # web-interface if not is_json: dataset = json.load(raw_data) for observation_label in dataset: # variables observations = dataset[observation_label] # dependent variable with single observation if type(observations) == dict: for feature_label, feature_value in observations.items(): # validation validate_fvalue = Validator(feature_value) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(feature_label), 'indep_variable_value': feature_value }) # generalized feature count in an observation if not feature_count: feature_count = len(observations) # dependent variable with multiple observations elif type(observations) == list: for observation in observations: for feature_label, feature_value in observation.items(): # validation validate_fvalue = Validator(feature_value) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(feature_label), 'indep_variable_value': feature_value }) # generalized feature count in an observation if not feature_count: feature_count = len(observation) # list of observation label observation_labels.append(observation_label) # programmatic-interface else: dataset = raw_data observation_label = raw_data[0] # list of observation label observation_labels.append(observation_label) # dependent variable with single observation if type(raw_data[1]) == dict: for label, feature in raw_data[1].items(): # validation validate_fvalue = Validator(feature) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(label), 'indep_variable_value': feature }) # generalized feature count in an observation if not feature_count: feature_count = len(raw_data[1]) # dependent variable with multiple observations if type(raw_data[1]) == list: for feature_set in raw_data[1]: for feature_label, feature_value in feature_set.items(): # validation validate_fvalue = Validator(feature_value) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(feature_label), 'indep_variable_value': feature_value }) # generalized feature count in an observation if not feature_count: feature_count = len(feature_set) # close file if not is_json: raw_data.close() # save observation labels, and return return { 'dataset': list_dataset, 'observation_labels': observation_labels, 'feature_count': feature_count }
def dataset2dict(model_type, upload): ''' This method converts the supplied csv, or xml file upload(s) to a uniform dict object, using necessary converter utility functions. @upload, uploaded dataset(s). ''' # local variables list_error = [] converted = [] Validate = Validator() datasets = upload['dataset'] settings = upload['properties'] stream = settings.get('stream', None) list_model_type = current_app.config.get('MODEL_TYPE') # programmatic-interface if stream == 'True': session_name = settings['session_name'] dataset_type = settings['dataset_type'] # convert dataset(s) into extended list for dataset in datasets: # scrape url content if dataset_type == 'dataset_url': r = requests.get(dataset) instance = r.json()['dataset'] else: instance = [dataset] if instance: if model_type == list_model_type[0]: error = Validate.validate_classification(instance) elif model_type == list_model_type[1]: error = Validate.validate_regression(instance) if error: list_error.append({ 'location': session_name, 'message': error }) converted.extend(instance) # web-interface else: dataset_type = settings['dataset_type'] if dataset_type == 'file_upload': adjusted_datasets = upload['dataset']['file_upload'] else: adjusted_datasets = upload['dataset']['dataset_url'] # convert dataset(s) into extended list for dataset in adjusted_datasets: location = dataset['filename'] # scrape url content if dataset_type == 'dataset_url': r = requests.get(dataset) instance = [r.json()][0]['dataset'] # file content else: if dataset['filename'].lower().endswith('.csv'): instance = csv2dict(dataset['file']) elif dataset['filename'].lower().endswith('.json'): # load dataset instance try: instance = json.load(dataset['file'])['dataset'] except: instance = dataset['file'] elif dataset['filename'].lower().endswith('.xml'): instance = xml2dict(dataset['file']) if instance: if model_type == list_model_type[0]: error = Validate.validate_classification(instance) elif model_type == list_model_type[1]: error = Validate.validate_regression(instance) if error: list_error.append({'location': location, 'message': error}) else: converted.extend(instance) else: list_error.append({ 'location': location, 'message': 'empty dataset, or invalid syntax (try lint)' }) # return results if list_error: return { 'dataset': converted, 'settings': settings, 'error': { 'validation': list_error } } else: return { 'dataset': converted, 'settings': settings, 'error': None, }
def svr_json2dict(raw_data, is_json): ''' This method converts the supplied json file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @is_json, flag indicating 'raw_data' is a json string. @observation_labels, is a list containing dependent variable labels. ''' # local variables feature_count = None list_dataset = [] observation_labels = [] logger = Logger(__name__, 'error', 'error') # web-interface if not is_json: dataset = json.load(raw_data) for criterion, predictors in dataset.items(): observation_label = criterion # list of observation label observation_labels.append(criterion) # criterion with single observation if type(predictors) == dict: for label, predictor in predictors.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(predictors) # criterion with multiple observation if type(predictors) == list: for criterion in predictors: for label, predictor in criterion.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(criterion.items()) # programmatic-interface else: dataset = raw_data for criterion, predictors in dataset.items(): # list of observation label observation_labels.append(criterion) # criterion with single observation if type(predictors) == dict: for label, predictor in predictors.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(criterion), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(predictors.items()) # criterion with multiple observation if type(predictors) == list: for single_predictors in predictors: for label, predictor in single_predictors.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(criterion), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(single_predictors.items()) # close file if not is_json: raw_data.close() # save observation labels, and return return { 'dataset': list_dataset, 'observation_labels': observation_labels, 'feature_count': feature_count }
def dataset2dict(model_type, upload): ''' This method converts the supplied csv, or xml file upload(s) to a uniform dict object, using necessary converter utility functions. @upload, uploaded dataset(s). ''' # local variables list_error = [] converted = [] Validate = Validator() datasets = upload['dataset'] settings = upload['properties'] stream = settings.get('stream', None) list_model_type = current_app.config.get('MODEL_TYPE') # programmatic-interface if stream == 'True': session_name = settings['session_name'] dataset_type = settings['dataset_type'] # convert dataset(s) into extended list for dataset in datasets: # scrape url content if dataset_type == 'dataset_url': r = requests.get(dataset) instance = r.json()['dataset'] else: instance = [dataset] if instance: if model_type == list_model_type[0]: error = Validate.validate_classification(instance) elif model_type == list_model_type[1]: error = Validate.validate_regression(instance) if error: list_error.append({ 'location': session_name, 'message': error }) converted.extend(instance) # web-interface else: dataset_type = settings['dataset_type'] if dataset_type == 'file_upload': adjusted_datasets = upload['dataset']['file_upload'] else: adjusted_datasets = upload['dataset']['dataset_url'] # convert dataset(s) into extended list for dataset in adjusted_datasets: location = dataset['filename'] # scrape url content if dataset_type == 'dataset_url': r = requests.get(dataset) instance = [r.json()][0]['dataset'] # file content else: if dataset['filename'].lower().endswith('.csv'): instance = csv2dict(dataset['file']) elif dataset['filename'].lower().endswith('.json'): # load dataset instance try: instance = json.load(dataset['file'])['dataset'] except: instance = dataset['file'] elif dataset['filename'].lower().endswith('.xml'): instance = xml2dict(dataset['file']) if instance: if model_type == list_model_type[0]: error = Validate.validate_classification(instance) elif model_type == list_model_type[1]: error = Validate.validate_regression(instance) if error: list_error.append({ 'location': location, 'message': error }) else: converted.extend(instance) else: list_error.append({ 'location': location, 'message': 'empty dataset, or invalid syntax (try lint)' }) # return results if list_error: return { 'dataset': converted, 'settings': settings, 'error': { 'validation': list_error } } else: return { 'dataset': converted, 'settings': settings, 'error': None, }