def svr_xml2dict(raw_data): ''' This method converts the supplied xml file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @list_observation_label, is a list containing dependent variable labels. ''' feature_count = None list_dataset = [] list_observation_label = [] logger = Logger(__name__, 'error', 'error') # convert xml file to python 'dict' dataset = xmltodict.parse(raw_data) # build 'list_dataset' for observation in dataset['dataset']['observation']: for key in observation: if key == 'criterion': observation_label = observation['criterion'] list_observation_label.append(observation[key]) elif key == 'predictor': for predictor in observation[key]: predictor_label = predictor['label'] predictor_value = predictor['value'] validate_value = Validator(predictor_value) validate_value.validate_value() list_error_value = validate_value.get_errors() if list_error_value: logger.log(list_error_value) return None else: list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(predictor_label), 'indep_variable_value': predictor_value }) # generalized feature count in an observation if not feature_count: feature_count = len(observation['predictor']) # save observation labels, and return raw_data.close() return { 'dataset': list_dataset, 'observation_labels': list_observation_label, 'feature_count': feature_count }
def svm_csv2dict(raw_data): ''' This method converts the supplied csv file-object, intended for an svm model, to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @list_observation_label, is a list containing dependent variable labels. Note: we use the 'Universal Newline Support' with the 'U' parameter when opening 'raw_data'. This allows newlines to be understood regardless, if the newline character was created in osx, windows, or linux. Note: since 'row' is a list, with one comma-delimited string element, the following line is required in this method: row = row[0].split(',') ''' feature_count = None list_dataset = [] list_observation_label = [] list_feature_label = [] logger = Logger(__name__, 'error', 'error') # open temporary 'csvfile' reader object dataset_reader = csv.reader( raw_data, delimiter=' ', quotechar='|' ) # iterate first row of csvfile for row in islice(dataset_reader, 0, 1): # iterate each column in a given row row_indep_label = row[0].split(',') for value in islice(row_indep_label, 1, None): list_feature_label.append(str(value)) # iterate all rows of csvfile for dep_index, row in enumerate(islice(dataset_reader, 0, None)): # iterate first column of each row (except first) row_dep_label = row[0].split(',') for value in row_dep_label[:1]: list_observation_label.append(str(value)) # generalized feature count in an observation row_indep_variable = row[0].split(',') if not feature_count: feature_count = len(row_indep_variable) - 1 # iterate each column in a given row for indep_index, value in enumerate( islice(row_indep_variable, 1, None) ): try: validate = Validator(value) validate.validate_value() list_error = validate.get_errors() if list_error: logger.log(list_error) return None else: value = float(value) except Exception as error: logger.log(error) return False list_dataset.append({ 'dep_variable_label': list_observation_label[dep_index], 'indep_variable_label': list_feature_label[indep_index], 'indep_variable_value': value }) # close file, save observation labels, and return raw_data.close() return { 'dataset': list_dataset, 'observation_labels': list_observation_label, 'feature_count': feature_count }
def svm_json2dict(raw_data, is_json): ''' This method converts the supplied json file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @is_json, flag indicating 'raw_data' is a json string. @observation_labels, is a list containing dependent variable labels. ''' # local variables feature_count = None list_dataset = [] observation_labels = [] logger = Logger(__name__, 'error', 'error') # web-interface if not is_json: dataset = json.load(raw_data) for observation_label in dataset: # variables observations = dataset[observation_label] # dependent variable with single observation if type(observations) == dict: for feature_label, feature_value in observations.items(): # validation validate_fvalue = Validator(feature_value) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(feature_label), 'indep_variable_value': feature_value }) # generalized feature count in an observation if not feature_count: feature_count = len(observations) # dependent variable with multiple observations elif type(observations) == list: for observation in observations: for feature_label, feature_value in observation.items(): # validation validate_fvalue = Validator(feature_value) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(feature_label), 'indep_variable_value': feature_value }) # generalized feature count in an observation if not feature_count: feature_count = len(observation) # list of observation label observation_labels.append(observation_label) # programmatic-interface else: dataset = raw_data observation_label = raw_data[0] # list of observation label observation_labels.append(observation_label) # dependent variable with single observation if type(raw_data[1]) == dict: for label, feature in raw_data[1].items(): # validation validate_fvalue = Validator(feature) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(label), 'indep_variable_value': feature }) # generalized feature count in an observation if not feature_count: feature_count = len(raw_data[1]) # dependent variable with multiple observations if type(raw_data[1]) == list: for feature_set in raw_data[1]: for feature_label, feature_value in feature_set.items(): # validation validate_fvalue = Validator(feature_value) validate_fvalue.validate_value() if validate_fvalue.get_errors(): logger.log(validate_fvalue.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(feature_label), 'indep_variable_value': feature_value }) # generalized feature count in an observation if not feature_count: feature_count = len(feature_set) # close file if not is_json: raw_data.close() # save observation labels, and return return { 'dataset': list_dataset, 'observation_labels': observation_labels, 'feature_count': feature_count }
def svr_json2dict(raw_data, is_json): ''' This method converts the supplied json file-object to a python dictionary. @raw_data, generally a file (or json string) containing the raw dataset(s), to be used when computing a corresponding model. If this argument is a file, it needs to be closed. @is_json, flag indicating 'raw_data' is a json string. @observation_labels, is a list containing dependent variable labels. ''' # local variables feature_count = None list_dataset = [] observation_labels = [] logger = Logger(__name__, 'error', 'error') # web-interface if not is_json: dataset = json.load(raw_data) for criterion, predictors in dataset.items(): observation_label = criterion # list of observation label observation_labels.append(criterion) # criterion with single observation if type(predictors) == dict: for label, predictor in predictors.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(predictors) # criterion with multiple observation if type(predictors) == list: for criterion in predictors: for label, predictor in criterion.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(observation_label), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(criterion.items()) # programmatic-interface else: dataset = raw_data for criterion, predictors in dataset.items(): # list of observation label observation_labels.append(criterion) # criterion with single observation if type(predictors) == dict: for label, predictor in predictors.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(criterion), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(predictors.items()) # criterion with multiple observation if type(predictors) == list: for single_predictors in predictors: for label, predictor in single_predictors.items(): # validation (part 1) validate_predictor = Validator(predictor) validate_predictor.validate_value() if validate_predictor.get_errors(): logger.log(validate_predictor.get_errors()) else: # restructured data list_dataset.append({ 'dep_variable_label': str(criterion), 'indep_variable_label': str(label), 'indep_variable_value': predictor }) # generalized feature count in an observation if not feature_count: feature_count = len(single_predictors.items()) # close file if not is_json: raw_data.close() # save observation labels, and return return { 'dataset': list_dataset, 'observation_labels': observation_labels, 'feature_count': feature_count }