def __init__(self, train=False, dataset=[]): """ SimilarFinder is used to obtain the most similar cases to a given sample. It uses Ski-kit learn's nearest neighbor implementation param: train: will train the model param: dataset: numpy array of precedent vectors. Ignored if train is set to False """ if not train: self.model = Load.load_binary("similarity_model.bin") self.case_numbers = Load.load_binary("similarity_case_numbers.bin") self.scaler = Load.load_binary("similarity_scaler.bin") elif len(dataset) > 0: sample_set = [ np.concatenate([vec['facts_vector'], vec['outcomes_vector']]) for vec in dataset ] for i in range(len(sample_set)): sample_set[i] = sample_set[i].astype(np.int64) self.scaler = StandardScaler() sample_set = self.scaler.fit_transform(sample_set) self.model = NearestNeighbors(5, metric='euclidean') self.model.fit(sample_set) self.case_numbers = [vec['name'] for vec in dataset] save = Save() save.save_binary("similarity_model.bin", self.model) save.save_binary("similarity_case_numbers.bin", self.case_numbers) save.save_binary("similarity_scaler.bin", self.scaler) else: raise ValueError('Please train or load the classifier first')
def weights_to_csv(self): """ Writes all the weights to .csv format 1) get the facts 2) for every outcome write the weights :return: None """ try: if self.model is None: self.model = Load.load_binary('multi_class_svm_model.bin') self.classifier_labels = Load.load_binary('classifier_labels.bin') except BaseException: return None index = TagPrecedents().get_intent_index() fact_header = [" "] for header in index['facts_vector']: fact_header.append(header[1]) with open('weights.csv', 'w') as outcsv: writer = csv.writer(outcsv) writer.writerow(fact_header) for i in range(len(self.model.estimators_)): outcome_list = [self.classifier_labels[i]] estimator = self.model.estimators_[i] try: weights = estimator.coef_[0] for j in range(len(weights)): outcome_list.append(weights[j]) writer.writerow(outcome_list) except AttributeError: pass Log.write('Weights saved to .csv')
def test_binarize_model(self): binary_directory = Path.binary_directory Path.binary_directory = Path.test_mock_precedent_directory s = Save() model = {'key': 'value'} s.save_binary('test_model.bin', model) l = Load() new_model = l.load_binary('test_model.bin') self.assertEqual(new_model['key'], 'value') os.remove(Path.binary_directory + 'test_model.bin') Path.binary_directory = binary_directory
def load(self): """ Loads the regressors different components """ regressor_name = self.regressor_name Log.write("Loading " + '{}_regressor.bin'.format(regressor_name)) file_path = os.path.join(Path.binary_directory, '{}_regressor.bin'.format(regressor_name)) Log.write('{}_regressor.bin'.format(regressor_name) + " is successfully loaded") regressor = load_model(file_path) scaler = Load.load_binary('{}_scaler.bin'.format(regressor_name)) self.model = AbstractRegressor._create_pipeline(scaler, regressor) self.mean_facts_vector = Load.load_binary('model_metrics.bin')['regressor'][regressor_name]['mean_facts_vector']
def get_ml_statistics(): """ Remove unecessary data for the endpoint. :return: { 'data_set':{ 'size': 5000 }, 'regressor':{ 'regressor name':{ 'std': 4, 'mean': 5, 'variance': 42, 'mean_fact_vector': [3, 1, 5, 6, 2] } }, 'classifier':{ 'classifier name':{ 'prediction_accuracy': 0.92, } } } """ stat_dict = Load.load_binary('model_metrics.bin') for regressor_type in stat_dict['regressor']: stat_dict['regressor'][regressor_type].pop('mean_facts_vector', None) return stat_dict
def __dictionary_to_list(): """ Converts the binarize structured_data_dict to a list format precedent_vectors:{ filename:{ name: 'AZ-XXXXXXX.txt', demands_vector: [...], facts_vector: [...], outcomes_vector: [...] } } :return: data_list: [{ name: 'AZ-XXXXXXX.txt', demands_vector: [...], facts_vector: [...], outcomes_vector: [...] }, { ... }] """ precedent_vector = Load.load_binary("precedent_vectors.bin") if precedent_vector is None: return [] Log.write("Formatting data") data_list = [] for precedent_file in precedent_vector: data_list.append(precedent_vector[precedent_file]) return data_list
def data_metrics(self): """ 1) Obtain the fact vectors 2) Obtain the outcome vectors pertaining to the regressor in question 3) Collect data metrics 3.1) mean_fact_vector --> the average of every fact column 3.2) standard deviation of outcomes 3.3) variance of outcomes 3.4) mean of outcomes 4) persist data into a dictionary which will be binarized model_metrics --> { 'data_set':{ 'size': 5000 }, 'regressor':{ 'regressor name':{ 'std': 4, 'mean': 5, 'variance': 42, 'mean_fact_vector': [3, 1, 5, 6, 2] } }, 'classifier':{ 'classifier name':{ 'prediction_accuracy': 0.92, } } } :return: model_metrics """ facts_vector = [x['facts_vector'] for x in self.dataset] outcomes_vector = [x['outcomes_vector'][self.outcome_index] for x in self.dataset] model_metrics = Load.load_binary('model_metrics.bin') if model_metrics is None: model_metrics = { 'regressor': { self.regressor_name: { } } } elif 'regressor' not in model_metrics: model_metrics['regressor'] = {} self.mean_facts_vector = np.mean(facts_vector, axis=0) model_metrics['regressor'][self.regressor_name] = { 'mean_facts_vector': self.mean_facts_vector, 'std': np.std(outcomes_vector), 'variance': np.var(outcomes_vector), 'mean': np.mean(outcomes_vector) } return model_metrics
def load_classifier_labels(): """ The prediction given by the model gives a matrix with less dimensions then the total outcomes. The reason being that only boolean outcomes are kept in the prediction. We therefore have to relabel the columns. :return: Dict of classifier labels dict:{ "column 1": <int>, "column 2": <int>, ... } """ return Load.load_binary('classifier_labels.bin')
def predict(self, data): """ 1) Predicts an outcome given facts 2) Predicts probability that prediction is correct 2.1) Range goes from [0-1] where x < 0.5 is False 2.2) The model only returns the probability that a fact is 1 2.3) therefore to predict that the probability that a fact is 0 we do 1 - x when x < 0.5 :param data: numpy([1, 0, 0, ...]) :return: np.array([...]) """ if self.model is None: self.model = Load.load_binary("multi_class_svm_model.bin") data = binarize([data], threshold=0) probabilities = self.model.predict_proba(data)[0] predictions = self.model.predict(data) for i in range(len(probabilities)): prediction = predictions[0][i] if prediction == 0: probabilities[i] = 1 - probabilities[i] probabilities[i] = format(probabilities[i], '.2f') return self.model.predict(data), probabilities
def get_ordered_weights(self): """ Sort all the facts by importance for every outcome 1) If the classifier model isn't loaded then load it 2) Load labels of the outcomes 3) obtain labels of every fact 4) for every estimator append all it's fact weights 5) sort the fact in descending order by weight 6) do not append facts with weight of 0 7) threshold facts by using the logarithmic power of the mean 7.1) any number with greater or equal power of magnitude is important 7.2) other numbers make a fact unimportant ** Custom list for additional_indemnity_money :return: { 'additional_indemnity_money': { 'important_facts': [ 'asker_is_landlord', 'tenant_rent_not_paid_more_3_weeks', 'tenant_owes_rent', 'tenant_left_without_paying', 'not_violent' ], 'additional_facts': [ ... ] } } """ if self.model is None: self.model = Load.load_binary('multi_class_svm_model.bin') self.classifier_labels = Load.load_binary('classifier_labels.bin') self.label_column_index = TagPrecedents().get_intent_index() weight_dict = {} for i in range(len(self.model.estimators_)): outcome_list = [] estimator = self.model.estimators_[i] try: weights = estimator.coef_[0] for j in range(len(weights)): if weights[j] > 0: outcome_list.append([self.label_column_index['facts_vector'][j][1], weights[j]]) outcome_list.sort(key=lambda x: abs(x[1]), reverse=True) weights = [abs(x[1]) for x in outcome_list] mean_power = math.log10(np.mean(np.array(weights))) important_facts = [x[0] for x in outcome_list if math.log10(abs(x[1])) >= mean_power] additional_facts = [x[0] for x in outcome_list if math.log10(abs(x[1])) < mean_power] if self.classifier_labels[i][0] == 'additional_indemnity_money': important_facts.append('tenant_monthly_payment') important_facts.append('tenant_not_paid_lease_timespan') if 'tenant_not_paid_lease_timespan' in additional_facts: additional_facts.remove('tenant_not_paid_lease_timespan') if 'tenant_monthly_payment' in additional_facts: additional_facts.remove('tenant_monthly_payment') weight_dict[self.classifier_labels[i][0]] = {} weight_dict[self.classifier_labels[i][0]]['important_facts'] = important_facts weight_dict[self.classifier_labels[i][0]]['additional_facts'] = additional_facts except AttributeError: print('Problem with {} prediction'.format(self.classifier_labels[i][0])) return weight_dict
def __test(self, x_test, y_test): """ 1) Tests model 2) Save the accuracy to the model metrics binary model_metrics --> { 'data_set':{ 'size': 5000 }, 'regressor':{ 'regressor name':{ 'std': 4, 'mean': 5, 'variance': 42, 'mean_fact_vector': [3, 1, 5, 6, 2] } }, 'classifier':{ 'classifier name':{ 'prediction_accuracy': 0.92, } } } :param x_test: numpy array :param y_test: numpy array :return: None """ model_metrics = Load.load_binary('model_metrics.bin') if model_metrics is None: model_metrics = { 'classifier': {} } elif 'classifier' not in model_metrics: model_metrics['classifier'] = {} model_metrics['data_set'] = { 'size': len(self.data_set) } index = TagPrecedents().get_intent_index()['outcomes_vector'] Log.write("Testing Classifier") y_predict = self.model.predict(x_test) Log.write("Classifier results:\n") for i in range(len(y_predict[0])): yp = y_predict[:, [i]] yt = y_test[:, [i]] accuracy = np.sum(yp == yt) * 100.0 / len(yt) column_name = index[self.mlb.classes_[i]][1] (precision, recall, f1, _) = precision_recall_fscore_support(yt, yp) Log.write('Column: {}'.format(column_name)) Log.write('Test accuracy: {}%'.format(accuracy)) Log.write('Precision: {}'.format(precision)) Log.write('Recall: {}'.format(recall)) Log.write('F1: {}\n'.format(f1)) model_metrics['classifier'][column_name] = { 'prediction_accuracy': accuracy, } Save().save_binary('model_metrics.bin', model_metrics)
class MlController: indexes = TagPrecedents().get_intent_index() classifier_labels = MultiClassSVM.load_classifier_labels() classifier_model = MultiClassSVM() regression_model = MultiOutputRegression() similar_finder = SimilarFinder() precedent_vectors = Load.load_binary("precedent_vectors.bin") @staticmethod def predict_outcome(input_json): """ Makes a prediction based on the input json input_json: Dict containing the facts and demands The input json must be as follows: { "facts" : { "fact1": 1 or 0, "fact2": 1 or 0, "fact3": 1 or 0, etc } } It is not necessary to include ALL demands or facts, some may be omitted returns: a dict containing all the predictions currently, its format is as follows: { "lease_resiliation" : 1 or 0 } """ facts_vector = MlController.fact_dict_to_vector(input_json['facts']) outcome_vector, probabilities = MlController.classifier_model.predict( facts_vector) outcome_vector = outcome_vector[0] outcome_vector = MlController.regression_model.predict( facts_vector, outcome_vector) response = MlController.outcome_vector_to_dict(outcome_vector) response[ 'probabilities_vector'] = MlController.probability_vector_to_dict( probabilities) similar_dict = { 'facts_vector': facts_vector, 'outcomes_vector': outcome_vector } response[ 'similar_precedents'] = MlController.format_similar_precedents( MlController.similar_finder.get_most_similar(similar_dict)) return response @staticmethod def fact_dict_to_vector(input_dict): """ Converts a dictionary to vector form, readable by ML input_dict: dictionary containing all facts or demands It is as follows: { "fact 1": <int>, "fact 2": <int>, "fact 3": <int>, ... } returns: a vector integers """ output_vector = np.zeros(len(MlController.indexes['facts_vector'])) for index, val, data_type in MlController.indexes['facts_vector']: if val in input_dict: output_vector[index] = int(input_dict[val]) return output_vector @staticmethod def outcome_vector_to_dict(outcome_vector): return_dict = {} for outcome_index in MlController.classifier_labels: label = MlController.classifier_labels[outcome_index][0] return_dict[label] = str(outcome_vector[outcome_index]) return {'outcomes_vector': return_dict} @staticmethod def probability_vector_to_dict(probabilities_vector): return_dict = {} for outcome_index in MlController.classifier_labels: label = MlController.classifier_labels[outcome_index][0] return_dict[label] = str(probabilities_vector[outcome_index]) return return_dict @staticmethod def fact_vector_to_dict(fact_vector): return_dict = {} for fact_tuple in MlController.indexes['facts_vector']: label = fact_tuple[1] return_dict[label] = str(fact_vector[fact_tuple[0]]) return {'facts': return_dict} @staticmethod def format_similar_precedents(similarity_list): """ Formats a list such as ["AZ-111111", 1.5] into a list of dicts of the form [{"precedent": "AZ-111111","distance": 1.5}] :param similarity_list: List of lists of the form ["PRECEDENT_NAME"(string), DISTANCE(number)] :return: A formatted list of precedents """ formatted_precedents = [] for precedent_array in similarity_list: precedent = { "precedent": MlController.precedent_vectors[precedent_array[0]] ['file_number'], "distance": precedent_array[1], "facts": MlController.fact_vector_to_dict( MlController.precedent_vectors[ precedent_array[0]]['facts_vector'])['facts'], "outcomes": MlController.outcome_vector_to_dict( MlController.precedent_vectors[precedent_array[0]] ['outcomes_vector'])['outcomes_vector'], } for fact_tuple in MlController.indexes['facts_vector']: if fact_tuple[2] == 'bool': precedent['facts'][fact_tuple[1]] = bool( float(precedent['facts'][fact_tuple[1]])) for outcome_tuple in MlController.classifier_labels.values(): if outcome_tuple[1] == 'bool': precedent['outcomes'][outcome_tuple[0]] = bool( float(precedent['outcomes'][outcome_tuple[0]])) formatted_precedents.append(precedent) return formatted_precedents @staticmethod def get_weighted_facts(): """ :return: { 'additional_indemnity_money': { 'important_facts': [ ... ], 'additional_facts': [ ... ] } """ return MlController.classifier_model.get_ordered_weights() @staticmethod def get_anti_facts(): return { 'tenant_individual_responsability': 'tenant_group_responsability', 'tenant_lease_fixed': 'tenant_lease_indeterminate', 'tenant_rent_not_paid_less_3_weeks': 'tenant_rent_not_paid_more_3_weeks', 'not_violent': 'violent' } @staticmethod def get_ml_statistics(): """ Remove unecessary data for the endpoint. :return: { 'data_set':{ 'size': 5000 }, 'regressor':{ 'regressor name':{ 'std': 4, 'mean': 5, 'variance': 42, 'mean_fact_vector': [3, 1, 5, 6, 2] } }, 'classifier':{ 'classifier name':{ 'prediction_accuracy': 0.92, } } } """ stat_dict = Load.load_binary('model_metrics.bin') for regressor_type in stat_dict['regressor']: stat_dict['regressor'][regressor_type].pop('mean_facts_vector', None) return stat_dict