def test_binarize_model(self): binary_directory = Path.binary_directory Path.binary_directory = Path.test_mock_precedent_directory s = Save() model = {'key': 'value'} s.save_binary('test_model.bin', model) l = Load() new_model = l.load_binary('test_model.bin') self.assertEqual(new_model['key'], 'value') os.remove(Path.binary_directory + 'test_model.bin') Path.binary_directory = binary_directory
def create_regex_cluster_bin(min_match_percentage): """ creates a regex to cluster mapping and store it in a binary file :param min_match_percentage: min percentage of sentence to be matched to a regex :return: None """ unpack_fact_demand_bin() rc_fact_dict = cluster_regex_mapper('fact', min_match_percentage) rc_demand_dict = cluster_regex_mapper('demand', min_match_percentage) cluster_regex_dict = {'fact': rc_fact_dict, 'demand': rc_demand_dict} save = Save() save.save_binary('cluster_regex_dict.bin', cluster_regex_dict)
def save(self): """ Saves the scaler and regressor. Does not use joblib for the regressor as it is not supported """ regressor_name = self.regressor_name file_path = os.path.join(Path.binary_directory, '{}_regressor.bin'.format(regressor_name)) Log.write("saving" + '{}_regressor.bin'.format(regressor_name) + " to: " + file_path) Log.write('{}_regressor.bin'.format(regressor_name) + " saved to: " + file_path) self.model.steps[1][1].model.save(file_path) Save().save_binary('{}_scaler.bin'.format(regressor_name), self.model.steps[0][1]) Save().save_binary('model_metrics.bin', self.data_metrics()) self.dataset = None
def __init__(self, train=False, dataset=[]): """ SimilarFinder is used to obtain the most similar cases to a given sample. It uses Ski-kit learn's nearest neighbor implementation param: train: will train the model param: dataset: numpy array of precedent vectors. Ignored if train is set to False """ if not train: self.model = Load.load_binary("similarity_model.bin") self.case_numbers = Load.load_binary("similarity_case_numbers.bin") self.scaler = Load.load_binary("similarity_scaler.bin") elif len(dataset) > 0: sample_set = [ np.concatenate([vec['facts_vector'], vec['outcomes_vector']]) for vec in dataset ] for i in range(len(sample_set)): sample_set[i] = sample_set[i].astype(np.int64) self.scaler = StandardScaler() sample_set = self.scaler.fit_transform(sample_set) self.model = NearestNeighbors(5, metric='euclidean') self.model.fit(sample_set) self.case_numbers = [vec['name'] for vec in dataset] save = Save() save.save_binary("similarity_model.bin", self.model) save.save_binary("similarity_case_numbers.bin", self.case_numbers) save.save_binary("similarity_scaler.bin", self.scaler) else: raise ValueError('Please train or load the classifier first')
def tag_precedents(self, nb_files=-1): """ 1) Displays progress of files vectorized 2) Vectorize file 3) Displays percentage of files covered 4) displays percentage of lines covered 5) binarize vectors :param nb_files when -1 then read all directory :return: structured_data_dict:{ filename:{ name: 'AZ-XXXXXXX.txt', demands_vector: [...], facts_vector: [...], outcomes_vector: [...] } } """ # ----------------------- 1 -----------------------------# Log.write('Tagging precedents') for file in os.listdir(self.precedents_directory_path): if nb_files == -1: percent = float(self.nb_text / len( os.listdir(self.precedents_directory_path))) * 100 else: percent = float(self.nb_text / nb_files) * 100 if self.nb_text > nb_files: break stdout.write("\rPrecedents tagged: %f " % percent) stdout.flush() # ----------------------- 2 -----------------------------# self.precedent_vector[file] = self.__tag_file(file) self.nb_text += 1 # ----------------------- 3 -----------------------------# Log.write('Precedent coverage: ' + str(self.text_tagged / self.nb_text)) # ----------------------- 4 -----------------------------# Log.write('Line Coverage: ' + str(self.statements_tagged / self.nb_lines)) # ----------------------- 5 -----------------------------# save = Save() save.save_binary('precedent_vectors.bin', self.precedent_vector) return self.precedent_vector
def create_regex_bin(): """ Driver to save regex binary file and regex-cluster dict binary 1) Create an empty dictionary 2) Add keys and values to the dictionary 3) binarize file :return: None """ regexes = RegexLib() reg_dict = {} reg_dict['regex_demands'] = regexes.regex_demands reg_dict['regex_facts'] = regexes.regex_facts reg_dict['regex_outcomes'] = regexes.regex_outcomes reg_dict['MONEY_REGEX'] = regexes.MONEY_REGEX save = Save() save.save_binary('regexes.bin', reg_dict)
def save(self): """ Since the regression and classifier models are separate, then new index will be assigned to each model. 1) Save classifier of each column into a binary format 2) Save the prediction model into binary :return: """ # ------------------- 1 ----------------------------- linear_labels = {} indices = TagPrecedents().get_intent_index()['outcomes_vector'] for i in range(len(self.mlb.classes_)): label = indices[self.mlb.classes_[i]][1] data_type = indices[self.mlb.classes_[i]][2] linear_labels[i] = label, data_type save = Save() save.save_binary("classifier_labels.bin", linear_labels) # ------------------- 2 ----------------------------- save.save_binary("multi_class_svm_model.bin", self.model)
def __test(self, x_test, y_test): """ 1) Tests model 2) Save the accuracy to the model metrics binary model_metrics --> { 'data_set':{ 'size': 5000 }, 'regressor':{ 'regressor name':{ 'std': 4, 'mean': 5, 'variance': 42, 'mean_fact_vector': [3, 1, 5, 6, 2] } }, 'classifier':{ 'classifier name':{ 'prediction_accuracy': 0.92, } } } :param x_test: numpy array :param y_test: numpy array :return: None """ model_metrics = Load.load_binary('model_metrics.bin') if model_metrics is None: model_metrics = { 'classifier': {} } elif 'classifier' not in model_metrics: model_metrics['classifier'] = {} model_metrics['data_set'] = { 'size': len(self.data_set) } index = TagPrecedents().get_intent_index()['outcomes_vector'] Log.write("Testing Classifier") y_predict = self.model.predict(x_test) Log.write("Classifier results:\n") for i in range(len(y_predict[0])): yp = y_predict[:, [i]] yt = y_test[:, [i]] accuracy = np.sum(yp == yt) * 100.0 / len(yt) column_name = index[self.mlb.classes_[i]][1] (precision, recall, f1, _) = precision_recall_fscore_support(yt, yp) Log.write('Column: {}'.format(column_name)) Log.write('Test accuracy: {}%'.format(accuracy)) Log.write('Precision: {}'.format(precision)) Log.write('Recall: {}'.format(recall)) Log.write('F1: {}\n'.format(f1)) model_metrics['classifier'][column_name] = { 'prediction_accuracy': accuracy, } Save().save_binary('model_metrics.bin', model_metrics)