コード例 #1
0
ファイル: file_test.py プロジェクト: sbeleuz/JusticeAI
    def test_binarize_model(self):
        binary_directory = Path.binary_directory
        Path.binary_directory = Path.test_mock_precedent_directory

        s = Save()
        model = {'key': 'value'}
        s.save_binary('test_model.bin', model)
        l = Load()
        new_model = l.load_binary('test_model.bin')
        self.assertEqual(new_model['key'], 'value')
        os.remove(Path.binary_directory + 'test_model.bin')
        Path.binary_directory = binary_directory
コード例 #2
0
ファイル: regex_lib_helper.py プロジェクト: sbeleuz/JusticeAI
def create_regex_cluster_bin(min_match_percentage):
    """
    creates a regex to cluster mapping and store it in a binary file
    :param min_match_percentage: min percentage of sentence to be matched to a regex
    :return: None
    """
    unpack_fact_demand_bin()
    rc_fact_dict = cluster_regex_mapper('fact', min_match_percentage)
    rc_demand_dict = cluster_regex_mapper('demand', min_match_percentage)
    cluster_regex_dict = {'fact': rc_fact_dict, 'demand': rc_demand_dict}
    save = Save()
    save.save_binary('cluster_regex_dict.bin', cluster_regex_dict)
コード例 #3
0
 def save(self):
     """
         Saves the scaler and regressor. Does not use joblib
         for the regressor as it is not supported
     """
     regressor_name = self.regressor_name
     file_path = os.path.join(Path.binary_directory, '{}_regressor.bin'.format(regressor_name))
     Log.write("saving" + '{}_regressor.bin'.format(regressor_name) + " to: " + file_path)
     Log.write('{}_regressor.bin'.format(regressor_name) + " saved to: " + file_path)
     self.model.steps[1][1].model.save(file_path)
     Save().save_binary('{}_scaler.bin'.format(regressor_name), self.model.steps[0][1])
     Save().save_binary('model_metrics.bin', self.data_metrics())
     self.dataset = None
コード例 #4
0
ファイル: similar_finder.py プロジェクト: sbeleuz/JusticeAI
    def __init__(self, train=False, dataset=[]):
        """
            SimilarFinder is used to obtain the most similar cases to a given
            sample. It uses Ski-kit learn's nearest neighbor implementation
            param: train: will train the model
            param: dataset: numpy array of precedent vectors. Ignored if train
                            is set to False
        """
        if not train:
            self.model = Load.load_binary("similarity_model.bin")
            self.case_numbers = Load.load_binary("similarity_case_numbers.bin")
            self.scaler = Load.load_binary("similarity_scaler.bin")
        elif len(dataset) > 0:
            sample_set = [
                np.concatenate([vec['facts_vector'], vec['outcomes_vector']])
                for vec in dataset
            ]

            for i in range(len(sample_set)):
                sample_set[i] = sample_set[i].astype(np.int64)

            self.scaler = StandardScaler()
            sample_set = self.scaler.fit_transform(sample_set)
            self.model = NearestNeighbors(5, metric='euclidean')
            self.model.fit(sample_set)
            self.case_numbers = [vec['name'] for vec in dataset]
            save = Save()
            save.save_binary("similarity_model.bin", self.model)
            save.save_binary("similarity_case_numbers.bin", self.case_numbers)
            save.save_binary("similarity_scaler.bin", self.scaler)
        else:
            raise ValueError('Please train or load the classifier first')
コード例 #5
0
    def tag_precedents(self, nb_files=-1):
        """
        1) Displays progress of files vectorized
        2) Vectorize file
        3) Displays percentage of files covered
        4) displays percentage of lines covered
        5) binarize vectors

        :param nb_files when -1 then read all directory
        :return:
        structured_data_dict:{
            filename:{
                name: 'AZ-XXXXXXX.txt',
                demands_vector: [...],
                facts_vector: [...],
                outcomes_vector: [...]
            }
        }
        """

        # ----------------------- 1 -----------------------------#
        Log.write('Tagging precedents')
        for file in os.listdir(self.precedents_directory_path):
            if nb_files == -1:
                percent = float(self.nb_text / len(
                    os.listdir(self.precedents_directory_path))) * 100
            else:
                percent = float(self.nb_text / nb_files) * 100
                if self.nb_text > nb_files:
                    break
            stdout.write("\rPrecedents tagged: %f " % percent)
            stdout.flush()

            # ----------------------- 2 -----------------------------#
            self.precedent_vector[file] = self.__tag_file(file)
            self.nb_text += 1

        # ----------------------- 3 -----------------------------#
        Log.write('Precedent coverage: ' +
                  str(self.text_tagged / self.nb_text))

        # ----------------------- 4 -----------------------------#
        Log.write('Line Coverage: ' +
                  str(self.statements_tagged / self.nb_lines))

        # ----------------------- 5 -----------------------------#
        save = Save()
        save.save_binary('precedent_vectors.bin', self.precedent_vector)
        return self.precedent_vector
コード例 #6
0
ファイル: regex_lib_helper.py プロジェクト: sbeleuz/JusticeAI
def create_regex_bin():
    """
    Driver to save regex binary file and regex-cluster dict binary
    1) Create an empty dictionary
    2) Add keys and values to the dictionary
    3) binarize file

    :return: None
    """
    regexes = RegexLib()
    reg_dict = {}
    reg_dict['regex_demands'] = regexes.regex_demands
    reg_dict['regex_facts'] = regexes.regex_facts
    reg_dict['regex_outcomes'] = regexes.regex_outcomes
    reg_dict['MONEY_REGEX'] = regexes.MONEY_REGEX
    save = Save()
    save.save_binary('regexes.bin', reg_dict)
コード例 #7
0
ファイル: multi_class_svm.py プロジェクト: sbeleuz/JusticeAI
    def save(self):
        """
        Since the regression and classifier models are separate,
        then new index will be assigned to each model.

        1) Save classifier of each column into a binary format
        2) Save the prediction model into binary
        :return:
        """
        # ------------------- 1 -----------------------------
        linear_labels = {}
        indices = TagPrecedents().get_intent_index()['outcomes_vector']
        for i in range(len(self.mlb.classes_)):
            label = indices[self.mlb.classes_[i]][1]
            data_type = indices[self.mlb.classes_[i]][2]
            linear_labels[i] = label, data_type
        save = Save()
        save.save_binary("classifier_labels.bin", linear_labels)

        # ------------------- 2 -----------------------------
        save.save_binary("multi_class_svm_model.bin", self.model)
コード例 #8
0
ファイル: multi_class_svm.py プロジェクト: sbeleuz/JusticeAI
    def __test(self, x_test, y_test):
        """
        1) Tests model
        2) Save the accuracy to the model metrics binary

        model_metrics -->
        {
            'data_set':{
                'size': 5000
            },
            'regressor':{
                'regressor name':{
                    'std': 4,
                    'mean': 5,
                    'variance': 42,
                    'mean_fact_vector': [3, 1, 5, 6, 2]
                }
            },
            'classifier':{
                'classifier name':{
                    'prediction_accuracy': 0.92,
                }
            }
        }

        :param x_test: numpy array
        :param y_test: numpy array
        :return: None
        """
        model_metrics = Load.load_binary('model_metrics.bin')
        if model_metrics is None:
            model_metrics = {
                'classifier': {}
            }
        elif 'classifier' not in model_metrics:
            model_metrics['classifier'] = {}

        model_metrics['data_set'] = {
            'size': len(self.data_set)
        }

        index = TagPrecedents().get_intent_index()['outcomes_vector']
        Log.write("Testing Classifier")
        y_predict = self.model.predict(x_test)
        Log.write("Classifier results:\n")
        for i in range(len(y_predict[0])):
            yp = y_predict[:, [i]]
            yt = y_test[:, [i]]
            accuracy = np.sum(yp == yt) * 100.0 / len(yt)
            column_name = index[self.mlb.classes_[i]][1]
            (precision, recall, f1, _) = precision_recall_fscore_support(yt, yp)
            Log.write('Column: {}'.format(column_name))
            Log.write('Test accuracy: {}%'.format(accuracy))
            Log.write('Precision: {}'.format(precision))
            Log.write('Recall: {}'.format(recall))
            Log.write('F1: {}\n'.format(f1))

            model_metrics['classifier'][column_name] = {
                'prediction_accuracy': accuracy,
            }
        Save().save_binary('model_metrics.bin', model_metrics)