def run(self):
        '''Runs the pipeline step.

        '''
        labels = persistence.json_to_obj(self.input['label_dict'])
        classifier = Classifier(
            persistence.bin_to_obj(self.input['TrainClassifier_vectorizer']),
            persistence.bin_to_obj(self.input['TrainClassifier_model']),
            labels)

        def update_label(row):
            return labels[str(row['expected'])]

        def predict(row):
            classification = classifier.predict(row['name'])
            return classification['label']['name']

        df = pd.read_csv(self.input['predictions'])
        df['expected'] = df.apply(update_label, axis=1)
        df['actual'] = df.apply(predict, axis=1)

        def print_incorrect(row):
            if row['actual'] != row['expected']:
                self.print(
                    '\'{name}\' [ expected: {expected}, actual: {actual} ]',
                    name=row['name'],
                    expected=row['expected'],
                    actual=row['actual'])

        df.apply(print_incorrect, axis=1)
Esempio n. 2
0
def predict(filename, model_id='latest'):
    ''' Makes a prediction using the classification model.

    Args:
        filenanme (string): The filename to evaluate.
        model_id (string): the id of the model to use.
    '''
    model_path = __get_model_path(model_id)
    classifier = Classifier(
        persistence.bin_to_obj(model_path + 'classifier_vec.pickle'),
        persistence.bin_to_obj(model_path + 'classifier_mdl.pickle'),
        persistence.json_to_obj('data/processed/label_dictionary.json')
    )
    classification = classifier.predict(filename)

    return (classification['label'], classification['probability'])
Esempio n. 3
0
def predict(filename, model_id='latest'):
    ''' Makes a prediction using the named entity recognition model.

    Args:
        input (filename): The filename to evaluate.
        model_id (string): the id of the model to use.
    '''
    model_path = __get_model_path(model_id)
    nlp, _ = ner.get_model()
    nlp_bytes = persistence.bin_to_obj(model_path + 'ner_mdl.pickle')
    nlp.from_bytes(nlp_bytes)
    recognizer = EntityRecognizer(nlp)
    return recognizer.predict(filename)
    def run(self):
        '''Runs the pipeline step.

        '''
        iterations = 10
        train_data = persistence.bin_to_obj(self.input['train_data'])
        nlp, ner_pipe = ner.get_model()

        # Add labels
        for _, annotations in train_data:
            for ent in annotations.get("entities"):
                ner_pipe.add_label(ent[2])

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
            nlp.begin_training()
            for _ in range(iterations):
                random.shuffle(train_data)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(train_data,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, drop=0.5, losses=losses)
                #self.print(losses['ner'])

        output_dir = self.output['output_dir'].format(
            timestamp=self.__get_timestamp())
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)

        self.output['model'] = self.output['model'].format(
            output_dir=output_dir)

        persistence.obj_to_bin(nlp.to_bytes(), self.output['model'])
    def run(self):
        '''Runs the pipeline step.

        '''
        nlp, _ = ner.get_model()
        nlp.from_bytes(persistence.bin_to_obj(self.input['TrainNerModel_model']))
        recognizer = EntityRecognizer(nlp)

        def predict(row):
            return recognizer.predict(row['name'])

        df = pd.read_csv(self.input['predictions'])
        df['actual'] = df.apply(predict, axis=1)

        def print_incorrect(row):
            actual_list = list(row['actual'])
            expected_list = list(ast.literal_eval(row['expected']))

            if len(actual_list) != len(expected_list):
                self.print(
                    '\'{name}\' [ expected: {expected}, actual: {actual} ]',
                    name=row['name'],
                    expected=row['expected'],
                    actual=row['actual'])
            else:
                for i in range(len(actual_list)): # pylint: disable=consider-using-enumerate
                    x = actual_list[i]
                    y = expected_list[i]
                    if x[0] != y[0] or x[1] != y[1]:
                        self.print(
                            '\'{name}\' [ expected: {expected}, actual: {actual} ]',
                            name=row['name'],
                            expected=y,
                            actual=x)

        df.apply(print_incorrect, axis=1)
Esempio n. 6
0
def test_bin_to_obj_null_path_throws_exception():
    with pytest.raises(TypeError):
        persistence.bin_to_obj(None)
Esempio n. 7
0
def test_bin_to_obj_not_null():
    assert persistence.bin_to_obj('models/classifier_mdl.pickle') is not None
Esempio n. 8
0
def test_bin_to_obj_empty_path_throws_exception():
    with pytest.raises(FileNotFoundError):
        persistence.bin_to_obj('')
def fixture_model():
    nlp, _ = ner.get_model()
    nlp.from_bytes(persistence.bin_to_obj('models/ner_mdl.pickle'))
    return EntityRecognizer(nlp)
def fixture_model():
    return Classifier(persistence.bin_to_obj('models/classifier_vec.pickle'),
                      persistence.bin_to_obj('models/classifier_mdl.pickle'),
                      persistence.json_to_obj('models/label_dictionary.json'))