def run(self): '''Runs the pipeline step. ''' labels = persistence.json_to_obj(self.input['label_dict']) classifier = Classifier( persistence.bin_to_obj(self.input['TrainClassifier_vectorizer']), persistence.bin_to_obj(self.input['TrainClassifier_model']), labels) def update_label(row): return labels[str(row['expected'])] def predict(row): classification = classifier.predict(row['name']) return classification['label']['name'] df = pd.read_csv(self.input['predictions']) df['expected'] = df.apply(update_label, axis=1) df['actual'] = df.apply(predict, axis=1) def print_incorrect(row): if row['actual'] != row['expected']: self.print( '\'{name}\' [ expected: {expected}, actual: {actual} ]', name=row['name'], expected=row['expected'], actual=row['actual']) df.apply(print_incorrect, axis=1)
def predict(filename, model_id='latest'): ''' Makes a prediction using the classification model. Args: filenanme (string): The filename to evaluate. model_id (string): the id of the model to use. ''' model_path = __get_model_path(model_id) classifier = Classifier( persistence.bin_to_obj(model_path + 'classifier_vec.pickle'), persistence.bin_to_obj(model_path + 'classifier_mdl.pickle'), persistence.json_to_obj('data/processed/label_dictionary.json') ) classification = classifier.predict(filename) return (classification['label'], classification['probability'])
def predict(filename, model_id='latest'): ''' Makes a prediction using the named entity recognition model. Args: input (filename): The filename to evaluate. model_id (string): the id of the model to use. ''' model_path = __get_model_path(model_id) nlp, _ = ner.get_model() nlp_bytes = persistence.bin_to_obj(model_path + 'ner_mdl.pickle') nlp.from_bytes(nlp_bytes) recognizer = EntityRecognizer(nlp) return recognizer.predict(filename)
def run(self): '''Runs the pipeline step. ''' iterations = 10 train_data = persistence.bin_to_obj(self.input['train_data']) nlp, ner_pipe = ner.get_model() # Add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner_pipe.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER nlp.begin_training() for _ in range(iterations): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=0.5, losses=losses) #self.print(losses['ner']) output_dir = self.output['output_dir'].format( timestamp=self.__get_timestamp()) if not os.path.exists(output_dir): os.mkdir(output_dir) self.output['model'] = self.output['model'].format( output_dir=output_dir) persistence.obj_to_bin(nlp.to_bytes(), self.output['model'])
def run(self): '''Runs the pipeline step. ''' nlp, _ = ner.get_model() nlp.from_bytes(persistence.bin_to_obj(self.input['TrainNerModel_model'])) recognizer = EntityRecognizer(nlp) def predict(row): return recognizer.predict(row['name']) df = pd.read_csv(self.input['predictions']) df['actual'] = df.apply(predict, axis=1) def print_incorrect(row): actual_list = list(row['actual']) expected_list = list(ast.literal_eval(row['expected'])) if len(actual_list) != len(expected_list): self.print( '\'{name}\' [ expected: {expected}, actual: {actual} ]', name=row['name'], expected=row['expected'], actual=row['actual']) else: for i in range(len(actual_list)): # pylint: disable=consider-using-enumerate x = actual_list[i] y = expected_list[i] if x[0] != y[0] or x[1] != y[1]: self.print( '\'{name}\' [ expected: {expected}, actual: {actual} ]', name=row['name'], expected=y, actual=x) df.apply(print_incorrect, axis=1)
def test_bin_to_obj_null_path_throws_exception(): with pytest.raises(TypeError): persistence.bin_to_obj(None)
def test_bin_to_obj_not_null(): assert persistence.bin_to_obj('models/classifier_mdl.pickle') is not None
def test_bin_to_obj_empty_path_throws_exception(): with pytest.raises(FileNotFoundError): persistence.bin_to_obj('')
def fixture_model(): nlp, _ = ner.get_model() nlp.from_bytes(persistence.bin_to_obj('models/ner_mdl.pickle')) return EntityRecognizer(nlp)
def fixture_model(): return Classifier(persistence.bin_to_obj('models/classifier_vec.pickle'), persistence.bin_to_obj('models/classifier_mdl.pickle'), persistence.json_to_obj('models/label_dictionary.json'))