class Fasttext_clf(BaseEstimator, ClassifierMixin): data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz') def __init__(self, path=data_path): self.model = FastText(path) self.default = '0' def fit(self, X, y): return self def predict(self, X): results = [] if isinstance(X, str): # res=self.model.predict_single(X) results = results + [self.default if not res else res] elif isinstance(X, list): # X=[(x) for x in X] res = self.model.predict(X) results = results + self.model.predict(X) return results def predict_proba(self, X): results = [] if isinstance(X, str): # results = results + [self.model.predict_proba_single(X)] elif isinstance(X, list): #X=[(x+'\n') for x in X] results = results + self.model.predict_proba(X) return results
def fasttext(text): test_data = text.replace('\n', ' ') model = FastText('./model_audit.bin') test = test_data + '\n' pred = model.predict_proba_single(test, k=2) out = pred[0][1] return out
def classifier(sentence): model = FastText('train.bin') return model.predict_proba_single(sentence, k=2)[0][0]
class model(object): name = "" version = 0 supervised = True ft = None loaded = False quantized = False config = None filepath = "" def __init__(self, name, version, supervised, quantized): self.name = name self.version = version self.supervised = supervised self.quantized = quantized if quantized == True: self.filepath = f"{MODELDIR}{self.name}/{self.version!s}/model.ftz" else: self.filepath = f"{MODELDIR}{self.name}/{self.version!s}/model.bin" def quantize(self): logger.error("TODO") def load(self): try: self.ft = FastText(self.filepath) except: return "Failed to Load FT file" logger.info(f"loaded file {self.filepath}") self.loaded = True return "success" def train(self, trainingfile): """Starts model building""" logger.info( f'Training started with : learningRate:{self.config.learningRate!s}, epoch:{self.config.epoch!s}, ngrams :{self.config.ngrams!s}' ) model = FastText() if self.supervised: model.supervised(input=trainingfile, output=self.filepath, epoch=self.config.epochs, lr=self.config.learningRate, wordNgrams=self.config.ngrams, verbose=2, minCount=1) elif self.config.method == "cbow": model.cbow(input=trainingfile, output='model', epoch=self.config.epoch, lr=self.config.learningRate) else: model.skipgram(input=trainingfile, output='model', epoch=self.config.epoch, lr=self.config.learningRate) def predict(self, text, nbpredictions=1): if self.loaded == False: return ['error', "please load model first"] logger.info(f"making prediction for {text}") predictions = self.ft.predict_proba_single(text, k=nbpredictions) logger.info(predictions) results = [] for prediction in predictions: if len(prediction) == 2: result = { "category": prediction[0], "confidence": prediction[1] } results.append(result) logger.info(f"{prediction[0]} {prediction[1]!s}") return results
class FasttextClassifier(TextClassificationModel): FT_MODEL_KEY = "ft_classifier.model" REQUIREMENTS = ["pyfasttext"] def __init__(self, ft_classifier_path, transform_func=None, init_func=None, **kwargs): super(FasttextClassifier, self).__init__(**kwargs) if not os.path.isfile(ft_classifier_path): raise FileNotFoundError('File does not exist: %s' % ft_classifier_path) if self.name is None: # use filename without extension as name self.name = simplify(get_file_name(ft_classifier_path)) self.init_func = init_func self.transform_func = transform_func self.add_requirements(FasttextClassifier.REQUIREMENTS) self.add_file(FasttextClassifier.FT_MODEL_KEY, ft_classifier_path) self._init_model() @overrides def _init_model(self): self.model_instance = FastText( self.get_file(FasttextClassifier.FT_MODEL_KEY)) if self.init_func: self.init_func(model=self) @overrides def _save_model(self, output_path): del self.model_instance @overrides def _predict(self, data, limit=None, **kwargs): if not limit or limit > self.model_instance.nlabels: limit = self.model_instance.nlabels if self.transform_func: if isinstance(self.transform_func, types.FunctionType): data = self.transform_func(data, model=self, **kwargs) else: self._log.warning( "Provided data transformer is not a function.") prediction = self.model_instance.predict_proba_single(data, k=limit) if prediction == []: # add no predictions of failed to predict something. # TODO is this the best solution new_labels = ["NO_PREDICTION"] * limit new_probabilities = [0.0] * limit prediction = list(zip(new_labels, new_probabilities)) return pd.DataFrame(prediction, columns=[ITEM_COLUMN, SCORE_COLUMN]) @overrides def predict_batch(self, data, limit=None, **kwargs): # Todo predict batch better function if not limit or limit > self.model_instance.nlabels: limit = self.model_instance.nlabels if self.transform_func: if isinstance(self.transform_func, types.FunctionType): # transform is only on a single item data = [ self.transform_func(item, model=self, **kwargs) for item in data ] else: self._log.warning( "Provided data transformer is not a function.") prediction = self.model_instance.predict_proba(data, k=limit) labels = [] probabilities = [] for ind, entry in enumerate(prediction): if entry == []: new_labels = ["NO_PREDICTION"] * limit new_probabilities = [0.0] * limit else: new_labels, new_probabilities = list(zip(*entry)) labels += [list(new_labels)] probabilities += [list(new_probabilities)] df = pd.DataFrame() df[ITEM_COLUMN] = labels df[SCORE_COLUMN] = probabilities return df