Example #1
0
class Fasttext_clf(BaseEstimator, ClassifierMixin):
    data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz')
    def __init__(self, path=data_path):
        self.model = FastText(path)
        self.default = '0'

    def fit(self, X, y):
        return self

    def predict(self, X):
        results = []
        if isinstance(X, str):  #
            res=self.model.predict_single(X)
            results = results + [self.default if not res  else res]
        elif isinstance(X, list):
           # X=[(x) for x in X]
            res = self.model.predict(X)
            results = results + self.model.predict(X)
        return results

    def predict_proba(self, X):
        results = []
        if isinstance(X, str):  #
            results = results + [self.model.predict_proba_single(X)]
        elif isinstance(X, list):
            #X=[(x+'\n') for x in X]
            results = results + self.model.predict_proba(X)
        return results
 def fasttext(text):
     test_data = text.replace('\n', ' ')
     model = FastText('./model_audit.bin')
     test = test_data + '\n'
     pred = model.predict_proba_single(test, k=2)
     out = pred[0][1]
     return out
Example #3
0
def classifier(sentence):
    model = FastText('train.bin')
    return model.predict_proba_single(sentence, k=2)[0][0]
Example #4
0
class model(object):
    name = ""
    version = 0
    supervised = True
    ft = None
    loaded = False
    quantized = False
    config = None
    filepath = ""

    def __init__(self, name, version, supervised, quantized):
        self.name = name
        self.version = version
        self.supervised = supervised
        self.quantized = quantized

        if quantized == True:
            self.filepath = f"{MODELDIR}{self.name}/{self.version!s}/model.ftz"
        else:
            self.filepath = f"{MODELDIR}{self.name}/{self.version!s}/model.bin"

    def quantize(self):
        logger.error("TODO")

    def load(self):
        try:
            self.ft = FastText(self.filepath)
        except:
            return "Failed to Load FT file"
        logger.info(f"loaded file {self.filepath}")
        self.loaded = True
        return "success"

    def train(self, trainingfile):
        """Starts model building"""

        logger.info(
            f'Training started with : learningRate:{self.config.learningRate!s}, epoch:{self.config.epoch!s}, ngrams :{self.config.ngrams!s}'
        )
        model = FastText()
        if self.supervised:
            model.supervised(input=trainingfile,
                             output=self.filepath,
                             epoch=self.config.epochs,
                             lr=self.config.learningRate,
                             wordNgrams=self.config.ngrams,
                             verbose=2,
                             minCount=1)
        elif self.config.method == "cbow":
            model.cbow(input=trainingfile,
                       output='model',
                       epoch=self.config.epoch,
                       lr=self.config.learningRate)
        else:
            model.skipgram(input=trainingfile,
                           output='model',
                           epoch=self.config.epoch,
                           lr=self.config.learningRate)

    def predict(self, text, nbpredictions=1):
        if self.loaded == False:
            return ['error', "please load model first"]

        logger.info(f"making prediction for {text}")
        predictions = self.ft.predict_proba_single(text, k=nbpredictions)
        logger.info(predictions)
        results = []
        for prediction in predictions:
            if len(prediction) == 2:

                result = {
                    "category": prediction[0],
                    "confidence": prediction[1]
                }
                results.append(result)
                logger.info(f"{prediction[0]} {prediction[1]!s}")

        return results
Example #5
0
class FasttextClassifier(TextClassificationModel):
    FT_MODEL_KEY = "ft_classifier.model"

    REQUIREMENTS = ["pyfasttext"]

    def __init__(self,
                 ft_classifier_path,
                 transform_func=None,
                 init_func=None,
                 **kwargs):
        super(FasttextClassifier, self).__init__(**kwargs)

        if not os.path.isfile(ft_classifier_path):
            raise FileNotFoundError('File does not exist: %s' %
                                    ft_classifier_path)

        if self.name is None:
            # use filename without extension as name
            self.name = simplify(get_file_name(ft_classifier_path))

        self.init_func = init_func
        self.transform_func = transform_func

        self.add_requirements(FasttextClassifier.REQUIREMENTS)

        self.add_file(FasttextClassifier.FT_MODEL_KEY, ft_classifier_path)
        self._init_model()

    @overrides
    def _init_model(self):
        self.model_instance = FastText(
            self.get_file(FasttextClassifier.FT_MODEL_KEY))
        if self.init_func:
            self.init_func(model=self)

    @overrides
    def _save_model(self, output_path):
        del self.model_instance

    @overrides
    def _predict(self, data, limit=None, **kwargs):
        if not limit or limit > self.model_instance.nlabels:
            limit = self.model_instance.nlabels

        if self.transform_func:
            if isinstance(self.transform_func, types.FunctionType):
                data = self.transform_func(data, model=self, **kwargs)
            else:
                self._log.warning(
                    "Provided data transformer is not a function.")

        prediction = self.model_instance.predict_proba_single(data, k=limit)
        if prediction == []:
            # add no predictions of failed to predict something.
            # TODO is this the best solution
            new_labels = ["NO_PREDICTION"] * limit
            new_probabilities = [0.0] * limit
            prediction = list(zip(new_labels, new_probabilities))

        return pd.DataFrame(prediction, columns=[ITEM_COLUMN, SCORE_COLUMN])

    @overrides
    def predict_batch(self, data, limit=None, **kwargs):
        # Todo predict batch better function
        if not limit or limit > self.model_instance.nlabels:
            limit = self.model_instance.nlabels

        if self.transform_func:
            if isinstance(self.transform_func, types.FunctionType):
                # transform is only on a single item
                data = [
                    self.transform_func(item, model=self, **kwargs)
                    for item in data
                ]
            else:
                self._log.warning(
                    "Provided data transformer is not a function.")

        prediction = self.model_instance.predict_proba(data, k=limit)

        labels = []
        probabilities = []

        for ind, entry in enumerate(prediction):
            if entry == []:
                new_labels = ["NO_PREDICTION"] * limit
                new_probabilities = [0.0] * limit
            else:
                new_labels, new_probabilities = list(zip(*entry))
            labels += [list(new_labels)]
            probabilities += [list(new_probabilities)]

        df = pd.DataFrame()
        df[ITEM_COLUMN] = labels
        df[SCORE_COLUMN] = probabilities

        return df