Exemple #1
0
def test_MLPClassifier(irisDataSet, irisClassificationTestCase, testResources):
    featureNames = irisDataSet.getInputOutputData().inputs.columns
    dftNorm = DFTNormalisation([DFTNormalisation.Rule(re.escape(f)) for f in featureNames], defaultTransformerFactory=sklearn.preprocessing.StandardScaler)
    model = sensai.torch.models.MultiLayerPerceptronVectorClassificationModel(hiddenDims=(50,25,8), cuda=False, epochs=100, optimiser="adam",
            batchSize=200, normalisationMode=NormalisationMode.NONE, hidActivationFunction=torch.tanh) \
        .withName("torchMLPClassifier") \
        .withInputTransformers([dftNorm]) \
        .withFeatureGenerator(FeatureGeneratorTakeColumns())
    irisClassificationTestCase.testMinAccuracy(model, 0.8)
Exemple #2
0
 def test_multiColumnSingleRule(self):
     arr = np.array([1, 5, 10])
     df = pd.DataFrame({"foo": arr, "bar": arr * 100})
     dft = DFTNormalisation([
         DFTNormalisation.Rule(
             r"foo|bar",
             transformer=sklearn.preprocessing.MaxAbsScaler(),
             independentColumns=False)
     ])
     df2 = dft.fitApply(df)
     assert np.all(df2.foo == arr / 1000) and np.all(df2.bar == arr / 10)
Exemple #3
0
 def test_arrayValued(self):
     arr = np.array([1, 5, 10])
     df = pd.DataFrame({"foo": [arr, 2 * arr, 10 * arr]})
     dft = DFTNormalisation([
         DFTNormalisation.Rule(
             r"foo|bar",
             transformer=sklearn.preprocessing.MaxAbsScaler(),
             arrayValued=True)
     ])
     df2 = dft.fitApply(df)
     assert np.all(df2.foo.iloc[0] == arr /
                   100) and np.all(df2.foo.iloc[-1] == arr / 10)
 def sentenceEmbeddingFeatureGeneratorFactory(persistCache=True):
     columnGen = ColumnGeneratorSentenceEncodings("reviewText",
                                                  encodingProvider,
                                                  CACHE_PATH,
                                                  persistCache=persistCache)
     return FeatureGeneratorFromColumnGenerator(
         columnGen,
         normalisationRuleTemplate=DFTNormalisation.RuleTemplate(
             unsupported=True))
Exemple #5
0
def test_FeatureGeneratorNAMarker(irisClassificationTestCase):
    """
    Integration test for handling of N/A values via marker features (using FeatureGeneratorNAMarker) in the context of models
    that do not support N/A values, replacing them with a different value (using FillNA)
    """
    iodata = irisClassificationTestCase.data

    # create some random N/A values in the data set
    inputs = iodata.inputs.copy()
    rand = random.Random(42)
    fullIndices = list(range(len(inputs)))
    for col in inputs.columns:
        indices = rand.sample(fullIndices, 20)
        inputs[col].iloc[indices] = np.nan
    iodata = InputOutputData(inputs, iodata.outputs)

    for useFGNA in (True, False):
        fgs = [
            FeatureGeneratorTakeColumns(
                normalisationRuleTemplate=DFTNormalisation.RuleTemplate(
                    independentColumns=True))
        ]
        if useFGNA:
            fgs.append(FeatureGeneratorNAMarker(inputs.columns))
        fCollector = FeatureCollector(*fgs)
        model = SkLearnMLPVectorClassificationModel() \
            .withFeatureCollector(fCollector) \
            .withInputTransformers(
                DFTNormalisation(fCollector.getNormalisationRules(), defaultTransformerFactory=SkLearnTransformerFactoryFactory.StandardScaler()),
                DFTFillNA(-3))
        # NOTE: using -3 instead of 0 to fill N/A values in order to force the model to learn the purpose of the N/A markers,
        # because 0 values are actually a reasonable fallback (which happens to work) when using StandardScaler
        # NOTE: it is important to apply DFTNormalisation before DFTFillNA, because DFTNormalisation would learn using the filled values otherwise

        ev = VectorClassificationModelEvaluator(iodata, testFraction=0.2)
        ev.fitModel(model)
        result = ev.evalModel(model)
        accuracy = result.getEvalStats().getAccuracy()
        log.info(f"Accuracy (for useFGNA={useFGNA}) = {accuracy}")
        if useFGNA:
            assert accuracy > 0.85
        else:
            assert accuracy < 0.85
Exemple #6
0
    flattenedPandasDf: pd.DataFrame = ...  # Load/insert the flattened dataframe from a previous step
    CACHE_PATH: str = ...

    # replace by a lightweight model for lambda
    reviewClassifier = models.MultiLayerPerceptronVectorClassificationModel(
        hiddenDims=[50, 50, 20], cuda=False, epochs=300)

    # add the feature generator that was previously used to fill the cache to the model
    # encodingProvider = TextStatEncodingProvider() # for lambda
    encodingProvider = BertBaseMeanEncodingProvider()
    reviewEncodingFeatureGen = sentenceEmbeddingFeatureGeneratorFactory(
        CACHE_PATH, persistCache=False)
    encodedReviewColName = reviewEncodingFeatureGen.columnGen.generatedColumnName
    flattenedSentenceEncodingsFeatureregen = flattenedFeatureGenerator(
        reviewEncodingFeatureGen,
        normalisationRuleTemplate=DFTNormalisation.RuleTemplate(skip=True))
    reviewFeatureCollector = FeatureCollector(
        flattenedSentenceEncodingsFeatureregen)
    reviewClassifier = reviewClassifier.withFeatureCollector(
        reviewFeatureCollector)

    # split off the targets and train
    targetDf = pd.DataFrame(flattenedPandasDf.pop("overall"))
    inputOutputData = InputOutputData(flattenedPandasDf, targetDf)
    evalModelViaEvaluator(reviewClassifier,
                          inputOutputData,
                          testFraction=0.01,
                          plotTargetDistribution=True)

    # save model, load it and try predict as integration test
    with open("reviewClassifier-v1.pickle", 'wb') as f: