Example #1
0
 def __init__(self, random_state: int = None, known_latent_traits: Dict[str, Dict[str, float]] = None):
     if known_latent_traits is not None:
         latent_traits_calibrator = KnownParametersCalibrator(latent_traits=known_latent_traits)
         if set(known_latent_traits.keys()) != {DIFFICULTY, DISCRIMINATION}:
             raise ValueError("wrong keys in known_latent_traits dictionary")
     else:
         latent_traits_calibrator = IRTCalibrator(DIFFICULTY_RANGE, DISCRIMINATION_RANGE)
     vec_diff = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=1000)
     feat_eng_regression_pipeline_difficulty = FeatureEngAndRegressionPipeline(
         FeatureEngineeringModule([IRFeaturesComponent(vec_diff, concatenate_correct=True, concatenate_wrong=True)]),
         RegressionModule([
             SklearnRegressionComponent(
                 RandomForestRegressor(n_estimators=250, max_depth=50, random_state=random_state),
                 latent_trait_range=DIFFICULTY_RANGE
             )
         ])
     )
     vec_disc = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=800)
     feat_eng_regression_pipeline_discrimination = FeatureEngAndRegressionPipeline(
         FeatureEngineeringModule([IRFeaturesComponent(vec_disc, concatenate_correct=True, concatenate_wrong=True)]),
         RegressionModule([
             SklearnRegressionComponent(
                 RandomForestRegressor(n_estimators=100, max_depth=25, random_state=random_state),
                 latent_trait_range=DISCRIMINATION_RANGE
             )
         ])
     )
     estimator_from_text = FeatureEngAndRegressionEstimatorFromText(
         {
             DIFFICULTY: feat_eng_regression_pipeline_difficulty,
             DISCRIMINATION: feat_eng_regression_pipeline_discrimination
         }
     )
     super().__init__(latent_traits_calibrator, estimator_from_text)
Example #2
0
    FeatureEngineeringModule([ReadabilityFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=B_RANGE)
    ]))
# pipeline discrimination
pipe_a = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([ReadabilityFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=A_RANGE)
    ]))
# create estimator from text form the previous pipelines
estimator_from_text = FeatureEngAndRegressionEstimatorFromText({
    DIFFICULTY:
    pipe_b,
    DISCRIMINATION:
    pipe_a
})
model = Text2PropsModel(latent_traits_calibrator, estimator_from_text)
model.calibrate_latent_traits(None)

# define parameters for randomized CV
dict_params = {
    DIFFICULTY: [{
        'regressor__n_estimators': randint(20, 200),
        'regressor__max_depth': randint(2, 50)
    }],
    DISCRIMINATION: [{
        'regressor__n_estimators': randint(20, 200),
        'regressor__max_depth': randint(2, 50)
    }],
Example #3
0
                            concatenate_correct=True,
                            concatenate_wrong=True),
        LinguisticFeaturesComponent(),
        ReadabilityFeaturesComponent(),
    ]),
    RegressionModule([
        SklearnRegressionComponent(RFRegressor(n_estimators=100,
                                               max_depth=20,
                                               random_state=SEED),
                                   latent_trait_range=DISCRIMINATION_RANGE)
    ]))
model = Text2PropsModel(
    KnownParametersCalibrator(dict_latent_traits),
    FeatureEngAndRegressionEstimatorFromText({
        DIFFICULTY:
        pipeline_difficulty,
        DISCRIMINATION:
        pipeline_discrimination
    }))
model.train(df_train)
print('[INFO] model trained')

# Here I estimate the latent traits for the test set
dict_predictions_test_set = model.predict(df_test)
# I have to convert the dictionary of the prediction in the right format as model.predict returns a dict of lists
#   (one list for each latent trait)
dict_predicted_latent_traits = dict()
dict_predicted_latent_traits[DIFFICULTY], dict_predicted_latent_traits[
    DISCRIMINATION] = dict(), dict()
for idx, q_id in enumerate(df_test[Q_ID].values):
    dict_predicted_latent_traits[DIFFICULTY][q_id] = dict_predictions_test_set[
        DIFFICULTY][idx]