def __init__(self, random_state: int = None, known_latent_traits: Dict[str, Dict[str, float]] = None): if known_latent_traits is not None: latent_traits_calibrator = KnownParametersCalibrator(latent_traits=known_latent_traits) if set(known_latent_traits.keys()) != {DIFFICULTY, DISCRIMINATION}: raise ValueError("wrong keys in known_latent_traits dictionary") else: latent_traits_calibrator = IRTCalibrator(DIFFICULTY_RANGE, DISCRIMINATION_RANGE) vec_diff = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=1000) feat_eng_regression_pipeline_difficulty = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([IRFeaturesComponent(vec_diff, concatenate_correct=True, concatenate_wrong=True)]), RegressionModule([ SklearnRegressionComponent( RandomForestRegressor(n_estimators=250, max_depth=50, random_state=random_state), latent_trait_range=DIFFICULTY_RANGE ) ]) ) vec_disc = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=800) feat_eng_regression_pipeline_discrimination = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([IRFeaturesComponent(vec_disc, concatenate_correct=True, concatenate_wrong=True)]), RegressionModule([ SklearnRegressionComponent( RandomForestRegressor(n_estimators=100, max_depth=25, random_state=random_state), latent_trait_range=DISCRIMINATION_RANGE ) ]) ) estimator_from_text = FeatureEngAndRegressionEstimatorFromText( { DIFFICULTY: feat_eng_regression_pipeline_difficulty, DISCRIMINATION: feat_eng_regression_pipeline_discrimination } ) super().__init__(latent_traits_calibrator, estimator_from_text)
df_test = df_test.drop( df_test.head(100).index ) # Not to use the validation data used in 5.1 for model selection dict_latent_traits = pickle.load( open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb")) # define latent traits calibrator (known latent traits) latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits) file = open("outputs/5_3_read.txt", 'w') # pipeline difficulty pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ReadabilityFeaturesComponent()]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE) ])) # pipeline discrimination pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ReadabilityFeaturesComponent()]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=A_RANGE) ])) # create estimator from text form the previous pipelines estimator_from_text = FeatureEngAndRegressionEstimatorFromText({ DIFFICULTY: pipe_b, DISCRIMINATION: pipe_a })
# model, as obtained in the scripts 5_1_* pipeline_difficulty = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(TfidfVectorizer( stop_words='english', preprocessor=vectorizer_text_preprocessor, min_df=0.02, max_df=0.92), concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(RFRegressor(n_estimators=100, max_depth=20, random_state=SEED), latent_trait_range=DIFFICULTY_RANGE) ])) pipeline_discrimination = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(TfidfVectorizer( stop_words='english', preprocessor=vectorizer_text_preprocessor, min_df=0.02, max_df=0.96), concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([
# pipeline difficulty vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(SVR(), latent_trait_range=B_RANGE) ])) # pipeline discrimination vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([
for min_df in np.arange(0.00, 0.11, 0.02): for max_df in np.arange(0.90, 1.01, 0.02): file = open("outputs/5_1_model_selection_DT_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w') file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df)) # pipeline difficulty vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(DecisionTreeRegressor(random_state=SEED), latent_trait_range=B_RANGE) ]) ) # pipeline discrimination vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(DecisionTreeRegressor(random_state=SEED), latent_trait_range=A_RANGE) ]) ) # create estimator from text form the previous pipelines