Esempio n. 1
0
 def __init__(self, random_state: int = None, known_latent_traits: Dict[str, Dict[str, float]] = None):
     if known_latent_traits is not None:
         latent_traits_calibrator = KnownParametersCalibrator(latent_traits=known_latent_traits)
         if set(known_latent_traits.keys()) != {DIFFICULTY, DISCRIMINATION}:
             raise ValueError("wrong keys in known_latent_traits dictionary")
     else:
         latent_traits_calibrator = IRTCalibrator(DIFFICULTY_RANGE, DISCRIMINATION_RANGE)
     vec_diff = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=1000)
     feat_eng_regression_pipeline_difficulty = FeatureEngAndRegressionPipeline(
         FeatureEngineeringModule([IRFeaturesComponent(vec_diff, concatenate_correct=True, concatenate_wrong=True)]),
         RegressionModule([
             SklearnRegressionComponent(
                 RandomForestRegressor(n_estimators=250, max_depth=50, random_state=random_state),
                 latent_trait_range=DIFFICULTY_RANGE
             )
         ])
     )
     vec_disc = TfidfVectorizer(stop_words='english', preprocessor=vectorizer_text_preprocessor, max_features=800)
     feat_eng_regression_pipeline_discrimination = FeatureEngAndRegressionPipeline(
         FeatureEngineeringModule([IRFeaturesComponent(vec_disc, concatenate_correct=True, concatenate_wrong=True)]),
         RegressionModule([
             SklearnRegressionComponent(
                 RandomForestRegressor(n_estimators=100, max_depth=25, random_state=random_state),
                 latent_trait_range=DISCRIMINATION_RANGE
             )
         ])
     )
     estimator_from_text = FeatureEngAndRegressionEstimatorFromText(
         {
             DIFFICULTY: feat_eng_regression_pipeline_difficulty,
             DISCRIMINATION: feat_eng_regression_pipeline_discrimination
         }
     )
     super().__init__(latent_traits_calibrator, estimator_from_text)
Esempio n. 2
0
df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv'))
df_test = df_test.drop(
    df_test.head(100).index
)  # Not to use the validation data used in 5.1 for model selection
dict_latent_traits = pickle.load(
    open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb"))

# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

file = open("outputs/5_3_read.txt", 'w')

# pipeline difficulty
pipe_b = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([ReadabilityFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=B_RANGE)
    ]))
# pipeline discrimination
pipe_a = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([ReadabilityFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=A_RANGE)
    ]))
# create estimator from text form the previous pipelines
estimator_from_text = FeatureEngAndRegressionEstimatorFromText({
    DIFFICULTY:
    pipe_b,
Esempio n. 3
0
df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv'))
df_test = df_test.drop(
    df_test.head(100).index
)  # Not to use the validation data used in 5.1 for model selection
dict_latent_traits = pickle.load(
    open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb"))

# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

file = open("outputs/5_3_ling.txt", 'w')

# pipeline difficulty
pipe_b = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([LinguisticFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=B_RANGE)
    ]))
# pipeline discrimination
pipe_a = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([LinguisticFeaturesComponent()]),
    RegressionModule([
        SklearnRegressionComponent(RandomForestRegressor(random_state=SEED),
                                   latent_trait_range=A_RANGE)
    ]))
# create estimator from text form the previous pipelines
estimator_from_text = FeatureEngAndRegressionEstimatorFromText({
    DIFFICULTY:
    pipe_b,
Esempio n. 4
0
for min_df in np.arange(0.00, 0.11, 0.02):
    for max_df in np.arange(0.90, 1.01, 0.02):

        file = open(
            "outputs/5_3_ir_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w')
        file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df))

        # pipeline difficulty
        vec_b = TfidfVectorizer(stop_words='english',
                                preprocessor=preproc,
                                min_df=min_df,
                                max_df=max_df)
        pipe_b = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_b,
                                    concatenate_correct=True,
                                    concatenate_wrong=True)
            ]),
            RegressionModule([
                SklearnRegressionComponent(
                    RandomForestRegressor(random_state=SEED),
                    latent_trait_range=B_RANGE)
            ]))
        # pipeline discrimination
        vec_a = TfidfVectorizer(stop_words='english',
                                preprocessor=preproc,
                                min_df=min_df,
                                max_df=max_df)
        pipe_a = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_a,
Esempio n. 5
0
# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

for min_df in np.arange(0.00, 0.11, 0.02):
    for max_df in np.arange(0.90, 1.01, 0.02):

        file = open("outputs/5_1_model_selection_RF_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w')
        file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df))

        # pipeline difficulty
        vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df)
        pipe_b = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True),
                LinguisticFeaturesComponent(),
                ReadabilityFeaturesComponent(),
            ]),
            RegressionModule([
                SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE)
            ])
        )
        # pipeline discrimination
        vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df)
        pipe_a = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True),
                LinguisticFeaturesComponent(),
                ReadabilityFeaturesComponent(),
            ]),
            RegressionModule([
Esempio n. 6
0
df_gte = pd.read_csv(os.path.join(DATA_PATH, 'a_gte.csv'))
df_sap = pd.read_csv(os.path.join(DATA_PATH, 'a_sap.csv'))
df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv'))
dict_latent_traits = pickle.load(
    open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb"))

# This first section is to train the model - I assume that the parameters used here are the ones of the best performing
#   model, as obtained in the scripts 5_1_*
pipeline_difficulty = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([
        IRFeaturesComponent(TfidfVectorizer(
            stop_words='english',
            preprocessor=vectorizer_text_preprocessor,
            min_df=0.02,
            max_df=0.92),
                            concatenate_correct=True,
                            concatenate_wrong=True),
        LinguisticFeaturesComponent(),
        ReadabilityFeaturesComponent(),
    ]),
    RegressionModule([
        SklearnRegressionComponent(RFRegressor(n_estimators=100,
                                               max_depth=20,
                                               random_state=SEED),
                                   latent_trait_range=DIFFICULTY_RANGE)
    ]))
pipeline_discrimination = FeatureEngAndRegressionPipeline(
    FeatureEngineeringModule([
        IRFeaturesComponent(TfidfVectorizer(
            stop_words='english',