# load data - TODO: in order to run this script you have to provide your own data df_gte = pd.read_csv(os.path.join(DATA_PATH, 'a_gte.csv')) df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv')) df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv')) df_test = df_test.drop(df_test.head(100).index) # Not to use the validation data used in 5.1 for model selection dict_latent_traits = pickle.load(open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb")) # define latent traits calibrator (known latent traits) latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits) file = open("outputs/5_3_read_ling.txt", 'w') # pipeline difficulty pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ReadabilityFeaturesComponent(), LinguisticFeaturesComponent()]), RegressionModule([SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE)]) ) # pipeline discrimination pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ReadabilityFeaturesComponent(), LinguisticFeaturesComponent()]), RegressionModule([SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=A_RANGE)]) ) # create estimator from text form the previous pipelines estimator_from_text = FeatureEngAndRegressionEstimatorFromText({DIFFICULTY: pipe_b, DISCRIMINATION: pipe_a}) model = Text2PropsModel(latent_traits_calibrator, estimator_from_text) model.calibrate_latent_traits(None) # define parameters for randomized CV dict_params = { DIFFICULTY: [{'regressor__n_estimators': randint(20, 200), 'regressor__max_depth': randint(2, 50)}],
df_train = pd.read_csv(os.path.join(DATA_PATH, 'q_train.csv')) df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv')) df_test = df_test.drop( df_test.head(100).index ) # Not to use the validation data used in 5.1 for model selection dict_latent_traits = pickle.load( open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb")) # define latent traits calibrator (known latent traits) latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits) file = open("outputs/5_3_ling.txt", 'w') # pipeline difficulty pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([LinguisticFeaturesComponent()]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE) ])) # pipeline discrimination pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([LinguisticFeaturesComponent()]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=A_RANGE) ])) # create estimator from text form the previous pipelines estimator_from_text = FeatureEngAndRegressionEstimatorFromText({ DIFFICULTY: pipe_b,
# define latent traits calibrator (known latent traits) latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits) for min_df in np.arange(0.00, 0.11, 0.02): for max_df in np.arange(0.90, 1.01, 0.02): file = open("outputs/5_1_model_selection_RF_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w') file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df)) # pipeline difficulty vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(RandomForestRegressor(random_state=SEED), latent_trait_range=B_RANGE) ]) ) # pipeline discrimination vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([