max_val_score = 0 while len(chosen_features) < 1: best_val = 0. best_feature = None for feature in feature_list: if feature in chosen_features: continue sent_encoder.feature_names = chosen_features + [feature] model = SimplePQModel(sent_encoder=sent_encoder, clf_type=AdaBoostClassifier, clf_args={ 'n_estimators': 100, 'base_estimator': DecisionTreeClassifier( max_depth=1, class_weight="balanced") }) model.fit(train_articles) val_accuracy = E.evaluate(model=model, articles=val_articles, verbose=0) test_accuracy = E.evaluate(model=model, articles=test_articles, verbose=0) res_str = "{}\t{:.1f}\t{:.1f}".format( ', '.join(chosen_features + [feature]), 100 * val_accuracy,
'๐ธ', '๐', '๐', '\u200e', 'รฅ', '๐ค', 'โ', 'ฬ', '๐ฅ', '๐', 'ั', 'ั', '๐คท', 'ั', '๐', '๐น', '๐', '{', '}', '๐ค', '๐', '\U0001f9e1', '\u200f', '๐', '๐', 'ยบ', '๐น', 'โ', 'ษช', '๐', '๐', '๐บ', '๐ค', '๐', 'ร', '๐', 'ยฏ', '๐บ', 'โ', 'โ', '์ ', '์จ', '๋ฆฌ', 'โ', 'ล', '๐ฆ', 'โ', '๐', 'สผ', 'โ', '๐', '๐', '๐คฆ', '๐', '๐', '๐ช', '๐', '๐', 'โฅ', 'ยน', 'ฤ', '๐', '๐', '๐จ', '๐ฅ', '๐', '๐ฅ', '๐ ', 'โ', '๐', '๐', 'โพ', '๐ ', '๐', '๐ท', 'ฬ', 'ั', 'ั', 'โช', 'แด', 'ษด', 'แด', 'แด', '๐', 'โก', '๐ ', 'โฑ', 'ล', '๐ง', '๐', 'โก', '๐', '\\', 'ใ', 'ล', '๐ฏ', '๐ญ', '๐', '๐', '๐', '๐ถ', '๐', 'ลพ', '๐', '๏ฟฝ', '๐', '๐', '๐', 'โต', 'โก', '๐ณ', '\U0001f90d', '๐', 'รน', '๐ธ', 'ยฌ', 'โ', '๐', '\U0001f9d8', 'โ', '๐', '๐', '๐ธ', '๐', '๐ธ', '๐ช', '๐', '๐ญ', '๐', '๐', '๐', '๐', '๐', '๐', '๐ผ', '๐ฝ', '๐', 'ยด', '๐ค', '\U0001f929', '๐', '๐คก', '๐ ', '๐ฑ', '๐', 'โ', '๐ซ', '๐', '๐', '๐', '๐ฏ', '๐', '๐ฆ', '๐ข', '๐ญ', 'ยพ', 'โ', 'ยจ', '๐ฐ', '๐', '๐', '๐', '๐', '๐', '๐', '๐ณ', '๐', '๐', 'ั', '๐', '๐', 'ส', 'ส', '๐', '๐ฃ', '๐', '๐', '๐ฌ', '๐', '๐ญ', '๐ ', '๐ณ', '๐', 'ฤ ', '๐ ', '๐ฉ', '๐', '๐', '๐ถ', '๏ฟผ', '๐ฅ', '๏ฌ', '๐', '๐', 'ฬ', '\U0001f974', '๐', 'โ', 'ฬ', '๐', '๐ป', '๐ฌ', '๐', '๐', '๐ฎ', 'โบ', '๐', '๐', '๐', '๐ค ', '๐ข', 'โ', 'โ', '๐', '๏ฝ', '๏ฝ', '๏ฝ', '๏ฝ', '๏ฝ', '๏ฝ ', '๐ผ', 'ส', 'โ ', '๐', '๐ต', '๐ ', '๐ฎ', '๐', '๐คธ', 'โฎ', '๐ฆ', '๐ ', '๐', 'โฃ', '๐ถ', '๐ป', '\U0001f9d0', '๐ญ', '๐', 'ร', '๐ฐ', '\u2009', 'ยผ', '๐ก', '๐', '๐ฌ', '๐ฃ', '๐บ', '\U0001f6f8', '๐', '๐ค', '๐', '๐ฅ', '๐ง', '๐ณ', '๐ฅ', '๐ท', '๐ฅ', '๐ค', '๐ธ', '๐ฐ', 'โ', '๐', '๐ผ', '๐ ', '๐', '๐ก', '๐พ', '๐', '๐ซ', 'ฤ', 'โ', 'สบ', 'โพ', 'ห', '\U0001f976', '๐', '\U0001f9da', '๐ถ', '๐', 'โ', 'ลผ'] for char_sets in char_sets_options: for quantiles in [1, 2, 5, 10, 15]: sent_encoder = TrueCharPositionEncoder(char_sets = char_sets, quantiles=quantiles) model = SimplePQModel(sent_encoder=sent_encoder, clf_type=LogisticRegression, clf_args={'class_weight':'balanced', 'max_iter':1000, 'solver':'lbfgs'}) #model = SimpleNNModel(sent_encoder=sent_encoder, layer_sizes=layer_sizes, layer_dropouts=layer_dropouts) model.fit(train_articles) #coefs = model.model.coef_[0] #coef_str = "\t".join([str(round(v, 3)) for v in coefs]) # #val_accuracy = E.evaluate(model=model, articles=val_articles, verbose=0) test_accuracy = E.evaluate(model=model, articles=test_articles, verbose=0) res_str = "{}\t{}\t{}\t{:.2f}".format(sent_encoder.name, char_sets, quantiles, 100*test_accuracy) print(res_str) results_file.write(res_str+"\n") results_file.flush() ''' for quantiles in [2, 5, 10, 20]:
from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from models.sentence_encoders import HandcraftedEncoder #sent_encoder = HandcraftedEncoder() sent_encoder = HandcraftedEncoder(precomputed_embeddings=settings.PRECOMPUTED_HANDCRAFTED_EMBEDDINGS_FNAME) feature_list = ["Quote_count", "Sent_position", "R_difficult", "POS_PRP", "POS_VB", "A_concreteness"] #HandcraftedEncoder._all_features + "best" #feature = "best" for feature in feature_list: print(feature) sent_encoder.set_features(feature) model = SimplePQModel(sent_encoder=sent_encoder, clf_type=AdaBoostClassifier, clf_args={'n_estimators':100, 'base_estimator':DecisionTreeClassifier(max_depth=1, class_weight="balanced")}) print("training {}...".format(feature)) model.fit(train_articles) print("generating...") combined_samples[feature] = generate_samples(model, test_articles) elif model_name == "ngrams": from models.sentence_encoders import NGramEncoder for mode, n in [('char', 2), ('word', 1)]: print(mode, n) sent_encoder = NGramEncoder(mode=mode, n=n, store_results=False, vocab_size=1000) print("preparing encoder...")