Example #1
0
def main(args, max_evals):
    # Hyperparameters space
    model_path = f"./checkpoint/{args.experiment_id}_ckpt.pth"
    trials_path = f"./results/{args.experiment_id}_trials.p"
    display_step = args.iterations//100 if args.iterations > 1000 else 10 # relative display steps
    space = {#------------------------------------- Architecture -------------------------------------#
             'experiment_id': hp.choice(label='experiment_id', options=[args.experiment_id]),
             'input_size': hp.choice(label='input_size', options=[SIZE]),
             'n_classes': hp.choice(label='n_classes', options=[4_000]),
             #------------------------------ Optimization Regularization -----------------------------#
             'iterations': hp.choice(label='iterations', options=[args.iterations]),
             'display_step': scope.int(hp.choice(label='display_step', options=[display_step])),
             'batch_size': scope.int(hp.choice(label='batch_size', options=[512])),
             #'initial_lr': hp.loguniform(label='lr', low=np.log(5e-3), high=np.log(0.1)),
             'initial_lr': scope.float(hp.choice(label='initial_lr', options=[0.1])),
             'lr_decay': scope.float(hp.choice(label='lr_decay', options=[0.5])),
             'adjust_lr_step': hp.choice(label='adjust_lr_step', options=[300_000//3]),
             'weight_decay': hp.choice(label='weight_decay', options=[5e-4]),
             'with_center_loss': hp.choice(label='with_center_loss', options=[bool(args.with_center_loss)]),
             'initial_clr': hp.choice(label='initial_clr', options=[0.01, 0.05, 0.1, 0.5]),
             'alpha': hp.choice(label='alpha', options=[0.1, 0.01]),
             #'display_step': scope.int(hp.choice(label='eval_epochs', options=[3_000])),
             #--------------------------------------   Others   --------------------------------------#
             'path': hp.choice(label='path', options=[model_path]),
             'trials_path': hp.choice(label='trials_path', options=[trials_path]),
             'random_seed': scope.int(hp.quniform('random_seed', 1, 10, 1))}

    # Hyperparameters search
    trials = Trials()
    fmin_objective = partial(fit_and_log, trials=trials, verbose=True)
    best_model = fmin(fmin_objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)

    # Save output
    with open(trials_path, "wb") as f:
        pickle.dump(trials, f)
Example #2
0
	def train(self,training_file,rare_thresh=100,as_text=True,tune_mode=None,size=None,clf_params=None,chosen_clf=None):
		"""
		Train the EnsembleSentencer. Note that the underlying estimators are assumed to be pretrained already.

		:param training_file: File in DISRPT shared task .conll format
		:param rare_thresh: Rank of rarest word to include (rarer items are replace with POS)
		:param genre_pat: Regex pattern with capturing group to extract genre from document names
		:param as_text: Boolean, whether the input is a string, rather than a file name to read
		:return:
		"""

		if tune_mode is not None and size is None:
			size = 5000
			sys.stderr.write("o No sample size set - setting size to 5000\n")
		if clf_params is None:
			# Default classifier parameters
			clf_params = {"n_estimators":100,"min_samples_leaf":3,"random_state":42}
		if chosen_clf is None:
			chosen_clf = DEFAULTCLF

		data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words = self.read_data(training_file,rare_thresh=rare_thresh,as_text=as_text)

		sys.stderr.write("o Learning...\n")

		if tune_mode == "hyperopt":
			from hyperopt import hp
			from hyperopt.pyll.base import scope
			dev_file = training_file.replace("_train","_dev")
			_, val_x, val_y, _, _, _, _, _, _ = self.read_data(dev_file,rare_thresh=rare_thresh,as_text=False,no_cache=True)
			space = {
				'average': hp.choice('average',["micro","weighted"]),
				'n_estimators': scope.int(hp.quniform('n_estimators', 50, 150, 10)),
				'max_depth': scope.int(hp.quniform('max_depth', 3, 35, 1)),
				'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)),
				'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)),
				'colsample_bytree': hp.choice('colsample_bytree', [0.4,0.5,0.6,0.7,0.8,1.0]),
				'subsample': hp.choice('subsample', [0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
				'clf': hp.choice('clf', ["xgb"])
			}
			best_clf, best_params = hyper_optimize(data_x,data_y,val_x=val_x,val_y=val_y,space=space)
			return best_clf, best_params
		elif tune_mode is not None:
			best_params = {}
			best_params_by_clf = defaultdict(dict)
			# Tune individual params separately for speed, or do complete grid search if building final model
			params_list = [{"n_estimators":[75,100,125]},
						   {'max_depth': [7,10,15,None]},
						   {"min_samples_split": [2, 5, 10]},
						   {"min_samples_leaf":[1,2,3]}]
			if tune_mode == "full":
				# Flatten dictionary if doing full CV
				params_list = [{k: v for d in params_list for k, v in d.items()}]
			best_score = -10000
			for clf in [RandomForestClassifier(),ExtraTreesClassifier(),GradientBoostingClassifier()]:
				for params in params_list:
					base_params = copy.deepcopy(clf_params)  # Copy default params
					if clf.__class__.__name__ != "GradientBoostingClassifier":
						base_params.update({"n_jobs":4, "oob_score":True, "bootstrap":True})
					for p in params:
						if p in base_params:  # Ensure base_params don't conflict with grid search params
							base_params.pop(p)
					clf.set_params(**base_params)
					grid = GridSearchCV(clf,params,cv=3,n_jobs=3,error_score="raise",refit=False,scoring="f1")
					grid.fit(data_x,data_y)
					if tune_mode == "full":
						if grid.best_score_ > best_score:
							best_score = grid.best_score_
							best_clf = clf
							for param in params:
								best_params[param] = grid.best_params_[param]
					else:
						if grid.best_score_ > best_score:
							best_clf = clf
						for param in params:
							best_params_by_clf[clf.__class__.__name__][param] = grid.best_params_[param]
			if tune_mode == "paramwise":
				best_params = best_params_by_clf[best_clf.__class__.__name__]
			else:
				best_params["best_score"] = best_score

			clf_name = best_clf.__class__.__name__
			with io.open(segmenters_dir + os.sep + "params" + os.sep + "EnsembleConnective"+self.auto+"_best_params.tab",'a',encoding="utf8") as bp:
				corpus = os.path.basename(training_file).split("_")[0]
				for k, v in best_params.items():
					bp.write("\t".join([corpus, clf_name, k, str(v)])+"\n")
			self.clf = best_clf
			return best_clf, best_params
		else:
			clf = chosen_clf
			clf.set_params(**clf_params)
			if clf.__class__.__name__ != "GradientBoostingClassifier":
				clf.set_params(**{"n_jobs":3,"oob_score":True,"bootstrap":True})
			clf.set_params(**{"random_state":42})
			clf.fit(data_x,data_y)
			self.clf = clf

		feature_names = cat_labels + num_labels

		zipped = zip(feature_names, clf.feature_importances_)
		sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True)
		sys.stderr.write("o Feature importances:\n\n")
		for name, importance in sorted_zip:
			sys.stderr.write(name + "=" + str(importance) + "\n")

		#sys.stderr.write("\no OOB score: " + str(clf.oob_score_)+"\n")

		sys.stderr.write("\no Serializing model...\n")

		joblib.dump((clf, num_labels, cat_labels, multicol_dict, top_n_words, firsts, lasts), self.model, compress=3)
def BayesSearch(X, y):
    """Search Hyper parameter"""
    global MODEL
    if MODEL == "log_reg":
        param_space = {
            "solver": hp.choice("solver", ["newton-cg", "saga", "lbfgs"]),
            "max_iter": scope.int(hp.uniform("max_iter", 100, 1500)),
            "C": scope.float(hp.lognormal("C", 0.0001, 3)),
        }
    elif MODEL == "sgd":
        param_space = {
            "loss": hp.choice("loss", ["log", "modified_huber"]),
            "penalty": hp.choice("penalty", ["l2", "l1", "elasticnet"]),
            "alpha": scope.float(hp.uniform("alpha", 0.001, 1)),
            "max_iter": scope.int(hp.uniform("max_iter", 100, 1500)),
        }
    elif MODEL == "rftree":
        param_space = {
            "max_depth": scope.int(hp.quniform("max_depth", 6, 15, 1)),
            "n_estimators": scope.int(hp.quniform("n_estimators", 100, 1000, 1)),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            "max_features": hp.choice("max_features", ["auto", "log2"]),
            "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 6, 100, 1)),
            "min_samples_split": scope.int(hp.quniform("min_samples_split", 6, 100, 1)),
            #'bootstrap': hp.choice('bootstrap', [True, False]),
        }
    elif MODEL == "extree":
        param_space = {
            "max_depth": scope.int(hp.quniform("max_depth", 5, 25, 1)),
            "n_estimators": scope.int(hp.quniform("n_estimators", 100, 2000, 1)),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            "max_features": hp.choice("max_features", ["auto", "log2"]),
            "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 3, 100, 1)),
            "min_samples_split": scope.int(hp.quniform("min_samples_split", 3, 100, 1)),
            #'bootstrap': hp.choice('bootstrap', [True, False]),
        }
    elif MODEL == "lgbm":
        param_space = {
            "num_leaves": scope.int(hp.uniform("num_leaves", 10, 1000)),
            "max_depth": scope.int(hp.uniform("max_depth", 6, 100)),
            "cat_smooth": scope.int(hp.uniform("cat_smooth", 1, 100)),
            "subsample": scope.float(hp.uniform("subsample", 0.4, 1)),
            "colsample_bytree": scope.float(hp.uniform("colsample_bytree", 0.4, 1)),
            # "subsample_freq":scope.int(hp.uniform("subsample_freq", 1, 20)),
            "min_child_samples": scope.int(hp.uniform("min_child_samples", 2, 100)),
            "min_split_gain": scope.float(
                hp.loguniform("min_split_gain", np.log(0.001), np.log(10))
            ),
            "reg_alpha": scope.float(
                hp.loguniform("reg_alpha", np.log(0.001), np.log(10))
            ),
            "reg_lambda": scope.float(
                hp.loguniform("reg_lambda", np.log(0.001), np.log(10))
            ),
        }
    elif MODEL == "xgbm":
        param_space = {
            "max_depth": scope.int(hp.quniform("max_depth", 6, 10, 1)),
            "subsample": scope.float(hp.uniform("subsample", 0.4, 1)),
            "colsample_bytree": scope.float(hp.uniform("colsample_bytree", 0.4, 1)),
            "gamma": scope.int(hp.quniform("gamma", 0, 20, 1)),
            "reg_alpha": scope.float(hp.uniform("reg_alpha", 0.01, 1)),
            "reg_lambda": scope.float(hp.uniform("reg_lambda", 0.01, 1)),
            # "scale_pos_weight":scope.float(hp.uniform("scale_pos_weight", 0.001, 1)),
        }

    # optimize function
    trails = Trials()
    optimization_function = partial(optimize, X=X, y=y)
    result = fmin(
        fn=optimization_function,
        space=param_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=trails,
        verbose=1,
    )

    print("Best Result is:", "_" * 10, result)
    return result, trails
Example #4
0
	def train(self, train_file, lexicon_file=None, freq_file=None, test_prop=0.1, output_importances=False, dump_model=False,
			  cross_val_test=False, output_errors=False, ablations=None, dump_transformed_data=False, do_shuffle=True, conf=None):
		"""

		:param train_file: File with segmentations to train on in one of the two formats described in make_prev_next()
		:param lexicon_file: Tab delimited lexicon file with full forms in first column and POS tag in second column (multiple rows per form possible)
		:param freq_file: Tab delimited file with segment forms and their frequencies as integers in two columns
		:param conf: configuration file for training (by default: <MODELNAME>.conf)
		:param test_prop: (0.0 -- 0.99) Proportion of shuffled data to test on
		:param output_importances: Whether to print feature importances (only if test proportion > 0.0)
		:param dump_model: Whether to dump trained model to disk via joblib
		:param cross_val_test: Whether to perform cross-validation for hyper parameter optimization
		:param output_errors: Whether to output prediction errors to a file 'errs.txt'
		:param ablations: Comma separated string of feature names to ablate, e.g. "freq_ratio,prev_grp_pos,next_grp_pos"
		:param dump_transformed_data: If true, transform data to a pandas dataframe and write to disk, then quit
				(useful to train other approaches on the same features, e.g. a DNN classifier)
		:param do_shuffle: Whether training data is shuffled after context extraction but before test partition is created
				(this has no effect if training on whole training corpus)
		:return: None
		"""
		import timing

		self.read_conf_file(file_name=conf)
		pos_lookup = read_lex(self.short_pos,lexicon_file)
		self.pos_lookup = pos_lookup
		conf_file_parser = self.conf_file_parser
		letter_config = LetterConfig(self.letters, self.conf["vowels"], self.pos_lookup)

		np.random.seed(42)

		if lexicon_file is None:
			print("i WARN: No lexicon file provided, learning purely from examples")

		seg_table = io.open(train_file,encoding="utf8").read()
		seg_table = seg_table.replace("\r","").strip()
		for c in self.conf["diacritics"]:  # TODO: configurable diacritic removal
			pass
			#seg_table = seg_table.replace(c,"")
		seg_table = seg_table.split("\n")

		sys.stderr.write("o Encoding Training data\n")

		# Validate training data
		non_tab_lines = 0
		non_tab_row = 0
		for r, line in enumerate(seg_table):
			if line.count("\t") < 1:
				non_tab_lines += 1
				non_tab_row = r
		if non_tab_lines > 0:
			sys.stderr.write("FATAL: found " + str(non_tab_lines) + " rows in training data not containing tab\n")
			sys.stderr.write("       Last occurrence at line: " + str(non_tab_row) + "\n")
			sys.exit()

		# Make into four cols: prev \t next \t current \t segmented (unless already receiving such a table, for shuffled datasets)
		if seg_table[0].count("\t") == 1:
			seg_table = make_prev_next(seg_table)

		# Ensure OOV symbol is in data
		seg_table = ["_\t_\t_\t_"] + seg_table

		data_y = []
		words = []
		all_encoded_groups = []

		encoding_cache = {}
		non_ident_segs = 0

		shuffle_mapping = list(range(len(seg_table)))
		zipped = list(zip(seg_table, shuffle_mapping))

		# Shuffle table to sample across entire dataset if desired
		if do_shuffle and False:
			random.Random(24).shuffle(zipped)

		seg_table, shuffle_mapping = zip(*zipped)

		headers = bg2array("_________",prev_group="_",next_group="_",print_headers=True,is_test=1,grp_id=1,config=letter_config)

		word_idx = -1
		bug_rows = []

		freqs = defaultdict(float)
		total_segs = 0.0
		flines = io.open(freq_file,encoding="utf8").read().replace("\r","").split("\n") if freq_file is not None else []
		for l in flines:
			if l.count("\t")==1:
				w, f = l.split("\t")
				freqs[w] += float(f)
				total_segs += float(f)

		for u in freqs:
			freqs[u] = freqs[u]/total_segs

		# Don't use freqs if they're empty
		if len(freqs) == 0:
			sys.stderr.write("o No segment frequencies provided, adding 'freq_ratio' to ablated features\n")
			if ablations is None:
				ablations = "freq_ratio"
			else:
				if "freq_ratio" not in ablations:
					ablations += ",freq_ratio"

		step = int(1/test_prop) if test_prop > 0 else 0
		test_indices = list(range(len(seg_table)))[0::step] if step > 0 else []
		test_rows = []

		for row_idx, row in enumerate(seg_table):
			is_test = 1 if row_idx in test_indices else 0

			prev_group, next_group, bound_group, segmentation = row.split("\t")
			if bound_group != "|":
				if len(bound_group) != len(segmentation.replace("|","")):  # Ignore segmentations that also normalize
					non_ident_segs += 1
					bug_rows.append((row_idx,bound_group,segmentation.replace("|","")))
					continue

			###
			if dump_transformed_data:
				if is_test:
					test_rows.append(bound_group + "\t" + segmentation)
			###

			word_idx += 1
			words.append(bound_group)
			group_type = "_".join([x for x in [prev_group, next_group, bound_group] if x != ""])
			if group_type in encoding_cache:  # No need to encode, an identical featured group has already been seen
				encoded_group = encoding_cache[group_type]
				for c in encoded_group:
					c[headers.index("is_test")] = is_test  # Make sure that this group's test index is correctly assigned
			else:
				encoded_group = bg2array(bound_group,prev_group=prev_group,next_group=next_group,is_test=is_test,grp_id=word_idx,config=letter_config,train=True,freqs=freqs)
				encoding_cache[group_type] = encoded_group
			all_encoded_groups += encoded_group
			data_y += segs2array(segmentation)

		sys.stderr.write("o Finished encoding " + str(len(data_y)) + " chars (" + str(len(seg_table)) + " groups, " + str(len(encoding_cache)) + " group types)\n")

		if non_ident_segs > 0:
			with io.open("bug_rows.txt",'w',encoding="utf8") as f:
				f.write(("\n".join([str(r) + ": " + g + "<>" + s for r, g, s in sorted([[shuffle_mapping[x], g, s] for x, g, s in bug_rows])]) + "\n"))

			sys.stderr.write("i WARN: found " + str(non_ident_segs) + " rows in training data where left column characters not identical to right column characters\n")
			sys.stderr.write("        Row numbers dumped to: bug_rows.txt\n")
			sys.stderr.write("        " + str(non_ident_segs) + " rows were ignored in training\n\n")

		data_y = np.array(data_y)


		# Remove features switched off in .conf file
		for label in self.conf["unused"]:
			if label in cat_labels:
				cat_labels.remove(label)
			if label in num_labels:
				num_labels.remove(label)

		# Handle temporary ablations if specified in option -a
		if ablations is not None:
			sys.stderr.write("o Applying ablations\n")
			if len(ablations) > 0 and ablations != "none":
				abl_feats = ablations.split(",")
				sys.stderr.write("o Ablating features:\n")
				for feat in abl_feats:
					found = False
					if feat in cat_labels:
						cat_labels.remove(feat)
						found = True
					elif feat in num_labels:
						num_labels.remove(feat)
						found = True
					if found:
						sys.stderr.write("\t"+feat+"\n")
					else:
						sys.stderr.write("\tERR: can't find ablation feature " + feat + "\n")
						sys.exit()

		sys.stderr.write("o Creating dataframe\n")
		data_x = pd.DataFrame(all_encoded_groups, columns=headers)

		###
		if dump_transformed_data:
			data_x["resp"] = data_y
			import csv
			to_remove = ["is_test","grp_id"]  # Columns to remove from transformed data dump
			out_cols = [col for col in headers if col not in to_remove] + ["resp"]  # Add the response column as 'resp'
			data_x.iloc[data_x.index[data_x["is_test"] == 0]].to_csv("rftokenizer_train_featurized.tab",sep="\t",quotechar="",quoting=csv.QUOTE_NONE,encoding="utf8",index=False,columns=out_cols)
			data_x.iloc[data_x.index[data_x["is_test"] == 1]].to_csv("rftokenizer_test_featurized.tab",sep="\t",quotechar="",quoting=csv.QUOTE_NONE,encoding="utf8",index=False,columns=out_cols)
			# Dump raw test rows to compare gold solution
			with io.open("rftokenizer_test_gold.tab","w",encoding="utf8") as gold:
				gold.write("\n".join(test_rows) + "\n")
			sys.stderr.write("o Wrote featurized train/test set and gold test to rftokenizer_*.tab\n")
			sys.exit()
		###

		data_x_enc, multicol_dict = multicol_fit_transform(data_x, pd.Index(cat_labels))

		if test_prop > 0:
			sys.stderr.write("o Generating train/test split with test proportion "+str(test_prop)+"\n")

		data_x_enc["boundary"] = data_y
		strat_train_set = data_x_enc.iloc[data_x_enc.index[data_x_enc["is_test"] == 0]]
		strat_test_set = data_x_enc.iloc[data_x_enc.index[data_x_enc["is_test"] == 1]]

		sys.stderr.write("o Transforming data to numerical array\n")
		train_x = strat_train_set[cat_labels+num_labels].values

		train_y = strat_train_set["boundary"]
		train_y_bin = np.where(strat_train_set['boundary'] == 0, 0, 1)

		if test_prop > 0:
			test_x = strat_test_set[cat_labels+num_labels].values
			test_y_bin = np.where(strat_test_set['boundary'] == 0, 0, 1)
			bound_grp_idx = np.array(strat_test_set['grp_id'])

			from sklearn.dummy import DummyClassifier
			d = DummyClassifier(strategy="most_frequent")
			d.fit(train_x,train_y_bin)
			pred = d.predict(test_x)
			print("o Majority baseline:")
			print("\t" + str(accuracy_score(test_y_bin, pred)))

		# Classifier used in 2018 paper:
		#clf = ExtraTreesClassifier(n_estimators=250, max_features=None, n_jobs=3, random_state=42)

		# Use xgboost for slightly better accuracy than paper
		from xgboost import XGBClassifier

		clf = XGBClassifier(n_estimators=230,n_jobs=3,random_state=42,max_depth=17,subsample=1.0,colsample_bytree=0.6,eta=.07,gamma=.09)

		if cross_val_test:
			# Modify code to tune hyperparameters

			from hyperopt import hp
			from hyperopt.pyll import scope
			space = {
				'n_estimators': scope.int(hp.quniform('n_estimators', 100, 250, 10)),
				'max_depth': scope.int(hp.quniform('max_depth', 8, 35, 1)),
				'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)),
				'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)),
				'colsample_bytree': hp.choice('colsample_bytree', [0.6,0.7,0.8,1.0]),
				'subsample': hp.choice('subsample', [0.6,0.7,0.8,0.9,1.0]),
				'clf': hp.choice('clf', ["xgb"])
			}
			if test_prop > 0:
				best_clf, best_params = hyper_optimize(train_x,train_y_bin,val_x=test_x,val_y=test_y_bin,space=space,max_evals=20)
			else:
				best_clf, best_params = hyper_optimize(train_x,train_y_bin,val_x=None,val_y=None,space=space,max_evals=100)
			print(best_params)
			clf = best_clf

			print("\nBest parameters:\n" + 30 * "=")
			print(best_params)
			sys.exit()

		sys.stderr.write("o Learning...\n")
		clf.fit(train_x, train_y_bin)

		if test_prop > 0:
			pred = clf.predict(test_x)
			j=-1
			for i, row in strat_test_set.iterrows():
				j+=1
				if row["idx"] +1 == row["len_bound_group"]:
					pred[j] = 0

			print("o Binary clf accuracy:")
			print("\t" + str(accuracy_score(test_y_bin, pred)))

			group_results = defaultdict(lambda : 1)
			for i in range(len(pred)):
				grp = bound_grp_idx[i]
				if test_y_bin[i] != pred[i]:
					group_results[grp] = 0

			correct = 0
			total = 0
			for grp in set(bound_grp_idx):
				if group_results[grp] == 1:
					correct +=1
				total +=1
			print("o Perfect bound group accuracy:")
			print("\t" + str(float(correct)/total))

			errs = defaultdict(int)
			for i, word in enumerate(words):
				if i in group_results:
					if group_results[i] == 0:
						errs[word] += 1

			if output_errors:
				print("o Writing prediction errors to errs.txt")
				with io.open("errs.txt",'w',encoding="utf8") as f:
					for err in errs:
						f.write(err + "\t" + str(errs[err])+"\n")
		else:
			print("o Test proportion is 0%, skipping evaluation")

		if output_importances:
			feature_names = cat_labels + num_labels

			zipped = zip(feature_names, clf.feature_importances_)
			sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True)
			print("o Feature importances:\n")
			for name, importance in sorted_zip:
				print(name, "=", importance)


		if dump_model:
			plain_dict_pos_lookup = {}
			plain_dict_pos_lookup.update(pos_lookup)
			joblib.dump((clf, num_labels, cat_labels, multicol_dict, plain_dict_pos_lookup, freqs, conf_file_parser), self.lang + ".sm" + str(sys.version_info[0]), compress=3)
			print("o Dumped trained model to " + self.lang + ".sm" + str(sys.version_info[0]))
Example #5
0
    def train(self,
              training_file,
              rare_thresh=200,
              clf_params=None,
              chosen_feats=None,
              tune_mode=None,
              size=None,
              as_text=True,
              multitrain=False,
              chosen_clf=DEFAULTCLF):
        """
		:param training_file:
		:param rare_thresh:
		:param clf_params:
		:param chosen_feats: List of feature names to force a subset of selected features to be used
		:param tune_mode: None for no grid search, "paramwise" to tune each hyperparameter separately, or "full" for complete grid (best but slowest)
		:param size: Sample size to optimize variable importance with
		:return:
		"""

        if tune_mode is not None and size is None:
            size = 5000
            sys.stderr.write("o No sample size set - setting size to 5000\n")
        if clf_params is None:
            # Default classifier parameters
            clf_params = {
                "n_estimators": 150,
                "min_samples_leaf": 3,
                "random_state": 42
            }
            if DEFAULTCLF.__class__.__name__ not in [
                    "GradientBoostingClassifier", "CatBoostClassifier",
                    "XGBClassifier"
            ]:
                clf_params.update({
                    "n_jobs": 4,
                    "oob_score": True,
                    "bootstrap": True
                })

        data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words = self.read_data(
            training_file,
            size,
            as_text=as_text,
            rare_thresh=rare_thresh,
            chosen_feats=chosen_feats)
        sys.stderr.write("o Learning...\n")

        if tune_mode is not None:
            # Randomly select |size| samples for training and leave rest for validation, max |size| samples
            data_x = data_encoded[cat_labels + num_labels + ["label"]].sample(
                frac=1, random_state=42)
            data_y = np.where(data_x['label'] == "_", 0, 1)
            data_x = data_x[cat_labels + num_labels]
            if len(data_y) > 2 * size:
                val_x = data_x[size:2 * size]
                val_y = data_y[size:2 * size]
            else:
                val_x = data_x[size:]
                val_y = data_y[size:]
            data_x = data_x[:size]
            data_y = data_y[:size]

        if tune_mode == "importances":
            sys.stderr.write(
                "o Measuring correlation of categorical variables\n")
            theil_implications = report_theils_u(val_x, cat_labels)
            for (var1, var2) in theil_implications:
                if var1 in cat_labels and var2 in cat_labels and var2 != "word":
                    drop_var = var2
                    u = theil_implications[(var1, var2)]
                    sys.stderr.write("o Removed feature " + drop_var +
                                     " due to Theil's U " + str(u)[:6] +
                                     " of " + var1 + "->" + var2 + "\n")
                    cat_labels.remove(drop_var)
            sys.stderr.write(
                "o Measuring correlation of numerical variables\n")
            cor_mat = report_correlations(val_x[num_labels], thresh=0.95)
            for (var1, var2) in cor_mat:
                if var1 in num_labels and var2 in num_labels:
                    drop_var = var2  # if imp[var1] > imp[var2] else var1
                    if drop_var == "word":
                        continue
                    corr_level = cor_mat[(var1, var2)]
                    sys.stderr.write("o Removed feature " + drop_var +
                                     " due to correlation " + str(corr_level) +
                                     " of " + var1 + ":" + var2 + "\n")
                    num_labels.remove(drop_var)

            return cat_labels, num_labels

        if tune_mode in ["paramwise", "full"]:  # Grid Search
            best_clf, best_params = grid_search(data_x, data_y, tune_mode,
                                                clf_params)
            clf_name = best_clf.__class__.__name__
            self.clf = best_clf
            return best_clf, best_params
        elif tune_mode == "hyperopt":  # TPE guided random search
            from hyperopt import hp
            from hyperopt.pyll.base import scope
            val_x, val_y = None, None
            if self.corpus_dir is not None:
                dev_file = self.corpus_dir + os.sep + self.corpus + "_dev.conll"
                _, val_x, val_y, _, _, _, _, _, _ = self.read_data(
                    dev_file,
                    size,
                    as_text=False,
                    rare_thresh=rare_thresh,
                    chosen_feats=chosen_feats)
            space = {
                'n_estimators':
                scope.int(hp.quniform('n_estimators', 100, 250, 10)),
                'max_depth':
                scope.int(hp.quniform('max_depth', 3, 30, 1)),
                'eta':
                scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)),
                'gamma':
                scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)),
                'colsample_bytree':
                hp.choice('colsample_bytree', [0.4, 0.5, 0.6, 0.7, 1.0]),
                'subsample':
                hp.choice('subsample', [0.5, 0.6, 0.7, 0.8, 1.0]),
                'clf':
                hp.choice('clf', ["xgb"])
            }
            best_clf, best_params = hyper_optimize(data_x.values,
                                                   data_y,
                                                   val_x=None,
                                                   val_y=None,
                                                   space=space,
                                                   max_evals=20)
            return best_clf, best_params
        else:  # No hyperparameter optimization
            clf = chosen_clf if chosen_clf is not None else DEFAULTCLF
            sys.stderr.write("o Setting params " + str(clf_params) + "\n")
            clf.set_params(**clf_params)
            if clf.__class__.__name__ not in [
                    "GradientBoostingClassifier", "CatBoostClassifier",
                    "XGBClassifier"
            ]:
                clf.set_params(**{
                    "n_jobs": 3,
                    "oob_score": True,
                    "bootstrap": True
                })
            if clf.__class__.__name__ in ["XGBClassifier"]:
                clf.set_params(**{"n_jobs": 3})
            clf.set_params(**{"random_state": 42})
            if multitrain:
                multitrain_preds = get_multitrain_preds(
                    clf, data_x, data_y, self.multifolds)
                multitrain_preds = "\n".join(multitrain_preds.strip().split(
                    "\n")[1:-1])  # Remove OOV tokens at start and end
                with io.open(script_dir + os.sep + "multitrain" + os.sep +
                             self.name + self.auto + '_' + self.corpus,
                             'w',
                             newline="\n") as f:
                    sys.stderr.write(
                        "o Serializing multitraining predictions\n")
                    f.write(multitrain_preds)
            if clf.__class__.__name__ == "CatBoostClassifier":
                clf.fit(data_x,
                        data_y,
                        cat_features=list(range(len(cat_labels))))
            else:
                clf.fit(data_x, data_y)
            self.clf = clf

        feature_names = cat_labels + num_labels
        sys.stderr.write("o Using " + str(len(feature_names)) + " features\n")

        zipped = zip(feature_names, clf.feature_importances_)
        sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True)
        sys.stderr.write("o Feature Gini importances:\n\n")
        for name, importance in sorted_zip:
            sys.stderr.write(name + "=" + str(importance) + "\n")

        if self.clf.__class__.__name__ not in [
                "GradientBoostingClassifier", "CatBoostClassifier",
                "XGBClassifier"
        ]:
            sys.stderr.write("\no OOB score: " + str(clf.oob_score_) + "\n\n")

        if tune_mode == "permutation":
            # Filter features based on permutation importance score threshold
            imp = permutation_importances(clf, val_x, val_y)
            for var, score in imp.items():
                if score < 0 and var != "word":
                    sys.stderr.write("o Dropping feature " + var +
                                     " due to low permutation importance of " +
                                     str(score) + "\n")
                    if var in cat_labels:
                        cat_labels.remove(var)
                    elif var in num_labels:
                        num_labels.remove(var)
            sys.stderr.write(
                "o Measuring correlation of numerical variables\n")
            cor_mat = report_correlations(val_x[num_labels])
            for (var1, var2) in cor_mat:
                if var1 in num_labels and var2 in num_labels:
                    drop_var = var2 if imp[var1] > imp[var2] else var1
                    if drop_var == "word":
                        continue
                    corr_level = cor_mat[(var1, var2)]
                    sys.stderr.write("o Removed feature " + drop_var +
                                     " due to correlation " + str(corr_level) +
                                     " of " + var1 + ":" + var2 + "\n")
                    num_labels.remove(drop_var)

            return cat_labels, num_labels

        sys.stderr.write("\no Serializing model...\n")

        joblib.dump((clf, num_labels, cat_labels, multicol_dict, top_n_words,
                     firsts, lasts),
                    self.model,
                    compress=3)