コード例 #1
0
	def read_data(self,training_file,rare_thresh=100,as_text=True, no_cache=False):

		if as_text:
			train = training_file
		else:
			train = io.open(training_file,encoding="utf8").read().strip().replace("\r","") + "\n"

		cat_labels = ["word","genre","deprel","s_type","morph"]#,"depchunk"]#,"first","last"]#,"pos"]#,"first","last"]
		num_labels = ["tok_len","tok_id","quote","bracket","sent_doc_percentile","s_len"]

		train_feats, vocab, toks, firsts, lasts = read_conll(train,genre_pat=self.genre_pat,mode="seg",as_text=True,char_bytes=self.lang=="zho")
		gold_feats, _, _, _, _ = read_conll_conn(train,mode="seg",as_text=True)
		gold_feats = [{"wid":0,"label":"_"}] + gold_feats + [{"wid":0,"label":"_"}]  # Add dummies to gold

		# Ensure that "_" is in the possible values of first/last for OOV chars at test time
		oov_item = train_feats[-1]
		oov_item["first"] = "_"
		oov_item["last"] = "_"
		oov_item["lemma"] = "_"
		oov_item["word"] = "_"
		oov_item["deprel"] = "_"
		oov_item["pos"] = "_"
		oov_item["cpos"] = "_"
		oov_item["genre"] = "_"
		oov_item["depchunk"] = "_"
		train_feats.append(oov_item)
		train_feats = [oov_item] + train_feats
		toks.append("_")
		toks = ["_"] + toks

		vocab = Counter(vocab)
		top_n_words = vocab.most_common(rare_thresh)
		top_n_words, _ = zip(*top_n_words)

		headers = sorted(list(train_feats[0].keys()))
		data = []

		preds = {}

		for e in self.estimators:
			if self.multitrain and e.name in ["RNNSegmenter"] and not no_cache:
				pred = e.predict_cached(train)
			# _, preds[e.name + "_B_prob"], preds[e.e.name + "_I_prob"] = [list(x) for x in zip(*pred)]
			else:
				pred = e.predict(train)
			preds[e.name + "_B_prob"] = []
			preds[e.name + "_I_prob"] = []
			if "Freq" in e.name:
				preds[e.name + "_freq"] = []
			for tup in pred:
				if "RNN" in e.name:
					pred = tup[0]
					probas = tup[1]
					freqs = None
				else:
					pred = tup[1]
					probas = float(tup[2])
					freqs = float(tup[3])
				if "B-Con" in pred:
					preds[e.name + "_B_prob"].append(probas)
					preds[e.name + "_I_prob"].append(0.0)
				elif "I-Con" in pred:
					preds[e.name + "_B_prob"].append(0.0)
					preds[e.name + "_I_prob"].append(probas)
				else:
					preds[e.name + "_B_prob"].append(0.0)
					preds[e.name + "_I_prob"].append(0.0)
				if freqs is not None:
					preds[e.name + "_freq"].append(freqs)

				# _, preds[e.name + "_prob"], ratio, freq = [list(x) for x in zip(*pred)]
			preds[e.name + "_B_prob"] = [0.0] + preds[e.name + "_B_prob"] + [0.0]  # Add dummy wrap for items -1 and +1
			preds[e.name + "_I_prob"] = [0.0] + preds[e.name + "_I_prob"] + [0.0]  # Add dummy wrap for items -1 and +1
			if e.name == "FreqConnDetector":
				preds[e.name + "_freq"] = [0.0] + preds[e.name + "_freq"] + [0.0]  # Add dummy wrap for items -1 and +1
			headers.append(e.name + "_B_prob")
			headers.append(e.name + "_I_prob")
			num_labels.append(e.name + "_B_prob")
			num_labels.append(e.name + "_I_prob")
			if "Freq" in e.name:
				headers.append(e.name + "_freq")
				num_labels.append(e.name + "_freq")

		for i, item in enumerate(train_feats):
			if item["word"] not in top_n_words:
				item["word"] = item["pos"]
			for e in self.estimators:
				item[e.name + "_B_prob"] = preds[e.name + "_B_prob"][i]
				item[e.name + "_I_prob"] = preds[e.name + "_I_prob"][i]
				if e.name == "FreqConnDetector":
					item[e.name + "_freq"] = preds[e.name + "_freq"][i]

			feats = []
			for k in headers:
				feats.append(item[k])

			data.append(feats)

		data, headers, cat_labels, num_labels = self.n_gram(data, headers, cat_labels, num_labels)

		# No need for n_gram feats for the following:
		if "FreqConnDetector_B_prob_min1" in num_labels:
			num_labels.remove("FreqConnDetector_B_prob_min1")
			num_labels.remove("FreqConnDetector_B_prob_pls1")
		if "FreqConnDetector_I_prob_min1" in num_labels:
			num_labels.remove("FreqConnDetector_I_prob_min1")
			num_labels.remove("FreqConnDetector_I_prob_pls1")
		if "FreqConnDetector_freq_min1" in num_labels:
			num_labels.remove("FreqConnDetector_freq_min1")
			num_labels.remove("FreqConnDetector_freq_pls1")
		if "RNNSegmenter_B_prob_min1" in num_labels:
			num_labels.remove("RNNSegmenter_B_prob_min1")
			num_labels.remove("RNNSegmenter_B_prob_pls1")
		if "RNNSegmenter_I_prob_min1" in num_labels:
			num_labels.remove("RNNSegmenter_I_prob_min1")
			num_labels.remove("RNNSegmenter_I_prob_pls1")
		if "tok_id_min1" in num_labels:
			num_labels.remove("tok_id_min1")
			num_labels.remove("tok_id_pls1")
		if "genre_min1" in cat_labels:
			cat_labels.remove("genre_min1")
			cat_labels.remove("genre_pls1")
		if "s_type_min1" in cat_labels:
			cat_labels.remove("s_type_min1")
			cat_labels.remove("s_type_pls1")
		if "morph_min1" in cat_labels:
			cat_labels.remove("morph_min1")
			cat_labels.remove("morph_pls1")
		if "s_len_min1" in num_labels:
			num_labels.remove("s_len_min1")
			num_labels.remove("s_len_pls1")
		if "sent_doc_percentile_min1" in num_labels:
			num_labels.remove("sent_doc_percentile_min1")
			num_labels.remove("sent_doc_percentile_pls1")

		data = pd.DataFrame(data, columns=headers)
		data_encoded, multicol_dict = self.multicol_fit_transform(data, pd.Index(cat_labels))

		data_x = data_encoded[cat_labels+num_labels].values
		data_y = []
		for t in gold_feats:
			if "B-Conn" in t['label']:
				#data_y.append((1,0))
				data_y.append(1)
			elif "I-Conn" in t["label"]:
				#data_y.append((0,1))
				data_y.append(2)
			else:
				#data_y.append((0,0))
				data_y.append(0)

		# data_y = [int(t['label'] != "_") for t in gold_feats]
		return data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words
コード例 #2
0
	def predict(self, conllu, eval_gold=False, as_text=True, serialize=False):
		"""
		Predict sentence splits using an existing model

		:param conllu: File in DISRPT shared task *.conll format
		:param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input
		:param as_text: Boolean, whether the input is a string, rather than a file name to read
		:param serialize: Whether to serialize prediction as a .conll file
		:return: tokenwise prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold
		"""

		clf, num_labels, cat_labels, multicol_dict, vocab, firsts, lasts = joblib.load(self.model)

		self.clf = clf
		if not as_text:
			conllu = io.open(conllu,encoding="utf8").read()

		train_feats, _, toks, _, _ = read_conll(conllu,genre_pat=self.genre_pat,mode="seg",as_text=True)
		headers = sorted(list(train_feats[0].keys()))

		data = []

		preds = {}
		for e in self.estimators:
			pred = e.predict(conllu)
			# _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)]
			preds[e.name + "_B_prob"] = []
			preds[e.name + "_I_prob"] = []
			if "Freq" in e.name:
				preds[e.name + "_freq"] = []
				headers.append(e.name + "_freq")
			for tup in pred:
				if "RNN" in e.name:
					pred = tup[0]
					probas = tup[1]
					freqs = None
				else:
					pred = tup[1]
					probas = float(tup[2])
					freqs = float(tup[3])
				if "B-Conn" in pred:
					preds[e.name + "_B_prob"].append(probas)
					preds[e.name + "_I_prob"].append(0.0)
				elif "I-Conn" in pred:
					preds[e.name + "_B_prob"].append(0.0)
					preds[e.name + "_I_prob"].append(probas)
				else:
					preds[e.name + "_B_prob"].append(0.0)
					preds[e.name + "_I_prob"].append(0.0)
				if "Freq" in e.name:
					preds[e.name + "_freq"].append(freqs)

			headers.append(e.name + "_B_prob")
			headers.append(e.name + "_I_prob")

		temp = []
		headers_with_oov = ["deprel","pos","cpos","morph","s_type","depchunk"]
		for pref in ["min1","pls1"]:
			temp += [pref + "_" + h for h in headers_with_oov]
		headers_with_oov += temp

		genre_warning = False
		for i, header in enumerate(headers):
			if header in headers_with_oov and header in cat_labels:
				for item in train_feats:
					if item[header] not in multicol_dict["encoder_dict"][header].classes_:
						item[header] = "_"
		for i, item in enumerate(train_feats):
			item["first"] = item["word"][0] if item["word"][0] in firsts else "_"
			item["last"] = item["word"][-1] if item["word"][-1] in lasts else "_"
			if "genre" in cat_labels:
				if item["genre"] not in multicol_dict["encoder_dict"]["genre"].classes_:  # New genre not in training data
					if not genre_warning:
						sys.stderr.write("! WARN: Genre not in training data: " + item["genre"] + "; suppressing further warnings\n")
						genre_warning = True
					item["genre"] = "_"
			if item["word"] not in vocab:
				if item["pos"] in multicol_dict["encoder_dict"]["word"].classes_:
					item["word"] = item["pos"]
				else:
					item["word"] = "_"
			for e in self.estimators:
				item[e.name + "_B_prob"] = preds[e.name + "_B_prob"][i]
				item[e.name + "_I_prob"] = preds[e.name + "_I_prob"][i]
				if e.name == "FreqConnDetector":
					item[e.name + "_freq"] = preds[e.name + "_freq"][i]

			feats = []
			for k in headers:
				feats.append(item[k])

			data.append(feats)

		data, headers, _, _ = self.n_gram(data,headers,[],[])

		data = pd.DataFrame(data, columns=headers)
		data_encoded = self.multicol_transform(data,columns=multicol_dict["columns"],all_encoders_=multicol_dict["all_encoders_"])

		data_x = data_encoded[cat_labels+num_labels].values
		preds = clf.predict(data_x)

		if eval_gold:
			gold_feats, _,_,_,_ = read_conll(conllu,genre_pat=self.genre_pat,mode="seg",as_text=True)

			# Array to keep labels for diff
			gold = []
			for t in gold_feats:
				if "B-Conn" in t['label']:
					gold.append("Seg=B-Conn")
				elif "I-Conn" in t['label']:
					gold.append("Seg=I-Conn")
				else:
					gold.append("_")
			gold = np.asarray(gold)

			# Generate response conllu
			lines = conllu.split("\n")
			processed = []
			pred_labs = []
			i = 0
			for line in lines:
				if "\t" in line:
					fields = line.split('\t')
					if "-" in fields[0]:
						processed.append(line)
						continue
					else:
						if preds[i] == 0:
							pred = "_"
						elif preds[i] == 1:
							pred = "Seg=B-Conn"
						else:
							pred = "Seg=I-Conn"
						pred_labs.append(pred)
						fields[-1]=pred
						processed.append("\t".join(fields))
						i+=1
				else:
					processed.append(line)
			processed = "\n".join(processed) + "\n"

			score_dict = get_scores(conllu,processed,string_input=True)

			print("o Total tokens: " + str(score_dict["tok_count"]))
			print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
			print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
			print("o Precision: " + str(score_dict["prec"]))
			print("o Recall: " + str(score_dict["rec"]))
			print("o F-Score: " + str(score_dict["f_score"]))

			if serialize:
				self.serialize(conllu,pred_labs)

			with io.open("diff.tab",'w',encoding="utf8") as f:
				for i in range(len(pred_labs)):
					f.write("\t".join([toks[i],str(gold[i]),str(pred_labs[i])])+"\n")
			return score_dict["f_score"]
		else:
			return preds
コード例 #3
0
    def predict(self, infile, model_path=None, eval_gold=False, as_text=False):
        """
		Predict sentence splits using an existing model

		:param infile: File in DISRPT shared task *.tok or *.conll format (sentence breaks will be ignored in .conll)
		:param model: Pickled model file, default: models/sent_model.pkl
		:param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input
		:param as_text: Boolean, whether the input is a string, rather than a file name to read
		:return: tokenwise binary prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold
		"""

        if model_path is None:  # Try default model location
            model_path = script_dir + os.sep + "models" + os.sep + self.corpus + "_ensemble_sent.pkl"

        clf, num_labels, cat_labels, multicol_dict, vocab, firsts, lasts = joblib.load(
            model_path)

        if as_text:
            conllu = infile
        else:
            conllu = io.open(infile, encoding="utf8").read()

        #tagged = udpipe_tag(conllu,self.udpipe_model)
        tagged = tt_tag(conllu, self.lang)

        train_feats, _, toks, _, _ = read_conll(tagged,
                                                genre_pat=self.genre_pat,
                                                mode="sent",
                                                as_text=True,
                                                char_bytes=self.lang == "zho")
        headers = sorted(list(train_feats[0].keys()))

        data = []

        preds = {}
        for e in self.estimators:
            pred = e.predict(tagged)
            _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)]
            headers.append(e.name + "_prob")

        genre_warning = False
        for i, item in enumerate(train_feats):
            item["first"] = item["word"][0] if item["word"][
                0] in firsts else "_"
            item["last"] = item["word"][-1] if item["word"][
                -1] in lasts else "_"
            if "genre" in cat_labels:
                if item["genre"] not in multicol_dict["encoder_dict"][
                        "genre"].classes_:  # New genre not in training data
                    if not genre_warning:
                        sys.stderr.write(
                            "! WARN: Genre not in training data: " +
                            item["genre"] + "; suppressing further warnings\n")
                        genre_warning = True
                    item["genre"] = "_"
            if "pos" in cat_labels:
                if item["pos"] not in multicol_dict["encoder_dict"][
                        "pos"].classes_:
                    item["pos"] = "_"
            if "cpos" in cat_labels:
                if item["cpos"] not in multicol_dict["encoder_dict"][
                        "cpos"].classes_:
                    item["cpos"] = "_"
            if item["word"] not in vocab and "word" in multicol_dict[
                    "encoder_dict"]:
                if item["pos"] in multicol_dict["encoder_dict"][
                        "word"].classes_:
                    item["word"] = item["pos"]
                else:
                    item["word"] = "_"
            for e in self.estimators:
                item[e.name + "_prob"] = preds[e.name + "_prob"][i]

            feats = []
            for k in headers:
                feats.append(item[k])

            data.append(feats)

        data, headers, _, _ = self.n_gram(data, headers, [], [])

        data = pd.DataFrame(data, columns=headers)
        data_encoded = self.multicol_transform(
            data,
            columns=multicol_dict["columns"],
            all_encoders_=multicol_dict["all_encoders_"])

        data_x = data_encoded[cat_labels + num_labels].values
        pred = clf.predict(data_x)

        # Ensure first token in document is always a sentence break
        for i, x in enumerate(data_encoded["tok_id"].values):
            if x == 1:
                pred[i] = 1

        if eval_gold:
            gold_feats, _, _, _, _ = read_conll(conllu,
                                                genre_pat=self.genre_pat,
                                                mode="sent",
                                                as_text=True)
            gold = [int(t['wid'] == 1) for t in gold_feats]
            conf_mat = confusion_matrix(gold, pred)
            sys.stderr.write(str(conf_mat) + "\n")
            true_positive = conf_mat[1][1]
            false_positive = conf_mat[0][1]
            false_negative = conf_mat[1][0]
            prec = true_positive / (true_positive + false_positive)
            rec = true_positive / (true_positive + false_negative)
            f1 = 2 * prec * rec / (prec + rec)
            sys.stderr.write("P: " + str(prec) + "\n")
            sys.stderr.write("R: " + str(rec) + "\n")
            sys.stderr.write("F1: " + str(f1) + "\n")
            with io.open("diff.tab", 'w', encoding="utf8") as f:
                for i in range(len(gold)):
                    f.write("\t".join([toks[i],
                                       str(gold[i]),
                                       str(pred[i])]) + "\n")
            return conf_mat, prec, rec, f1
        else:
            return pred
コード例 #4
0
    def train(self,
              training_file,
              rare_thresh=100,
              clf_params=None,
              model_path=None,
              chosen_feats=None,
              tune_mode=None,
              size=None,
              as_text=False,
              multitrain=True,
              chosen_clf=None):
        """
		Train the EnsembleSentencer. Note that the underlying estimators are assumed to be pretrained already.

		:param training_file: File in DISRPT shared task .conll format
		:param model_path: Path to dump pickled model to
		:param rare_thresh: Rank of rarest word to include (rarer items are replace with POS)
		:param genre_pat: Regex pattern with capturing group to extract genre from document names
		:param as_text: Boolean, whether the input is a string, rather than a file name to read
		:return:
		"""

        if tune_mode is not None and size is None and tune_mode != "hyperopt":
            size = 5000
            sys.stderr.write("o No sample size set - setting size to 5000\n")

        if not as_text:
            train = io.open(
                training_file, encoding="utf8").read().strip().replace(
                    "\r", "") + "\n"
        else:
            train = training_file

        if size is not None:
            train = shuffle_cut_conllu(train, size)
        #tagged = udpipe_tag(train,self.udpipe_model)
        tagged = tt_tag(train, self.lang, preserve_sent=True)

        if model_path is None:  # Try default model location
            model_path = script_dir + os.sep + "models" + os.sep + self.corpus + "_ensemble_sent.pkl"

        if clf_params is None:
            # Default classifier parameters
            #clf_params = {"n_estimators":125,"min_samples_leaf":1, "max_depth":15, "max_features":None, "n_jobs":4, "random_state":42, "oob_score":True, "bootstrap":True}
            clf_params = {
                "n_estimators": 100,
                "min_samples_leaf": 1,
                "min_samples_split": 5,
                "max_depth": 10,
                "max_features": None,
                "n_jobs": 4,
                "random_state": 42,
                "oob_score": True,
                "bootstrap": True
            }

        if chosen_clf is None:
            chosen_clf = RandomForestClassifier(n_jobs=4,
                                                oob_score=True,
                                                bootstrap=True)
            chosen_clf.set_params(**clf_params)

        cat_labels = ["word", "first", "last", "genre", "pos", "cpos"]
        num_labels = ["tok_len", "tok_id"]

        train_feats, vocab, toks, firsts, lasts = read_conll(
            tagged,
            genre_pat=self.genre_pat,
            mode="sent",
            as_text=True,
            char_bytes=self.lang == "zho")
        gold_feats, _, _, _, _ = read_conll(train, mode="sent", as_text=True)
        gold_feats = [{
            "wid": 0
        }] + gold_feats + [{
            "wid": 0
        }]  # Add dummies to gold

        # Ensure that "_" is in the possible values of first/last for OOV chars at test time
        oov_item = train_feats[-1]
        oov_item["first"] = "_"
        oov_item["last"] = "_"
        oov_item["lemma"] = "_"
        oov_item["word"] = "_"
        oov_item["pos"] = "_"
        oov_item["cpos"] = "_"
        oov_item["genre"] = "_"
        train_feats.append(oov_item)
        train_feats = [oov_item] + train_feats
        toks.append("_")
        toks = ["_"] + toks

        vocab = Counter(vocab)
        top_n_words = vocab.most_common(rare_thresh)
        top_n_words, _ = zip(*top_n_words)

        headers = sorted(list(train_feats[0].keys()))
        data = []

        preds = {}

        for e in self.estimators:
            if multitrain and e.name in ["LRSentencer", "DNNSentencer"]:
                pred = e.predict_cached(tagged)
            else:
                pred = e.predict(tagged)
            _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)]
            preds[e.name + "_prob"] = [0.0] + preds[e.name + "_prob"] + [
                0.0
            ]  # Add dummy wrap for items -1 and +1
            headers.append(e.name + "_prob")
            num_labels.append(e.name + "_prob")

        for i, item in enumerate(train_feats):
            if item["word"] not in top_n_words:
                item["word"] = item["pos"]
            for e in self.estimators:
                item[e.name + "_prob"] = preds[e.name + "_prob"][i]

            feats = []
            for k in headers:
                feats.append(item[k])

            data.append(feats)

        data, headers, cat_labels, num_labels = self.n_gram(
            data, headers, cat_labels, num_labels)
        # No need for n_gram feats for the following:
        if "NLTKSentencer_prob_min1" in num_labels:
            num_labels.remove("NLTKSentencer_prob_min1")
            num_labels.remove("NLTKSentencer_prob_pls1")
        if "UDPipeSentencer_prob_min1" in num_labels:
            num_labels.remove("UDPipeSentencer_prob_min1")
            num_labels.remove("UDPipeSentencer_prob_pls1")
        if "LRSentencer_prob_min1" in num_labels:
            num_labels.remove("LRSentencer_prob_min1")
            num_labels.remove("LRSentencer_prob_pls1")
        if "RuleBasedSplitter_prob_min1" in num_labels:
            num_labels.remove("RuleBasedSplitter_prob_min1")
            num_labels.remove("RuleBasedSplitter_prob_pls1")
        if "DNNSentencer_prob_min1" in num_labels:
            num_labels.remove("DNNSentencer_prob_min1")
            num_labels.remove("DNNSentencer_prob_pls1")
        if "tok_id_min1" in num_labels:
            num_labels.remove("tok_id_min1")
            num_labels.remove("tok_id_pls1")
        if "genre_min1" in cat_labels:
            cat_labels.remove("genre_min1")
            cat_labels.remove("genre_pls1")

        # Use specific feature subset
        if chosen_feats is not None:
            new_cat = []
            new_num = []
            for feat in chosen_feats:
                if feat in cat_labels:
                    new_cat.append(feat)
                elif feat in num_labels:
                    new_num.append(feat)
            cat_labels = new_cat
            num_labels = new_num

        data = pd.DataFrame(data, columns=headers)
        data_encoded, multicol_dict = self.multicol_fit_transform(
            data, pd.Index(cat_labels))

        data_x = data_encoded[cat_labels + num_labels].values
        data_y = [int(t['wid'] == 1) for t in gold_feats]

        sys.stderr.write("o Learning...\n")

        if tune_mode is not None:
            # Randomize samples for training
            data_x = data_encoded[cat_labels + num_labels + ["label"]].sample(
                frac=1, random_state=42)
            data_y = np.where(data_x['label'] == "_", 0, 1)
            data_x = data_x[cat_labels + num_labels]

            # Reserve 10% for validation
            val_x = data_x[int(len(data_y) / 9):]
            val_y = data_y[int(len(data_y) / 9):]
            data_x = data_x[:int(len(data_y) / 9)]
            data_y = data_y[:int(len(data_y) / 9)]

        if tune_mode == "importances":
            sys.stderr.write(
                "o Measuring correlation of categorical variables\n")
            theil_implications = report_theils_u(val_x, cat_labels)
            for (var1, var2) in theil_implications:
                if var1 in cat_labels and var2 in cat_labels:
                    drop_var = var2
                    u = theil_implications[(var1, var2)]
                    sys.stderr.write("o Removed feature " + drop_var +
                                     " due to Theil's U " + str(u)[:6] +
                                     " of " + var1 + "->" + var2 + "\n")
                    cat_labels.remove(drop_var)

            sys.stderr.write(
                "o Measuring correlation of numerical variables\n")
            cor_mat = report_correlations(val_x[num_labels], thresh=0.95)
            for (var1, var2) in cor_mat:
                if var1 in num_labels and var2 in num_labels:
                    drop_var = var2
                    corr_level = cor_mat[(var1, var2)]
                    sys.stderr.write("o Removed feature " + drop_var +
                                     " due to correlation " + str(corr_level) +
                                     " of " + var1 + ":" + var2 + "\n")
                    num_labels.remove(drop_var)

            return cat_labels, num_labels

        if tune_mode in ["paramwise", "full"]:
            best_params = {}
            # Tune individual params separately for speed, or do complete grid search if building final model
            params_list = [{
                "n_estimators": [100, 125, 150]
            }, {
                'max_depth': [10, 15, 20, None]
            }, {
                "min_samples_split": [5, 10, 15]
            }, {
                "min_samples_leaf": [1, 2, 3]
            }, {
                "max_features": [None, "sqrt", "log2"]
            }]
            if tune_mode == "full":
                # Flatten dictionary if doing full CV
                params_list = [{
                    k: v
                    for d in params_list for k, v in d.items()
                }]
            for params in params_list:
                base_params = copy.deepcopy(clf_params)  # Copy default params
                for p in params:
                    if p in base_params:  # Ensure base_params don't conflict with grid search params
                        base_params.pop(p)
                grid = GridSearchCV(RandomForestClassifier(**base_params),
                                    params,
                                    cv=3,
                                    n_jobs=4,
                                    error_score="raise",
                                    refit=False)
                grid.fit(data_x, data_y)
                for param in params:
                    best_params[param] = grid.best_params_[param]
            with io.open("best_params.tab", 'a', encoding="utf8") as bp:
                corpus = os.path.basename(training_file).split("_")[0]
                best_clf = RandomForestClassifier(**best_params)
                clf_name = best_clf.__class__.__name__
                for k, v in best_params.items():
                    bp.write("\t".join([corpus, clf_name, k, str(v)]))
                bp.write("\n")
            return best_clf, best_params
        elif tune_mode == "hyperopt":
            from hyperopt import hp
            from hyperopt.pyll.base import scope
            space = {
                'n_estimators':
                scope.int(hp.quniform('n_estimators', 50, 150, 10)),
                'max_depth':
                scope.int(hp.quniform('max_depth', 5, 30, 1)),
                'min_samples_split':
                scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
                'min_samples_leaf':
                scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
                'max_features':
                hp.choice('max_features', ["sqrt", None, 0.5, 0.7, 0.9]),
                'clf':
                hp.choice('clf', ["rf", "et", "gbm"])
            }
            #space = {
            #	'n_estimators': scope.int(hp.quniform('n_estimators', 50, 150, 10)),
            #	'max_depth': scope.int(hp.quniform('max_depth', 3, 30, 1)),
            #	'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)),
            #	'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)),
            #	'colsample_bytree': hp.choice('colsample_bytree', [0.4,0.5,0.6,0.7,1.0]),
            #	'subsample': hp.choice('subsample', [0.5,0.6,0.7,0.8,1.0]),
            #	'clf': hp.choice('clf', ["xgb"])
            #}

            best_clf, best_params = hyper_optimize(data_x,
                                                   data_y,
                                                   cat_labels=cat_labels,
                                                   space=space,
                                                   max_evals=50)
            return best_clf, best_params
        else:
            clf = chosen_clf
            clf.set_params(**clf_params)
            if clf.__class__.__name__ in [
                    "RandomForestClassifier", "ExtraTreesClassifier",
                    "XGBClassifier"
            ]:
                clf.set_params(
                    **{
                        "n_jobs": 3,
                        "random_state": 42,
                        "oob_score": True,
                        "bootstrap": True
                    })
            else:
                clf.set_params(**{"random_state": 42})
            clf.fit(data_x, data_y)

        feature_names = cat_labels + num_labels

        zipped = zip(feature_names, clf.feature_importances_)
        sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True)
        sys.stderr.write("o Feature importances:\n\n")
        for name, importance in sorted_zip:
            sys.stderr.write(name + "=" + str(importance) + "\n")

        if hasattr(clf, "oob_score_"):
            sys.stderr.write("\no OOB score: " + str(clf.oob_score_) + "\n")

        sys.stderr.write("\no Serializing model...\n")

        joblib.dump((clf, num_labels, cat_labels, multicol_dict, top_n_words,
                     firsts, lasts),
                    model_path,
                    compress=3)
コード例 #5
0
    def predict(self, conllu, eval_gold=False, as_text=True, serialize=False):
        """
		Predict sentence splits using an existing model

		:param infile: File in DISRPT shared task *.conll format
		:param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input
		:param as_text: Boolean, whether the input is a string, rather than a file name to read
		:return: tokenwise binary prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold
		"""

        clf, num_labels, cat_labels, multicol_dict, vocab, firsts, lasts = joblib.load(
            self.model)
        self.clf = clf

        if not as_text:
            conllu = io.open(conllu, encoding="utf8").read()

        train_feats, _, toks, _, _ = read_conll(conllu,
                                                genre_pat=self.genre_pat,
                                                mode="seg",
                                                as_text=True)
        headers = sorted(list(train_feats[0].keys()))

        data = []

        preds = {}
        for e in self.estimators:
            pred = e.predict(conllu)
            _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)]
            headers.append(e.name + "_prob")

        temp = []
        headers_with_oov = [
            "deprel", "pos", "cpos", "morph", "s_type", "depchunk"
        ]
        for pref in ["min1", "pls1"]:
            temp += [pref + "_" + h for h in headers_with_oov]
        headers_with_oov += temp

        genre_warning = False
        for i, header in enumerate(headers):
            if header in headers_with_oov and header in cat_labels:
                for item in train_feats:
                    if item[header] not in multicol_dict["encoder_dict"][
                            header].classes_:
                        item[header] = "_"
        for i, item in enumerate(train_feats):
            item["first"] = item["word"][0] if item["word"][
                0] in firsts else "_"
            item["last"] = item["word"][-1] if item["word"][
                -1] in lasts else "_"
            if "genre" in cat_labels:
                if item["genre"] not in multicol_dict["encoder_dict"][
                        "genre"].classes_:  # New genre not in training data
                    if not genre_warning:
                        sys.stderr.write(
                            "! WARN: Genre not in training data: " +
                            item["genre"] + "; suppressing further warnings\n")
                        genre_warning = True
                    item["genre"] = "_"
            if item["word"] not in vocab:
                if item["pos"] in multicol_dict["encoder_dict"][
                        "word"].classes_:
                    item["word"] = item["pos"]
                else:
                    item["word"] = "_"
            for e in self.estimators:
                item[e.name + "_prob"] = preds[e.name + "_prob"][i]

            feats = []
            for k in headers:
                feats.append(item[k])

            data.append(feats)

        data, headers, _, _ = self.n_gram(data, headers, [], [])

        data = pd.DataFrame(data, columns=headers)
        data_encoded = self.multicol_transform(
            data,
            columns=multicol_dict["columns"],
            all_encoders_=multicol_dict["all_encoders_"])

        data_x = data_encoded[cat_labels + num_labels].values
        pred = clf.predict(data_x)

        if serialize:
            self.serialize(conllu, pred)
        if eval_gold:
            gold_feats, _, _, _, _ = read_conll(conllu,
                                                genre_pat=self.genre_pat,
                                                mode="seg",
                                                as_text=True)
            gold = [int(t['label'] != "_") for t in gold_feats]
            conf_mat = confusion_matrix(gold, pred)
            sys.stderr.write(str(conf_mat) + "\n")
            true_positive = conf_mat[1][1]
            false_positive = conf_mat[0][1]
            false_negative = conf_mat[1][0]
            prec = true_positive / (true_positive + false_positive)
            rec = true_positive / (true_positive + false_negative)
            f1 = 2 * prec * rec / (prec + rec)
            sys.stderr.write("P: " + str(prec) + "\n")
            sys.stderr.write("R: " + str(rec) + "\n")
            sys.stderr.write("F1: " + str(f1) + "\n")
            with io.open("diff.tab", 'w', encoding="utf8") as f:
                for i in range(len(gold)):
                    f.write("\t".join([toks[i],
                                       str(gold[i]),
                                       str(pred[i])]) + "\n")
            return conf_mat, prec, rec, f1
        else:
            return pred