Ejemplo n.º 1
0
	def train(self,trainfile, devfile, multifolds=1,as_text=False):

		p = StdOutFilter()
		p.start()

		train_feats, _, _, _, _ = read_conll(trainfile,genre_pat=self.genre_pat,as_text=as_text)
		train_for_rnn, scalers = feats2rnn(train_feats)

		dev_feats, _, _, _, _ = read_conll(devfile,genre_pat=self.genre_pat,as_text=as_text)
		dev_for_rnn, scalers = feats2rnn(dev_feats)

		with io.open(self.corpus_dir + os.sep + "dev."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(dev_for_rnn)

		# NCRFpp expects a test file, we reuse dev
		with io.open(self.corpus_dir + os.sep + "test."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(dev_for_rnn)

		if multifolds > 1:
			train_chunks, test_chunks = self.split_dataset(train_for_rnn,multifolds=multifolds)
			all_preds = []
			all_labs = []
			for i in range(opts.multifolds):
				with io.open(self.corpus_dir + os.sep + "train."+self.ext,'w',encoding="utf8",newline="\n") as f:
					f.write(train_chunks[i])

				with io.open(self.corpus_dir + os.sep + "raw."+self.ext,'w',encoding="utf8",newline="\n") as f:
					f.write(test_chunks[i])

				# TRAIN ON FOLD
				sys.stderr.write("\no Training on fold " + str(i+1)+"/" + str(multifolds) + "\n")
				config = ncrf_dir + os.sep + self.segtype + ".train"+self.auto+".config"
				ncrf(config)

				# PREDICT
				config = ncrf_dir + os.sep + self.segtype + ".decode"+self.auto+".config"
				ncrf(config,status="decode")

				labs, scores = self.read_preds()
				all_labs += labs
				all_preds += scores

			# SERIALIZE MULTITRAIN PREDS
			with io.open(self.corpus_dir + os.sep + corpus + self.auto + "_multitrain.tab",'w',newline='\n') as f:
				for j, pred in enumerate(all_preds):
					if self.conn:
						lab = all_labs[j]
					else:
						lab = 1 if pred >= 0.15 else 0  # Arbitrary threshold, we will only use prob as feature for metalearner
					f.write(str(lab) + "\t" + str(pred) + "\n")

		# TRAIN MODEL ON ALL TRAIN
		sys.stderr.write("\no Training on full train set\n")
		with io.open(self.corpus_dir + os.sep + "train."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(train_for_rnn)

		config = ncrf_dir + os.sep + self.segtype + ".train"+self.auto+".config"
		ncrf(config)

		p.end()
Ejemplo n.º 2
0
	def predict(self,testfile,as_text=True):

		test_feats, _, _, _, _ = read_conll(testfile,genre_pat=self.genre_pat,as_text=as_text)
		test_for_rnn, scalers = feats2rnn(test_feats)

		with io.open(self.corpus_dir + os.sep + "raw."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(test_for_rnn)

		p = StdOutFilter()
		p.start()

		config = ncrf_dir + os.sep + self.segtype+".decode"+self.auto+".config"
		ncrf(config,status="decode")

		p.end()

		labs, probas = self.read_preds()
		return zip(labs,probas)
Ejemplo n.º 3
0
        udpipe = NLTKSentencer(lang=corpus[:3])
        conllu = io.open(topred_file, encoding="utf8").read()
        filtered = []
        for line in conllu.split("\n"):
            if "\t" in line:
                fields = line.split("\t")
                if "-" in fields[0]:
                    continue
            filtered.append(line)
        conllu = "\n".join(filtered) + "\n"
        pred = [x[0] for x in udpipe.predict(conllu)]
        # pred = udpipe.predict(conllu)

        gold_feats, _, toks, _, _ = read_conll(gold_file,
                                               genre_pat="^(..)",
                                               mode="sent",
                                               as_text=False)
        gold = [int(t['wid'] == 1) for t in gold_feats]
        conf_mat = confusion_matrix(gold, pred)

        sys.stderr.write("Evaluating on " + corpus + "\n")
        sys.stderr.write(str(conf_mat) + "\n")
        true_positive = conf_mat[1][1]
        false_positive = conf_mat[0][1]
        false_negative = conf_mat[1][0]
        prec = true_positive / (true_positive + false_positive)
        rec = true_positive / (true_positive + false_negative)
        f1 = 2 * prec * rec / (prec + rec)

        sys.stderr.write("P: " + str(prec) + "\n")
        sys.stderr.write("R: " + str(rec) + "\n")
Ejemplo n.º 4
0
    for corpus in corpora:

        corpus_start_time = time.time()

        train = glob(opts.data_dir + os.sep + corpus + os.sep + corpus +
                     "_train.conll")[0]
        dev = glob(opts.data_dir + os.sep + corpus + os.sep + corpus +
                   "_dev.conll")[0]
        test = glob(opts.data_dir + os.sep + corpus + os.sep + corpus +
                    "_test.conll")[0]

        feats = DepFeatures()

        from conll_reader import read_conll
        outputs = read_conll(dev)
        train = outputs[0]

        # Extract features from file
        sys.stderr.write("\no Extracting features from training corpus " +
                         corpus + "\n")
        train_feats = feats.extract_depfeatures(train)

        elapsed = time.time() - corpus_start_time
        sys.stderr.write("\nTime training on corpus:\n")
        sys.stderr.write(str(timedelta(seconds=elapsed)) + "\n\n")

    sys.stderr.write("\nTotal time:\n")
    elapsed = time.time() - start_time
    sys.stderr.write(str(timedelta(seconds=elapsed)) + "\n\n")
Ejemplo n.º 5
0
    if opts.mode == "train":
        # Get Connectives: Raw frequency and ratio of the connectives found in the training data
        conn = connective.train(train_path)

    if opts.eval_test:
        dev_path = test_path

    with open(dev_path, 'r', encoding='utf-8') as f:
        lines = f.read()

    # Prediction on the devset
    pred_labels_probs = connective.predict(lines)
    resps = [
        tok[1] if float(tok[2]) > 0.5 else "_" for tok in pred_labels_probs
    ]
    train_feats, _, _, _, _ = read_conll(dev_path, as_text=False)
    gold = io.open(dev_path, encoding="utf8").read()
    lines = gold.split("\n")
    processed = []
    i = 0
    for line in lines:
        if "\t" in line:
            fields = line.split('\t')
            if "-" in fields[0]:
                processed.append(line)
                continue
            else:
                fields[-1] = resps[i]
                processed.append("\t".join(fields))
                i += 1
        else:
Ejemplo n.º 6
0
	def read_conll_sentbreak(self, infile, neighborwindowsize=5,as_text=True,cut=True,do_tag=True,multitrain=False):
		global TRAIN_LIMIT
		# read data from conll_reader

		vowels = "AEIOUaeiouéèàáíìúùòóаэыуояеёюи"


		numeric_entries = []
		nonnumeric_entries = []
		goldseg_entries = []
		if as_text:
			conllu_in = infile
		else:
			conllu_in = io.open(infile, 'r', encoding='utf8').read()

		# # Reduce data if too large
		# if not cut and conllu_in.count("\n") > 100000 and multitrain:
		# 	sys.stderr.write("o Data too large; forcing cut and turning off multitraining\n")
		# 	cut = True
		# 	TRAIN_LIMIT = 100000
		# if cut:
		# 	conllu_in = shuffle_cut_conllu(conllu_in,limit=TRAIN_LIMIT)

		train_feats,_,_,_,_ = read_conll(conllu_in,mode="seg",genre_pat=None,as_text=True,cap=None,char_bytes=False)


		featurekeys = ["label", "word", "pos", "cpos", "head", "head_dist", "deprel", "case", "tok_len", "depchunk", "conj", "s_len", "s_type", "sent_doc_percentile", "parentclauses"]

		for lnum, line in enumerate(train_feats):
			lfeatures = [line[x] for x in featurekeys]
			lfeatures[0] = int(lfeatures[0]!="_")


			firstletter = str(lfeatures[1][0].encode("utf8")[0]) if self.lang == "zho" else lfeatures[1][0]
			firstisupper = int (firstletter.upper() == firstletter)
			firstisconsonant = len(re.findall('[^'+vowels+']', firstletter))
			firstisvowel = len(re.findall('['+vowels+']', firstletter))
			firstisdigit = len(re.findall('[0-9]', firstletter))
			firstisspecial = len(re.findall('[^A-Za-z0-9]', firstletter))

			lastletter = str(lfeatures[1][-1].encode("utf8")[-1]) if self.lang == "zho" else lfeatures[1][-1]
			lastisupper = int(lastletter.upper() == lastletter)
			lastisconsonant = len(re.findall('[^'+vowels+']', lastletter))
			lastisvowel = len(re.findall('['+vowels+']', lastletter))
			lastisdigit = len(re.findall('[0-9]', lastletter))
			lastisspecial = len(re.findall('[^A-Za-z0-9]', lastletter))

			numconsonants = len(re.findall('[^'+vowels+']', lfeatures[1]))
			numvowels = len(re.findall('['+vowels+']', lfeatures[1]))
			numdigits = len(re.findall('[0-9]', lfeatures[1]))
			numspecials = len(re.findall('[^A-Za-z0-9]', lfeatures[1]))


			numeric_entries.append(
				[
				 numconsonants, numvowels, numdigits, numspecials,
				  firstisupper, firstisconsonant, firstisvowel, firstisdigit, firstisspecial,
				  lastisupper, lastisconsonant, lastisspecial,
					lfeatures[4], lfeatures[5], lfeatures[8], lfeatures[11], lfeatures[13]
				])

			nonnumeric_entries.append([lfeatures[2],lfeatures[3], firstletter,lastletter,
									   lfeatures[6], lfeatures[7], lfeatures[9], lfeatures[10], lfeatures[12], re.sub(r'^([^\|]*\|[^\|]*)\|.*', r'\1', lfeatures[14])
									   ])

			goldseg_entries.append(lfeatures[0])


		# featurekeys = ["label", "word", "pos", "cpos", "head", "head_dist", "deprel", "case", "tok_len", 9 "depchunk", "conj", 11"s_len", "s_type", "sent_doc_percentile", "parentclauses"]


		numeric_colnames = ['numconsonants', 'numvowels', 'numdigits', 'numspecials',
								    'firstisupper', 'firstisconsonant', 'firstisvowel', 'firstisdigit','firstisspecial',
								    'lastisupper', 'lastisconsonant','lastisspecial',
						   featurekeys[4], featurekeys[5], featurekeys[8], featurekeys[11], featurekeys[13]
							]
		nonnumeric_colnames = ['gold_pos', 'gold_cpos','firstletter', 'lastletter',
							  featurekeys[6], featurekeys[7], featurekeys[9], featurekeys[10], featurekeys[12], featurekeys[14]
							   ]
		numeric_entries = np.array(numeric_entries, dtype=np.float32)
		nonnumeric_entries = np.array(nonnumeric_entries)


		# Dummy multi vectors cattodummy is much better
		unigram_entries, unigram_colnames = self.cattodummy(nonnumeric_entries, nonnumeric_colnames, numeric_entries, numeric_colnames)


		sys.stderr.write("o unigram dataframe ready\n")


		if neighborwindowsize >= 3:
			sys.stderr.write("o duplicating to %d-gram...\n" %neighborwindowsize)

			prev_entries, prev_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'prev')
			next_entries, next_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'next')
			ngram_entries, ngram_colnames = self.numpyconcat2df(prev_entries, prev_colnames, unigram_entries, unigram_colnames)
			ngram_entries, ngram_colnames = self.numpyconcat2df(ngram_entries,ngram_colnames, next_entries, next_colnames)
			del prev_entries, prev_colnames, next_colnames, next_entries

			if neighborwindowsize >=5:
				prevprev_entries, prevprev_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'prevprev')
				nextnext_entries, nextnext_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'nextnext')
				ngram_entries, ngram_colnames = self.numpyconcat2df(prevprev_entries, prevprev_colnames, ngram_entries,
																   ngram_colnames)
				ngram_entries, ngram_colnames  = self.numpyconcat2df(ngram_entries, ngram_colnames, nextnext_entries,
																   nextnext_colnames)
				del prevprev_entries, prevprev_colnames, nextnext_colnames, nextnext_entries

				if neighborwindowsize == 7:
					prevprevprev_entries, prevprevprev_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'prevprevprev')
					nextnextnext_entries, nextnextnext_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'nextnextnext')
					ngram_entries, ngram_colnames = self.numpyconcat2df(prevprevprev_entries, prevprevprev_colnames, ngram_entries,
																   ngram_colnames)
					ngram_entries, ngram_colnames = self.numpyconcat2df(ngram_entries, ngram_colnames, nextnextnext_entries,
																   nextnextnext_colnames)
					del prevprevprev_entries, prevprevprev_colnames, nextnextnext_colnames, nextnextnext_entries

		else:
			ngram_entries, ngram_colnames = unigram_entries, unigram_colnames

		del unigram_entries, numeric_colnames, numeric_entries

		return ngram_entries, ngram_colnames, goldseg_entries, unigram_colnames
Ejemplo n.º 7
0
		# Run test
		segmenter = LRSegmenter(lang=lang,model=corpusname,windowsize=args.windowsize)
		if args.verbose:
			segmenter.verbose = True

		if segmenter.verbose:
			sys.stderr.write("o Processing corpus "+corpusname+"\n")


		if args.mode == "train":
			# When running from CLI, we always train (predict mode is done on imported class)
			segmenter.train(data_folder + os.sep+ corpusname + os.sep + corpusname + "_train.conll",as_text=False,standardization=args.standardization,multitrain=args.multitrain)

		# Now evaluate model
		predictions, probas = zip(*segmenter.predict(data_folder + os.sep+ corpusname + os.sep +corpusname + "_dev.conll",
													 as_text=False,standardization=args.standardization,do_tag=True))

		# Get gold labels for comparison
		conllu_in = io.open(data_folder + os.sep+ corpusname + os.sep +corpusname + "_dev.conll", 'r', encoding='utf8').read()
		devfeats,_,_,_,_ = read_conll(conllu_in, mode="seg", genre_pat=None, as_text=True, cap=None, char_bytes=False)
		labels = [int(x["label"]!='_') for x in devfeats]

		# give dev F1 score
		from sklearn.metrics import classification_report, confusion_matrix
		print(classification_report(labels, predictions, digits=6))
		print(confusion_matrix(labels, predictions))

		elapsed = time.time() - start_time
		sys.stderr.write(str(timedelta(seconds=elapsed)) + "\n\n")

Ejemplo n.º 8
0
data_dir = os.path.abspath(lib + os.sep + ".." + os.sep + ".." + os.sep +
                           "data_parsed")

corpora = os.listdir(data_dir)
corpora = [c for c in corpora if os.path.isdir(os.path.join(c, data_dir))]

log = io.open("baseline_parsed.log", 'w', encoding="utf8")
table_out = []

all_f = []

for corpus in corpora:
    if "pdtb" in corpus:
        continue
    dev = glob(data_dir + os.sep + corpus + os.sep + "*_test.conll")[0]
    train_feats, vocab, toks, firsts, lasts = read_conll(dev)
    labels = [int(t["label"] != "_") for t in train_feats]
    baseline_preds = [int(t["wid"] == 1) for t in train_feats]
    conf_mat = confusion_matrix(labels, baseline_preds)
    true_positive = conf_mat[1][1]
    false_positive = conf_mat[0][1]
    false_negative = conf_mat[1][0]
    prec = true_positive / (true_positive + false_positive)
    rec = true_positive / (true_positive + false_negative)
    f1 = 2 * prec * rec / (prec + rec)
    log.write("corpus: " + corpus + "\n")
    log.write("=" * 10 + "\n")
    log.write(str(confusion_matrix(labels, baseline_preds)) + "\n")
    log.write("P: " + str(prec) + "\n")
    log.write("R: " + str(rec) + "\n")
    log.write("F1: " + str(f1) + "\n\n")
Ejemplo n.º 9
0
    def read_data(self, infile, size, as_text, rare_thresh, chosen_feats=None):

        cap = 3 * size if size is not None else None
        train_feats, vocab, toks, firsts, lasts = read_conll(
            infile,
            genre_pat=self.genre_pat,
            mode="seg",
            cap=cap,
            char_bytes=self.lang == "zho",
            as_text=as_text)
        vocab = Counter(vocab)
        top_n_words = vocab.most_common(rare_thresh)
        top_n_words, _ = zip(*top_n_words)
        for tok in train_feats:
            if tok["word"] not in top_n_words:
                tok["word"] = tok["pos"]

        tokens_by_abs_id = self.traverse_trees(train_feats)

        data, headers = self.n_gram(train_feats, tokens_by_abs_id)

        # Features to use for all n-gram tokens
        num_labels = [
            "head_dist", "left_span", "right_span", "samepar_left", "tok_len"
        ]
        cat_labels = [
            "case", "closest_left", "closest_right", "deprel", "farthest_left",
            "farthest_right", "pos", "word", "morph", "cpos", "depchunk"
        ]

        pref_cat = []
        pref_num = []
        for pref in ["mn2", "mn1", "par", "par_par", "pl1", "pl2"]:
            pref_cat += [pref + "_" + h for h in cat_labels]
            pref_num += [pref + "_" + h for h in num_labels]

        # Features only needed for node token
        cat_labels += [
            "genre"
        ] + pref_cat  #+ ["heading_first","heading_last"]#+ ["s_type"]
        num_labels += [
            "dist2end", "sent_doc_percentile", "tok_id", "wid", "quote", "rank"
        ] + pref_num  # + ["bracket"]
        num_labels += ["par_quote",
                       "par_par_quote"]  #,"par_bracket","par_par_bracket"]

        # Use specific feature subset
        if chosen_feats is not None:
            new_cat = []
            new_num = []
            for feat in chosen_feats:
                if feat in cat_labels:
                    new_cat.append(feat)
                elif feat in num_labels:
                    new_num.append(feat)
            cat_labels = new_cat
            num_labels = new_num

        data = pd.DataFrame(data, columns=headers)
        data_encoded, multicol_dict = self.multicol_fit_transform(
            data, pd.Index(cat_labels))

        data_x = data_encoded[cat_labels + num_labels].values
        data_y = np.where(data_encoded['label'] == "_", 0, 1)

        return data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words
Ejemplo n.º 10
0
    def predict(self, infile, eval_gold=False, as_text=True):
        """
		Predict sentence splits using an existing model

		:param infile: File in DISRPT shared task *.tok or *.conll format (sentence breaks will be ignored in .conll)
		:param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input
		:param genre_pat: A regex pattern identifying the document genre from document name comments
		:param as_text: Boolean, whether the input is a string, rather than a file name to read
		:return: tokenwise binary prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold
		"""

        if self.model is None:  # Try default model location
            model_path = ".." + os.sep + ".." + os.sep + "models" + os.sep + "subtreeseg.pkl"
        else:
            model_path = self.model

        clf, num_labels, cat_labels, multicol_dict, top_n_words, firsts, lasts = joblib.load(
            model_path)

        feats, _, toks, _, _ = read_conll(infile,
                                          genre_pat=self.genre_pat,
                                          mode="seg",
                                          as_text=as_text,
                                          char_bytes=self.lang == "zho")
        tokens_by_abs_id = self.traverse_trees(feats)
        feats, headers = self.n_gram(feats, tokens_by_abs_id, dummies=False)

        temp = []
        headers_with_oov = [
            "first", "last", "deprel", "closest_left", "closest_right",
            "farthest_left", "farthest_right", "pos", "cpos", "morph",
            "s_type", "depchunk"
        ]
        for pref in ["mn2", "mn1", "par", "par_par", "pl1", "pl2"]:
            temp += [pref + "_" + h for h in headers_with_oov]
        headers_with_oov += temp

        genre_warning = False
        for i, header in enumerate(headers):
            if header in headers_with_oov and header in cat_labels:
                for item in feats:
                    if item[i] not in multicol_dict["encoder_dict"][
                            header].classes_:
                        item[i] = "_"
            elif header == "genre" and "genre" in cat_labels:
                for item in feats:
                    if item[i] not in multicol_dict["encoder_dict"][
                            "genre"].classes_:  # New genre not in training data
                        if not genre_warning:
                            sys.stderr.write(
                                "! WARN: Genre not in training data: " +
                                item[i] + "; suppressing further warnings\n")
                            genre_warning = True
                        item[i] = "_"
            elif header.endswith("word") and header in cat_labels:
                for item in feats:
                    # Replace rare words and words never seen before in this position with POS
                    if item[i] not in top_n_words or item[
                            i] not in multicol_dict["encoder_dict"][
                                header].classes_:
                        pos_col = headers.index(header.replace("word", "pos"))
                        if item[pos_col] in multicol_dict["encoder_dict"][
                                header].classes_:
                            item[i] = item[pos_col]
                        else:
                            item[i] = "_"
        data = feats
        data = pd.DataFrame(data, columns=headers)
        data_encoded = self.multicol_transform(
            data,
            columns=multicol_dict["columns"],
            all_encoders_=multicol_dict["all_encoders_"])

        data_x = data_encoded[cat_labels + num_labels].values

        probas = clf.predict_proba(data_x)
        probas = [p[1] for p in probas]
        preds = [int(p > 0.5) for p in probas]

        for i, p in enumerate(preds):
            if data["tok_id"].values[
                    i] == 1:  # Ensure tok_id 1 is always a segment start
                preds[i] = 1

        if eval_gold:
            gold = np.where(data_encoded['label'] == "_", 0, 1)
            conf_mat = confusion_matrix(gold, preds)
            sys.stderr.write(str(conf_mat) + "\n")
            true_positive = conf_mat[1][1]
            false_positive = conf_mat[0][1]
            false_negative = conf_mat[1][0]
            prec = true_positive / (true_positive + false_positive)
            rec = true_positive / (true_positive + false_negative)
            f1 = 2 * prec * rec / (prec + rec)
            sys.stderr.write("P: " + str(prec) + "\n")
            sys.stderr.write("R: " + str(rec) + "\n")
            sys.stderr.write("F1: " + str(f1) + "\n")
            with io.open("diff.tab", 'w', encoding="utf8") as f:
                for i in range(len(gold)):
                    f.write("\t".join([toks[i],
                                       str(gold[i]),
                                       str(preds[i])]) + "\n")

            return conf_mat, prec, rec, f1
        else:
            return zip(preds, probas)
Ejemplo n.º 11
0
	def tune(self, trainfile, devfile, max_evals=10, as_text=False):

		train_feats, _, _, _, _ = read_conll(trainfile,genre_pat=self.genre_pat,as_text=as_text)
		train_for_rnn, scalers = feats2rnn(train_feats)

		dev_feats, _, _, _, _ = read_conll(devfile,genre_pat=self.genre_pat,as_text=as_text)
		dev_for_rnn, scalers = feats2rnn(dev_feats)

		with io.open(self.corpus_dir + os.sep + "dev."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(dev_for_rnn)

		# NCRFpp expects a test file, we reuse dev
		with io.open(self.corpus_dir + os.sep + "test."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(dev_for_rnn)

		# TRAIN MODEL ON ALL TRAIN
		with io.open(self.corpus_dir + os.sep + "train."+self.ext,'w',encoding="utf8",newline="\n") as f:
			f.write(train_for_rnn)

		config = ncrf_dir + os.sep + self.segtype + ".train"+self.auto+".config"

		def objective(params):

			# sys.stderr.write(str(params))

			data = Data()
			data.read_config(config)

			data.HP_batch_size = int(params['batch_size'])
			data.HP_lr = float(params['lr'])
			# data.word_emb_dim=int(params['word_emb_dim'])
			data.char_emb_dim=int(params['char_emb_dim'])
			data.word_feature_extractor=params['word_seq_feature']
			data.char_feature_extractor=params['char_seq_feature']
			#data.optimizer=params['optimizer']
			data.HP_cnn_layer=int(params['cnn_layer'])
			#data.HP_char_hidden_dim=int(params['char_hidden_dim'])
			data.HP_hidden_dim=int(params['hidden_dim'])
			data.HP_dropout=float(params['dropout'])
			data.HP_lstm_layer=int(params['lstm_layer'])
			data.average_batch_loss=str2bool(params['ave_batch_loss'])

			p = StdOutFilter4Tune()
			p.start()

			ret, best_dev = ncrf(config=None, data=data)

			p.end()
			sys.stdout.write("F1 {:.3f} params {}".format(-best_dev, params))


			if ret == 1:
				return {'loss': -best_dev, 'status': STATUS_OK }
			else:
				return {'status': STATUS_FAIL }

		space = {
			'batch_size': scope.int(hp.quniform('batch_size', 10, 100, 10)),
			'lr': hp.quniform('lr', 0.003, 0.18, 0.001),
			# 'word_emb_dim': scope.int(hp.quniform('word_emb_dim', 100, 300, 10)),
			'char_emb_dim': scope.int(hp.quniform('char_emb_dim', 30, 70, 10)),
			'word_seq_feature': hp.choice('word_seq_feature', ["LSTM","CNN"]),
			'char_seq_feature': hp.choice('char_seq_feature', ["LSTM","CNN"]),
			#'optimizer': hp.choice('optimizer', ["SGD","AdaGrad","AdaDelta","RMSProp","Adam"]),
			'optimizer': hp.choice('optimizer', ["AdaGrad"]),
			'cnn_layer': scope.int(hp.quniform('cnn_layer', 1, 8, 1)),
			'char_hidden_dim': scope.int(hp.quniform('char_hidden_dim', 50, 200, 10)),
			'hidden_dim': scope.int(hp.quniform('hidden_dim', 100, 300, 20)),
			'dropout': hp.quniform('dropout', 0.2, 0.8, 0.1),
			'lstm_layer': scope.int(hp.quniform('lstm_layer', 1, 5, 1)),
			'ave_batch_loss': hp.choice('ave_batch_loss', ["True","False"])
		}

		best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_evals)

		best_params = space_eval(space,best_params)

		with io.open(script_dir + os.sep + "params" + os.sep + "RNNSegmenter"+self.auto+"_best_params.tab",'a',encoding="utf8") as bp:
			corpus = os.path.basename(trainfile).split("_")[0]
			for k, v in best_params.items():
				bp.write("\t".join([corpus, 'RNNClassifier', k, str(v)])+"\n")
		return best_params