def train_xgboost_lr(data_path, vectorizer_path=None, xgblr_xgb_model_path=None, xgblr_lr_model_path=None, feature_encoder_path=None, feature_type='tfidf_char', col_sep='\t'): data_content, data_lbl = data_reader(data_path, col_sep) # init feature feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=vectorizer_path) # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path, feature_encoder_path) # fit model.train_model(X_train, y_train) # evaluate label_pred = model.predict(X_val) simple_evaluate(y_val, label_pred)
def infer_classic(model_save_path, test_data_path, thresholds=0.5, pred_save_path=None, vectorizer_path=None, col_sep=',', num_classes=2, feature_type='tf'): # load model model = load_pkl(model_save_path) # load data content data_set, test_ids = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=vectorizer_path, is_infer=True) # get data feature data_feature = feature.get_feature() if num_classes == 2: # binary classification label_pred_probas = model.predict_proba(data_feature)[:, 1] label_pred = label_pred_probas > thresholds else: label_pred = model.predict(data_feature) save(label_pred, test_ids, pred_save_path) print("finish prediction.")
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data=data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) # evaluate if true_labels: try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id)) except Exception: print("error. no true labels") # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight_dict = load_dict(config.lr_feature_weight_path) pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict, id_label) print(pred_labels[:5])
def __find_candidates_to_anaphora_in_chapter(chapter, total_words, candidates, stop_word_check): """ Finds candidates to anaphora in the specified chapter""" word_count = 0 i = 0 while i < len(chapter.sentences): if len(chapter.sentences[i]) > 1: candidate = Feature('anaphora') candidate.add_word(total_words + word_count) candidate.add_context( total_words + word_count, total_words + word_count + len(chapter.sentences[i]) - 1) word_count += len(chapter.sentences[i]) first_anaphora_word = chapter.sentences[i][0].lower() i += 1 if stop_word_check(first_anaphora_word): continue context_length = 1 while i < len(chapter.sentences) and chapter.sentences[i][0].lower( ) == first_anaphora_word: candidate.extend_context(total_words + word_count + len(chapter.sentences[i]) - 1) candidate.add_word(total_words + word_count) word_count += len(chapter.sentences[i]) i += 1 context_length += 1 if context_length > 1: if candidate not in candidates: candidates.append(candidate) else: if len(chapter.sentences[i]) > 0 and chapter.sentences[i][0]: word_count += len(chapter.sentences[i]) i += 1 return word_count
def infer_deep_model(model_type='cnn', data_path='', model_save_path='', label_vocab_path='', max_len=300, batch_size=128, col_sep='\t', pred_save_path=None): from keras.models import load_model # load data content data_set, true_labels = data_reader(data_path, col_sep) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': feature_type = 'doc_vectorize' else: feature_type = 'vectorize' feature = Feature(data_set, feature_type=feature_type, is_infer=True, max_len=max_len) # get data feature data_feature = feature.get_feature() # load model model = load_model(model_save_path) # predict, in keras, predict_proba same with predict pred_label_probs = model.predict(data_feature, batch_size=batch_size) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [prob.argmax() for prob in pred_label_probs] pred_labels = [id_label[i] for i in pred_labels] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to: %s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if true_labels: # evaluate assert len(pred_labels) == len(true_labels) for label, prob in zip(true_labels, pred_label_probs): logger.debug('label_true:%s\tprob_label:%s\tprob:%s' % (label, id_label[prob.argmax()], prob.max())) print('total eval:') try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))
def _get_feature(self, word_vocab): # 提取特征 print("feature_type : %s" % self.feature_type) print("seg_contents:") print(self.seg_contents[:2]) feature = Feature(data=self.seg_contents, feature_type=self.feature_type, feature_vec_path=self.feature_vec_path, word_vocab=word_vocab) # get data feature return feature.get_feature()
def test_returns_none_if_database_and_enum_are_consistent(self, app): # Given find_all_features = MagicMock() features = [] for feature_toggle in FeatureToggle: feature = Feature() feature.populate_from_dict({'name': feature_toggle}) features.append(feature) find_all_features.return_value = features # When / Then assert check_feature_consistency(find_all_features) is None
def install_features(): Feature.query.delete() features = [] for toggle in FeatureToggle: feature = Feature() feature.populate_from_dict({ 'description': toggle.value, 'name': toggle, 'is_active': True }) features.append(feature) PcObject.save(*features)
def context_intersection(first: Feature, second: Feature) -> list: """ Returns context intersection between two features :param first: first feature :param second: second feature :return: context intersection """ first_beg, first_end = first.context_begin(), first.context_end() second_beg, second_end = second.context_begin(), second.context_end() if first_beg > second_end or second_beg > first_end: return [] return [max(first_beg, second_beg), min(first_end, second_end)]
def find(anaphoras: list, epiphoras: list) -> list: """ Finds symploce between previously found anaphoras and epiphoras :param anaphoras: list with anaphoras :param epiphoras: list with epiphoras :return: list with symploces (Feature objects) """ res = list() for anaphora in anaphoras: for epiphora in epiphoras: intersection = context_intersection(anaphora, epiphora) if intersection: inter_start, inter_end = intersection anaphora_words = [ word for word in anaphora.words() if inter_start <= word <= inter_end ] epiphora_words = [ word for word in epiphora.words() if inter_start <= word <= inter_end ] if anaphora_words and epiphora_words: if anaphora_words[-1] > epiphora_words[0]: res.append( Feature("symploce", words=sorted(anaphora_words + epiphora_words), context=intersection)) return res
def __parse_conjunctive_adverbs_polysyndeton(sent: list, start_count: int, language: str) -> list: """ :param sent: sentence as list of words :param start_count: position of first word in sentence in document :return: list of Features """ res = [] sentence = [w.lower() for w in sent] for conj_adv in conjunctive_adverbs(language=language): candidates_start = [ ] # list of start words of candidates of repeating conjuctive adverbs for i in range(len(sentence) - len(conj_adv) + 1): if tuple(sentence[i:i + len(conj_adv)]) == conj_adv: candidates_start.append(i) if len(candidates_start) >= MIN_N_OF_REPETITIONS: res.append( Feature(feature_type="polysyndeton", words=[ start_count + c_pos + con_len for c_pos in candidates_start for con_len in range(len(conj_adv)) ], context=[start_count, start_count + len(sentence) - 1])) return res
def __parse_pair_conjunctions_polysyndeton(sent: list, start_count: int, language: str) -> list: """ :param sent: sentence as list of words :param start_count: position of first word in sentence in document :return: list of Features """ res = [] sentence = [w.lower() for w in sent] # <list> of words for conj_word_1, conj_word_2 in pair_conjunctions(language=language): positions = set() # polysyndeton words positions first_word_met, first_word_pos = False, -1 for i, word in enumerate(sentence): if word == conj_word_1: first_word_met, first_word_pos = True, i elif word == conj_word_2 and first_word_met: positions.update({first_word_pos, i}) first_word_met, first_word_pos = False, -1 if len(positions) // 2 >= MIN_N_OF_REPETITIONS: res.append( Feature(feature_type="polysyndeton", words=[start_count + i for i in positions], context=[start_count, start_count + len(sentence) - 1])) return res
def __find_anadiplosis_inside_chapter(chapter: list, start_count: int, stop_word_check) -> list: """ :param chapter: chapter as list of sentences (as list of words) :param start_count: index of first word in chapter :return: list of Features """ threshold = 1 # minimal metric res = [] word_count = start_count for i in range(len(chapter) - 1): candidate = __test_sentences_for_anadiplosis(chapter[i], chapter[i + 1], stop_word_check) if candidate['metric'] >= threshold: res.append( Feature(feature_type="anadiplosis", words=[ n + word_count + len(chapter[i]) for n in range(-len(candidate['words']), len(candidate['words'])) ], context=[ word_count, word_count + len(chapter[i]) + len(chapter[i + 1]) - 1 ])) word_count += len(chapter[i]) return res
def get_features(auth: AuthData): """ This handler function return features by service or all features. """ if auth.user: return Feature.all().serialize() features = _set_visible_only(["id", "version"], auth.service.features) return features.serialize()
def _find_feature(id): """ Find a feature by id """ feature = Feature.find(id) if not feature: msg = f"Feature not found." raise exceptions.NotFound({"message": msg}) return feature
def __find_between_sentences(self, first_sentence, second_sentence): if first_sentence[0] == second_sentence[-1] and first_sentence[ -1] == second_sentence[0]: self._features.append( Feature('chiasmus', [ self.word_counter, self.word_counter + len(first_sentence) - 1, self.word_counter + len(first_sentence), self.word_counter + len(first_sentence) + len(second_sentence) - 1 ], [ self.word_counter, self.word_counter + len(first_sentence) + len(second_sentence) - 1 ]))
def create_feature(featureData: FeatureCreate): """ Create a new feature """ _check_id_format(featureData.id) _check_version_format(featureData.version) if Feature.find(featureData.id): msg = f"The feature id isn't available. Please try another." return http.JSONResponse({"message": msg}, status_code=409) _check_services_exists(featureData.services) feature = Feature.create(**featureData) msg = "Feature created successfully." log.info(f"{msg} - ID: {feature.id}") feature.update_services(featureData.services) headers = {"Content-Location": f"/features/{feature.id}"} return http.JSONResponse({"message": msg}, status_code=201, headers=headers)
def load(self): """ Loads Document from specified file """ with open(self.file_name, "r", encoding='utf8') as file: json_doc = json.loads(file.read()) self.language = json_doc["metadata"]["language"] features = [ Feature(feature['type'], feature['words'], feature['context'], self.__letters_to_int(feature['letters']), feature['transcription']) for feature in json_doc["features"] ] chapters = self.__load_chapters(json_doc["text"]) stop_words = json_doc["stop_words"] return Document(chapters, self.language, features, stop_words)
def infer_xgboost_lr(test_data_path, vectorizer_path=None, xgblr_xgb_model_path=None, xgblr_lr_model_path=None, feature_encoder_path=None, col_sep='\t', pred_save_path=None, feature_type='tfidf_char'): # load data content data_set, test_ids = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=vectorizer_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path, feature_encoder_path) # predict label_pred = model.predict(data_feature) save(label_pred, test_ids, pred_save_path) print("finish prediction.")
def train_classic(model_type, data_path=None, pr_figure_path=None, model_save_path=None, vectorizer_path=None, col_sep=',', thresholds=0.5, num_classes=2, feature_type='tfidf_char'): data_content, data_lbl = data_reader(data_path, col_sep) # init feature feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=vectorizer_path) # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) model = get_model(model_type) # fit model.fit(X_train, y_train) # save model dump_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, thresholds=thresholds, num_classes=num_classes, model_type=model_type, pr_figure_path=pr_figure_path)
def __parse_simple_conjunctions_polysyndeton(sent: list, start_count: int, language: str) -> list: """ :param sent: sentence as list of words :param start_count: position of first word in sentence in document :return: list of Features """ res = [] sentence = [w.lower() for w in sent] for conj in conjunctions(language=language): if sentence.count(conj) >= MIN_N_OF_REPETITIONS: res.append( Feature(feature_type="polysyndeton", words=[ start_count + i for i, w in enumerate(sentence) if w == conj ], context=[start_count, start_count + len(sentence) - 1])) return res
def __parse_diacope_inside_sentence(sent: list, start_count: int, excluded_words: set) -> list: """ Parses diacope from the specified sentence :param sent: sentence to find diacope from (as list of words) :param start_count: index of first word in sentence :param excluded_words: set of words which can't be part of diacope :return: list with diacope in sentence """ min_diacope_power = 2 res = [] for word in set(sent) - excluded_words: word_positions = [i for i, w in enumerate(sent) if w == word] if len(word_positions) >= min_diacope_power: diacope_positions = __diacope_words(word_positions) if len(diacope_positions) >= min_diacope_power: res.append(Feature("diacope", [start_count + pos for pos in diacope_positions], [start_count, start_count + len(sent) - 1])) res = __merge_diacope_in_nearby_words_inside_sentence(res) return res
def parse_features_collection_json(): # path_to_file = '/Users/akratovich/projects/python/python_stud/parser/real_json_sample.json' path_to_file = '/Users/akratovich/projects/python/sf-city-lots-json/citylots.json' parsed_features = [] features_collection = Reader.read_json(path_to_file) for feature_item in features_collection['features']: tmp_coord_list = __get_points(feature_item) tmp_figure_type = __get__geometry_type(feature_item) tmp_geometry = Geometry(tmp_figure_type, tmp_coord_list) tmp_feature_type = __get_feature_type(feature_item) tmp_prop_list = __get_prop_feature_prop_list(feature_item) feature = Feature(tmp_feature_type, tmp_prop_list, tmp_geometry) # print(feature.__str__()) parsed_features.append(feature) print('{0} features are parsed'.format(parsed_features.__len__())) print('Parsed {0} features'.format(parsed_features.__len__())) return parsed_features
def __find_epizeuxis_inside_sentence(sentence: Sentence, start_count: int, stop_word_check) -> list: """Finds epizeuxis in the given sentence :param sentence: sentence to find epizeuxis in :param start_count: index of first word in sentence :param stop_word_check: function checking if word is stop word :return: epizeuxis into given sentence (as list of Features)""" res = [] i = 0 sentence = [word.lower() for word in sentence.words_list] while i < len(sentence) - 1: repeat_length, n_of_repeats = 0, 0 for length in range(1, len(sentence) // 2 + 1): if sentence[i:i + length] == sentence[i + length:i + length * 2] and \ True not in [stop_word_check(word) for word in sentence[i:i + length]]: repeat_length, n_of_repeats = length, 2 break if repeat_length: for repeats in range( 3, len(sentence[i + repeat_length * 2:]) // repeat_length): if sentence[i:i + repeat_length] != \ sentence[i + repeat_length * (repeats - 1):i + repeat_length * repeats]: break n_of_repeats += 1 res.append( Feature("epizeuxis", words=[ start_count + i + j for j in range(repeat_length * n_of_repeats) ], context=[start_count, start_count + len(sentence) - 1])) i += repeat_length * n_of_repeats else: i += 1 return res
def __find_epizeuxis_between_sentences(chapter: Chapter, start_count: int, stop_word_check) -> list: """Finds epizeuxis between sentences in the given chapter :param chapter: chapter to find epizeuxis in :param start_count: index of first word in chapter :param stop_word_check: function checking if word is stop word :return: epizeuxis in given chapter (as list of Features)""" res = [] current_feature = None chapter = [[word.lower() for word in sentence.words_list] for sentence in chapter.sentences] for i in range(len(chapter) - 1): if chapter[i] == chapter[i + 1] and True not in [ stop_word_check(word) for word in chapter[i + 1] ]: if current_feature: current_feature.extend_context(start_count + len(chapter[i + 1])) for j in range(len(chapter[i + 1])): current_feature.add_word(start_count + len(chapter[i]) + j) else: current_feature = Feature( "epizeuxis", words=[ start_count + j for j in range(len(chapter[i]) + len(chapter[i + 1])) ], context=[ start_count, start_count + len(chapter[i]) + len(chapter[i + 1]) ]) res.append(current_feature) else: current_feature = None start_count += len(chapter[i]) return res
def __find_epiphora_inside_chapter(chapter: Chapter, start_count: int, stop_word_check) -> list: """ Parses epiphora from chapter :param chapter: chapter to find epiphora from (list of sentences as list of words) :param start_count: index of first word in chapter :param stop_word_check: function checking if word is stop word :return: list with epiphora(Feature objects) """ res = [] word_count = start_count current_feature = None for i in range(len(chapter.sentences) - 1): if __test_sentences_for_epiphora(chapter[i], chapter[i + 1], stop_word_check): if current_feature is None: current_feature = Feature( "epiphora", words=[ word_count + len(chapter.sentences[i]) - 1, word_count + len(chapter.sentences[i]) + len(chapter.sentences[i + 1]) - 1 ], context=[ word_count, word_count + len(chapter.sentences[i]) + len(chapter.sentences[i + 1]) - 1 ]) res.append(current_feature) else: current_feature.add_word(word_count + len(chapter[i]) + len(chapter[i + 1]) - 1) current_feature.extend_context(word_count + len(chapter[i]) + len(chapter[i + 1]) - 1) else: current_feature = None word_count += len(chapter[i]) return res
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=512, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' feature = Feature(data=data_content, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = cnn_model(max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_filters=num_filters, filter_sizes=filter_sizes, num_classses=num_classes, dropout=dropout) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) cp = ModelCheckpoint(model_save_path, monitor='val_acc', verbose=1, save_best_only=True) # fit and save model history = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_val, y_val), callbacks=[cp]) logger.info('save model:%s' % model_save_path) plt_history(history, model_name=model_type)
color="g", label="Cross-validation score") plt.legend(loc="best") plt.savefig(figure_path) return plt if __name__ == "__main__": sys.path.append("../") from models.feature import Feature from models.reader import data_reader data_content, data_lbl = data_reader('../data/train_words.txt', '\t') # init feature feature = Feature(feature_type='tfidf_word', feature_vec_path='../output/temp') # get data feature data_feature = feature.get_feature(data_content) # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.2) search_cv(X_train, y_train, X_val, y_val, model=SVC()) # test plot_learning_curve title = "Learning Curves (Random Forest, n_estimators = 30)" estimator = SVC() plot_learning_curve(estimator, title, X_train,
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if 'logistic_regression' in model_save_path and config.is_debug: count = 0 features = load_pkl('output/lr_features.pkl') for line in data_set: if count > 5: break count += 1 logger.debug(line) words = line.split() for category, category_feature in features.items(): logger.debug('*' * 43) logger.debug(category) category_score = 0 for w in words: if w in category_feature: category_score += category_feature[w] logger.debug("%s:%s" % (w, category_feature[w])) logger.debug("%s\t%f" % (category, category_score)) logger.debug('=' * 43) if true_labels: # evaluate try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))