def predict( query, model_path, stopwords_path, person_name_path, place_name_path, common_char_path, segment_sep, domain_sample_path, ngram, pmi_path, entropy_path, ): logger.info('model predict') # get feature feat = Feature(stopwords_path=stopwords_path, person_name_path=person_name_path, place_name_path=place_name_path, common_char_path=common_char_path, segment_sep=segment_sep, domain_sample_path=domain_sample_path, ngram=ngram, pmi_path=pmi_path, entropy_path=entropy_path) features, terms = feat.get_feature(query, is_word_segmented=False) # predict classification model model = load_pkl(model_path) logger.debug("model predict") label_pred = model.predict(features) logger.info("words: %s" % terms) logger.info("predict label: %s" % label_pred) print("predict label: %s" % label_pred) return label_pred
def rank_query(self, query): self.check_inited() if len(query) == 1: return zip([query], [0]) # get feature data_feature, terms = self.get_feature(query, is_word_segmented=False) # predict model label_pred = self.model.predict(data_feature) logger.debug("predict label: %s" % label_pred) return zip(terms, label_pred)
def get_feature(self, query, is_word_segmented=False): """ Get text feature :param query: :param is_word_segmented: :return: list, list: term features, sentence features """ term_features = [] if is_word_segmented: word_seq = query.split(self.segment_sep) else: word_seq = word_segment(query, cut_type='word', pos=False) logger.debug('%s' % word_seq) # sentence sentence_features = AttrDict( query_length=len(query), term_size=len(word_seq), ) # term idx = 0 offset = 0 for word in word_seq: emb = self.vec.encode(word) word_list = deepcopy(word_seq) if word in word_list: word_list.remove(word) del_word_query = ''.join(word_list) del_term_sim_score = self.sim.get_score(query, del_word_query) term_features.append( AttrDict( term=word, term_length=len(word), idx=idx, offset=offset, is_number=is_number_string(word), is_chinese=is_chinese_string(word), is_alphabet=is_alphabet_string(word), is_stopword=self.is_stopword(word), is_name=self.is_name(word), # is_entity=self.is_entity(pos), is_common_char=self.is_common_char_string(word), embedding_sum=np.sum(emb), del_term_score=del_term_sim_score, )) idx += len(word) offset += 1 return term_features, sentence_features
def get_feature(self, query, is_word_segmented=False): """ Get feature from query :param query: input query :param is_word_segmented: bool, is word segmented or not :return: features, terms """ features = [] terms = [] self.check_feature_inited() text_terms, text_sent = self.text_feature.get_feature( query, is_word_segmented=is_word_segmented) stat_terms, stat_sent = self.statistics_feature.get_feature( query, is_word_segmented=is_word_segmented) lang_terms, lang_sent = self.language_feature.get_feature( query, is_word_segmented=is_word_segmented) # sentence feature text_sent.update(stat_sent) text_sent.update(lang_sent) logger.debug('sentence features: %s' % text_sent) sent_feature = [ text_sent.query_length, text_sent.term_size, text_sent.ppl ] # term feature for text, stat, lang in zip(text_terms, stat_terms, lang_terms): text.update(stat) text.update(lang) # logger.debug('term features: %s' % text) term_feature = [ text.term_length, text.idx, text.offset, float(text.is_number), float(text.is_chinese), float(text.is_alphabet), float(text.is_stopword), float(text.is_name), float(text.is_common_char), text.embedding_sum, text.del_term_score, text.idf, text.text_rank_score, text.tfidf_score, text.pmi_score, text.left_entropy_score, text.right_entropy_score, text.del_term_ppl, text.term_ngram_score, text.left_term_score, text.right_term_score ] feature = sent_feature + term_feature features.append(feature) terms.append(text.term) logger.debug("[query]feature size: %s, term size: %s" % (len(features), len(terms))) return features, terms
def tfidf_word_feature(data_set, is_infer=False, feature_vec_path='', word_vocab=None): """ Get TFIDF ngram feature by word """ if is_infer: vectorizer = load_pkl(feature_vec_path) data_feature = vectorizer.transform(data_set) else: vectorizer = TfidfVectorizer(analyzer='word', vocabulary=word_vocab, sublinear_tf=True) data_feature = vectorizer.fit_transform(data_set) vocab = vectorizer.vocabulary_ logger.debug('vocab size: %d' % len(vocab)) logger.debug(data_feature.shape) # if not self.is_infer: save_pkl(vectorizer, feature_vec_path, overwrite=True) return data_feature
def check_inited(self): if not self.inited: self.model = load_pkl(self.model_path) logger.debug('Loaded model: {}'.format(self.model_path)) self.inited = True
def train( train_file, col_sep, stopwords_path, person_name_path, place_name_path, common_char_path, segment_sep, domain_sample_path, ngram, pmi_path, entropy_path, model_path, ): # 1.read train data contents, labels = data_reader(train_file, col_sep) logger.info('contents size:%s, labels size:%s' % (len(contents), len(labels))) # 2.get feature feat = Feature(stopwords_path=stopwords_path, person_name_path=person_name_path, place_name_path=place_name_path, common_char_path=common_char_path, segment_sep=segment_sep, domain_sample_path=domain_sample_path, ngram=ngram, pmi_path=pmi_path, entropy_path=entropy_path) features = [] tags = [] for content, label in zip(contents, labels): label_split = [int(i) for i in label.split(segment_sep)] content_split = content.split(segment_sep) if len(label_split) != len(content_split): logger.warning('pass, content size not equal label size, %s %s' % (content, label)) continue tags += label_split content_features, terms = feat.get_feature(content, is_word_segmented=True) features += content_features logger.info("[train]features size: %s, tags size: %s" % (len(features), len(tags))) assert len(features) == len(tags), "features size must equal tags size" X_train, X_val, y_train, y_val = train_test_split(features, tags, test_size=0.2, random_state=0) logger.debug("train size:%s, val size:%s" % (len(y_train), len(y_val))) # 3.train classification model, save model file model = RandomForestClassifier(n_estimators=300) # fit logger.debug("start train model ...") model.fit(X_train, y_train) # save model save_pkl(model, model_path, overwrite=True) logger.info("model saved: %s" % model_path) # 4.validation and evaluate logger.debug("evaluate model with validation data") evaluate(model, X_val, y_val) return model