Beispiel #1
0
def predict(
    query,
    model_path,
    stopwords_path,
    person_name_path,
    place_name_path,
    common_char_path,
    segment_sep,
    domain_sample_path,
    ngram,
    pmi_path,
    entropy_path,
):
    logger.info('model predict')
    # get feature
    feat = Feature(stopwords_path=stopwords_path,
                   person_name_path=person_name_path,
                   place_name_path=place_name_path,
                   common_char_path=common_char_path,
                   segment_sep=segment_sep,
                   domain_sample_path=domain_sample_path,
                   ngram=ngram,
                   pmi_path=pmi_path,
                   entropy_path=entropy_path)
    features, terms = feat.get_feature(query, is_word_segmented=False)
    # predict classification model
    model = load_pkl(model_path)
    logger.debug("model predict")
    label_pred = model.predict(features)
    logger.info("words: %s" % terms)
    logger.info("predict label: %s" % label_pred)
    print("predict label: %s" % label_pred)
    return label_pred
Beispiel #2
0
 def rank_query(self, query):
     self.check_inited()
     if len(query) == 1:
         return zip([query], [0])
     # get feature
     data_feature, terms = self.get_feature(query, is_word_segmented=False)
     # predict model
     label_pred = self.model.predict(data_feature)
     logger.debug("predict label: %s" % label_pred)
     return zip(terms, label_pred)
Beispiel #3
0
    def get_feature(self, query, is_word_segmented=False):
        """
        Get text feature
        :param query:
        :param is_word_segmented:
        :return: list, list: term features, sentence features
        """
        term_features = []
        if is_word_segmented:
            word_seq = query.split(self.segment_sep)
        else:
            word_seq = word_segment(query, cut_type='word', pos=False)
        logger.debug('%s' % word_seq)

        # sentence
        sentence_features = AttrDict(
            query_length=len(query),
            term_size=len(word_seq),
        )

        # term
        idx = 0
        offset = 0
        for word in word_seq:
            emb = self.vec.encode(word)
            word_list = deepcopy(word_seq)
            if word in word_list:
                word_list.remove(word)
            del_word_query = ''.join(word_list)
            del_term_sim_score = self.sim.get_score(query, del_word_query)
            term_features.append(
                AttrDict(
                    term=word,
                    term_length=len(word),
                    idx=idx,
                    offset=offset,
                    is_number=is_number_string(word),
                    is_chinese=is_chinese_string(word),
                    is_alphabet=is_alphabet_string(word),
                    is_stopword=self.is_stopword(word),
                    is_name=self.is_name(word),
                    # is_entity=self.is_entity(pos),
                    is_common_char=self.is_common_char_string(word),
                    embedding_sum=np.sum(emb),
                    del_term_score=del_term_sim_score,
                ))
            idx += len(word)
            offset += 1

        return term_features, sentence_features
Beispiel #4
0
    def get_feature(self, query, is_word_segmented=False):
        """
        Get feature from query
        :param query: input query
        :param is_word_segmented: bool, is word segmented or not
        :return: features, terms
        """
        features = []
        terms = []

        self.check_feature_inited()
        text_terms, text_sent = self.text_feature.get_feature(
            query, is_word_segmented=is_word_segmented)
        stat_terms, stat_sent = self.statistics_feature.get_feature(
            query, is_word_segmented=is_word_segmented)
        lang_terms, lang_sent = self.language_feature.get_feature(
            query, is_word_segmented=is_word_segmented)
        # sentence feature
        text_sent.update(stat_sent)
        text_sent.update(lang_sent)
        logger.debug('sentence features: %s' % text_sent)
        sent_feature = [
            text_sent.query_length, text_sent.term_size, text_sent.ppl
        ]
        # term feature
        for text, stat, lang in zip(text_terms, stat_terms, lang_terms):
            text.update(stat)
            text.update(lang)
            # logger.debug('term features: %s' % text)
            term_feature = [
                text.term_length, text.idx, text.offset,
                float(text.is_number),
                float(text.is_chinese),
                float(text.is_alphabet),
                float(text.is_stopword),
                float(text.is_name),
                float(text.is_common_char), text.embedding_sum,
                text.del_term_score, text.idf, text.text_rank_score,
                text.tfidf_score, text.pmi_score, text.left_entropy_score,
                text.right_entropy_score, text.del_term_ppl,
                text.term_ngram_score, text.left_term_score,
                text.right_term_score
            ]
            feature = sent_feature + term_feature
            features.append(feature)
            terms.append(text.term)
        logger.debug("[query]feature size: %s, term size: %s" %
                     (len(features), len(terms)))

        return features, terms
Beispiel #5
0
def tfidf_word_feature(data_set,
                       is_infer=False,
                       feature_vec_path='',
                       word_vocab=None):
    """
    Get TFIDF ngram feature by word
    """
    if is_infer:
        vectorizer = load_pkl(feature_vec_path)
        data_feature = vectorizer.transform(data_set)
    else:
        vectorizer = TfidfVectorizer(analyzer='word',
                                     vocabulary=word_vocab,
                                     sublinear_tf=True)
        data_feature = vectorizer.fit_transform(data_set)
    vocab = vectorizer.vocabulary_
    logger.debug('vocab size: %d' % len(vocab))
    logger.debug(data_feature.shape)
    # if not self.is_infer:
    save_pkl(vectorizer, feature_vec_path, overwrite=True)
    return data_feature
Beispiel #6
0
 def check_inited(self):
     if not self.inited:
         self.model = load_pkl(self.model_path)
         logger.debug('Loaded model: {}'.format(self.model_path))
         self.inited = True
Beispiel #7
0
def train(
    train_file,
    col_sep,
    stopwords_path,
    person_name_path,
    place_name_path,
    common_char_path,
    segment_sep,
    domain_sample_path,
    ngram,
    pmi_path,
    entropy_path,
    model_path,
):
    # 1.read train data
    contents, labels = data_reader(train_file, col_sep)
    logger.info('contents size:%s, labels size:%s' %
                (len(contents), len(labels)))

    # 2.get feature
    feat = Feature(stopwords_path=stopwords_path,
                   person_name_path=person_name_path,
                   place_name_path=place_name_path,
                   common_char_path=common_char_path,
                   segment_sep=segment_sep,
                   domain_sample_path=domain_sample_path,
                   ngram=ngram,
                   pmi_path=pmi_path,
                   entropy_path=entropy_path)

    features = []
    tags = []
    for content, label in zip(contents, labels):
        label_split = [int(i) for i in label.split(segment_sep)]
        content_split = content.split(segment_sep)
        if len(label_split) != len(content_split):
            logger.warning('pass, content size not equal label size, %s %s' %
                           (content, label))
            continue
        tags += label_split
        content_features, terms = feat.get_feature(content,
                                                   is_word_segmented=True)
        features += content_features
    logger.info("[train]features size: %s, tags size: %s" %
                (len(features), len(tags)))
    assert len(features) == len(tags), "features size must equal tags size"
    X_train, X_val, y_train, y_val = train_test_split(features,
                                                      tags,
                                                      test_size=0.2,
                                                      random_state=0)
    logger.debug("train size:%s, val size:%s" % (len(y_train), len(y_val)))
    # 3.train classification model, save model file
    model = RandomForestClassifier(n_estimators=300)
    # fit
    logger.debug("start train model ...")
    model.fit(X_train, y_train)
    # save model
    save_pkl(model, model_path, overwrite=True)
    logger.info("model saved: %s" % model_path)

    # 4.validation and evaluate
    logger.debug("evaluate model with validation data")
    evaluate(model, X_val, y_val)
    return model