Exemple #1
0
def load_save_model(w2v_bin_path, vocab_path, save_txt_path):
    # load model(加载模型的方法)
    # 注意:不同的保存方式对应不同的加载模型的方式
    # skip_gram_model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    skip_gram_model = Word2Vec.load(w2v_bin_path)
    print(skip_gram_model.most_similar("车子"))

    #构建词表:词:词向量
    word_dict = {}

    # 从模型中加载词向量
    # 问题:一次性加载model中的所有词向量(依据词向量构建词表),如果model中词向量很大,则内存吃不消
    # for word in skip_gram_model.wv.vocab:
    #     word_dict[word] = skip_gram_model[word]     #字典word_dict中的存储形式为: 词:词对应的词向量

    # 构建embedding_matrix
    vocab = Vocab(vocab_path, VOCAB_SIZE)
    for word, index in vocab.word2id.items():
        #注:若要使用腾讯的词向量,只要在加载skip_gram_model时w2v_bin_path用腾讯词向量的路径
        #但是上面vocab_path还是要用自己的vocab.txt文件的路径
        if word in skip_gram_model.wv.vocab:    #构建embedding层
            word_dict[index] = skip_gram_model[word]        # 即为后面所用到的embedding_matrix
        else:
            #随机初始化,值的大小为-0.025到0.025,词向量维度为256
            word_dict[index] = np.random.uniform(-0.025, 0.025, (EMBEDDING_DIM))

    # 将从模型中加载的数据进行压缩保存,保存为二进制文件,节约空间
    dump_pkl(word_dict, save_txt_path, overwrite=True)
Exemple #2
0
    def tf_word_feature(self, data_set):
        """
        Get TF feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = CountVectorizer(analyzer='word',
                                              encoding='utf-8',
                                              lowercase=True,
                                              vocabulary=self.word_vocab)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        logger.debug('Vocab list:')
        count = 0
        for k, v in self.vectorizer.vocabulary_.items():
            if count < 10:
                logger.debug("%s	%s" % (k, v))
                count += 1
        feature_names = self.vectorizer.get_feature_names()
        logger.info('feature_names:%s' % feature_names[:20])

        logger.info(data_feature.shape)
        if not self.is_infer:
            dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature
Exemple #3
0
    def tfidf_word_feature(self, data_set):
        """
        Get TFIDF ngram feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = TfidfVectorizer(analyzer='word',
                                              ngram_range=(1, 2),
                                              sublinear_tf=True)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        print('Vocab size:', len(vocab))
        print('Vocab list:')
        count = 0
        for k, v in self.vectorizer.vocabulary_.items():
            if count < 10:
                print(k, v)
                count += 1

        print('\nIFIDF词频矩阵:')
        print('data_feature shape:', data_feature.shape)
        print(data_feature.toarray())
        dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature
Exemple #4
0
    def tfidf_char_feature(self, data_set):
        """
        Get TFIDF feature by char
        :param data_set:
        :return:
        """
        data_set = get_char_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        logger.debug('Vocab list:')
        count = 0
        for k, v in self.vectorizer.vocabulary_.items():
            if count < 10:
                logger.debug("%s	%s" % (k, v))
                count += 1

        logger.info(data_feature.shape)
        if not self.is_infer:
            dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature
def build(path1,
          path2,
          path3,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(path1, path2, path3)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')

    # train model
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(sentence_path),
                   size=256,
                   negative=5,
                   workers=8,
                   iter=40,
                   window=3,
                   min_count=min_count)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    sim = w2v.wv.similarity('宝马', '车主')
    print('宝马 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
Exemple #6
0
def build(train_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1,
          col_sep='\t'):
    sentences = extract_sentence(train_seg_path,
                                 test_seg_path,
                                 col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(sentence_path),
                   size=256,
                   window=5,
                   min_count=min_count,
                   iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
Exemple #7
0
def build(train_seg_x_path,
          train_seg_target_path,
          test_seg_x_path,
          w2v_output,
          sentence_path,
          w2v_bin_path="model.bin",
          embedding_size=256,
          min_count=5,
          col_sep='\t'):
    # sentences = extract_sentence(train_seg_x_path, train_seg_target_path, test_seg_x_path, col_sep=col_sep)
    # save_sentence(sentences, sentence_path)
    #
    # print('train w2v model...')
    # # train model
    # model = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
    #                size=embedding_size, window=5, min_count=min_count, iter=40)
    # model.wv.save_word2vec_format(w2v_bin_path, binary=True)
    # print("save %s ok." % w2v_bin_path)
    # # test
    # sim = model.wv.similarity('奔驰', '宝马')
    # print('奔驰 vs 宝马 similarity score:', sim)

    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word2vec_dict = {}
    for word in model.vocab:
        word2vec_dict[word] = model.word_vec(word)
    dump_pkl(word2vec_dict, w2v_output, True)
Exemple #8
0
    def _show_all_labels(self):
        # split labeled data and unlabeled data
        output = []
        contents = []
        seg_contents = []
        features = []
        labels = []
        for i in self.samples:
            label = i.human_label if i.human_label else i.machine_label
            output.append(label + self.col_sep + str(i.prob))
            seg_contents.append(i.seg_text_word)
            contents.append(i.original_text)
            labels.append(label)
            features.append(i.feature.toarray().tolist()[0])
        # get data feature
        X_train, X_val, y_train, y_val = train_test_split(
            csr_matrix(np.array(features)), labels)

        # fit
        self.model.fit(X_train, y_train)

        # save model
        dump_pkl(self.model, self.model_save_path, overwrite=True)
        eval(self.model, X_val, y_val)
        save(output,
             ture_labels=None,
             pred_save_path=self.pred_save_path,
             data_set=contents)
Exemple #9
0
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1):
    # sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path)
    # save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """
    # sentences=[line.strip().split() for line in sentences]
    # sentences=LineSentence(sentence_path)    #sentences为二维list
    # w2v=Word2Vec(sentences=sentences,size=256,sg=1,window=5)

    # w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    # print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('技师', '车主')
    # print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    print(model["说"])
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path,
          train_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_x_seg_path, train_y_seg_path,
                                 test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    #train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentence,使用skip-gram,embedding维度为256
    """
    w2v = Word2Vec(sentences=LineSentence(sentence_path),
                   size=256,
                   min_count=min_count,
                   sg=1,
                   workers=8,
                   iter=50)
    # w2v.wv.save_word2vec_format('{}/datasets/self_word2vec.txt'.format(BASE_DIR),binary=False)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    #test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)

    #load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
Exemple #11
0
def build(train_x_seg_path,
          train_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="../data/w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_x_seg_path, train_y_seg_path,
                                 test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(sentence_path),
                   size=256,
                   window=5,
                   min_count=min_count,
                   iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
Exemple #12
0
    def _train(self, labeled_sample_list, unlabeled_sample_list, batch_id):
        machine_samples_list = []
        # get data feature
        labeled_data_label = [
            i.human_label if i.human_label else i.machine_label
            for i in labeled_sample_list
        ]
        labeled_data_feature = [
            i.feature.toarray().tolist()[0] for i in labeled_sample_list
        ]
        X_train, X_val, y_train, y_val = train_test_split(
            csr_matrix(np.array(labeled_data_feature)), labeled_data_label)
        # fit
        self.model.fit(X_train, y_train)

        # save model
        dump_pkl(self.model, self.model_save_path, overwrite=True)
        eval(self.model, X_val, y_val)

        # 预测未标注数据集
        unlabeled_data_feature = [
            i.feature.toarray().tolist()[0] for i in unlabeled_sample_list
        ]
        if not unlabeled_sample_list:
            return machine_samples_list
        pred_result = self.model.predict_proba(
            csr_matrix(np.array(unlabeled_data_feature)))

        pred_label_proba = [(self.id_label[prob.argmax()], prob.max())
                            for prob in pred_result]

        # save middle result
        pred_output = [
            self.id_label[prob.argmax()] + self.col_sep + str(prob.max())
            for prob in pred_result
        ]
        pred_save_path = self.pred_save_path[:-4] + '_batch_' + str(
            batch_id) + '.txt'
        logger.debug("save infer label and prob result to: %s" %
                     pred_save_path)
        unlabeled_data_text = [i.original_text for i in unlabeled_sample_list]
        save(pred_output,
             ture_labels=None,
             pred_save_path=pred_save_path,
             data_set=unlabeled_data_text)

        assert len(unlabeled_sample_list) == len(pred_label_proba)
        for unlabeled_sample, label_prob in zip(unlabeled_sample_list,
                                                pred_label_proba):
            idx = unlabeled_sample.id
            self.samples[idx].machine_label = label_prob[0]
            self.samples[idx].prob = label_prob[1]
            machine_samples_list.append(unlabeled_sample)
        return machine_samples_list
Exemple #13
0
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    # test
    model_test(model, '技师', '车主')
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
Exemple #14
0
def build_pos_embedding(path,
                        overwrite=False,
                        pos_vocab_path=None,
                        pos_vocab_start=1,
                        pos_dim=64):
    if os.path.exists(path) and not overwrite:
        print("already has $s and use it." % path)
        return load_pkl(path)
    pos_vocab = load_vocab(pos_vocab_path)
    pos_vocab_count = len(pos_vocab) + pos_vocab_start
    pos_emb = np.random.normal(size=(
        pos_vocab_count,
        pos_dim,
    )).astype('float32')
    for i in range(pos_vocab_start):
        pos_emb[i, :] = 0.
    # save
    dump_pkl(pos_emb, path, overwrite=True)
    return pos_emb
Exemple #15
0
def save_w2v(bin_path, pkl_out_path, min_count=100):
    sentences = extract_sentence(QA_TRAIN_CLEAN_X_PATH, QA_TRAIN_CLEAN_Y_PATH,
                                 QA_TEST_CLEAN_X_PATH)
    save_sentence(sentences, QA_SENTENCE_PATH)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(QA_SENTENCE_PATH),
                   size=256,
                   window=5,
                   min_count=min_count,
                   iter=5)
    w2v.wv.save_word2vec_format(bin_path, binary=True)
    print("save w2v model %s ok." % bin_path)

    model = KeyedVectors.load_word2vec_format(bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, pkl_out_path, overwrite=True)
Exemple #16
0
def train_classic(model_type,
                  data_path=None,
                  pr_figure_path=None,
                  model_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  thresholds=0.5,
                  num_classes=2,
                  feature_type='tfidf_char'):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # init feature
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path)
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    dump_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model,
         X_val,
         y_val,
         thresholds=thresholds,
         num_classes=num_classes,
         model_type=model_type,
         pr_figure_path=pr_figure_path)
Exemple #17
0
def build_word_embedding(path,
                         overwrite=False,
                         sentence_w2v_path=None,
                         word_vocab_path=None,
                         word_vocab_start=2,
                         w2v_dim=256):
    if os.path.exists(path) and not overwrite:
        print("already has $s and use it." % path)
        return load_pkl(path)
    word_vocab = load_vocab(word_vocab_path)
    w2v_dict_full = load_pkl(sentence_w2v_path)
    word_vocab_count = len(w2v_dict_full) + word_vocab_start
    word_emb = np.zeros((word_vocab_count, w2v_dim), dtype='float32')
    for word in word_vocab:
        index = word_vocab[word]
        if word in w2v_dict_full:
            word_emb[index, :] = w2v_dict_full[word]
        else:
            random_vec = np.random.uniform(-0.25, 0.25,
                                           size=(w2v_dim, )).astype('float32')
            word_emb[index, :] = random_vec
    # save
    dump_pkl(word_emb, path, overwrite=True)
    return word_emb
Exemple #18
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)