Exemple #1
0
def init_tfidf(project_info):

    logger.log.info('start initial tfidf model ... ')

    name, dict_config, _, _, _, _ = get_project_config(project_info)

    tfidf_model = gensim_tfidf.GensimTfidf(name=name)

    doc_dir = './data/gensim/{0}/original_corpus'.format(name)
    # doc_dir = os.path.join(base_dir, name)

    if not os.path.exists(doc_dir):
        logger.log.error('doc dir %s not exists, exit!' % doc_dir)
        sys.exit(-1)

    pattern = './data/gensim/{0}/original_corpus/{0}-train-*-*.txt'.format(
        name)
    doc_path = glob(pattern)

    # 文本迭代器,减少内存消耗
    my_doc = gensim_tfidf.MyDoc(doc_path)
    # 初始化词典,并保存
    tfidf_model.add_document_from_file(my_doc, is_save=True, **dict_config)
    # 初始化 tfidf model, 并保存
    tfidf_model.init_tfidf_from_file(my_doc, is_save=True)
    # 计算 tfidf 用于初始化 lsi model, 并保存
    tfidf_model.compute_tfidf(my_doc, is_save=True, corpus_name=name)

    return tfidf_model
Exemple #2
0
def recover_clf(project_info):

    name, _, topic_method, _, train_method, _ = get_project_config(project_info)

    base_dir = './data/gensim/{}'.format(name)

    base_dir = os.path.join(base_dir, 'model')

    clf_name = '{0}-{1}-{2}.pkl'.format(name, train_method, topic_method)
    clf_path = os.path.join(base_dir, clf_name)

    le_name = '{0}-{1}-{2}-label_encoder.pkl'.format(
        name, train_method, topic_method)
    le_path = os.path.join(base_dir, le_name)

    if not os.path.exists(clf_path) or not os.path.exists(le_path):
        logger.log.error('clf %s or label_encoder %s not exists' %
                         (clf_path, le_path))
        return None

    with open(clf_path, 'rb') as f:
        clf = pickle.load(f)

    with open(le_path, 'rb') as f:
        le = pickle.load(f)

    return clf, le
Exemple #3
0
def init_lsi_lda(project_info, recover_tfidf=True):

    logger.log.info('start init lsi model ... ')

    name, dict_config, topic_method, topic_config, _, _ = get_project_config(
        project_info)

    assert topic_method in ['lsi', 'lda']

    if topic_method == 'lsi':
        if recover_tfidf:
            model_dir = './data/gensim/{}/model'.format(name)
            model_name = '{}-tfidf.model'.format(name)
            tfidf_model_path = os.path.join(model_dir, model_name)
            if not os.path.exists(tfidf_model_path):
                logger.log.error('tfidf model %s not eixsts, exit' %
                                 tfidf_model_path)
                sys.exit(-1)
            else:
                tfidf_model = gensim_tfidf.GensimTfidf(name=name)
                tfidf_model.load_dictionary()
                tfidf_model.load_tfidf_model()
        else:
            tfidf_model = init_tfidf(project_info)

        corpus_tfidf = tfidf_model.load_tfidf_corpus(name)

        tfidf_model.init_lsi_model(corpus_tfidf, is_save=True, **topic_config)

    elif topic_method == 'lda':

        doc_dir = './data/gensim/{0}/original_corpus'.format(name)
        if not os.path.exists(doc_dir):
            logger.log.error('doc dir %s not exists, exit!' % doc_dir)
            sys.exit(-1)

        pattern = './data/gensim/{0}/original_corpus/{0}-train-*-*.txt'.format(
            name)
        doc_path = glob(pattern)

        # 文本迭代器,减少内存消耗
        my_doc = gensim_tfidf.MyDoc(doc_path)

        tfidf_model = gensim_tfidf.GensimTfidf(name=name)
        if recover_tfidf:
            tfidf_model.load_dictionary()
        else:
            tfidf_model.add_document_from_file(my_doc,
                                               is_save=True,
                                               **dict_config)

        my_corpus = gensim_tfidf.MyCorpus(gensim_tfidf.MyDoc(doc_path),
                                          tfidf_model.dictionary)

        tfidf_model.init_lda_model(my_corpus, is_save=True, **topic_config)

    return tfidf_model
Exemple #4
0
def clf(project_info, data, target, le, is_save=True):

    name, _, topic_method, _, train_method, train_config = get_project_config(project_info)

    is_test = train_config['is_test']
    
    is_unbalanced = train_config['is_unbalanced']


    class_weight = None
    if is_unbalanced:
        class_weight = 'balanced'

    test_score = None

    params = train_config['params']
    clf = LogisticRegression(class_weight=class_weight, **params)


    if is_test:
        # split data to train and test
        X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                            test_size=0.33,
                                                            shuffle=True,
                                                            random_state=42)

        clf.fit(X_train, y_train)
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

    else:
        clf.fit(data, target)
        train_score = clf.score(data, target)

    if is_save:
        base_dir = './data/gensim/{}'.format(name)
        base_dir = os.path.join(base_dir, 'model')

        if not os.path.exists(base_dir):
            os.mkdir(base_dir)

        clf_name = '{0}-{1}-{2}.pkl'.format(name, train_method, topic_method)
        clf_path = os.path.join(base_dir, clf_name)

        with open(clf_path, 'wb') as f:
            pickle.dump(clf, f, protocol=2)

        le_name = '{0}-{1}-{2}-label_encoder.pkl'.format(
            name, train_method, topic_method)
        le_path = os.path.join(base_dir, le_name)

        with open(le_path, 'wb') as f:
            pickle.dump(le, f, protocol=2)

    return train_score, test_score
Exemple #5
0
def recover_model(project_info):
    logger.log.info('start recovering lsi model ... ')

    name, _, topic_method, _, _, _ = get_project_config(project_info)

    assert topic_method in ['lsi', 'lda']

    tfidf_model = gensim_tfidf.GensimTfidf(name=name)
    tfidf_model.load_dictionary()
    tfidf_model.load_tfidf_model()

    if topic_method == 'lsi':
        tfidf_model.load_lsi_model()
    elif topic_method == 'lda':
        tfidf_model.load_lda_model()

    return tfidf_model
Exemple #6
0
def compute_lsi_lda(project_info,
                    recover_m=True,
                    recover_d=True,
                    recover_tfidf=False):

    name, dict_config, topic_method, topic_config, _, _ = get_project_config(
        project_info)

    pattern = './data/gensim/{0}/original_corpus/{0}-train-*-*.txt'.format(
        name)
    doc_path = glob(pattern)

    data = []

    num_topics = topic_config['num_topics']

    if recover_d:
        tfidf_model = gensim_tfidf.GensimTfidf(name=name)

        doc_path = [x.split('/')[-1] for x in doc_path]

        for doc in doc_path:
            _, _, tag, _ = doc[:-4].split('-')

            if topic_method == 'lsi':
                corpus_lsi = tfidf_model.load_lsi_corpus(doc[:-4], is_npy=True)
                assert corpus_lsi.shape[1] == num_topics
                data.append((corpus_lsi, tag))

            elif topic_method == 'lda':
                corpus_lda = tfidf_model.load_lda_corpus(doc[:-4], is_npy=True)
                assert corpus_lsi.shape[1] == num_topics
                data.append((corpus_lda, tag))

        return data

    if recover_m:
        tfidf_model = recover_model(project_info)
        if topic_method == 'lsi':
            assert tfidf_model.lsi_model.num_topics == num_topics
        elif topic_method == 'lda':
            assert tfidf_model.lda_model.num_topics == num_topics
    else:
        tfidf_model = init_lsi_lda(project_info, recover_tfidf)

    for doc in doc_path:
        _, _, tag, _ = doc[:-4].split('-')
        corpus_name = doc.split('/')[-1][:-4]
        my_doc = gensim_tfidf.MyDoc(doc)

        if topic_method == 'lsi':
            corpus_tfidf = tfidf_model.compute_tfidf(my_doc)
            corpus_lsi = tfidf_model.compute_lsi(corpus_tfidf,
                                                 num_topics=num_topics,
                                                 is_dense=True,
                                                 is_npy_save=True,
                                                 corpus_name=corpus_name)
            data.append((corpus_lsi, tag))

        elif topic_method == 'lda':
            my_corpus = gensim_tfidf.MyCorpus(gensim_tfidf.MyDoc(doc),
                                              tfidf_model.dictionary)
            corpus_lda = tfidf_model.compute_lda(my_corpus,
                                                 num_topics=num_topics,
                                                 is_dense=True,
                                                 is_npy_save=True,
                                                 corpus_name=corpus_name)
            data.append((corpus_lda, tag))

    return data
Exemple #7
0
def train(project_info):
    start_time = datetime.now()

    name, dict_config, topic_method, topic_config, train_method, train_config = get_project_config(
        project_info)

    data = lsi.compute_lsi_lda(project_info,
                               recover_d=True,
                               recover_m=True,
                               recover_tfidf=True)

    targets = [x[1] for x in data]

    if train_method == 'xgb':
        le = LabelBinarizer()
    else:
        le = LabelEncoder()

    le.fit(targets)

    X_data = []
    Y_data = []
    for i in range(len(data)):
        x = data[i][0]
        assert np.shape(x)[1] == topic_config['num_topics']

        y = [targets[i]] * len(x)

        y = le.transform(y)

        X_data.append(x)
        Y_data.append(y)

    data = np.concatenate(X_data, axis=0)
    target = np.concatenate(Y_data, axis=0)

    train_score = None

    if train_method == 'svm':
        train_score, test_score = svm.clf(project_info,
                                          data,
                                          target,
                                          le,
                                          is_save=True)

    # TODO
    elif train_method == 'xgb':
        pass

    elif train_method == 'logistic':
        train_score, test_score = logistic.clf(project_info,
                                               data,
                                               target,
                                               le,
                                               is_save=True)
        # train_score = logistic.logistic_clf(data, target, le, is_save=True)
        # cross_scores = logistic.logistic_cross(data, target)
    elif train_method == 'lsvm':
        train_score, test_score = linear_svm.clf(project_info,
                                                 data,
                                                 target,
                                                 le,
                                                 is_save=True)

    base_dir = './data/gensim/{}'.format(name)
    base_dir = os.path.join(base_dir, 'train')

    if not os.path.exists(base_dir):
        os.mkdir(base_dir)

    end_time = datetime.now()
    interval = str(end_time - start_time)

    train_date = start_time.strftime('%Y-%m-%d')
    res_name = '{0}-{1}-{2}-{3}-train.txt'.format(name, train_method,
                                                  topic_method, train_date)

    res_path = os.path.join(base_dir, res_name)

    with open(res_path, 'a') as f:
        f.write('time:\n')
        f.write('\tstart => %s\n' % start_time)
        f.write('\tend => %s\n' % end_time)
        f.write('\tinterval => %s\n\n' % interval)

        project_info = [
            '\t' + str(x) + ' => ' + str(y) for x, y in project_info.items()
            if isinstance(y, str)
        ]
        project_info = '\n'.join(project_info)
        f.write('project-info:\n')
        f.write(project_info)
        f.write('\n\n')

        dict_config = [
            '\t' + str(x) + ' => ' + str(y) for x, y in dict_config.items()
        ]
        dict_config = '\n'.join(dict_config)
        f.write('dictionary-config:\n')
        f.write(dict_config)
        f.write('\n\n')

        topic_config = [
            '\t' + str(x) + ' => ' + str(y) for x, y in topic_config.items()
        ]
        topic_config = '\n'.join(topic_config)
        f.write('topic-config:\n')
        f.write(topic_config)
        f.write('\n\n')

        train_config = [
            '\t' + str(x) + ' => ' + str(y) for x, y in train_config.items()
        ]
        train_config = '\n'.join(train_config)
        f.write('train-config:\n')
        f.write(train_config)
        f.write('\n\n')

        if train_score:
            f.write('score:\n')
            f.write('\ttrain_score => %s\n' % str(train_score))
            if test_score:
                f.write('\ttest_score => %s\n' % str(test_score))
            f.write('\n\n')

        f.write('*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#* \n')
        f.write('\n')