Esempio n. 1
0
def make_doc(train_info, config_info):
    """ 从数据库导入原始文本,再分词或提取关键词,最后根据标签名称以txt保存在本地

    Arguments:
        train_info {dict/yaml} -- 有关训练和分词的参数配置
        config_info {dict/yaml} --  有关数据库等的参数配置
    """

    name = train_info['train_info']['name']

    logger.log.info('start making doc, %s ... ' % name)

    db_info = config_info['yff_mysql']

    make_info = train_info['make_doc']

    len_threshold = make_info['len_threshold']
    is_keyword = make_info['is_keyword']

    tmp_dir = make_info.get('tmp_dir')  # jieba缓存目录,避免权限问题
    user_dict_path = make_info.get('user_dict_path')  # 导入用户词典,用于分词和提取关键词
    stop_words_path = make_info.get(
        'stop_words_path') if is_keyword else None  # 导入停用词,只能用于提取关键词
    idf_file_path = make_info.get(
        'idf_file_path') if is_keyword else None  # 导入idf词典,只能用于提取关键词

    tokenizer.jieba_init(tmp_dir=tmp_dir,
                         user_dict_path=user_dict_path,
                         stop_words_path=stop_words_path,
                         idf_file_path=idf_file_path)

    # 导入停用词用于过滤分词结果
    stop_words = tokenizer.get_stop_words(make_info.get('stop_words_path'))

    recommend_db = MysqlCtrl(db_info=db_info)
    ret = recommend_db.connect()
    if not ret:
        logger.log.error('connect to database error, exit')
        sys.exit(-1)

    select_sql = 'SELECT DISTINCT tag_name FROM t_news_tag '\
                 'WHERE tag_id LIKE "06%";'
    ret, targets = recommend_db.TB_select(select_sql)

    targets = [x[0] for x in targets]

    doc_type = 'train'
    part_id = 1
    for tag in targets:
        news = load_news(recommend_db, tag)
        if len(news) > 0:
            text_list = [remove_tags(item[-1]) for item in news]
            _make_doc(name,
                      text_list,
                      doc_type,
                      tag,
                      part_id,
                      stop_words=stop_words,
                      is_keyword=is_keyword,
                      len_threshold=len_threshold)
Esempio n. 2
0
def main():

    log_file_path = './logs/clean_news'
    logger.logger_init(log_file_path)

    cfg_file_path = './conf/config.yaml'
    with open(cfg_file_path, 'r') as f:
        cfg_info = yaml.load(f.read())

    db_info = cfg_info['yff_mysql']
    recommend_db = MysqlCtrl(db_info=db_info)
    ret = recommend_db.connect()
    if not ret:
        logger.log.error('connect to database error, exit')
        sys.exit(-1)

    news_list = load_sogou_news(recommend_db)
    # news_list = load_yff_news(recommend_db)

    # ret = insert_news(recommend_db, news_list)

    recommend_db.close()
Esempio n. 3
0
def main():

    log_file_path = './logs/etl'
    logger.logger_init(log_file_path, stdout_level='info')

    train_yaml_path = './conf/train-business.yaml'
    config_yaml_path = './conf/config.yaml'

    with open(train_yaml_path, 'r') as f:
        project_info = yaml.load(f.read())

    with open(config_yaml_path, 'r') as f:
        config_info = yaml.load(f.read())

    db_info = config_info['recommend_mysql_r']
    recommend_db = MysqlCtrl(db_info=db_info)
    ret = recommend_db.connect()
    if not ret:
        logger.log.error('connect to database error, exit')
        sys.exit(-1)

    # train-sogou.yaml
    # clean_news_sogou2.load_sogou_news(recommend_db)
    make_business_doc.make_doc(project_info, config_info)
Esempio n. 4
0
def predict_test(config_info, train_info):
    recommend_db = MysqlCtrl(config_info['recommend_mysql_r'])
    recommend_db.connect()

    select_sql = 'SELECT news_id, content FROM t_news_corpus_latest LIMIT 10;'

    ret, news = recommend_db.TB_select(select_sql)

    tfidf_model, clf, le = predict.init_model(train_info)

    for news_id, content in news:
        pred = predict.predict(train_info, tfidf_model, clf, le, content)
        pred = le.inverse_transform(pred)

        #
        print(pred)

    recommend_db.close()
Esempio n. 5
0
def predict_test(config_info):
    recommend_db = MysqlCtrl(config_info['recommend_mysql_r'])
    recommend_db.connect()

    select_sql = 'SELECT news_id, info_title, content FROM t_news_corpus_latest;'

    ret, news = recommend_db.TB_select(select_sql)

    res = []
    for news_id, title, content in news:
        titleContentKeywordMatch(news_id, title, content)

    # insert_sql = 'INSERT INTO t_result '\
    #              '(news_id, title, pred, content) '\
    #              'VALUES (%s, %s, %s, %s);'

    # recommend_db.TB_insert(insert_sql, res)

    recommend_db.close()
Esempio n. 6
0
def predict_test(config_info, train_info):
    recommend_db = MysqlCtrl(config_info['recommend_mysql_r'])
    recommend_db.connect()

    select_sql = 'SELECT news_id, summary FROM t_baidu_search '\
                 'WHERE keyword_type = "business";'

    ret, news = recommend_db.TB_select(select_sql)

    tfidf_model, clf, le = predict.init_model(train_info)

    for news_id, content in news:
        pred = predict.predict(train_info, tfidf_model, clf, le, content)
        # pred = le.inverse_transform(pred)
        #
        x = np.argmax(pred[0])
        if x == 0:
            print(news_id)

    recommend_db.close()
Esempio n. 7
0
def make_doc(project_info, config_info):
    """ 从数据库导入原始文本,再分词或提取关键词,最后根据标签名称以txt保存在本地

    Arguments:
        train_info {dict/yaml} -- 有关训练和分词的参数配置
        config_info {dict/yaml} --  有关数据库等的参数配置
    """

    name = project_info['name']

    logger.log.info('start making doc, %s ... ' % name)

    db_info = config_info['yff_mysql']

    make_info = project_info['make_doc']

    len_threshold = make_info['len_threshold']
    is_keyword = make_info['is_keyword']

    tmp_dir = make_info.get('tmp_dir')  # jieba缓存目录,避免权限问题
    user_dict_path = make_info.get('user_dict_path')  # 导入用户词典,用于分词和提取关键词
    stop_words_path = make_info.get(
        'stop_words_path') if is_keyword else None  # 导入停用词,只能用于提取关键词
    idf_file_path = make_info.get(
        'idf_file_path') if is_keyword else None  # 导入idf词典,只能用于提取关键词

    tokenizer.jieba_init(tmp_dir=tmp_dir,
                         user_dict_path=user_dict_path,
                         stop_words_path=stop_words_path,
                         idf_file_path=idf_file_path)

    # 导入停用词用于过滤分词结果
    stop_words = tokenizer.get_stop_words(make_info.get('stop_words_path'))

    recommend_db = MysqlCtrl(db_info=db_info)
    ret = recommend_db.connect()
    if not ret:
        logger.log.error('connect to database error, exit')
        sys.exit(-1)

    count = 0
    date_end = datetime(2018, 7, 26, 0, 0, 0)

    while count < 20:
        count += 1

        doc_type = 'train'
        part_id = 1

        date_start = date_end - timedelta(days=1)

        tag = date_end.strftime('%Y_%m_%d')

        news = load_news(recommend_db, date_start, date_end)

        date_end = date_end - timedelta(days=1)

        if len(news) > 0:
            text_list = [item[-1] for item in news]
            _make_doc(name, text_list, doc_type, tag, part_id,
                      stop_words=stop_words, is_keyword=is_keyword,
                      len_threshold=len_threshold)
Esempio n. 8
0
def make_doc(project_info, config_info):
    """ 从数据库导入原始文本,再分词或提取关键词,最后根据标签名称以txt保存在本地

    Arguments:
        train_info {dict/yaml} -- 有关训练和分词的参数配置
        config_info {dict/yaml} --  有关数据库等的参数配置
    """

    name = project_info['name']

    logger.log.info('start making doc, %s ... ' % name)

    db_info = config_info['yff_mysql']

    make_info = project_info['make_doc']

    len_threshold = make_info['len_threshold']
    is_keyword = make_info['is_keyword']

    tmp_dir = make_info.get('tmp_dir')  # jieba缓存目录,避免权限问题
    user_dict_path = make_info.get('user_dict_path')  # 导入用户词典,用于分词和提取关键词
    stop_words_path = make_info.get(
        'stop_words_path') if is_keyword else None  # 导入停用词,只能用于提取关键词
    idf_file_path = make_info.get(
        'idf_file_path') if is_keyword else None  # 导入idf词典,只能用于提取关键词

    tokenizer.jieba_init(tmp_dir=tmp_dir,
                         user_dict_path=user_dict_path,
                         stop_words_path=stop_words_path,
                         idf_file_path=idf_file_path)

    # 导入停用词用于过滤分词结果
    stop_words = tokenizer.get_stop_words(make_info.get('stop_words_path'))

    recommend_db = MysqlCtrl(db_info=db_info)
    ret = recommend_db.connect()
    if not ret:
        logger.log.error('connect to database error, exit')
        sys.exit(-1)

    select_sql = 'SELECT DISTINCT target FROM t_sogou_news;'

    ret, targets = recommend_db.TB_select(select_sql)

    targets = [x[0] for x in targets]

    doc_type = 'train'
    file_size = 10000

    # for tag in targets:
    tag = '非财经'
    news = load_news(recommend_db, '新闻')
    random.seed(10)
    news = random.sample(news, 50000)

    if len(news) > 0:
        part_id = 6
        file_num = len(news) // file_size
        if file_num * file_size < len(news):
            file_num += 1

        for i in range(file_num + 1):

            sub_news = news[i * file_size:(i + 1) * file_size]

            text_list = [remove_tags(item[-1]) for item in sub_news]
            _make_doc(name,
                      text_list,
                      doc_type,
                      tag,
                      part_id,
                      stop_words=stop_words,
                      is_keyword=is_keyword,
                      len_threshold=len_threshold)

            part_id += 1