def make_doc(train_info, config_info): """ 从数据库导入原始文本,再分词或提取关键词,最后根据标签名称以txt保存在本地 Arguments: train_info {dict/yaml} -- 有关训练和分词的参数配置 config_info {dict/yaml} -- 有关数据库等的参数配置 """ name = train_info['train_info']['name'] logger.log.info('start making doc, %s ... ' % name) db_info = config_info['yff_mysql'] make_info = train_info['make_doc'] len_threshold = make_info['len_threshold'] is_keyword = make_info['is_keyword'] tmp_dir = make_info.get('tmp_dir') # jieba缓存目录,避免权限问题 user_dict_path = make_info.get('user_dict_path') # 导入用户词典,用于分词和提取关键词 stop_words_path = make_info.get( 'stop_words_path') if is_keyword else None # 导入停用词,只能用于提取关键词 idf_file_path = make_info.get( 'idf_file_path') if is_keyword else None # 导入idf词典,只能用于提取关键词 tokenizer.jieba_init(tmp_dir=tmp_dir, user_dict_path=user_dict_path, stop_words_path=stop_words_path, idf_file_path=idf_file_path) # 导入停用词用于过滤分词结果 stop_words = tokenizer.get_stop_words(make_info.get('stop_words_path')) recommend_db = MysqlCtrl(db_info=db_info) ret = recommend_db.connect() if not ret: logger.log.error('connect to database error, exit') sys.exit(-1) select_sql = 'SELECT DISTINCT tag_name FROM t_news_tag '\ 'WHERE tag_id LIKE "06%";' ret, targets = recommend_db.TB_select(select_sql) targets = [x[0] for x in targets] doc_type = 'train' part_id = 1 for tag in targets: news = load_news(recommend_db, tag) if len(news) > 0: text_list = [remove_tags(item[-1]) for item in news] _make_doc(name, text_list, doc_type, tag, part_id, stop_words=stop_words, is_keyword=is_keyword, len_threshold=len_threshold)
def main(): log_file_path = './logs/clean_news' logger.logger_init(log_file_path) cfg_file_path = './conf/config.yaml' with open(cfg_file_path, 'r') as f: cfg_info = yaml.load(f.read()) db_info = cfg_info['yff_mysql'] recommend_db = MysqlCtrl(db_info=db_info) ret = recommend_db.connect() if not ret: logger.log.error('connect to database error, exit') sys.exit(-1) news_list = load_sogou_news(recommend_db) # news_list = load_yff_news(recommend_db) # ret = insert_news(recommend_db, news_list) recommend_db.close()
def main(): log_file_path = './logs/etl' logger.logger_init(log_file_path, stdout_level='info') train_yaml_path = './conf/train-business.yaml' config_yaml_path = './conf/config.yaml' with open(train_yaml_path, 'r') as f: project_info = yaml.load(f.read()) with open(config_yaml_path, 'r') as f: config_info = yaml.load(f.read()) db_info = config_info['recommend_mysql_r'] recommend_db = MysqlCtrl(db_info=db_info) ret = recommend_db.connect() if not ret: logger.log.error('connect to database error, exit') sys.exit(-1) # train-sogou.yaml # clean_news_sogou2.load_sogou_news(recommend_db) make_business_doc.make_doc(project_info, config_info)
def predict_test(config_info, train_info): recommend_db = MysqlCtrl(config_info['recommend_mysql_r']) recommend_db.connect() select_sql = 'SELECT news_id, content FROM t_news_corpus_latest LIMIT 10;' ret, news = recommend_db.TB_select(select_sql) tfidf_model, clf, le = predict.init_model(train_info) for news_id, content in news: pred = predict.predict(train_info, tfidf_model, clf, le, content) pred = le.inverse_transform(pred) # print(pred) recommend_db.close()
def predict_test(config_info): recommend_db = MysqlCtrl(config_info['recommend_mysql_r']) recommend_db.connect() select_sql = 'SELECT news_id, info_title, content FROM t_news_corpus_latest;' ret, news = recommend_db.TB_select(select_sql) res = [] for news_id, title, content in news: titleContentKeywordMatch(news_id, title, content) # insert_sql = 'INSERT INTO t_result '\ # '(news_id, title, pred, content) '\ # 'VALUES (%s, %s, %s, %s);' # recommend_db.TB_insert(insert_sql, res) recommend_db.close()
def predict_test(config_info, train_info): recommend_db = MysqlCtrl(config_info['recommend_mysql_r']) recommend_db.connect() select_sql = 'SELECT news_id, summary FROM t_baidu_search '\ 'WHERE keyword_type = "business";' ret, news = recommend_db.TB_select(select_sql) tfidf_model, clf, le = predict.init_model(train_info) for news_id, content in news: pred = predict.predict(train_info, tfidf_model, clf, le, content) # pred = le.inverse_transform(pred) # x = np.argmax(pred[0]) if x == 0: print(news_id) recommend_db.close()
def make_doc(project_info, config_info): """ 从数据库导入原始文本,再分词或提取关键词,最后根据标签名称以txt保存在本地 Arguments: train_info {dict/yaml} -- 有关训练和分词的参数配置 config_info {dict/yaml} -- 有关数据库等的参数配置 """ name = project_info['name'] logger.log.info('start making doc, %s ... ' % name) db_info = config_info['yff_mysql'] make_info = project_info['make_doc'] len_threshold = make_info['len_threshold'] is_keyword = make_info['is_keyword'] tmp_dir = make_info.get('tmp_dir') # jieba缓存目录,避免权限问题 user_dict_path = make_info.get('user_dict_path') # 导入用户词典,用于分词和提取关键词 stop_words_path = make_info.get( 'stop_words_path') if is_keyword else None # 导入停用词,只能用于提取关键词 idf_file_path = make_info.get( 'idf_file_path') if is_keyword else None # 导入idf词典,只能用于提取关键词 tokenizer.jieba_init(tmp_dir=tmp_dir, user_dict_path=user_dict_path, stop_words_path=stop_words_path, idf_file_path=idf_file_path) # 导入停用词用于过滤分词结果 stop_words = tokenizer.get_stop_words(make_info.get('stop_words_path')) recommend_db = MysqlCtrl(db_info=db_info) ret = recommend_db.connect() if not ret: logger.log.error('connect to database error, exit') sys.exit(-1) count = 0 date_end = datetime(2018, 7, 26, 0, 0, 0) while count < 20: count += 1 doc_type = 'train' part_id = 1 date_start = date_end - timedelta(days=1) tag = date_end.strftime('%Y_%m_%d') news = load_news(recommend_db, date_start, date_end) date_end = date_end - timedelta(days=1) if len(news) > 0: text_list = [item[-1] for item in news] _make_doc(name, text_list, doc_type, tag, part_id, stop_words=stop_words, is_keyword=is_keyword, len_threshold=len_threshold)
def make_doc(project_info, config_info): """ 从数据库导入原始文本,再分词或提取关键词,最后根据标签名称以txt保存在本地 Arguments: train_info {dict/yaml} -- 有关训练和分词的参数配置 config_info {dict/yaml} -- 有关数据库等的参数配置 """ name = project_info['name'] logger.log.info('start making doc, %s ... ' % name) db_info = config_info['yff_mysql'] make_info = project_info['make_doc'] len_threshold = make_info['len_threshold'] is_keyword = make_info['is_keyword'] tmp_dir = make_info.get('tmp_dir') # jieba缓存目录,避免权限问题 user_dict_path = make_info.get('user_dict_path') # 导入用户词典,用于分词和提取关键词 stop_words_path = make_info.get( 'stop_words_path') if is_keyword else None # 导入停用词,只能用于提取关键词 idf_file_path = make_info.get( 'idf_file_path') if is_keyword else None # 导入idf词典,只能用于提取关键词 tokenizer.jieba_init(tmp_dir=tmp_dir, user_dict_path=user_dict_path, stop_words_path=stop_words_path, idf_file_path=idf_file_path) # 导入停用词用于过滤分词结果 stop_words = tokenizer.get_stop_words(make_info.get('stop_words_path')) recommend_db = MysqlCtrl(db_info=db_info) ret = recommend_db.connect() if not ret: logger.log.error('connect to database error, exit') sys.exit(-1) select_sql = 'SELECT DISTINCT target FROM t_sogou_news;' ret, targets = recommend_db.TB_select(select_sql) targets = [x[0] for x in targets] doc_type = 'train' file_size = 10000 # for tag in targets: tag = '非财经' news = load_news(recommend_db, '新闻') random.seed(10) news = random.sample(news, 50000) if len(news) > 0: part_id = 6 file_num = len(news) // file_size if file_num * file_size < len(news): file_num += 1 for i in range(file_num + 1): sub_news = news[i * file_size:(i + 1) * file_size] text_list = [remove_tags(item[-1]) for item in sub_news] _make_doc(name, text_list, doc_type, tag, part_id, stop_words=stop_words, is_keyword=is_keyword, len_threshold=len_threshold) part_id += 1