def feature_about(): # 获取特征列表 feature_dict = NewsUtil.get_feature() # 获取新闻中出现特征后最近的5个词及其属性 logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, CFETSFX_LEXICON_PATH) # 加载模型,第二个参数是您的外部词典文件路径 feature_about_list = list() for rowN in range(0, raw_news_rows): news_content = raw_news_table.cell_value(rowN, 2) sentences = SentenceSplitter.split(news_content) for sentence in sentences: print(sentence) # 分词 words = segmentor.segment(sentence) print(list(words)) for word_index in range(0, len(words)): word = words[word_index] for feature_word in feature_dict.values(): if feature_word in word: about_list = list() count = 0 while word_index < len(words) and count < 6: about_list.append(words[word_index]) count += 1 word_index += 1 feature_about_list.append(about_list) print(about_list) break segmentor.release() CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
def prepare_feature(): logger.info("In Prepare Feature...") # 获取sheet feature_data = CommonUtil.read_excel(FEATURE_PATH) feature_table = feature_data.sheet_by_index(0) # 获取总行数 feature_rows = feature_table.nrows # 获取总列数 # feature_cols = feature_table.ncols for rowNum in range(1, feature_rows): key = feature_table.cell_value(rowNum, 0) value = feature_table.cell_value(rowNum, 1) featureDict[key] = value logger.info("Prepare Feature...Done!")
def get_feature(): # 特征字典[AAAA:黄金] feature_dict = dict() # 获取sheet feature_data = CommonUtil.read_excel(FEATURE_PATH) feature_table = feature_data.sheet_by_index(0) # 获取总行数 feature_rows = feature_table.nrows # 获取总列数 # feature_cols = feature_table.ncols for rowNum in range(1, feature_rows): key = feature_table.cell_value(rowNum, 0) value = feature_table.cell_value(rowNum, 1) feature_dict[key] = value return feature_dict
def prepare_raw_news(): logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows for rowN in range(0, raw_news_rows): news_item = list() news_index = int(raw_news_table.cell_value(rowN, 0)) news_time = CommonUtil.get_datetime_from_cell( raw_news_table.cell_value(rowN, 1)) news_content = raw_news_table.cell_value(rowN, 2) news_item.append(news_index) news_item.append(news_time) news_item.append(news_content) newsList.append(news_item) logger.info("Prepare Raw News...Done!")