def __init__(self): # 加载分词自定义词典 dicts.init() self.data_process = DataPressing() # 停用词 self.stop_words = load_stop_words() # 股票-股票代码对, 并且对股票代码做一些变换,比如 _, self.stocks_df = dicts.load_stock_data() self.tokenizer = Tokenizer(self.data_process, self.stop_words)
def cut_process(text): """ 数据处理模块, 分词、提取股票实体词 :param text: :return: """ # 分词 dicts.init() text_list = tokenizer.token(text) # 提取text中涉及到的股票实体,并且转换成股票代码 stock_list = data_process.find_stocks(text_list, stocks_df) # stock_list = ','.join(stock_list) return stock_list
def load_data_test(): dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), load_stop_words() # 分词 tk = tokenization.Tokenizer(dp, stop_words) # 获取三张表中的所有新闻 df_result = get_data() # 接口已经改变,调用时需要注意 res_lists = [] for index, row in df_result.iterrows(): title, content = row["title"], row["content"] if title is not None and title: title = dp.no_remove(title) if not dp.useless_filter(title, dicts.stock_dict): title_list = tk.token(title) res_lists.append(title_list) if content is not None and content: content = dp.no_remove(content) if not dp.useless_filter(content, dicts.stock_dict): content_list = tk.token(content) res_lists.append(content_list) file_out = open("text.txt", "w") for index in res_lists: item = ",".join(item for item in index) file_out.write(item.encode("utf8") + "\n") file_out.close()
def d_test(): data_processing = data_process.DataPressing() dict_init = dicts.init() stop_words = load_stop_words() tk = Tokenizer(data_processing, stop_words) # print(["大智慧".decode("utf8")]) # print(["【今日题材】".decode("utf8")]) # print(["关注同".decode("utf-8")]) # 剔除杂质词 print( data_processing.no_remove( "【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的, 关注同花顺财经(ths58), 获取更多机会。" )) # 判断content中是否存在某些特殊词 print( data_processing.useless_contain( "[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的")) # 对content中的内容进行去停,去杂质词,分词 # result = tk.token("【今日题材】[AI决策]加多宝的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的") result = tk.token("加多宝重推红罐 是否能再与王老吉争锋") print('Type of result: {}。'.format(type(result))) for i in result: print(i)
def _cut_sentence(self, sentence): """ # 对句子进行分词 :return: """ # 使用多进程的时候需要修改一下 dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), tokenization.load_stop_words() tk = tokenization.Tokenizer(dp, stop_words) self.word_list = tk.token(sentence)
def multi_token_test(): """ 多进程测试 :return: """ import time from multiprocessing import Pool import multiprocessing as mp s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' dataprocess = data_process.DataPressing() dicts.init() stop_words = load_stop_words() # 串行处理 t0 = time.time() res1_l = [] for i in range(10000): res1 = paralize_test(s, dataprocess, stop_words) res1_l.append(res1) print("串行处理花费时间{t}s".format(t=time.time() - t0)) # 并行处理 t1 = time.time() res2_l = [] pool = Pool(processes=int(mp.cpu_count() * 0.8)) for i in range(10000): res = pool.apply_async(paralize_test, ((s, dataprocess, stop_words), )) res2_l.append(res) # 获取数据 # for k in res2_l: # print k.get() pool.close() pool.join() print("并行处理花费时间{t}s".format(t=time.time() - t1))
def data_save(): """ 读取数据库中的内容,文本预处理之后,保存成本地,主要用于singlepass进行历史事件的聚类使用,用于词向量训练,关键词提取等操作。 :return: """ dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), load_stop_words() tk = tokenization.Tokenizer(dp, dict_init, stop_words) # 分词 df_result = get_data() # df_result.ix[:, ["content"]].apply(tk.token) # 提取dataFrame中的title和content的内容,然后分别进行预处理, # 方式一、标题和正文保存为同一个新闻,且新闻标题和正文同时存在 res_lists = [] for i in tqdm(range(len(df_result))): news_id = df_result.iloc[i]['id'] title = df_result.iloc[i]['title'] content = df_result.iloc[i]['content'] unix_time = df_result.iloc[i]['unix_time'] if content and title: news_id = news_id.strip() title = title.strip() string = title.strip() + content.strip() string_list = tk.token(string) if not dp.useless_filter(string_list, dicts.stock_dict): # string_list = keywords_extractor.parallel_test(string_list) # 提取关键词 res_lists.append((news_id, title, string_list, unix_time)) # 根据上面的具体格式,组成tuple # res_lists.append((string, unix_time)) # 根据上面的具体格式,组合成tuple logging.logger.info("提取的文章的个数: %s" % len(res_lists)) # 数据更新 # 保存新闻的新闻ID,发布时间, 分词后的正文;[news_id, timestamp, contents] # file_out = open("./data/text_full_index.txt", "w") file_out = open(conf.corpus_news, "w") for index, content in enumerate(res_lists): item = " ".join(item for item in content[2]) file_out.write( str(content[0]) + "\t" + str(content[3]) + "\t" + item.encode("utf8") + "\n") file_out.close() # 保存新闻的新闻ID, 发布时间, 新闻标题;[news_id, timestamp, title] # file_out = open("./data/text_title_index.txt", "w") file_out = open(conf.corpus_news_title, "w") for index, content in enumerate(res_lists): file_out.write( str(content[0]) + "\t" + str(content[3]) + "\t" + content[1] + "\n") file_out.close()
def d_test(): """ 类接口测试 :return: """ # s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ # '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ # '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' # s = '【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,' \ # '还是注册制的, 关注同花顺财经(ths58), 获取更多机会。' s = '中兴通讯(000063)在经历七个一字跌停板后,于今天打开跌停板。债转股开盘大涨,天津普林(002134)、信达地产(600657)' \ '、海德股份(000567)集体涨停,长航凤凰(000520)、浙江东方(600120)、陕国投A(000563)大涨,消息面上,' \ '央行宣布定向降准0.5个百分点,将重点支持债转股。中兴通讯机构最低估值12.02元/股在复牌之前,' \ '多家基金公司对中兴通讯估值大多调整至20.54元/股。连续7个跌停板后,中兴通讯A股股价早就已经跌穿这一价格。' \ '据《中国经营报》记者不完全统计,6月20日~22日,多家基金公司再做出调整中兴通讯A股估值的公告,下调公司包括工银瑞信基金、' \ '华泰柏瑞基金、东方基金、大摩华鑫基金、融通基金、大成基金等22家基金公司。值得注意的是,此次基金公司估值下调幅度并不一致,' \ '调整估值在每股12.02~16.64元之间。其中,大摩华鑫基金、融通基金和安信基金给出的估值最高,为每股16.64元,而工银瑞信基金、' \ '富国基金和泰达宏利基金给出的估值最低,为每股12.02元。关注同花顺财经(ths518),获取更多机会' # s = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ # u"根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标,\n" \ # u"有部分省超过红线的指标。对一些超过红线的地方,\n陈明忠表示,对一些取用水项目进行区域的限批," \ # u"严格地进行水资源论证和取水许可的批准。" dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), tokenization.load_stop_words() tk = tokenization.Tokenizer(dp, stop_words) s_list = tk.token(s) # 根据句子的长度,动态划分关键词的个数 # top_k = int(len(s_list) * 0.1) # text_rank = TextRank(s_list, top_k=15, with_weight=True) text_rank = TextRank(top_k=15) res = text_rank.run(s_list) logging.logger.info("提取的%s个关键词: " % len(res)) if text_rank.withWeight: print(",".join(item[0] for item in res)) print(",".join(str(item[1]) for item in res)) else: print(",".join(str(item) for item in res))
def multi_extract_test(): """ 多进程测试 :return: """ import time from multiprocessing import Pool import multiprocessing as mp s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' dp = data_process.DataPressing() dict_init = dicts.init() stop_words = tokenization.load_stop_words() # 分词 tk = tokenization.Tokenizer(dp, stop_words) s_list = tk.token(s) t0 = time.time() for i in range(10000): parallel_test(s_list) logging.logger.info("串行处理花费时间{t}".format(t=time.time() - t0)) pool = Pool(processes=int(mp.cpu_count())) res_l = [] t1 = time.time() for i in range(10000): res = pool.apply_async(parallel_test, (s_list, )) res_l.append(res) # pool.map(parallel_test, s_list) # for i in res_l: # print i.get() pool.close() pool.join() logging.logger.info("并行处理花费时间{t}s".format(t=time.time() - t1))
stock_dict.append(stock.strip("\n")) stocks_df = pd.read_csv(st_new_path, encoding='utf-8') # stock_df.append(stocks_df.set_index('SESNAME')) for index, row in stocks_df.iterrows(): stock_dict.append(row.SESNAME) stock_dict.append(row.SYMBOL) return stock_dict, stocks_df _, stocks_df = load_stock_data() # 识别评论中的股票实体。 # 对讨论进行分词,然后提取评论中的股票实体。 data_process = DataPressing() dict_init = dicts.init() stop_words = load_stop_words() tokenizer = Tokenizer(data_process, stop_words) # 整理股票代码 stocks_df = stocks_df.set_index('SESNAME') # print('stocks_df %s' % stocks_df) def cut_process(text): """ 数据处理模块, 分词、提取股票实体词 :param text: :return: """
today_timestamp = int(latest_event_time) today = time_util.timestamp_to_time(today_timestamp) logging.logger.info('读取新闻的起始时间: {}'.format(today)) ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp) # load tf-idf VSM # tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl' # tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl' tfidf_feature_path = conf.tfidf_feature_path tfidf_transformer_path = conf.tfidftransformer_path tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) # 导入词典,停用词,数据处理接口,分词接口 dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), tokenization.load_stop_words() tk = tokenization.Tokenizer(dp, stop_words) # 提取dataFrame中的内容 ordered_news_lists = data_reader.trans_df_data(ordered_df, tfidf_feature, tfidf_transformer, dp, tk) # 如果当天没有新闻更新,则直接退出程序,事件单元不需要更新。 # 文章重复更新, if len(ordered_news_lists) <= 0: # print '今天没有新新闻,事件单元不更新' logging.logger.info('[事件库未更新]: 今天没有新新闻,事件单元不更新') sys.exit() # for tmp in ordered_news_lists: # print tmp[0], tmp[1]