def load_data_test(): dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), load_stop_words() # 分词 tk = tokenization.Tokenizer(dp, stop_words) # 获取三张表中的所有新闻 df_result = get_data() # 接口已经改变,调用时需要注意 res_lists = [] for index, row in df_result.iterrows(): title, content = row["title"], row["content"] if title is not None and title: title = dp.no_remove(title) if not dp.useless_filter(title, dicts.stock_dict): title_list = tk.token(title) res_lists.append(title_list) if content is not None and content: content = dp.no_remove(content) if not dp.useless_filter(content, dicts.stock_dict): content_list = tk.token(content) res_lists.append(content_list) file_out = open("text.txt", "w") for index in res_lists: item = ",".join(item for item in index) file_out.write(item.encode("utf8") + "\n") file_out.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file", type=str, required=True, help='Input raw text file.') parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.') parser.add_argument("--num_splits", type=int, default=1, help='The MindRecord file will be split into the number of partition. ') parser.add_argument("--max_length", type=int, required=True, help='Maximum sequence length.') parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ') parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ') parser.add_argument("--mode", type=str, required=True, default='cnn_dailymail', help='mode of dataset creation') args = parser.parse_args() tokenizer = tokenization.Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file, mode=args.mode) input_file = args.input_file logging.info("***** Reading from input files *****") logging.info("Input File: %s", input_file) output_file = args.output_file logging.info("***** Writing to output files *****") logging.info("Output File: %s", output_file) writer = FileWriter(output_file, args.num_splits) data_schema = {"input_ids": {"type": "int64", "shape": [-1]}, "input_mask": {"type": "int64", "shape": [-1]}, "label_ids": {"type": "int64", "shape": [-1]} } writer.add_schema(data_schema, "wikitext2-schema") total_written = 0 total_read = 0 logging.info("***** Reading from %s *****", input_file) with open(input_file, "r") as f: while True: line = f.readline() if not line: break total_read += 1 if total_read % 500 == 0: logging.info("%d ...", total_read) output = create_instance(tokenizer, line, args.max_length) features = write_instance_to_file(writer, instance=output) total_written += 1 if total_written <= 20: logging.info("***** Example *****") logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1])) logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:])) for feature_name in features.keys(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) writer.commit() logging.info("Wrote %d total instances", total_written)
def _cut_sentence(self, sentence): """ # 对句子进行分词 :return: """ # 使用多进程的时候需要修改一下 dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), tokenization.load_stop_words() tk = tokenization.Tokenizer(dp, stop_words) self.word_list = tk.token(sentence)
def data_save(): """ 读取数据库中的内容,文本预处理之后,保存成本地,主要用于singlepass进行历史事件的聚类使用,用于词向量训练,关键词提取等操作。 :return: """ dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), load_stop_words() tk = tokenization.Tokenizer(dp, dict_init, stop_words) # 分词 df_result = get_data() # df_result.ix[:, ["content"]].apply(tk.token) # 提取dataFrame中的title和content的内容,然后分别进行预处理, # 方式一、标题和正文保存为同一个新闻,且新闻标题和正文同时存在 res_lists = [] for i in tqdm(range(len(df_result))): news_id = df_result.iloc[i]['id'] title = df_result.iloc[i]['title'] content = df_result.iloc[i]['content'] unix_time = df_result.iloc[i]['unix_time'] if content and title: news_id = news_id.strip() title = title.strip() string = title.strip() + content.strip() string_list = tk.token(string) if not dp.useless_filter(string_list, dicts.stock_dict): # string_list = keywords_extractor.parallel_test(string_list) # 提取关键词 res_lists.append((news_id, title, string_list, unix_time)) # 根据上面的具体格式,组成tuple # res_lists.append((string, unix_time)) # 根据上面的具体格式,组合成tuple logging.logger.info("提取的文章的个数: %s" % len(res_lists)) # 数据更新 # 保存新闻的新闻ID,发布时间, 分词后的正文;[news_id, timestamp, contents] # file_out = open("./data/text_full_index.txt", "w") file_out = open(conf.corpus_news, "w") for index, content in enumerate(res_lists): item = " ".join(item for item in content[2]) file_out.write( str(content[0]) + "\t" + str(content[3]) + "\t" + item.encode("utf8") + "\n") file_out.close() # 保存新闻的新闻ID, 发布时间, 新闻标题;[news_id, timestamp, title] # file_out = open("./data/text_title_index.txt", "w") file_out = open(conf.corpus_news_title, "w") for index, content in enumerate(res_lists): file_out.write( str(content[0]) + "\t" + str(content[3]) + "\t" + content[1] + "\n") file_out.close()
def d_test(): """ 类接口测试 :return: """ # s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ # '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ # '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' # s = '【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,' \ # '还是注册制的, 关注同花顺财经(ths58), 获取更多机会。' s = '中兴通讯(000063)在经历七个一字跌停板后,于今天打开跌停板。债转股开盘大涨,天津普林(002134)、信达地产(600657)' \ '、海德股份(000567)集体涨停,长航凤凰(000520)、浙江东方(600120)、陕国投A(000563)大涨,消息面上,' \ '央行宣布定向降准0.5个百分点,将重点支持债转股。中兴通讯机构最低估值12.02元/股在复牌之前,' \ '多家基金公司对中兴通讯估值大多调整至20.54元/股。连续7个跌停板后,中兴通讯A股股价早就已经跌穿这一价格。' \ '据《中国经营报》记者不完全统计,6月20日~22日,多家基金公司再做出调整中兴通讯A股估值的公告,下调公司包括工银瑞信基金、' \ '华泰柏瑞基金、东方基金、大摩华鑫基金、融通基金、大成基金等22家基金公司。值得注意的是,此次基金公司估值下调幅度并不一致,' \ '调整估值在每股12.02~16.64元之间。其中,大摩华鑫基金、融通基金和安信基金给出的估值最高,为每股16.64元,而工银瑞信基金、' \ '富国基金和泰达宏利基金给出的估值最低,为每股12.02元。关注同花顺财经(ths518),获取更多机会' # s = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ # u"根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标,\n" \ # u"有部分省超过红线的指标。对一些超过红线的地方,\n陈明忠表示,对一些取用水项目进行区域的限批," \ # u"严格地进行水资源论证和取水许可的批准。" dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), tokenization.load_stop_words() tk = tokenization.Tokenizer(dp, stop_words) s_list = tk.token(s) # 根据句子的长度,动态划分关键词的个数 # top_k = int(len(s_list) * 0.1) # text_rank = TextRank(s_list, top_k=15, with_weight=True) text_rank = TextRank(top_k=15) res = text_rank.run(s_list) logging.logger.info("提取的%s个关键词: " % len(res)) if text_rank.withWeight: print(",".join(item[0] for item in res)) print(",".join(str(item[1]) for item in res)) else: print(",".join(str(item) for item in res))
def multi_extract_test(): """ 多进程测试 :return: """ import time from multiprocessing import Pool import multiprocessing as mp s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \ '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \ '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。' dp = data_process.DataPressing() dict_init = dicts.init() stop_words = tokenization.load_stop_words() # 分词 tk = tokenization.Tokenizer(dp, stop_words) s_list = tk.token(s) t0 = time.time() for i in range(10000): parallel_test(s_list) logging.logger.info("串行处理花费时间{t}".format(t=time.time() - t0)) pool = Pool(processes=int(mp.cpu_count())) res_l = [] t1 = time.time() for i in range(10000): res = pool.apply_async(parallel_test, (s_list, )) res_l.append(res) # pool.map(parallel_test, s_list) # for i in res_l: # print i.get() pool.close() pool.join() logging.logger.info("并行处理花费时间{t}s".format(t=time.time() - t1))
logging.logger.info('读取新闻的起始时间: {}'.format(today)) ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp) # load tf-idf VSM # tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl' # tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl' tfidf_feature_path = conf.tfidf_feature_path tfidf_transformer_path = conf.tfidftransformer_path tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path) tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path) # 导入词典,停用词,数据处理接口,分词接口 dp, dict_init, stop_words = data_process.DataPressing(), dicts.init( ), tokenization.load_stop_words() tk = tokenization.Tokenizer(dp, stop_words) # 提取dataFrame中的内容 ordered_news_lists = data_reader.trans_df_data(ordered_df, tfidf_feature, tfidf_transformer, dp, tk) # 如果当天没有新闻更新,则直接退出程序,事件单元不需要更新。 # 文章重复更新, if len(ordered_news_lists) <= 0: # print '今天没有新新闻,事件单元不更新' logging.logger.info('[事件库未更新]: 今天没有新新闻,事件单元不更新') sys.exit() # for tmp in ordered_news_lists: # print tmp[0], tmp[1]