Exemple #1
0
 def __init__(self):
     # 加载分词自定义词典
     dicts.init()
     self.data_process = DataPressing()
     # 停用词
     self.stop_words = load_stop_words()
     # 股票-股票代码对, 并且对股票代码做一些变换,比如
     _, self.stocks_df = dicts.load_stock_data()
     self.tokenizer = Tokenizer(self.data_process, self.stop_words)
def cut_process(text):
    """
    数据处理模块, 分词、提取股票实体词
    :param text:
    :return:
    """
    # 分词
    dicts.init()
    text_list = tokenizer.token(text)
    # 提取text中涉及到的股票实体,并且转换成股票代码
    stock_list = data_process.find_stocks(text_list, stocks_df)
    # stock_list = ','.join(stock_list)
    return stock_list
Exemple #3
0
def load_data_test():
    dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
    ), load_stop_words()
    # 分词
    tk = tokenization.Tokenizer(dp, stop_words)
    # 获取三张表中的所有新闻
    df_result = get_data()  # 接口已经改变,调用时需要注意
    res_lists = []
    for index, row in df_result.iterrows():
        title, content = row["title"], row["content"]
        if title is not None and title:
            title = dp.no_remove(title)
            if not dp.useless_filter(title, dicts.stock_dict):
                title_list = tk.token(title)
                res_lists.append(title_list)

        if content is not None and content:
            content = dp.no_remove(content)
            if not dp.useless_filter(content, dicts.stock_dict):
                content_list = tk.token(content)
                res_lists.append(content_list)
    file_out = open("text.txt", "w")
    for index in res_lists:
        item = ",".join(item for item in index)
        file_out.write(item.encode("utf8") + "\n")
    file_out.close()
Exemple #4
0
def d_test():
    data_processing = data_process.DataPressing()
    dict_init = dicts.init()
    stop_words = load_stop_words()
    tk = Tokenizer(data_processing, stop_words)
    # print(["大智慧".decode("utf8")])
    # print(["【今日题材】".decode("utf8")])
    # print(["关注同".decode("utf-8")])

    # 剔除杂质词
    print(
        data_processing.no_remove(
            "【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的, 关注同花顺财经(ths58), 获取更多机会。"
        ))
    # 判断content中是否存在某些特殊词
    print(
        data_processing.useless_contain(
            "[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的"))

    # 对content中的内容进行去停,去杂质词,分词
    # result = tk.token("【今日题材】[AI决策]加多宝的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,还是注册制的")
    result = tk.token("加多宝重推红罐 是否能再与王老吉争锋")
    print('Type of result: {}。'.format(type(result)))
    for i in result:
        print(i)
Exemple #5
0
 def _cut_sentence(self, sentence):
     """
     # 对句子进行分词
     :return:
     """
     # 使用多进程的时候需要修改一下
     dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
     ), tokenization.load_stop_words()
     tk = tokenization.Tokenizer(dp, stop_words)
     self.word_list = tk.token(sentence)
Exemple #6
0
def multi_token_test():
    """
    多进程测试
    :return:
    """
    import time
    from multiprocessing import Pool
    import multiprocessing as mp

    s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
        '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \
        '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'

    dataprocess = data_process.DataPressing()
    dicts.init()
    stop_words = load_stop_words()
    # 串行处理
    t0 = time.time()
    res1_l = []
    for i in range(10000):
        res1 = paralize_test(s, dataprocess, stop_words)
        res1_l.append(res1)
    print("串行处理花费时间{t}s".format(t=time.time() - t0))

    # 并行处理
    t1 = time.time()
    res2_l = []
    pool = Pool(processes=int(mp.cpu_count() * 0.8))
    for i in range(10000):
        res = pool.apply_async(paralize_test, ((s, dataprocess, stop_words), ))
        res2_l.append(res)
    # 获取数据
    # for k in res2_l:
    #     print k.get()
    pool.close()
    pool.join()
    print("并行处理花费时间{t}s".format(t=time.time() - t1))
Exemple #7
0
def data_save():
    """
    读取数据库中的内容,文本预处理之后,保存成本地,主要用于singlepass进行历史事件的聚类使用,用于词向量训练,关键词提取等操作。

    :return:
    """
    dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
    ), load_stop_words()
    tk = tokenization.Tokenizer(dp, dict_init, stop_words)  # 分词
    df_result = get_data()
    # df_result.ix[:, ["content"]].apply(tk.token)
    # 提取dataFrame中的title和content的内容,然后分别进行预处理,

    # 方式一、标题和正文保存为同一个新闻,且新闻标题和正文同时存在
    res_lists = []
    for i in tqdm(range(len(df_result))):
        news_id = df_result.iloc[i]['id']
        title = df_result.iloc[i]['title']
        content = df_result.iloc[i]['content']
        unix_time = df_result.iloc[i]['unix_time']
        if content and title:
            news_id = news_id.strip()
            title = title.strip()
            string = title.strip() + content.strip()
            string_list = tk.token(string)
            if not dp.useless_filter(string_list, dicts.stock_dict):
                # string_list = keywords_extractor.parallel_test(string_list)  # 提取关键词
                res_lists.append((news_id, title, string_list,
                                  unix_time))  # 根据上面的具体格式,组成tuple
                # res_lists.append((string, unix_time))  # 根据上面的具体格式,组合成tuple
    logging.logger.info("提取的文章的个数: %s" % len(res_lists))
    # 数据更新
    # 保存新闻的新闻ID,发布时间, 分词后的正文;[news_id, timestamp, contents]
    # file_out = open("./data/text_full_index.txt", "w")
    file_out = open(conf.corpus_news, "w")
    for index, content in enumerate(res_lists):
        item = " ".join(item for item in content[2])
        file_out.write(
            str(content[0]) + "\t" + str(content[3]) + "\t" +
            item.encode("utf8") + "\n")
    file_out.close()
    # 保存新闻的新闻ID, 发布时间, 新闻标题;[news_id, timestamp, title]
    # file_out = open("./data/text_title_index.txt", "w")
    file_out = open(conf.corpus_news_title, "w")
    for index, content in enumerate(res_lists):
        file_out.write(
            str(content[0]) + "\t" + str(content[3]) + "\t" + content[1] +
            "\n")
    file_out.close()
Exemple #8
0
def d_test():
    """
    类接口测试
    :return:
    """
    # s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
    #     '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \
    #     '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'

    # s = '【今日题材】[AI决策]大智慧的股票真烂,中美贸易战打得好,中美贸易摩擦擦出爱情火花!科创板也上市了,' \
    #     '还是注册制的, 关注同花顺财经(ths58), 获取更多机会。'

    s = '中兴通讯(000063)在经历七个一字跌停板后,于今天打开跌停板。债转股开盘大涨,天津普林(002134)、信达地产(600657)' \
        '、海德股份(000567)集体涨停,长航凤凰(000520)、浙江东方(600120)、陕国投A(000563)大涨,消息面上,' \
        '央行宣布定向降准0.5个百分点,将重点支持债转股。中兴通讯机构最低估值12.02元/股在复牌之前,' \
        '多家基金公司对中兴通讯估值大多调整至20.54元/股。连续7个跌停板后,中兴通讯A股股价早就已经跌穿这一价格。' \
        '据《中国经营报》记者不完全统计,6月20日~22日,多家基金公司再做出调整中兴通讯A股估值的公告,下调公司包括工银瑞信基金、' \
        '华泰柏瑞基金、东方基金、大摩华鑫基金、融通基金、大成基金等22家基金公司。值得注意的是,此次基金公司估值下调幅度并不一致,' \
        '调整估值在每股12.02~16.64元之间。其中,大摩华鑫基金、融通基金和安信基金给出的估值最高,为每股16.64元,而工银瑞信基金、' \
        '富国基金和泰达宏利基金给出的估值最低,为每股12.02元。关注同花顺财经(ths518),获取更多机会'

    # s = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \
    #     u"根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标,\n" \
    #     u"有部分省超过红线的指标。对一些超过红线的地方,\n陈明忠表示,对一些取用水项目进行区域的限批," \
    #     u"严格地进行水资源论证和取水许可的批准。"

    dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
    ), tokenization.load_stop_words()
    tk = tokenization.Tokenizer(dp, stop_words)
    s_list = tk.token(s)
    # 根据句子的长度,动态划分关键词的个数
    # top_k = int(len(s_list) * 0.1)
    # text_rank = TextRank(s_list, top_k=15, with_weight=True)

    text_rank = TextRank(top_k=15)
    res = text_rank.run(s_list)
    logging.logger.info("提取的%s个关键词: " % len(res))
    if text_rank.withWeight:
        print(",".join(item[0] for item in res))
        print(",".join(str(item[1]) for item in res))
    else:
        print(",".join(str(item) for item in res))
Exemple #9
0
def multi_extract_test():
    """
    多进程测试
    :return:
    """
    import time
    from multiprocessing import Pool
    import multiprocessing as mp

    s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
        '一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,' \
        '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'

    dp = data_process.DataPressing()
    dict_init = dicts.init()
    stop_words = tokenization.load_stop_words()
    # 分词
    tk = tokenization.Tokenizer(dp, stop_words)
    s_list = tk.token(s)
    t0 = time.time()
    for i in range(10000):
        parallel_test(s_list)
    logging.logger.info("串行处理花费时间{t}".format(t=time.time() - t0))

    pool = Pool(processes=int(mp.cpu_count()))
    res_l = []
    t1 = time.time()
    for i in range(10000):
        res = pool.apply_async(parallel_test, (s_list, ))
        res_l.append(res)
    # pool.map(parallel_test, s_list)

    # for i in res_l:
    #     print i.get()
    pool.close()
    pool.join()
    logging.logger.info("并行处理花费时间{t}s".format(t=time.time() - t1))
        stock_dict.append(stock.strip("\n"))

    stocks_df = pd.read_csv(st_new_path, encoding='utf-8')
    # stock_df.append(stocks_df.set_index('SESNAME'))
    for index, row in stocks_df.iterrows():
        stock_dict.append(row.SESNAME)
        stock_dict.append(row.SYMBOL)
    return stock_dict, stocks_df


_, stocks_df = load_stock_data()

# 识别评论中的股票实体。
# 对讨论进行分词,然后提取评论中的股票实体。
data_process = DataPressing()
dict_init = dicts.init()
stop_words = load_stop_words()
tokenizer = Tokenizer(data_process, stop_words)


# 整理股票代码
stocks_df = stocks_df.set_index('SESNAME')
# print('stocks_df %s' % stocks_df)


def cut_process(text):
    """
    数据处理模块, 分词、提取股票实体词
    :param text:
    :return:
    """
    today_timestamp = int(latest_event_time)
    today = time_util.timestamp_to_time(today_timestamp)

logging.logger.info('读取新闻的起始时间: {}'.format(today))
ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp)

# load tf-idf VSM
# tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl'
# tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl'
tfidf_feature_path = conf.tfidf_feature_path
tfidf_transformer_path = conf.tfidftransformer_path
tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)

# 导入词典,停用词,数据处理接口,分词接口
dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
), tokenization.load_stop_words()
tk = tokenization.Tokenizer(dp, stop_words)

# 提取dataFrame中的内容
ordered_news_lists = data_reader.trans_df_data(ordered_df, tfidf_feature,
                                               tfidf_transformer, dp, tk)

# 如果当天没有新闻更新,则直接退出程序,事件单元不需要更新。
# 文章重复更新,
if len(ordered_news_lists) <= 0:
    # print '今天没有新新闻,事件单元不更新'
    logging.logger.info('[事件库未更新]: 今天没有新新闻,事件单元不更新')
    sys.exit()

# for tmp in ordered_news_lists:
#     print tmp[0], tmp[1]