def count_times(self, n_word):
     """
     统计出字数为n_word的词并将结果存储到文件
     :param n_word:字数
     :return:
     """
     output_log('正在统计'+str(n_word)+'词词频,请稍后')
     chunks = pd.read_csv(self.__input_file,
                          header=None,
                          encoding='utf8',
                          chunksize=LINE_NUM,
                          dtype=str,
                          sep='\n',
                          names=[u'text_segment'])
     buffer = ''
     for chunk in chunks:
         self.__wBuffer = buffer
         self.__n = n_word
         self.__add(self.__work(chunk))
         buffer = self.__wBuffer
         pass
     # 将最后生成的词频表写入外存中
     wl = WordList()
     wl.set_data(self.__df)
     self.__df = pd.DataFrame()
     self.__wBuffer = ""
     wl.write_data(self.__storepath[n_word])
def c_stopword_result(stopword_source=STOP_SOURCE, result_path=STOP_RESULT):
    """统计最终会产生停用词的文本文件,将词频统计结果存储到STOP_RESULT中"""
    sourcefiles = get_allfile(stopword_source)
    # 遍历文件夹下的子目录 分别统计词频
    for file in sourcefiles:
        from count_times import Segment
        seg = Segment(file, result_path)
        output_log('开始统计' + get_name(file) + '的词频')
        seg.count_all()
    pass
Exemple #3
0
def input_pic_name():
    """
    输入图片存储路径并检查该路径是否合法
    :return:
    """
    path = raw_input()
    if os.path.isdir(os.path.dirname(path)):
        if path.endswith('.png') or path.endswith('.jpg'):
            return path
    output_log('输入文件路径错误或者该路径不存在')
    print('按1重新输入或其他任意键结束')
    choose = int(raw_input())
    if choose == 1:
        return input_pic_name()
    else:
        sys.exit()
    pass
Exemple #4
0
def input_directory():
    """
    输入文件目录路径并返回该路径
    :return:
    """
    print '请输入文件目录,以换行符结束:'
    path = str(raw_input())
    if os.path.isdir(path):
        print '输入目录为:', path
        return path
    output_log('输入文件目录路径错误或者该目录不存在')
    print('按1重新输入或其他任意键结束')
    choose = int(raw_input())
    if choose == 1:
        return input_directory()
    else:
        sys.exit()
    pass
Exemple #5
0
def draw_line_user():
    """
    用户输入待处理的源文件存储路径和要绘制成折线图的词
    绘制折现图并输出到文件
    :return:
    """
    from chapters_line import Chapter
    output_log('输入待处理的源文件路径:')

    input_file = input_file_path()
    _chpater = Chapter(input_file)
    output_log('输入待绘制的词:(以逗号隔开)')
    word_str = raw_input()
    word_list = word_str.split(',')
    for i in xrange(0, len(word_list)):
        word_list[i] = word_list[i].decode('utf8')
    _chpater.main_work(word_list, 'test')

    pass
Exemple #6
0
def input_file_path():
    """
        输入已经存在的文件路径并返回该路径
    :return:
    """
    path = str(raw_input('请输入文件路径,以换行符结束:'))
    if os.path.isfile(path):
        if path.endswith('.txt') or path.endswith('.png'):
            print '输入路径为:', path
            return path

    output_log('输入文件路径错误或者该文件不存在')
    print('按1重新输入或其他任意键结束')
    choose = int(raw_input())
    if choose == 1:
        return input_file_path()
    else:
        sys.exit()
    pass
Exemple #7
0
def c_times_user():
    """
    统计目标源文件词频,存储到外存,并返回结果路径
    :return:
    """
    output_log('输入要统计词频的文本文件')
    input_file = input_file_path()
    output_log('输入结果文件存放位置')
    output_file = input_directory()
    output_log('开始统计词频')
    count_times(input_file, output_file)
    output_log('词频统计完毕!')
    global TARGET_RESULT
    TARGET_RESULT = output_file
    return output_file
    pass
Exemple #8
0
def d_stopwords_user():
    """
    在统计完目标源文件词频的基础上
    产生停用词
    :return:
    """
    from c_stopwords import c_stopword_result, c_stopwords_all
    from del_stopwords import del_stopwords
    output_log('请输入产生停用词的源文件路径')
    input_file = input_directory()
    output_log('请输入停用词源文件词频统计的结果存放路径')
    output_file = input_directory()
    output_log('开始统计停用词源文件词频信息')
    c_stopword_result(input_file, output_file)
    output_log('停用词源文件词频统计完毕')
    output_log('请输入停用词存放路径')
    stopwords_result = input_directory()
    c_stopwords_all(output_file, stopwords_result)
    output_log('停用词统计完毕')
    output_log('输入删除停用词后的结果存储路径')
    result_path = input_directory()
    del_stopwords(TARGET_RESULT, stopwords_result, result_path)

    global STOPWORD_PATH
    STOPWORD_PATH = result_path
    return result_path
Exemple #9
0
def select_word_user():
    """
    用户输入+筛选词语后词云输出
    :return:
    """
    # 计算凝固度
    from count_co import count_co_all
    output_log('请输入已经删除停用词的词频表目录')
    input_path = input_directory()
    output_log('请输入凝固度co的计算结果存储目录')
    co_result_path = input_directory()
    output_log('开始计算凝固度')
    count_co_all(input_path, co_result_path)
    output_log('凝固度计算完毕')
    # 计算自由度
    output_log('请输入自由度fr的计算结果存储目录')
    fr_result_path = input_directory()
    from count_fr import count_fr_all
    count_fr_all(input_path, fr_result_path)
    output_log('自由度计算完毕')
    # 筛选词语
    # co_result_path = '/Users/Krystal/Desktop/with_co'
    # fr_result_path = '/Users/Krystal/Desktop/fr_result'
    from select_word import select_word_all
    output_log('请输入筛选结果的存储路径')
    selected_word_path = input_directory()
    # selected_word_path = '/Users/Krystal/Desktop/test'
    # 输入筛选词的标准
    co, fr, score = input_stand()
    select_word_all(co_result_path,
                    fr_result_path,
                    selected_word_path,
                    co=co,
                    fr=fr,
                    score=score)
    from draw_wordcloud import plt_n_word
    output_log('请输入绘制的词云存储路径')
    pic_path = input_pic_name()
    plt_n_word(selected_word_path, pic_path)

    pass
Exemple #10
0
def input_stand():
    """
    选择是否输入选择词的参数
    输入参数co,fr,score
    :return:
    """
    output_log('是否输入选择词的参数?\n1.是\n2.否(采用默认值co=2.0,fr=1.0,score=100.0)\n')

    choose = int(raw_input('输入1或2:'))
    if choose == 2:
        output_log('开始计算')
    else:
        if choose == 1:
            output_log('输入参数co:')
            co = float(raw_input())
            output_log('输入的co值为:' + str(co))
            output_log('输入参数fr:')
            fr = float(raw_input())
            output_log('输入的fr值为:' + str(fr))
            output_log('输入参数score:')
            score = float(raw_input())
            output_log('输入的co值为:' + str(score))
            output_log('开始计算')
            return co, fr, score
    return 10.0, 0.3, 100.0