Ejemplo n.º 1
0
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//'
    txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//"
    set_dir = r"D:\semantic analysis\2016-10-05结果\新词//"

    k_list = util.get_key_list()

    for key in k_list:
        print(key)
        # 文件目录
        file_list = sorted(util.get_file_list(txt_dir + key, ".txt"))
        # 集合目录
        set_list = sorted(util.get_file_list(set_dir + key, ".pkl"))

        util.create_directory(result_dir + "新词//" + key + "//")

        i = 0
        while i < len(file_list):
            s_list = util.get_list_from_file(txt_dir + key + "//" +
                                             set_list[i][0:-4] + ".txt")
            new_word_list = util.get_nw(set_dir + key + "//" + set_list[i])
            # 过滤相同的语句,防止重复计算
            s_list = list(set(s_list))
            w_list = remark(s_list, new_word_list, key)
            html_name = file_list[i][:-4] + '.html'
            util.save_file(result_dir + "新词//" + key + "//" + html_name,
                           w_list)
            i += 1
Ejemplo n.º 2
0
def loop_compare(keyword_list, pkl_dir1, txt_dir1, result_dir, mode=1, lap=1):
    for key in keyword_list:
        print(key)
        if mode == 0:
            util.create_directory(result_dir + key + "//")

        pkl_dir = pkl_dir1.format(key)
        txt_dir = txt_dir1.format(key)

        # 获取日期列表
        d_list = util.get_file_list(pkl_dir, '.pkl')
        d_list = [d.split(".")[0] for d in d_list]

        result_list = []
        # 升序排序
        d_list = sorted(d_list)
        ii = len(d_list) - 1

        while ii - lap >= 0:
            g1 = get_core_graph(pkl_dir + d_list[ii] + ".pkl")
            d1 = get_txt_dict(txt_dir + d_list[ii] + ".txt")

            # 迭代生成子图
            k = 1
            while k < lap:
                g1 = nx.compose(g1, util.get_nw(d_list[ii - k]))
                k += 1
            result_list.append(compare_function(d1, g1))
            ii -= lap
        util.save_file(result_dir + key + ".txt", result_list)
Ejemplo n.º 3
0
def loop_compare(com_function,
                 keyword_list,
                 pkl_dir1,
                 result_dir,
                 mode=1,
                 lap=1,
                 type="pkl"):
    for key in keyword_list:
        global keyword
        keyword = key
        print(key)
        if mode == 0:
            util.create_directory(result_dir + key + "//")
        pkl_dir = pkl_dir1.format(key)
        f_list = util.get_file_list(pkl_dir, '.pkl')
        os.chdir(pkl_dir)
        result_list = []
        # 升序排序
        nw_list = sorted(f_list)
        ii = len(nw_list) - 1

        while ii - 2 * lap >= 0:
            g2 = util.get_nw(nw_list[ii])
            # 迭代生成子图
            # k = 1
            # while k < lap:
            #     g2 = nx.compose(g2, util.get_nw(nw_list[ii - k]))
            #     k += 1

            ii -= lap
            g1 = util.get_nw(nw_list[ii])
            # 迭代生成子图
            # k = 1
            # while k < lap:
            #     g1 = nx.compose(g1, util.get_nw(nw_list[ii - k]))
            #     k += 1

            # 生成连通子图
            # 相互比例
            if mode == 1:
                r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))
                result_list.append((nw_list[ii][0:-4] + "\t" + str(r2)))
            # 一对一
            elif mode == 0:
                result_list = com_function(copy.deepcopy(g1),
                                           copy.deepcopy(g2))
                util.save_file(
                    result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt",
                    result_list)
            # n对一
            elif mode == 2:
                r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type)
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))

            ii -= lap
        if mode != 0:
            result_list.reverse()
            util.save_file(result_dir + key + ".txt", result_list)
Ejemplo n.º 4
0
def main1():
    # date_list = ["2012-08-05","2011-04-05","2011-03-28","2011-10-20","2012-12-30","2011-07-30","2011-06-09","2012-02-05","2012-12-16","2011-08-01","2011-05-19","2013-09-01","2012-08-01","2013-12-01"]
    # key_list = ["吐槽","纠结","淡定","自拍","正能量","山寨","达人","腹黑","接地气","扯淡","闷骚","不明觉厉","完爆","人艰不拆"]
    date_list = [
        "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31",
        "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31",
        "2013-12-31"
    ]
    key_list = [
        '努力',
        '感觉',
        '简单',
        '无聊',
        '希望',
        '美好',
        '气质',
        '害怕',
        '喜欢',
        '不约而同',
        '喜闻乐见',
    ]

    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\2016-10-09结果\html标记结果//'
    txt_dir = r"D:\semantic analysis\纯文本\常用词分句//"
    set_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//"

    i = 0

    while i < len(key_list):
        key = key_list[i]
        print(key)
        # 文件目录
        file_list = sorted(util.get_file_list(txt_dir + key, ".txt"))
        # 集合目录
        set_dir_list = util.get_file_list(set_dir + key, ".pkl")
        set_list = []
        for set_list_dir in set_dir_list:
            set_list.append(util.get_nw(set_dir + key + "//" + set_list_dir))
            print(set_list_dir)

        util.create_directory(result_dir + key + "//")
        rr = cal_index2(date_list[i], txt_dir + key_list[i])
        j = 0
        # 每个分段
        while j < len(rr):
            k = 0
            while k < rr[j]:
                print(file_list[k][:-4])
                print(rr[j])
                txt_list = util.get_list_from_file(txt_dir + key + "//" +
                                                   file_list[k])
                w_list = remark(txt_list, set_list[j], key)
                html_name = file_list[k][:-4] + '.html'
                util.save_file(result_dir + key + "//" + html_name, w_list)
                k += 1
            j += 1
        i += 1
def loop_compare(com_function,
                 keyword_list,
                 pkl_dir1,
                 result_dir,
                 mode=1,
                 lap=1,
                 type="pkl"):
    for key in keyword_list:
        print(key)
        if mode == 0:
            util.create_directory(result_dir + key + "//")
        pkl_dir = pkl_dir1.format(key)
        f_list = util.get_file_list(pkl_dir, '.txt')
        os.chdir(pkl_dir)
        result_list = []
        # 升序排序
        nw_list = sorted(f_list)
        ii = len(nw_list) - 1

        while ii - 2 * lap >= 0:
            g2 = util.txt2dict(util.get_list_from_file(nw_list[ii]))
            # 迭代生成子图
            k = 1
            while k < lap:
                g2 = nx.compose(g2, util.get_nw(nw_list[ii - k]))
                k += 1

            ii -= lap
            g1 = util.txt2dict(util.get_list_from_file(nw_list[ii]))
            d1 = util.get_nw(
                "D:\semantic analysis\新结果\去虚词去单字共现网络//{0}//p//".format(key) +
                nw_list[ii].split(".")[0] + ".pkl")
            # 迭代生成子图
            k = 1
            while k < lap:
                g1 = nx.compose(g1, util.get_nw(nw_list[ii - k]))
                k += 1

            # 生成连通子图
            if mode == 1:
                r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))
                result_list.append((nw_list[ii][0:-4] + "\t" + str(r2)))
            elif mode == 0:
                result_list = com_function(copy.deepcopy(g1),
                                           copy.deepcopy(g2))
                util.save_file(
                    result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt",
                    result_list)
            elif mode == 2:
                r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), d1)
                # result_list.append(str(r1))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))

            ii -= lap
        if mode != 0:
            result_list.reverse()
            util.save_file(result_dir + key + ".txt", result_list)
Ejemplo n.º 6
0
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\新结果\去虚词去单字共现网络//'
    txt_dir = r"D:\semantic analysis\新纯文本\1常用词//"
    # k_list = util.get_key_list()
    # k_list = ['不约而同', '喜闻乐见', '努力', '感觉', '简单', '无聊', '希望', '美好']
    # 中心词
    k_list = ['希望', '气质', '害怕', '喜欢']

    # 结巴分词词典的目录
    # jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt")
    # jieba.initialize()
    pynlpir.open()
    for key in k_list:
        pynlpir.nlpir.AddUserWord(c_char_p(key.encode()))

    for key in k_list:
        print(key)
        # 文件目录
        file_list = util.get_file_list(txt_dir+key, ".txt")
        # 建立目录
        util.create_directory(result_dir + key)
        # mk_dir(result_dir+key+'//w')
        util.create_directory(result_dir+key+'//p')

        for n_file in file_list:
            s_list = util.get_list_from_file(txt_dir+key+"//"+n_file)
            # 过滤相同的语句,防止重复计算
            print(len(s_list))
            s_list = list(set(s_list))
            print(len(s_list))

            # 生成所有分句的网络
            pps_list,pmn = create_matrix(s_list,key)

            pkl_name = n_file[:-4] + '.pkl'

            for w_list in pps_list:
                pmn.add_gram_edges(w_list)
            g = pmn.get_network()
            g.remove_edges_from(g.selfloop_edges())
            util.save_nw(g, result_dir+key+'//p//' + pkl_name)

            print(n_file)
            print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

            with open(result_dir+key+'//record.txt','a',encoding='utf-8') as rf:
                rf.write(n_file+'\n')
    pynlpir.close()
Ejemplo n.º 7
0
def cal_node_mcs(pkl_dir, mcs_dir, key_word, lap=2):
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)
    record_list = []
    num_list = []
    enum_list = []
    ii = len(nw_list) - 1

    while (ii - lap + 1) >= 0:
        # print(nw_list[ii])
        g1 = util.get_nw(nw_list[ii])
        # 迭代生成子图
        k = 1
        while k < lap:
            g1 = mcs(g1, util.get_nw(nw_list[ii - k]))
            k += 1

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '.pkl'

        # 保存结果
        pkl_dir = r"D:\semantic analysis\公共子图节点数\新词\30公共子图//" + key_word + "//"
        util.create_directory(pkl_dir)
        util.save_nw(g1, pkl_dir + nw_list[ii][0:-4])

        num_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_nodes()))
        enum_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_edges()))

        # 统计节点数
        # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file:
        #     for node in g1.nodes():
        #         file.write(node+'\n')
        # util.save_nw(g1,mcs_dir + filename)

        ii -= lap

    # util.save_file(mcs_dir + key_word+'mcs.txt', record_list)
    util.save_file(mcs_dir + 'n' + key_word + 'mcs.txt', num_list)
    util.save_file(mcs_dir + 'e' + key_word + 'mcs.txt', enum_list)
Ejemplo n.º 8
0
def loop_key2(pkl_dir, result_dir, key_word, lap=1):
    pkl_dir = pkl_dir.format(key_word)
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)
    ii = 0
    # g2是2号 g1是1号,此处获取最末端的网络
    g1 = util.get_nw(nw_list[ii])
    util.create_directory(result_dir + key_word)

    while ii < len(nw_list) - lap:
        ii += lap
        g2 = util.get_nw(nw_list[ii])

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '.txt'

        result_list = extract_new_nodes_attributes(g1, g2)
        util.save_file(result_dir + key_word + "//" + filename, result_list)
        g1 = nx.compose(g1, g2)
Ejemplo n.º 9
0
def loop_key(pkl_dir, result_dir, key_word, lap=1):
    pkl_dir = pkl_dir.format(key_word)
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)
    ii = len(nw_list) - 1
    # g2是2号 g1是1号,此处获取最末端的网络
    g2 = util.get_nw(nw_list[ii])
    util.create_directory(result_dir + key_word)

    while ii > 0:
        jj = ii
        ii -= lap
        # print(nw_list[ii])
        g1 = util.get_nw(nw_list[ii])

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.txt'

        result_list = cal_connect_real_probability(g1, g2, key_word)
        util.save_file(result_dir + key_word + "//" + filename, result_list)

        g2 = g1
Ejemplo n.º 10
0
import tool.util as util
import os

key_list = util.get_key_list2()

for keyword in key_list:
    print(keyword)
    dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword
    os.chdir(dirr)
    pkl_list = util.get_file_list(dirr, '.pkl')
    pkl_list = sorted(pkl_list)
    util.create_directory(r"D:\semantic analysis\常用词的分词集合1//" + keyword)
    i = 0
    s = util.get_nw(pkl_list[0])
    while i < len(pkl_list) - 1:
        s1 = util.get_nw(pkl_list[i + 1])
        util.save_nw(
            s & s1,
            r"D:\semantic analysis\常用词的分词集合1//" + keyword + "//" + pkl_list[i])
        s = s1
        i += 1
    "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31",
    "2013-12-31"
]
key_list = util.get_key_list2()

k = 0
for key in key_list:
    print(key)
    dir = "D:\semantic analysis\常用词的分词集合//"
    # index_list = cal_index2(date_list[k], dir+key)
    index_list = [100, 125, 150]
    print(index_list)
    k += 1
    file_list = util.get_file_list(dir + key, ".pkl")
    set_list = []
    # 获取目录下所有set集合
    os.chdir(dir + key)
    for file in file_list:
        set_list.append(util.get_nw(file))
        # print(len(set_list))
    rd_list, r_list = cal_difference(index_list, set_list)
    r_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//"
    util.create_directory(r_dir + key)
    i = 0
    while i < len(rd_list):
        print(len(rd_list[i]))
        print(len(r_list[i]))
        print(len(rd_list[i]) / len(r_list[i]))
        # util.save_nw(r_set, r_dir+key+"//"+str(index_list[i]).zfill(3)+".pkl")
        i += 1
import tool.util as util
root_path = r"D:\semantic analysis\用户信息\dict//"
save_root_path = r"D:\semantic analysis\用户信息\s_dict//"


def combine(src_path, save_path):
    file_list_dict, file_name_list = util.get_objdict_list(src_path, ".txt")
    for file_name, file_dict in file_list_dict.items():
        r_dict = dict()
        for place, num in file_dict.items():
            p_place = place.split(" ")[0]
            r_dict[p_place] = r_dict.get(p_place, 0) + num
        util.save_dict_list(r_dict,save_path+file_name)

py_list = ["tc","zp","dd","sz","dr","ms","fh","znl"]
for py in py_list:
    util.create_directory(save_root_path+py+"//")
    combine(root_path+py+"//",save_root_path+py+"//")
import tool.util as util
import os

key_list = util.get_key_list2()

for keyword in key_list:
    print(keyword)
    dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword
    os.chdir(dirr)
    pkl_list = util.get_file_list(dirr, '.pkl')
    pkl_list = sorted(pkl_list)
    util.create_directory(r"D:\semantic analysis\2016-10-05结果//" + keyword)
    i = 0
    s = util.get_nw(pkl_list[0])
    while i < len(pkl_list):
        s1 = util.get_nw(pkl_list[i]) | s
        s2 = s1 - s
        print(len(s2))
        util.save_nw(
            s2, r"D:\semantic analysis\2016-10-05结果//" + keyword + "//" +
            pkl_list[i])
        s = s1
        i += 1
Ejemplo n.º 14
0
import tool.util as util

# 从每个dict——txt文件里面统计词频率
dict_path = r"D:\semantic analysis\结果\去重频数//"
result_path = r"D:\semantic analysis\结果\去重频率//"

keyword_list = util.get_key_list2() + util.get_key_list()

for key in keyword_list:
    print(key)
    r_dict, file_name_list = util.get_objdict_list(dict_path + key, ".txt")
    for (k, word_dict) in r_dict.items():
        sum = 0
        r_f_dict = {}
        if key in word_dict:
            word_dict.pop(key)
        for word, value in word_dict.items():
            sum += int(value)
        for word, value in word_dict.items():
            ratio = value / sum
            r_f_dict[word] = ratio
        util.create_directory(result_path + key + "//")
        util.save_dict_list(r_f_dict, result_path + key + "//" + k)
Ejemplo n.º 15
0
for key in key_word:
    print(key)
    pynlpir.nlpir.AddUserWord(c_char_p(key.encode()))

result_dir = r"D:\semantic analysis\新结果\去重去虚词去单字词频数//"
fold_list_dir = r"D:\semantic analysis\新纯文本\1常用词分句//"
for key in key_word:
    print(key)
    file_list = sorted(util.get_file_list(fold_list_dir + key, ".txt"))
    # 循环文件
    for txt_file in file_list:
        print(txt_file)
        # 过滤重复
        s_list = set(
            util.get_list_from_file(fold_list_dir + key + "//" + txt_file))
        # 获取分词dict
        rr = count_word(s_list, key)
        # if "无力" in rr:
        #     print(rr["无力"]/rr["吐槽"])

        # 对key进行排序
        kk = sort_by_value(rr)
        w_list = create_dict_list(kk, rr)

        # 创建目录
        util.create_directory(result_dir + key)
        util.save_file(result_dir + key + "//" + txt_file, w_list, False)

# 关闭分词工具
pynlpir.close()
Ejemplo n.º 16
0
"2011-10-16",
"2012-02-16",
"2012-07-09",
"2012-11-19"]
             ]
py_list = ["zp","dd","sz","dr","ms","fh"]

date_list_list = [["2010-06-01",
"2011-01-21",
"2011-07-09",
"2011-11-21"]]
py_list = ["tc"]
root_path = r"D:\semantic analysis\用户信息\dict//"
i = 0
while i < len(py_list):
    py = py_list[i]
    date_list = date_list_list[i]
    i += 1
    for dd in date_list:
        user_id_list = extract_user_id(py,dd)
        place_list = []
        place_dict = dict()
        if user_id_list:
            for user_id in user_id_list:
                place = get_place(user_id)
                if place:
                    place_list.append(place)
            place_dict = dict((a, place_list.count(a)) for a in place_list)
            util.create_directory(root_path+py)
            util.save_dict_list(place_dict, root_path+py+"//"+dd+".txt")
Ejemplo n.º 17
0
import re
import tool.util as util

key_list = util.get_key_list()

dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//"
for key in key_list:
    print(key)
    set_dict, file_name = util.get_objdict_list(dict_dir + key, ".txt")
    date_list = util.get_file_list(dict_dir + key, ".txt")
    pattern = re.compile(r"(\d*-\d*)-\d*")
    month_array = pattern.findall(" ".join(date_list))
    month_array = ["2010"]

    util.create_directory(r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" +
                          key)

    # 循环查找月份
    for month in month_array:
        pattern = re.compile(r"(" + month + "-\d*-\d*)")
        date_array = pattern.findall(" ".join(date_list))
        print(date_array)
        # 循环合并月份频数字典
        r_dict = dict()
        for file_date in date_array:
            r_dict = util.union_dicts(set_dict[file_date + ".txt"], r_dict)
        util.save_dict_list(
            r_dict,
            r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key + ".txt")
Ejemplo n.º 18
0
                           db=db_name,
                           charset='UTF8')
    cur = conn.cursor()

    sql_str = "select content from {0} where date = '{1}';".format(
        py_keyword, date_str)
    print(sql_str)
    cur.execute(sql_str)
    with open(date_str + '.txt', 'w', encoding='utf8') as w_file:
        for c in cur:
            w_file.write(c[0] + '\n' + '\n')

    cur.close()  #关闭游标
    conn.close()  #关闭到数据库的连接,释放数据库资源


# key_list = ['完爆', '扯淡', '接地气', '正能量', '腹黑', '达人', '闷骚']
import time

key_list = ["喜欢"]
os.chdir(dirr)
for key_word in key_list:
    util.create_directory(key_word)
    # os.mkdir(key_word)

for key_word in key_list:
    with open(r"D:\semantic analysis\新纯文本\1新词/date/" + key_word, "r") as file:
        date_list = file.readlines()
    for date in date_list:
        create_txt(key_word, date.strip())