Esempio n. 1
0
def createContent(fileName, rows):
    result = []
    count = 0
    f = open(u.utf8_2_gbk(fileName), 'rb')
    f.next()
    for line in f:
        count += 1
        result.append(u.create_content(line, rows).lower() + ',' + '\n')
    f.close()
    return [result, count]
Esempio n. 2
0
def factory(body_list):
    count = 0
    for line in result_file_body:
        count += 1
        content = u.create_content(line, COLUMN)
        res[content] = str(res.get(content, 0)) + "," + str(count)

    for key, value in res.items():
        rowNum_list = value[2:].split(",")
        if len(rowNum_list) >= NUMBER:
            for num in rowNum_list:
                remove_list.append(num)
        else:
            for num in rowNum_list:
                save_list.append(num)
Esempio n. 3
0
        print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE)
        source_file_body = u.create_file_body(SOURCE_PATH)
        for num, line in enumerate(source_file_body):
            source_file_body[num] = line.strip().lower() + ',' + '\n'
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
                p = re.compile(c)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if p.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1
            if len(words) == 2:
                c = createPattern(words[0], ACCURATE)
                f = createPattern(words[1], ACCURATE)
                cp = re.compile(c)
                fp = re.compile(f)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if cp.match(content) and not fp.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1

        u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body)
Esempio n. 4
0
#!/bin/bash
# coding=utf-8
import re
import sys
import os
import util as u
'''
功能说明:先groupby某一列,然后统计每个类的数量,输出一份文件
'''

##############参数说明###############
SOURCE_FILE = 'hebing.csv'  # 输入文件
RESULT_FILE = 'result.csv'  # 输出文件
COLUMN = 1  # 要groupby和统计的列的列
####################################
count_dict = {}

if __name__ == '__main__':
    source_file_head = u.create_file_head(SOURCE_FILE, 'right', ['次数'])
    source_file_body = u.create_file_body(SOURCE_FILE)
    for line in source_file_body:
        content = u.create_content(line, COLUMN)
        count_dict[content] = count_dict.get(content, 0) + 1
    result_file = file(RESULT_FILE, 'w+')
    result_file.write(source_file_head)
    for key, value in count_dict.items():
        result_file.write(key + ',' + str(value) + '\n')
    result_file.close()
Esempio n. 5
0
def rm_repeat(file_list):
    rm_set = set()
    for line in file_list:
        content = u.create_content(line, 1)
        rm_set.add(content.replace('"', '') + '\n')
    return list(rm_set)
Esempio n. 6
0
def rm_repeat(file_list):
    rm_set = set()
    for line in file_list:
        content = u.create_content(line, 1)
        rm_set.add(content.replace('"', '') + '\n')
    return list(rm_set)


if __name__ == '__main__':

    file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), [])
    for f in file_list:
        file_dict[f.encode('gbk')] = rm_repeat(
            u.create_file_body(f.encode('utf-8')))

    total_file = file('total.csv', 'w+')
    for key, value in file_dict.items():
        total_file.writelines(value)
    total_file.close()

    count_file = open('total.csv', 'rb')
    for line in count_file:
        content = u.create_content(line, 1)
        count_dict[content] = count_dict.get(content, 0) + 1

    result_file = file(RESULT_FILE, 'w+')
    for key, value in count_dict.items():
        result_file.write(key + '\t' + str(value) + '\n')
    result_file.close()

    os.remove('total.csv')
Esempio n. 7
0
def createPattern(fileName):
    content = ''
    f = open(u.utf8_2_gbk(fileName))
    for line in f:
        if line[:-1].strip():
            content += '|' + '.*' + line.strip() + '.*'
    f.close()
    return content[1:].lower()


if __name__ == '__main__':
    source_file_body = u.create_file_body(SOURCE_FILE)
    source_file_head = u.create_file_head(SOURCE_FILE)
    m = createPattern(MATCH_FILE)
    print m + '===>>' + u.utf8_2_gbk('若乱码,匹配词文件请使用gbk编码')
    p = re.compile(m)
    print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body))
    for line in source_file_body:
        content = u.create_content(line, COLUMN).lower()
        if p.match(content):
            result_list.append(line)
        else:
            remove_list.append(line)
    print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list))
    print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list))
    u.create_result_file(u.changeFileName(SOURCE_FILE, '-含关键词.csv'),
                         source_file_head, result_list)
    u.create_result_file(u.changeFileName(SOURCE_FILE, '-不含关键词.csv'),
                         source_file_head, remove_list)
    raw_input('Press Enter to exit...')
Esempio n. 8
0
__author__ = "liangzhicheng"

SOURCENAME, SOURCEPATH = u.getFirstFile('csv')
cluster = {}
result_file_body = []
pattern = re.compile("\w|[/.,/#@$%^& ]")
count_file_dict = {}

if __name__ == '__main__':

    try:
        source_file_head = u.create_file_head(SOURCEPATH, 'left', ['类型'])
        source_file_body = u.create_file_body(SOURCEPATH)
        print u.utf8_2_gbk('开始执行聚类')
        for num, line in enumerate(source_file_body):
            content = re.sub(pattern, '', u.create_content(line, COLUMN))
            if len(content) <= 20:
                keywords = jieba.analyse.extract_tags(content, topK=2)
            else:
                keywords = jieba.analyse.extract_tags(content, topK=TOPKET)
            keywords.sort()
            key = ','.join(keywords)
            cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1)
        print u.utf8_2_gbk('聚类完成,生成输出文件')
        for num, value in enumerate(cluster.itervalues()):
            cluster_list = value[2:].split(',')
            count_file_dict[num] = len(cluster_list)
            for n in cluster_list:
                result_file_body.append(
                    str(num) + ',' + source_file_body[int(n) - 1])
        u.create_result_file(u.changeFileName(SOURCENAME, '-聚类.csv'),