Beispiel #1
0
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
                p = re.compile(c)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if p.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1
            if len(words) == 2:
                c = createPattern(words[0], ACCURATE)
                f = createPattern(words[1], ACCURATE)
                cp = re.compile(c)
                fp = re.compile(f)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if cp.match(content) and not fp.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1

        u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body)
        u.writeDictFile(u.changeFileName(combinefileName(SOURCE_FILE, LABEL_FILE), '统计.txt'), keyWordCount, 1)
    except:
        traceback.print_exc(file=open('error.txt', 'w+'))
Beispiel #2
0
    matchHead = u.utf8_2_gbk('内容' + ',' + LABEL_FILE.split('.')[0] + '\n')

    print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')

    for key, value in labelType.items():
        count += 1
        print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
        words = value.strip().split('|')
        if len(words) == 1:
            c = createPattern(words[0])
            p = re.compile(c)
            for num, line in enumerate(source_file_body):
                if p.match(line):
                    source_file_body[num] = source_file_body[num].strip(
                    ) + key + '|' + '\n'
                    keyWordCount[key] = keyWordCount.get(key, 0) + 1
        if len(words) == 2:
            c = createPattern(words[0])
            f = createPattern(words[1])
            cp = re.compile(c)
            fp = re.compile(f)
            for num, line in enumerate(source_file_body):
                if cp.match(line) and not fp.match(line):
                    source_file_body[num] = source_file_body[num].strip(
                    ) + key + '|' + '\n'
                    keyWordCount[key] = keyWordCount.get(key, 0) + 1

    u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead,
                         source_file_body)
    u.writeDictFile(u.changeFileName(LABEL_FILE, '统计.csv'), keyWordCount, 1)
Beispiel #3
0
        print u.utf8_2_gbk('开始执行')
        file_list = u.GetFileList(FILE_PATH, [])
        for f in file_list:
            file_dict[f.encode('gbk')] = rm_repeat(u.create_file_body(f.encode('utf-8')))

        total_file = file('total.csv', 'w+')
        for key, value in file_dict.items():
            total_file.writelines(value)
        total_file.close()

        count_file = open('total.csv', 'rb')
        for line in count_file:
            content = u.create_content(line, 1)
            count_dict[content] = count_dict.get(content, 0) + 1
        count_file.close()

        u.writeDictFile(RESULT_FILE, count_dict, 1)
        print u.utf8_2_gbk('执行完毕')
        print u.utf8_2_gbk('输出文件路径:') + sys.path[0] + u.utf8_2_gbk('\\' + RESULT_FILE)
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
        print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空')
        print '=============================================================='
        raw_input('Press Enter to exit...')
    os.remove('total.csv')
    
Beispiel #4
0
    head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip()
    TOTALCOLUNM = len(head.split(','))
    print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp)

    source_file_body = u.create_file_body(SOURCE_FILE)
    for key, value in labelWordp.items():
        i += 1
        print u.utf8_2_gbk('当前执行到{0}个'.format(i))
        for num, line in enumerate(source_file_body):
            data = line.strip().split(',')
            if len(data) == TOTALCOLUNM + 1:
                continue
            content = data[COLUNM - 1]
            p = re.compile(value)
            if p.match(content):
                source_file_body[num] = source_file_body[num].strip() + ',' + key + '\n'
                keyWordCount[key] = keyWordCount.get(key, 0) + 1

    # 补全格式
    for num, line in enumerate(source_file_body):
        data = line.strip().split(',')
        if len(data) == TOTALCOLUNM + 1:
            continue
        source_file_body[num] = source_file_body[num].strip() + ',' + '' + '\n'

    result_file_head = u.create_file_head(SOURCE_FILE, 'right', [u.gbk_2_utf8(columnName)])
    u.create_result_file(RESULT_FILE, result_file_head, source_file_body)

    KEYWORD_FILE = LABELWORD.split('.')[0] + '统计.txt'
    u.writeDictFile(KEYWORD_FILE, keyWordCount)  # 输出统计结果
Beispiel #5
0
        print u.utf8_2_gbk('开始执行聚类')
        for num, line in enumerate(source_file_body):
            content = re.sub(pattern, '', u.create_content(line, COLUMN))
            if len(content) <= 20:
                keywords = jieba.analyse.extract_tags(content, topK=2)
            else:
                keywords = jieba.analyse.extract_tags(content, topK=TOPKET)
            keywords.sort()
            key = ','.join(keywords)
            cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1)
        print u.utf8_2_gbk('聚类完成,生成输出文件')
        for num, value in enumerate(cluster.itervalues()):
            cluster_list = value[2:].split(',')
            count_file_dict[num] = len(cluster_list)
            for n in cluster_list:
                result_file_body.append(
                    str(num) + ',' + source_file_body[int(n) - 1])
        u.create_result_file(u.changeFileName(SOURCENAME, '-聚类.csv'),
                             source_file_head, result_file_body)
        u.writeDictFile(u.changeFileName(SOURCENAME, '-聚类统计.txt'),
                        count_file_dict, 1)
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
        print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空')
        print '=============================================================='
        raw_input('Press Enter to exit...')