def tencent_classify_rawtext_files(files_root_path, result_path, pass_num=-1):
    count = 0
    flist = os.listdir(files_root_path)
    for f in flist:

        print '%s:%s' % (count, f)
        count += 1
        if count < pass_num:
            continue
        ftext = codecs.open(os.path.join(files_root_path, f), 'r', encoding='utf8').read()
        try:
            # json_obj = json.loads(ftext)
            ftext = ftext.replace('\n', '')
            ftext = ftext.replace(' ', '')
            refined_text = wenzhi_utils.remove_illegal_characters(ftext)
            result = wenzhi_utils.wenzhi_analysis(refined_text)
            # result = tencent_classify(ftext)
        except Exception, e:  # 懒得差各种异常了,直接重复
            print e
            continue
        if result['code'] == 0:
            for class_type in result['classes']:
                if class_type['conf'] > 0.5:
                    try:
                        fout = codecs.open(os.path.join(result_path, class_type['class'], f + '.txt'), 'w')
                    except IOError, e:
                        print e
                        os.mkdir(os.path.join(result_path, class_type['class']))
                        fout = codecs.open(os.path.join(result_path, class_type['class'], f + ".txt"), 'w')
                    except KeyError, ke:
                        print ke
                        continue
                    fout.write(refined_text)
Esempio n. 2
0
def analyzse_article():
    """
    抽离文章分析接口
    :return:
    """
    req_data = json.loads(request.data)
    content_list = req_data.get('article_content')
    article_content = req_data.get('article_content')
    result = wenzhi_utils.wenzhi_analysis(article_content)
    # topic_list = tagging_utils.passage_second_level_classify(web_content)
    tag_result = []
    if result['code'] == 0:
        for class_item in result['classes']:
            class_type = class_item['class']
            class_prob = class_item['conf']
            tag_result.append({'tag': class_type, 'prob': class_prob})
    return json.dumps({'code': 0, 'tag_result': tag_result}, ensure_ascii=False)