コード例 #1
0
def chs_to_cht(sentence):  #传入参数为列表
    """
    将简体转换成繁体
    :param sentence:
    :return:
    """
    sentence = ",".join(sentence)
    sentence = Converter('zh-hant').convert(sentence)
    sentence.encode('utf-8')
    return sentence.split(",")
コード例 #2
0
def cat_to_chs(sentence):  #传入参数为列表
    """
    将繁体转换成简体
    :param line:
    :return:
    """
    sentence = ",".join(sentence)
    sentence = Converter('zh-hans').convert(sentence)
    sentence.encode('utf-8')
    return sentence.split(",")
コード例 #3
0
 def post(self):
     json_from_request = json.loads(
         Converter('zh-hant').convert(
             request.stream.read().decode('utf-8')))
     LOG.debug('In doTravelInsuranceCalculate, data received from TE: %s' %
               json.dumps(json_from_request, ensure_ascii=False, indent=4))
     payload = {
         "DAY": 10,
         "HD_FACE_AMT": 1000000,
         "MR_FACE_AMT": 100000,
         "OHS_FACE_AMT": 100000
     }
     headers = {
         'Content-Type': 'application/x-www-form-urlencoded',
         'X-Requested-With': 'XMLHttpRequest'
     }
     url = constants.CATHAYLIFE_URL + 'SYL0_M030/doTravelInsuranceCalculate'
     LOG.debug('request cathaylife travel insurance caculate API: %s' % url)
     LOG.debug('payload: %s' %
               json.dumps(payload, ensure_ascii=False, indent=4))
     r = requests.post(url,
                       json=payload,
                       timeout=float(constants.REQUEST_TIMEOUT),
                       headers=headers)
     r_obj = r.json()
     LOG.debug('response: %s' %
               json.dumps(r_obj, ensure_ascii=False, indent=4))
     r_obj = eval(r_obj['msg_response']['update']['content'])
     res = r_obj['text'].replace("<br>", "")
     update_kv_map = {"response": res}
     ret = encapsule_rtn_format(update_kv_map, None)
     return Response(json.dumps(ret), status=200)
コード例 #4
0
def filter_chinese(sentence: str) -> str:
    """
    中文的一些预处理
    :param sentence: 输入的句子或文本
    :return:
    """
    # 去除文本中的url
    # sentence = re.sub(r"http\S+", "", sentence)
    #剔除所有数字
    # decimal_regex = re.compile(r"[^a-zA-Z]\d+")
    # sentence = decimal_regex.sub(r"", sentence)
    #删除英文字符
    # eng_regex = re.compile(r'[a-zA-z]')
    # sentence = eng_regex.sub(r"", sentence)
    #只保留中文和标点符号
    words = [
        word for word in sentence if word >= u'\u4e00' and word <= u'\u9fa5'
        or word in [',', '。', '?', '!']
    ]
    sentence = ''.join(words)
    # 去除空格
    space_regex = re.compile(r"\s+")
    sentence = space_regex.sub(r"", sentence)
    # 繁体字转换成简体字
    sentence = Converter('zh-hans').convert(sentence)
    return sentence.strip().lower()
コード例 #5
0
    def post(self):
        json_from_request = json.loads(
            Converter('zh-hant').convert(
                request.stream.read().decode('utf-8')))
        LOG.debug('In Give_Fake_Data, data received from TE: %s' %
                  json.dumps(json_from_request, ensure_ascii=False, indent=4))
        df = pd.read_excel("ASR_test.xlsx")
        ID = int(json_from_request['task_info']['ID'])
        # find
        update_kv_map = {}
        if 'counter' not in json_from_request['task_info']:
            update_kv_map['counter'] = 0
            ans = df.at[df[df['ID'] == ID].index[update_kv_map['counter']],
                        "answer"]
        else:
            update_kv_map[
                'counter'] = json_from_request['task_info']['counter'] + 1
            ans = df.at[df[df['ID'] == ID].index[update_kv_map['counter']],
                        "answer"]

        if update_kv_map['counter'] == (len(df[df['ID'] == ID]) - 1):
            update_kv_map['last_ans'] = "ture"
        update_kv_map['ans'] = ans

        ret = encapsule_rtn_format(update_kv_map, None)
        return Response(json.dumps(ret), status=200)
コード例 #6
0
def store_code_info(lang, code_info):
    global DB_CURSOR

    name_zh = Converter('zh-hant').convert(code_info['name'])

    if lang == 'en' and code_info['division_id'] != 0:
        pass
    elif lang == 'zh' and code_info['division_id'] != 0:
        sql = """INSERT INTO api_country_divisions (division_id, locale, name, adm_code)
	VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE name=%s"""
        params = (code_info['division_id'], 'zh_CN', code_info['name'],
                  code_info['adm_code'], code_info['name'])
        if DB_CURSOR is None:
            myToolbox.print_sql(sql, params)
        else:
            DB_CURSOR.execute(sql, params)

        sql = """INSERT INTO api_country_division_localized_names (division_id, locale, localized_name)
	VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE localized_name=%s"""
        params = (code_info['division_id'], 'zh_CN', code_info['name'],
                  code_info['name'])
        if DB_CURSOR is None:
            myToolbox.print_sql(sql, params)
        else:
            DB_CURSOR.execute(sql, params)

        sql = """INSERT INTO api_country_division_localized_names (division_id, locale, localized_name)
	VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE localized_name=%s"""
        params = (code_info['division_id'], 'zh', name_zh, name_zh)
        if DB_CURSOR is None:
            myToolbox.print_sql(sql, params)
        else:
            DB_CURSOR.execute(sql, params)
コード例 #7
0
 def post(self):
     json_from_request = json.loads(
         Converter('zh-hans').convert(
             request.stream.read().decode('utf-8')))
     app_id = json_from_request['app_id']
     log.info('app_id: ' + app_id)
     try:
         conn = pymysql.Connect(host=const.DB_HOST,
                                user=const.DB_ACCOUNT,
                                passwd=const.DB_PASSWORD,
                                charset='utf8')
         data = dao.Database(conn).query_bot_work_list(app_id)
         log.info(data)
         if len(data) == 1:
             result = {
                 'bot_id': data[0][0],
                 'work': data[0][1],
                 'return_flag': data[0][2],
                 'return_finish': data[0][3]
             }
             log.info(result)
             update_kv_map = {
                 "bot_response":
                 work_num_to_str(data[0][1]) +
                 return_flag_to_str(data[0][2])
             }
             ret = encapsule_rtn_format(update_kv_map, None)
             return Response(json.dumps(ret), status=200)
     except Exception as e:
         log.info("query_bot_work_list occured some error: " +
                  utils.except_raise(e))
     finally:
         conn.close()
コード例 #8
0
ファイル: app.py プロジェクト: chamingkk/cn2tw
def handle_message(event):
    msg = event.message.text

    msgtw = Converter('zh-hant').convert(msg)
    if msgtw != msg:
        line_bot_api.reply_message(event.reply_token,
                                   TextSendMessage(text=msgtw))
コード例 #9
0
ファイル: utils.py プロジェクト: tcandzq/NLPToolKit
def traditional2simplified(sentence):
    """
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    """
    sentence = Converter('zh-hans').convert(sentence)
    return sentence
コード例 #10
0
def Traditional2Simplified(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence
コード例 #11
0
def Simplified2Traditional(sentence):
    '''
    将sentence中的简体字转为繁体字
    :param sentence: 待转换的句子
    :return: 将句子中简体字转换为繁体字之后的句子
    '''
    sentence = Converter('zh-hant').convert(sentence)
    return sentence
コード例 #12
0
def convert_cht_name(data):
    for node in data:
        if 'name' in node:
            edit = Converter('zh-hans').convert(node['name'])
            if node['name'] != edit:
                print('id:\'{}\':convert name from \'{}\' to \'{}\''.format(
                    node['id'], node['name'], edit))
                node['name'] = edit
コード例 #13
0
def trd2smp(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence
コード例 #14
0
def smp2trd(sentence):
    '''
    将sentence中的简体字转为繁体字
    :param sentence: 待转换的句子
    :return: 将句子中简体字转换为繁体字之后的句子
    '''
    sentence = Converter('zh-hant').convert(sentence)
    return sentence
コード例 #15
0
ファイル: utils.py プロジェクト: tcandzq/NLPToolKit
def simplified2traditional(sentence):
    """
    将sentence中的简体字转为繁体字
    :param sentence: 待转换的句子
    :return: 将句子中简体字转换为繁体字之后的句子
    """
    sentence = Converter('zh-hant').convert(sentence)
    return sentence
コード例 #16
0
    def post(self):
        json_from_request = json.loads(
            Converter('zh-hans').convert(
                request.stream.read().decode('utf-8')))
        user_execute_work = json_from_request['task_info']['bot_execute']
        log.info(json.dumps(json_from_request))
        app_id = json_from_request['app_id']
        log.info('app_id: ' + app_id)
        work_stats = 0
        update_kv_map = {}
        try:
            conn = pymysql.Connect(host=const.DB_HOST,
                                   user=const.DB_ACCOUNT,
                                   passwd=const.DB_PASSWORD,
                                   charset='utf8')
            data = dao.Database(conn).query_bot_work_list(app_id)
            log.debug(data)
            if len(data) == 1:
                work_stats = data[0][1]
        except Exception as e:
            log.info("query_bot_work_list occured some error: " +
                     utils.except_raise(e))
        finally:
            conn.close()

        if work_stats == 0:
            try:
                data = {}
                data['bot_id'] = json_from_request['app_id']
                data['work'] = user_execute_work
                conn = pymysql.Connect(host=const.DB_HOST,
                                       user=const.DB_ACCOUNT,
                                       passwd=const.DB_PASSWORD,
                                       charset='utf8')
                update_row = dao.Database(conn).insert_work_to_bot_work_list(
                    data)
            except Exception as e:
                log.info("insert_work_to_bot_work_list occured some error: " +
                         utils.except_raise(e))
            finally:
                conn.close()
            log.info(user_execute_work)
            update_kv_map = {
                "bot_response":
                work_num_to_str(int(user_execute_work)) + '派車,成功'
            }
        else:
            update_kv_map = {
                "bot_response":
                '正在執行' + work_num_to_str(work_stats) + '任務,請稍後在下命令'
            }

        ret = encapsule_rtn_format(update_kv_map, None)
        return Response(json.dumps(ret), status=200)
コード例 #17
0
ファイル: test.py プロジェクト: ubuntu733/SentencePairs
def preprocess_sentence(sentence):
    '''
    sentence = re.sub(r'\*+', '', sentence)
    sentence = re.sub(
        u"[’!\"#$%&'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+", "", sentence
    )
    '''

    sentence = Converter('zh-hans').convert(sentence)
    sentence = change_sentence(sentence)
    return sentence
コード例 #18
0
def Traditional2Simplified(sentences):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''

    ans = []
    for s in tqdm(sentences):
        ans.append(Converter('zh-hans').convert(s))
    return ans
コード例 #19
0
ファイル: api.py プロジェクト: pimcms/fetch1688
def item_get(iid):
    '''
    調用 Onebound API 的 item_get 接口,回傳商品詳細數據
    '''

    url = "{url}?key={apiKey}&secret={apiSecret}&api_name=item_get&num_iid={iid}".format(
        url=cfg.api['url'],
        apiKey=cfg.api['key'],
        apiSecret=cfg.api['secret'],
        iid=iid)

    downloaded = False
    try_tiems = 0

    try:
        while not downloaded or try_tiems > cfg.api['max_try_times']:
            try_tiems += 1
            try:
                r = requests.get(url, headers=cfg.headers, timeout=30).json()
                r_t = Converter('zh-hant').convert(
                    json.dumps(r, ensure_ascii=False))
                json_obj = json.loads(r_t)
            except Exception as e:
                print(bcolors.FAIL + '''
    API 回傳了無效的資料格式,請聯繫 API 供應商取得協助。
    錯誤信息:
    {}
    收到的資料:
    {}
    '''.format(e, json.dumps(r)) + bcolors.ENDC)
                return None

            # API 例外處理
            if 'item' not in json_obj:
                print(bcolors.FAIL + '''API 服務發生錯誤,請聯繫 API 供應商或程式開發者。''' +
                      bcolors.ENDC)
                if 'error' in json_obj:
                    print('錯誤信息:')
                    print(json_obj['error'])
                    print(bcolors.ENDC)
                return None

            item = json_obj['item']

            # 如果下載的資料不是空白的才結束循環
            if item['title'] != '':
                downloaded = True

        return item
    except:
        return None
コード例 #20
0
def speech_to_text(file_name):  # function for audio recognition
    wav_file_name = convert_audio_extension(file_name)
    ''' File Type : .wav '''
    r = sr.Recognizer()

    harvard = sr.AudioFile(wav_file_name)

    try:
        with harvard as source:
            audio = r.record(source)
            recognize_text = r.recognize_google(audio, language='zh-tw')
            recognize_text = Converter('zh-hant').convert(recognize_text)
    except:
        recognize_text = "無法辨識內容"
    return recognize_text
コード例 #21
0
def train_wiki_word2vector():
    # 词向量训练,这部分为针对中文维基百科的训练
    #  For Wiki_Zh_Corpus处理繁体转变为简体,这里开始是处理wiki的文件
    f_read = open(os.path.join(sys.path[0], 'extracted_data.txt'),
                  'r',
                  encoding='utf-8')
    f_write = open(os.path.join(sys.path[0], 'processed_extracted_data.txt'),
                   'w',
                   encoding='utf-8')
    for line in f_read:
        sentence = Converter('zh-hans').convert(line)
        f_write.write(sentence)
    f_read.close()
    f_write.close()

    path_main = sys.path[0] + '/Data Source'
    data = codecs.open('processed_extracted_wiki_data.txt',
                       'r',
                       encoding='utf-8')
    processed_data = process_raw_articles(path_main, data)

    i = 0
    space = ' '
    current_file = codecs.open('clear_wiki_zh_data.txt', 'w', encoding='utf-8')
    for text in processed_data:
        current_file.write(space.join(text) + "\n")
        i = i + 1
        if i % 100 == 0:
            print("Saved " + str(i) + " articles")
    current_file.close()

    sentences = word2vec.Text8Corpus("clear_wiki_zh_data.txt")

    model = Word2Vec(sentences,
                     size=200,
                     window=5,
                     min_count=10,
                     workers=multiprocessing.cpu_count(),
                     sample=0.001,
                     sorted_vocab=True)
    model.wv.save_word2vec_format('word2vector_wiki_zh_info.bin2',
                                  binary=False)
コード例 #22
0
    def client_translate(self):
        global dic
        global root

        #T.delete(0,END)
        print("Start Translate\n")
        #pyperclip.copy("""今天天氣如何""")
        msg = pyperclip.paste()
        if msg == "":
            msg = """今天天氣如何"""

        if self._lastTranslated == msg:
            if self._isAuto == True:
                # keep doing auto translation
                self._timer = threading.Timer(2.0, self.client_translate)
                self._timer.start()
            return

        T.delete(1.0, END)
        translatedMsg = dic.translateFinal(msg)
        #line = Converter('zh-hans').convert(line.decode('utf-8'))
        #line = line.encode('utf-8')

        chtMsg = Converter('zh-hant').convert(translatedMsg)
        #print(translatedMsg)
        print(chtMsg)

        self._lastTranslated = msg
        #T.insert(END,translatedMsg)
        if self._isBilingual == True:
            T.insert(END, msg + "\n")

        T.insert(END, chtMsg)

        if self._isAuto == True:
            # keep doing auto translation
            self._timer = threading.Timer(2.0, self.client_translate)
            self._timer.start()
コード例 #23
0
# -*- coding: utf-8 -*-
__author__ = 'v-tedl'

import os
from langconv import Converter
path = r'%s/redirectData/' % os.getcwd()
KBItem2RedirectItem = dict()
for file in os.listdir(path):
    print "process %s..." % file
    fp = open(path + file, 'r')
    for line in fp:
        KBItem, RedirectItem = line.strip('\n').split('\t')
        if KBItem == "KBItem":
            continue
        SimRedirectItem = Converter('zh-hans').convert(RedirectItem.decode('utf8')).encode('utf8')
        if KBItem in KBItem2RedirectItem.iterkeys():
            KBItem2RedirectItem[KBItem].append(RedirectItem)
            if SimRedirectItem != RedirectItem:
                KBItem2RedirectItem[KBItem].append(SimRedirectItem)
        else:
            KBItem2RedirectItem[KBItem] = [RedirectItem]
            if SimRedirectItem != RedirectItem:
                KBItem2RedirectItem[KBItem].append(SimRedirectItem)
    fp.close()
lines = []
for key in KBItem2RedirectItem.iterkeys():
    simKey = Converter('zh-hans').convert(key.decode('utf8')).encode('utf8')
    if key == simKey:
        line = '\t'.join([key] + KBItem2RedirectItem[key])
    else:
        line = '\t'.join([key, simKey] + KBItem2RedirectItem[key])
コード例 #24
0
def Traditional2Simplified(sentence):
    sentence = Converter('zh-hans').convert(sentence)
    return sentence
コード例 #25
0
    return (1, 1)


if __name__ == '__main__':
    (conn, cursor) = cardsdb.init()
    filename = '大表'
    wb = openpyxl.load_workbook(filename + '.xlsx')
    sheet = wb[wb.sheetnames[0]]
    (base_row, base_col) = find_base_cell(sheet)
    for r in range(base_row, sheet.max_row + 1):
        cell = sheet.cell(r, base_col).value
        if cell == 'Card name':
            for i in range(4):
                sheet.cell(r, base_col + i).value = translate_local(
                    cursor,
                    sheet.cell(r, base_col + i).value)
        else:
            sheet.cell(r, base_col).value = translate_local(cursor, cell)
            for i in range(1, 4):
                sheet.cell(r, base_col + i).number_format = '0.0000'
    sheet.column_dimensions[get_column_letter(base_col)].width = 13
    sheet.column_dimensions[get_column_letter(base_col + 1)].width = 8
    sheet.column_dimensions[get_column_letter(base_col + 2)].width = 11
    sheet.column_dimensions[get_column_letter(base_col + 3)].width = 11

    for r in range(base_row, sheet.max_row + 1):
        cell = sheet.cell(r, base_col).value
        if cell != None:
            sheet.cell(r, base_col).value = Converter('zh-hans').convert(cell)
    wb.save(filename + '翻译版.xlsx')
    conn.close()
コード例 #26
0
ファイル: userTags.py プロジェクト: devyumao/user-tags
import jieba as jb
import logging
from stopWords import removeStopWordsJbg
import MySQLdb as mdb
from langconv import Converter

jb.enable_parallel()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

weiboAppKey = '82966982'
tagsDict = {}
a = 1
b = 2
L = 2
cvt = Converter('zh-hans')

def setEleInDict(d, e, n):
    if e not in d:
        d[e] = n
    else:
        d[e] += n

def setTagInDict(d, i, t, n):
    if i not in d:
        d[i] = [t, n]
    else:
        d[i][1] += n

def getWordsFromWeibo(text):
    text = text.lower()
def tradition2simple(word):
    return Converter('zh-hans').convert(word)
def simple2tradition(word):
    return Converter('zh-hant').convert(word)
コード例 #29
0
def convert_track(tagger, metadata, release, track):
    for key in metadata:
        if not isinstance('', (str, unicode)):
            continue
        metadata[key] = Converter(TO_LOCAL).convert(metadata[key])
コード例 #30
0
def store_lang_info(lang, lang_info):
    global DB_CURSOR

    if lang_info['iso_639_2'] != '':
        iso_639_2 = lang_info['iso_639_2'].split('/')
        if len(iso_639_2) == 2:
            lang_info['iso_639_2'] = iso_639_2[0]
            lang_info['iso_639_2_t'] = iso_639_2[1]
        else:
            lang_info['iso_639_2_t'] = lang_info['iso_639_2']
    else:
        lang_info['iso_639_2_t'] = ''

    if not re.match(r'^[a-z]{3}$', lang_info['iso_639_2']):
        lang_info['iso_639_2'] = None
    if not re.match(r'^[a-z]{3}$', lang_info['iso_639_2_t']):
        lang_info['iso_639_2_t'] = None

    if len(lang_info['iso_639_3']) > 3:
        lang_info['iso_639_3'] = lang_info['iso_639_3'][0:3]
    if not re.match(r'^[a-z]{3}$', lang_info['iso_639_3']):
        lang_info['iso_639_3'] = None

    if lang == 'zh' and lang_info['iso_639_1'] != '':
        print(
            "INFO (%s) > Got and stored language: %s, %s, %s, %s (%s, %s, %s, %s)"
            % (myToolbox.get_time(), lang_info['iso_639_1'],
               lang_info['iso_639_2'], lang_info['iso_639_2_t'],
               lang_info['iso_639_3'], lang_info['self_name'], lang_info['en'],
               lang_info['zh_CN'], lang_info['zh_TW']))

        sql = """INSERT INTO api_languages (iso_639_1_code, iso_639_2_b_code, iso_639_2_t_code, iso_639_3_code, self_name)
	VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE self_name=%s"""
        params = (lang_info['iso_639_1'], lang_info['iso_639_2'],
                  lang_info['iso_639_2_t'], lang_info['iso_639_3'],
                  lang_info['self_name'], lang_info['self_name'])
        if DB_CURSOR is None:
            myToolbox.print_sql(sql, params)
        else:
            DB_CURSOR.execute(sql, params)

        sql = """INSERT INTO api_language_localized_names (iso_639_1_code, locale, localized_name)
	VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE localized_name=%s"""

        params = (lang_info['iso_639_1'], lang_info['iso_639_1'],
                  lang_info['self_name'], lang_info['self_name'])
        if DB_CURSOR is None:
            myToolbox.print_sql(sql, params)
        else:
            DB_CURSOR.execute(sql, params)

        if lang_info['en'] != '':
            params = (lang_info['iso_639_1'], 'en', lang_info['en'],
                      lang_info['en'])
            if DB_CURSOR is None:
                myToolbox.print_sql(sql, params)
            else:
                DB_CURSOR.execute(sql, params)

        if lang_info['zh_CN'] != '':
            params = (lang_info['iso_639_1'], 'zh_CN', lang_info['zh_CN'],
                      lang_info['zh_CN'])
            if DB_CURSOR is None:
                myToolbox.print_sql(sql, params)
            else:
                DB_CURSOR.execute(sql, params)

            name_zh = Converter('zh-hant').convert(lang_info['zh_CN'])
            params = (lang_info['iso_639_1'], 'zh', name_zh, name_zh)
            if DB_CURSOR is None:
                myToolbox.print_sql(sql, params)
            else:
                DB_CURSOR.execute(sql, params)

        if lang_info['zh_TW'] != '':
            params = (lang_info['iso_639_1'], 'zh_TW', lang_info['zh_TW'],
                      lang_info['zh_TW'])
            if DB_CURSOR is None:
                myToolbox.print_sql(sql, params)
            else:
                DB_CURSOR.execute(sql, params)
    else:
        pass
コード例 #31
0
ファイル: data.py プロジェクト: cjx3721/QA
class Database(list):

    _re_split = re.compile("(?<=</doc>).*?(?=<doc)", flags=re.DOTALL)
    _re_ignore = re.compile("<br>")
    _xml_escape = [(" & ", " &amp; ")]
    _converter = Converter("zh-hans")

    @staticmethod
    def cond_length(length):
        return lambda x: len(x[0]) > length

    @staticmethod
    def cond_title(condition):
        return lambda x: condition(x[1]["title"])

    def __init__(self, database=None, conditions=[]):
        if isinstance(database, list):
            super(Database, self).__init__(database)

        elif isinstance(database, str):
            self.load_data(database, conditions=conditions)

    def load_data(self, path, conditions=[]):
        walk = os.walk(path)
        total = reduce(int.__add__, map(lambda t: len(t[2]), walk))
        count = 0
        success = 0
        filtered = 0
        fail = 0
        print "loading data from %s" % path

        self.entities = set()
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                fullname = os.path.join(dirpath, filename)
                xml_raw = open(fullname, "r").read()
                for xml in self._re_split.split(xml_raw):
                    xml = self._re_ignore.sub("", xml)
                    for old, new in self._xml_escape:
                        xml = xml.replace(old, new)

                    try:
                        e = et.fromstring(xml)
                        if not isinstance(e.text, unicode):
                            e.text = e.text.decode()
                        if not isinstance(e.attrib["title"], unicode):
                            e.attrib["title"] = e.attrib["title"].decode()
                        e.text = self._converter.convert(e.text)
                        for key in e.attrib:
                            e.attrib[key] = self._converter.convert(
                                e.attrib[key])
                        data = (e.text, e.attrib)
                        for condition in conditions:
                            if not condition(data):
                                filtered += 1
                                break
                        else:
                            success += 1
                            self.append(data)
                            self.entities.add(e.attrib["title"])
                    except et.ParseError:
                        fail += 1

                count += 1
                #if count >= 10 :
                #	break
                print "\rfiles: %d / %d" % (count, total),
                sys.stdout.flush()

        print "... %d loaded, %d filtered, %d fails" % (success, filtered,
                                                        fail)

    @property
    def sentences(self):
        # return tokenized sentences
        for text, attrib in self:
            yield list(jieba.cut(text))

    @property
    def dictionary(self):
        # return gensim Dictionary
        if not hasattr(self, "_dictionary"):
            self._dictionary = Dictionary(self.sentences, prune_at=None)
        return self._dictionary

    @property
    def corpus(self):
        # return sparse vectors for gensim models
        if not hasattr(self, "_corpus"):
            self._corpus = [
                self.dictionary.doc2bow(sentence)
                for sentence in self.sentences
            ]
        return self._corpus

    def save(self, filename):
        pickle.dump(self, open(filename, "w"))

    @staticmethod
    def load(filename):
        return pickle.load(open(filename))