Python ensure_unicode Examples, chunyu.utils.general.encoding_utils.ensure_unicode Python Examples

Example #1

0

Show file

def get_special_population(text):
    text = ensure_unicode(text)
    try:
        return qa_pro_special_population(text)
    except Exception, e:
        print 'get_special_population Exception', e
        return 'common_population'

Example #2

0

Show file

def get_sex(text):
    text = ensure_unicode(text)
    if u"男" in text:
        return 2
    if u"女" in text:
        return 1
    return 0

Example #3

0

Show file

File: file_utils.py Project: johndpope/material_recommendation

def load_simple_lines(filename):
    res = set()
    with open(filename, 'r') as f:
        for l in f:
            l = l.strip('\n').strip()
            res.add(ensure_unicode(l))
    print "load from file %s" % filename
    return res

Example #4

0

Show file

File: word2vec_api.py Project: johndpope/material_recommendation

def get_vecs_weighted(words, weights):
    # 不再用
    vecs = []
    new_weights = []
    words = [ensure_unicode(x) for x in words]
    for i in range(len(words)):
        vec = get_vec(words[i])
        if vec is None:
            continue
        vecs.append(vec)
        new_weights.append(weights[i])
    return vecs, new_weights

Example #5

0

Show file

def population_pros2(population_type):
    # 首推
    population_type = ensure_unicode(population_type)
    if population_type in (u"children", u"pregnant_woman", u"lactating_women"):
        return [u"MYBJ", u"MR", u"YY"]
    if population_type == u"elder":
        return [u"PY", u"YY", u"AZ", u"TNB"]
    if population_type == u"children":
        return [u"MYBJ", u"YY"]
    if population_type == u"for_pregnant":
        return [u"MYBJ", u"YY"]
    return []

Example #6

0

Show file

File: file_utils.py Project: johndpope/material_recommendation

def load_from_file(filename, limit=None, to_unicode=True):
    res = []
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            if limit and i >= limit - 1:
                break
            line = line.strip()
            if to_unicode:
                line = ensure_unicode(line)
            if line:
                res.append(line)

    return res

Example #7

0

Show file

File: word2vec_api.py Project: johndpope/material_recommendation

def get_vecs_weighted3(words):
    # 与get_vecs_weighted2功能一样，但使用word2vec的get_vec_list_redis接口
    vec_dict = get_vec_list_redis(list(words))
    # vec_dict 的key 确保都是unicode编码的
    # 已经归一化，并去掉word2vec里边没有向量的,但是没有转换为ndarray

    keep_indices = []
    vecs = []
    for i, word in enumerate(words):
        word = ensure_unicode(word)
        if word in vec_dict:
            keep_indices.append(i)
            vecs.append(np.array(vec_dict[word]))
    return vecs, keep_indices

Example #8

0

Show file

File: word2vec_api.py Project: johndpope/material_recommendation

def get_vecs_weighted2(words):
    # 给出那些保留的indices
    # 不再用
    keep_indices = []
    vecs = []
    words = [ensure_unicode(x) for x in words]
    for i in range(len(words)):
        vec = get_vec(words[i])
        if vec is None:
            continue
        vecs.append(vec)
        keep_indices.append(i)

    return vecs, keep_indices

Example #9

0

Show file

def find_bdpart(tags, max_len=4):
    cutups = set()
    tags = [ensure_unicode(x) for x in tags]
    for tag in tags:
        for begin_index in range(len(tag)):
            l = len(tag)
            for w_size in range(0, l - begin_index):
                end_index = begin_index + w_size + 1
                if w_size > max_len:
                    break
                w = tag[begin_index:end_index]
                if get_db_data_local_handler().is_in_bodypart(w):
                    cutups.add(w)
    return cutups

Example #10

0

Show file

def is_baby_text(text):
    '''
    用规则判断是不是宝宝文本
    :param text:
    :return:
    '''
    text = ensure_unicode(text)
    for x in BABY_WORDS:
        if x in text:
            return True
    for p in BABY_SEARCH:
        if p.search(text):
            return True
    return False

Example #11

0

Show file

File: solr_utils.py Project: johndpope/material_recommendation

def get_bodypart_word(text):
    # md4 simple_medical_entity
    text = ensure_unicode(text)
    solr_query = SolrQuery()
    query = 'name:%s' % text
    solr_query.set('q', query)
    solr_query.set('fl', ['name'])
    solr_query.set('rows', 3)
    solr_query.set('fq', 'type:bodypart')
    bp_words = []
    for item in solr_sme.search(**solr_query.get_query_dict()):
        names = item['name']
        for name in names:
            if name in text:
                bp_words.append(name)
    return bp_words

Example #12

0

Show file

def qa_ask_info(text):
    sex = ''
    age = ''
    text = ensure_unicode(text).strip()
    ask_tail_pattern = re.compile(u"（.{1,8}）$")
    sex_pattern = re.compile(u"[男|女]")

    tail = ask_tail_pattern.search(text)

    if tail:
        text = ask_tail_pattern.sub("", text)
        tail = tail.group(0)
        sex0 = sex_pattern.search(tail)
        if sex0:
            sex = sex0.group(0)
        age = sex_pattern.sub(u"", tail.replace(u"（", u"").replace(u"）", "")).replace(u"，", u"").strip()
    return text, sex, age

Example #13

0

Show file

def population_cons2(population_type):
    if population_type is None:
        return []
    population_type = ensure_unicode(population_type)
    if population_type == u"children":
        return [u"MALE", u"NX", u"LXXZ"]
    if population_type == u"lactating_women":
        return [u"MALE"]
    if population_type == u"common_population_men":
        return [u"NX", u"MYBJ", u"LXXZ"]
    if population_type == u"common_population_women":
        return [u"MALE", u"MYBJ", u"LXXZ"]
    if population_type == u"common_population":
        return [u"MYBJ", u"LXXZ"]
    if population_type not in (u"pregnant_woman", u"lactating_women",
                               u"children"):
        return [u"MYBJ"]
    return []

Example #14

0

Show file

def find_systag_keywords_extend(text, max_len=7):
    cnt_dict = defaultdict(int)
    systag_id_dict = defaultdict(list)
    text = ensure_unicode(text)
    l = len(text)
    for begin_index in range(len(text)):
        cnt = 0
        for end_index in range(begin_index + 1, l + 1):
            if cnt >= max_len:
                continue
            w = text[begin_index:end_index]
            relation_systag_id, weight = get_db_data_local_handler(
            ).get_extend_keyword_relation_systag_id(w)
            if relation_systag_id:
                cnt_dict[w] += weight
                systag_id_dict[w] = relation_systag_id
            cnt += 1
    return systag_id_dict, cnt_dict

Example #15

0

Show file

File: word2vec_api.py Project: johndpope/material_recommendation

def get_vec_list_redis(word_list):
    # get_vec_list的redis版本，但向量归一化了,是list，不是ndarray
    # step 1 尝试从redis获取vec_list,不在Redis的NOT_IN_REDIS_SIGN表示
    word_list = [ensure_unicode(x) for x in word_list]
    vec_dict = {}
    not_in_redis_indices = []
    redis_res = Word2VecCache.get_vec_list(word_list)  #

    # 更换数据期间，所有词假设都不在redis中，而从api中重新获取，故redis_res = [NOT_IN_REDIS_SIGN]*len(word_list)
    # redis_res = [NOT_IN_REDIS_SIGN]*len(word_list)
    for i, vec in enumerate(redis_res):
        # vec 是归一化了的
        if vec == NOT_IN_REDIS_SIGN:
            not_in_redis_indices.append(i)
        else:
            if vec not in BAD_VEC_SIGN:  # 只加入有向量的词
                vec_dict[word_list[i]] = vec
    # step 2 redis里没有的，从接口里取，并把它们放进redis
    not_in_redis_word_list = [word_list[i] for i in not_in_redis_indices]

    # print 'not_in_redis_word_list', '|||'.join(not_in_redis_word_list)
    # for x in vec_dict:
    #     print 'already in redis good word', x, len(vec_dict[x])

    vec_dict_from_api = get_vec_list(not_in_redis_word_list)

    # for x in vec_dict_from_api:
    #     print 'vec_dict_from_api', x, len(vec_dict_from_api[x])

    for word in vec_dict_from_api:
        vec = vec_dict_from_api[word]
        vec = norm_list(vec)  # 归一化

        if vec != NOT_IN_WORD2VEC_SIGN_API:  # 只加入有向量的词,word2vec接口返回的
            vec_dict[word] = vec
        Word2VecCache.set_vec(word, vec)  # 存入redis

    # vec_dict.update(vec_dict_from_api)

    # for x in vec_dict:
    #     vec = vec_dict[x]
    #     print 'final word', x, type(x),len(vec), type(vec), math.sqrt(sum([y * y for y in vec]))

    return vec_dict

Example #16

0

Show file

def is_for_pregnant(text):
    # 用规则判断是不是备孕人群(刘慧珠)
    text = ensure_unicode(text)
    patterns = (
        u"想怀孕",
        u"尝试怀孕",
        u"怎么才能怀孕",
        u"能不能怀孕",
        u"能怀孕",
        u"可不可以怀孕",
        u"可以怀孕",
        u"备孕",
        u"影响怀孕",
        u"想要宝宝",
        u"要宝宝",
        u"要孩子",
        u"要小孩",
        u"想生宝宝",
        u"想生小孩",
        u"想生孩子",
        u"如何怀孩子",
        u"如何生孩子",
        u"怎样怀孩子",
        u"怎样生孩子",
        u"如何怀宝宝",
        u"如何生宝宝",
        u"怎样怀宝宝",
        u"怎样生宝宝",
        u"如何怀上孩子",
        u"怎样怀上孩子",
        u"如何怀上宝宝",
        u"如何怀上宝宝",
    )
    for p in patterns:
        if p in text:
            return True
    return False

Example #17

0

Show file

 def get_keyword_relation_systag_id(cls, word):
     word = ensure_unicode(word)
     return cls.systag_data['keyword'].get(word, [])

Example #18

0

Show file

File: word2vec_api.py Project: johndpope/material_recommendation

def get_vecs2(words):
    # 不考虑权重
    words = [ensure_unicode(x) for x in words]
    return [get_vec(word) for word in words if get_vec(word) is not None]

Example #19

0

Show file

File: search_api.py Project: johndpope/material_recommendation

def get_news_from_bigsearch(query):
    query = ensure_unicode(query)
    return [int(x) for x in json.loads(more_news(query))[0]["ids"]]

Example #20

0

Show file

def get_systag_data():
    # 获取热卖tag相关数据，keywords,target_param,name等
    sql = "select sysTag_id, keywords ,clinic_no,second_clinic_no from ner_systagsolrgenerateconf;"
    data = dict()
    data["systag"] = {}
    # 9:{'tag_name':'gastroscope_colonoscope','plan':[{'url':url1,'name':name1},{'url':url2,'name':name2}]}
    data['keyword'] = defaultdict(list)  # '感冒':[systag_id1,systag_id2...]
    data['keyword_extend'] = {}
    data['clinic_no'] = defaultdict(list)  # u'1':[systag_id1]
    all_plan_name = []
    o = get_diagnose_handler().dbhandler.do_one(sql)

    for item in o:
        systag_id = item[0]
        keywords = item[1].strip()
        clinic_no = item[2].strip()
        second_clinic_no = item[3].strip()

        # 科室信息与systag_id的对应关系，不标记区分一二级科室
        if clinic_no:
            clinic_nos = clinic_no.split()
            for x in clinic_nos:
                x = ensure_unicode(x)
                data['clinic_no'][x].append(systag_id)
        if second_clinic_no:
            second_clinic_nos = second_clinic_no.split()
            for x in second_clinic_nos:
                x = ensure_unicode(x)
                data['clinic_no'][x].append(systag_id)

        # data['systag']
        tag_name = get_diagnose_handler().get_systag_en_name(systag_id)
        sql1 = 'select id,name,target_param from api_userhomehotsalegallery where tag="%s" and is_online=1;' % tag_name
        o1 = get_medicaldb_handler().do_one(sql1)

        data['systag'][systag_id] = {'tag_name': tag_name, 'plan': []}

        if not o1:
            continue

        for item1 in o1:
            plan_id = item1[0]
            name = item1[1]
            url = item1[2].replace('\r\n', '')
            print systag_id, tag_name, name, url
            data['systag'][systag_id]['plan'].append({
                'url': url,
                'name': name,
                'plan_id': plan_id
            })

            all_plan_name.append([systag_id, name])

        if keywords == u"*":
            continue
            # data['keyword']
        keywords = keywords.lower().split()
        for k in keywords:
            if systag_id not in data['keyword'][k]:
                data['keyword'][k].append(systag_id)

    # 用相似词将keyword扩充
    num = 20
    master_slave = {}
    high_freq_words = get_high_freq_words()

    for k in data['keyword']:
        systag_id_list = data['keyword'][k]
        # data['keyword_extend'][k] = [systag_id_list, 1.0]
        master_slave[k] = [systag_id_list, []]
        for w, s in get_similar_redis(k, num):
            w = ensure_unicode(w)
            if len(w) < 2:
                # 去掉长度为1的相似词
                continue
            if s < 0.41:
                # 分数过低的不要
                break
            if w in high_freq_words:
                # 去掉公认的高频词
                continue

            data['keyword_extend'][w] = [systag_id_list, s]
            master_slave[k][1].append([w, s])

    for k in data['keyword']:
        systag_id_list = data['keyword'][k]
        data['keyword_extend'][k] = [systag_id_list, 1.0]

    # 把keyword_extend信息存文件里，方便查看
    with open(SYSTAG_DATA_CHECK_FILE, 'w') as fc:
        for k in master_slave:
            systag_id_list, ws_list = master_slave[k]
            fc.write('###' + k + '|||' + json.dumps(systag_id_list) +
                     '=' * 10 + '\n')
            for w, s in ws_list:
                fc.write(w + '|||' + str(s) + '\n')
        for systag_id, plan_name in all_plan_name:
            fc.write(str(systag_id) + '---' + plan_name + '\n')

    pickle_to_file(data, SYSTAG_DATA_FILE)

Example #21

0

Show file

 def is_in_bodypart(cls, word):
     word = ensure_unicode(word)
     if word == u"血":
         return False
     return word in cls.bodypart_data

Example #22

0

Show file

 def get_entity_cate(cls, word):
     word = ensure_unicode(word.lower())
     return cls.medical_entity_cate.get(word, '')

Example #23

0

Show file

 def get_extend_keyword_relation_systag_id(cls, word):
     # 对热卖tag进行相似词扩展后
     word = ensure_unicode(word)
     # return [systag_id_list,similarity]
     return cls.systag_data['keyword_extend'].get(word, [[], 0.0])

Example #24

0

Show file

 def is_e_stop_word(cls, word):
     word = ensure_unicode(word)
     return word in cls.e_data

Example #25

0

Show file

 def get_relation_drug(cls, word, num=100):
     word = ensure_unicode(word)
     return cls.medical_relation_drug.get(word, [])[:num]

Example #26

0

Show file

 def is_entity(cls, word):
     word = ensure_unicode(word)
     return word in cls.medical_entity_cate