Ejemplo n.º 1
0
def domain_classfiy_by_text(user_weibo):#根据用户微博文本进行领域分类
    '''
    输入数据:字典
    {uid:weibo字符串(多条微博用逗号连接),...}
    输出数据:字典
    {uid:label1,uid2:label2,...}
    '''
    domain_dict,domain_count = load_train()
    len_dict = dict()
    total = 0
    for k,v in domain_dict.items():
        len_dict[k] = len(v)
        total = total + len(v)

    sw = load_scws()
    black = load_black_words()
    result_data = dict()
    p_data = dict()
    for k,v in user_weibo.items():
        start = time.time()
        words = sw.participle(v)
        domain_p = start_p()
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        for d_k in domain_p.keys():
            start_time = time.time()
            domain_p[d_k] = com_p(word_list,domain_dict[d_k],domain_count[d_k],len_dict[d_k],total)#计算文档属于每一个类的概率
            end_time = time.time()
            print '%s domain takes %s second...' % (d_k,(end_time-start_time))
        label,rank_data = rank_dict(domain_p)
        result_data[k] = label
        p_data[k] = rank_data
        end = time.time()
        print '%s takes %s second...' % (k,(end-start))

    return result_data,p_data
Ejemplo n.º 2
0
mediaw = getMediaWords()


def getBusinessWords():
    businessw = []
    f = open('./domain_dict/businessw.txt', 'r')
    for line in f:
        businessw.append(line.strip())  # 商业人士词汇

    return businessw


businessw = getBusinessWords()

s = load_scws()


def user_domain_classifier_v2(user):
    r = user
    label = labels[11]

    verified_type = r['verified_type']
    location = r['location'].encode('utf-8')
    province = location.split(' ')[0]

    followers_count = r['followers_count']
    statuses_count = r['statuses_count']

    name = r['name']
    description = r['description']
Ejemplo n.º 3
0
    return mediaw

mediaw = getMediaWords()

def getBusinessWords():
    businessw = []
    f = open('./domain_dict/businessw.txt', 'r')
    for line in f:
        businessw.append(line.strip()) # 商业人士词汇

    return businessw

businessw = getBusinessWords()

s = load_scws()

def user_domain_classifier_v2(user):
    r = user
    label = labels[11]

    verified_type = r['verified_type']
    location = r['location'].encode('utf-8')
    province = location.split(' ')[0]

    followers_count = r['followers_count']
    statuses_count = r['statuses_count']

    name = r['name']
    description = r['description']