Ejemplo n.º 1
def separater(user_weibos):
    #print user_weibos
    s = load_scws()
    contents = []
    #all_words_dict = {}
    for user_weibo in user_weibos:
        content = user_weibo['_source']['text']
        print str(content)
        content = cut_filter(content)
        content = re_cut(content)
        separated_words = cut(s, content)
        words_dict = {}
        for word in separated_words:
            print str(word)
                words_dict[word] += 1
                words_dict[word] = 1

        #for item in words_dict:
        #print str(words_dict[item])


    #print contents

    return words_dict
Ejemplo n.º 2
def triple_classifier(tweet):
    """content168 以utf-8编码
    sentiment = 0
    text = tweet['content168']

    if '//@' in text:
        text = text[:text.index('//@')]

    if not len(text):
        text = remove_at(tweet['content168'])

    emoticon_sentiment = emoticon(pe_set,ne_set, text)
    if emoticon_sentiment in [1,2]:
        sentiment = 1
        text = ''

    if text != '':
        entries = cut(sw, text)
        entry = [e.decode('utf-8') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1,1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            sentiment = 1
            sentiment = 0

    return sentiment
Ejemplo n.º 3
def cut_words_noun(text):
    '''分词, 加入黑名单过滤单个词,保留名词
           texts: 输入text的list,utf-8
           terms: 关键词list
    if not isinstance(text, str):
        raise ValueError("cut words input text must be string")

    cx_terms = cut(s, text, cx=True)

    return [term for term, cx in cx_terms if cx in cx_dict_noun and term not in black_words]
Ejemplo n.º 4
def prepare_svm_input(texts, y=None, dictionary=dictionary):
    x = []

    if not y:
        y = [1.0 for i in range(0, len(texts))]

    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)

    return y, x
Ejemplo n.º 5
def prepare_svm_input_file(texts, dictionary=dictionary):
    pid = os.getpid()
    svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid)

    fw = open(svm_input_path, 'w')
    for text in texts:
        words = cut(sw, text)
        feature = dictionary.doc2bow(words)
        line = '1 ' + ' '.join([
            str(wordid + 1) + ':' + str(wordcount)
            for wordid, wordcount in feature
        fw.write('%s\n' % line)

    return svm_input_path
Ejemplo n.º 7
def user_domain_classifier_v2(user):
    r = user
    label = labels[11]

    verified_type = r['verified_type']
    location = r['user_location']
    province = location.split(' ')[0]

    followers_count = r['fansnum']
    statuses_count = r['statusnum']

    name = r['nick_name']
    description = r['description']

    if verified_type == 4:
        label = labels[0] # 高校微博

    elif verified_type == 1:
        label = labels[7]#政府机构及人士
    elif verified_type == 8 or verified_type == 7 or verified_type == 2:
        if province not in outlist:
            label = labels[1] # 境内机构
            label = labels[2] # 境外机构

    elif verified_type == 3:
        if location not in outlist:
            label = labels[3] # 境内媒体
            label = labels[4] # 境外媒体 

    elif verified_type == 5 or verified_type == 6:
        label = labels[5] # 民间组织

    elif verified_type == 0:
        text = name + description
        kwdlist = cut(s, text)
        lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw]) # 律师
        adminw_weight = sum([1 for keyword in kwdlist if keyword in adminw]) # 政府官员
        mediaw_weight = sum([1 for keyword in kwdlist if keyword in mediaw]) # 媒体人士
        businessw_weight = sum([1 for keyword in kwdlist if keyword in businessw]) # 商业人士

        max_weight = 0
        if max_weight < lawyer_weight:
            max_weight = lawyer_weight
            label = labels[6]
        if max_weight < businessw_weight:
            max_weight = businessw_weight
            label = labels[12]

        if max_weight < adminw_weight:
            max_weight = adminw_weight
            label = labels[7]

        if max_weight < mediaw_weight:
            max_weight = mediaw_weight
            label = labels[8]

        if max_weight == 0:
            label = labels[9]

        if lawyer_weight!=0:
            label = labels[6]

    elif verified_type == 220 or verified_type == 200:
        label = labels[9]

    elif verified_type == 400:
        label = labels[11]    

        if followers_count >= FOLLOWER_THRE and statuses_count >= STATUS_THRE:
            label = labels[10] # 草根

        lawyer_weight = 0
        text = name + description
        kwdlist = cut(s, text)
        lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw])

        if lawyer_weight != 0:
            label = labels[6]

    return label
Ejemplo n.º 8
