Example #1
0
def category (info,content):
    if content.__class__ == ' '.__class__:        #ascii
        try:
            content = content.decode ('utf8')
        except:
            try :
                content = content.decode('gbk')
            except:
                pasw

    article_info,length = split_article_into_words.split_text(content)
    max_value = 0
    out_match = 0

    category_text = ''
    result = []

    for key,value in info.iteritems():
        tmp = 0
        tmp2 = 0
        for word in article_info:
            if value.has_key (word):
                tmp += value[word] * article_info[word]/length
            else:
                tmp2 += 1
        if tmp > max_value:
            category_text = key
            max_value = tmp
        result.append ([key, tmp, float(tmp2)/length])

    return category_text, result
Example #2
0
def group_key_word (info_filename, n=20):
    """ the parameter n mean how many top n words should be save """

    group_dir = {}
    for line in file (info_filename):
        line = line.strip().decode('utf8')
        if not line:
            continue

        tmp = re.split (r'\s+',line)
        group_name = tmp[0]
        dir_name = tmp[1]
        if not group_dir.has_key (group_name):
            group_dir[group_name] = []
        group_dir[group_name].append (dir_name)

    for group_name,dir_names in group_dir.iteritems():
        same_group_key_words = {}
        same_group_words_length = 0
        for dir_name in dir_names:

            if not os.path.isdir (dir_name):
                print 'Error, folder not exists %s' % dir_name
                continue

            for root,dirs,files in os.walk (dir_name):
                print root
                for filename in files:
                    tmp = split_article_into_words.split_text(read_json(os.path.join (root, filename)))
                    merge_dict (same_group_key_words, tmp[0])
                    same_group_words_length += tmp[1]
        
        if n==-1:
            group_dir[group_name] = same_group_key_words
        else:
            words_sorted = sorted (same_group_key_words.iteritems(), key=operator.itemgetter(1), reverse=True)   
            group_dir[group_name] = dict (words_sorted[:n])

    return group_dir