def category (info,content): if content.__class__ == ' '.__class__: #ascii try: content = content.decode ('utf8') except: try : content = content.decode('gbk') except: pasw article_info,length = split_article_into_words.split_text(content) max_value = 0 out_match = 0 category_text = '' result = [] for key,value in info.iteritems(): tmp = 0 tmp2 = 0 for word in article_info: if value.has_key (word): tmp += value[word] * article_info[word]/length else: tmp2 += 1 if tmp > max_value: category_text = key max_value = tmp result.append ([key, tmp, float(tmp2)/length]) return category_text, result
def group_key_word (info_filename, n=20): """ the parameter n mean how many top n words should be save """ group_dir = {} for line in file (info_filename): line = line.strip().decode('utf8') if not line: continue tmp = re.split (r'\s+',line) group_name = tmp[0] dir_name = tmp[1] if not group_dir.has_key (group_name): group_dir[group_name] = [] group_dir[group_name].append (dir_name) for group_name,dir_names in group_dir.iteritems(): same_group_key_words = {} same_group_words_length = 0 for dir_name in dir_names: if not os.path.isdir (dir_name): print 'Error, folder not exists %s' % dir_name continue for root,dirs,files in os.walk (dir_name): print root for filename in files: tmp = split_article_into_words.split_text(read_json(os.path.join (root, filename))) merge_dict (same_group_key_words, tmp[0]) same_group_words_length += tmp[1] if n==-1: group_dir[group_name] = same_group_key_words else: words_sorted = sorted (same_group_key_words.iteritems(), key=operator.itemgetter(1), reverse=True) group_dir[group_name] = dict (words_sorted[:n]) return group_dir