def combine(category_id, category_list, category_wrapper_dict): print 'combing' tag_set = set([]) editor_tag_set = getCategoryEditorTag(category_id) outfile = open('combine/' + str(category_id) + '.txt', 'wb') outfile2 = open('final/' + str(category_id) + '.txt', 'wb') for i in range(len(category_list)): for j in range(len(category_list)): if i != j: category_i = category_list[i] category_j = category_list[j] if len(category_i) < len(category_j): if text_process.isSubset(category_i, category_j) or len( set(category_i) & set(category_j)) == len(category_i): if category_i in category_wrapper_dict.keys(): category_wrapper_dict[category_i].append( category_j) for category in category_wrapper_dict.keys(): tag_set.add(category) for wrapper in category_wrapper_dict[category]: tag_set.add(wrapper) outfile.write(category + '@' + ','.join(category_wrapper_dict[category]) + '\r\n') tag_set = tag_set | editor_tag_set for tag in tag_set: outfile2.write(tag + '\r\n')
def inclusionRelation(category_id, category_set): print 'extracting inclusion relation feature' category_feature_dict = {} for category in category_set: category_feature_dict.setdefault(category, 0) outfile = open('internal/' + str(category_id) + '.csv', 'wb') for i in range(len(category_feature_dict.keys())): for j in range(len(category_feature_dict.keys())): if i != j: word_i = category_feature_dict.keys()[i] word_j = category_feature_dict.keys()[j] if len(word_i) < len(word_j): if text_process.isSubset(word_i, word_j) and len(word_i) != 1: category_feature_dict[word_i] += 1 inclusion_max = max(category_feature_dict.values()) print 'sorting' sorted_list = sorted(category_feature_dict.items(), key=lambda p: p[1], reverse=True) print 'writing' for val in sorted_list: inclusion_normalize = 1.0 * val[1] / inclusion_max outfile.write(val[0] + ',' + str(inclusion_normalize) + '\r\n')
def getMainCategoryKeywords(main_category_list,category_info_dict): main_category_keywords = [] for category in category_info_dict.keys(): for relevant_category in main_category_list: if len(category) > len(relevant_category): if text_process.isSubset(relevant_category,category): main_category_keywords.append(category) return main_category_keywords