def main(category_path): reload(sys) sys.setdefaultencoding("utf-8") category_path_list = category_path.split("_") category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode("utf-8") main_category_list = [query_category] file_utils.createDirs(["baidu_baike_search_hierarchy"]) file_path_list = file_utils.getFilePathList("../../scrapy/baidu_baike_search/clean_data/") category_info_dict = readCategoryInfo(file_path_list) g = generateCategoryNetwork(main_category_list, category_info_dict) hierarchy_node_dict = getHierarchy(g, category_info_dict, main_category_list) hierarchy_max_dict = {} category_hierarchy_score_dict = {} for query_category in category_info_dict.keys(): calculateRelation(g, hierarchy_node_dict, hierarchy_max_dict, category_hierarchy_score_dict, query_category) # break outfile = open("baidu_baike_search_hierarchy/" + str(category_path) + ".csv", "wb") for category in category_hierarchy_score_dict.keys(): outlist = [] for level in category_hierarchy_score_dict[category].keys(): score_nomalize = 0 if hierarchy_max_dict[level] != 0: score_nomalize = 1.0 * category_hierarchy_score_dict[category][level] / hierarchy_max_dict[level] outlist.append(score_nomalize) best_level = -1 if max(outlist) != 0: best_level = outlist.index(max(outlist)) + 1 outfile.write(category + "," + ",".join([str(val) for val in outlist]) + "," + str(best_level) + "\r\n")
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') file_utils.createDirs(['result_pointwise']) category_feature_dict = getFeature(category_path) X_train, Y = readTrainData(category_feature_dict, category_path) X_test, X_test_name = readTestData(category_feature_dict) clf = model(X_train, Y) test(clf, X_test, X_test_name, category_path)
def main(category_id): reload(sys) sys.setdefaultencoding('utf-8') jieba.load_userdict("../../../data/jieba_userdict.txt") file_utils.createDirs(['wiki_search']) category_set = common.getCandidateCategory(category_id) category_crawl_dict =readJosn(category_id) clean(category_id,category_crawl_dict,category_set)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] file_utils.createDirs(['combine_feature']) category_set = common.getCandidateCategory(category_id) combineFeature(category_id, category_path, category_set)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') jieba.load_userdict(data_path+"jieba_userdict.txt") file_utils.createDirs(['wikipedia']) category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] category_set = common.getCandidateCategory(category_id) extractFeatureFromWikiCategory(category_id,category_path,main_category_list,category_set)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') jieba.load_userdict(data_path + "jieba_userdict.txt") file_utils.createDirs(['wikipedia']) category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] category_set = common.getCandidateCategory(category_id) extractFeatureFromWikiCategory(category_id, category_path, main_category_list, category_set)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" query_category = category_path_list[-1].decode('utf-8') category_relevant_set = getMainCategoryRevelantWord(query_category) main_category_list = list(category_relevant_set) file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/17/') category_info_dict = readCategoryInfo(file_path_list) file_utils.createDirs(['baidu_baike_search']) extractFeature(category_id,category_path,main_category_list,category_info_dict)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') print '-loading preparation file' jieba.load_userdict(data_path+"jieba_userdict.txt") app_tag_dict = pickle.load(open(data_path+'app_tag.dict','rb')) app_category_dict = pickle.load(open(data_path+'app_category.dict','rb')) file_utils.createDirs(['tag_tf','title_tf']) category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') category_set = common.getCandidateCategory(category_id) tf(category_id,category_path,query_category,category_set,app_category_dict,app_tag_dict)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') print '-loading preparation file' jieba.load_userdict(data_path + "jieba_userdict.txt") app_tag_dict = pickle.load(open(data_path + 'app_tag.dict', 'rb')) app_category_dict = pickle.load(open(data_path + 'app_category.dict', 'rb')) file_utils.createDirs(['tag_tf', 'title_tf']) category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') category_set = common.getCandidateCategory(category_id) tf(category_id, category_path, query_category, category_set, app_category_dict, app_tag_dict)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" query_category = category_path_list[-1].decode('utf-8') category_relevant_set = getMainCategoryRevelantWord(query_category) main_category_list = list(category_relevant_set) file_path_list = file_utils.getFilePathList( '../../scrapy/baidu_baike_search/clean_data/17/') category_info_dict = readCategoryInfo(file_path_list) file_utils.createDirs(['baidu_baike_search']) extractFeature(category_id, category_path, main_category_list, category_info_dict)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/') # file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/') category_info_dict = readCategoryInfo(file_path_list) file_utils.createDirs(['baidu_baike']) # file_utils.createDirs(['baidu_baike_search']) sub_category_list = category_info_dict.keys() extractFeature(category_id,category_path,main_category_list,sub_category_list,category_info_dict)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] file_utils.createDirs(['baidu_baike_hierarchy']) file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/') category_info_dict = readCategoryInfo(file_path_list) g = generateCategoryNetwork(main_category_list,category_info_dict) hierarchy_node_dict = getHierarchy(g,category_info_dict,main_category_list) hierarchy_max_dict = {} category_hierarchy_score_dict = {} for query_category in category_info_dict.keys(): calculateRelation(g,hierarchy_node_dict,hierarchy_max_dict,category_hierarchy_score_dict,query_category) # break outfile = open('baidu_baike_hierarchy/'+str(category_path)+'.csv','wb') for category in category_hierarchy_score_dict.keys(): outlist = [] for level in category_hierarchy_score_dict[category].keys(): score_nomalize = 0 if hierarchy_max_dict[level] != 0: score_nomalize = 1.0*category_hierarchy_score_dict[category][level]/hierarchy_max_dict[level] outlist.append(score_nomalize) best_level = -1 if max(outlist) != 0: best_level = outlist.index(max(outlist))+1 outfile.write(category+','+','.join([str(val) for val in outlist])+','+str(best_level)+'\r\n')