def main(category_path):
    reload(sys)
    sys.setdefaultencoding("utf-8")

    category_path_list = category_path.split("_")
    category_id = int(category_path_list[0])
    query_category = ""
    if len(category_path_list) >= 2:
        query_category = category_path_list[-1].decode("utf-8")
    main_category_list = [query_category]

    file_utils.createDirs(["baidu_baike_search_hierarchy"])
    file_path_list = file_utils.getFilePathList("../../scrapy/baidu_baike_search/clean_data/")
    category_info_dict = readCategoryInfo(file_path_list)
    g = generateCategoryNetwork(main_category_list, category_info_dict)
    hierarchy_node_dict = getHierarchy(g, category_info_dict, main_category_list)

    hierarchy_max_dict = {}
    category_hierarchy_score_dict = {}
    for query_category in category_info_dict.keys():
        calculateRelation(g, hierarchy_node_dict, hierarchy_max_dict, category_hierarchy_score_dict, query_category)
        # break

    outfile = open("baidu_baike_search_hierarchy/" + str(category_path) + ".csv", "wb")
    for category in category_hierarchy_score_dict.keys():
        outlist = []
        for level in category_hierarchy_score_dict[category].keys():
            score_nomalize = 0
            if hierarchy_max_dict[level] != 0:
                score_nomalize = 1.0 * category_hierarchy_score_dict[category][level] / hierarchy_max_dict[level]
            outlist.append(score_nomalize)
        best_level = -1
        if max(outlist) != 0:
            best_level = outlist.index(max(outlist)) + 1
        outfile.write(category + "," + ",".join([str(val) for val in outlist]) + "," + str(best_level) + "\r\n")
コード例 #2
0
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')
    file_utils.createDirs(['result_pointwise'])

    category_feature_dict = getFeature(category_path)
    X_train, Y = readTrainData(category_feature_dict, category_path)
    X_test, X_test_name = readTestData(category_feature_dict)
    clf = model(X_train, Y)
    test(clf, X_test, X_test_name, category_path)
コード例 #3
0
def main(category_id):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	jieba.load_userdict("../../../data/jieba_userdict.txt")

	file_utils.createDirs(['wiki_search'])
	category_set = common.getCandidateCategory(category_id)
	category_crawl_dict =readJosn(category_id)
	clean(category_id,category_crawl_dict,category_set)
コード例 #4
0
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')

    category_path_list = category_path.split('_')
    category_id = int(category_path_list[0])
    query_category = ""
    if len(category_path_list) >= 2:
        query_category = category_path_list[-1].decode('utf-8')
    main_category_list = [query_category]

    file_utils.createDirs(['combine_feature'])
    category_set = common.getCandidateCategory(category_id)
    combineFeature(category_id, category_path, category_set)
コード例 #5
0
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	jieba.load_userdict(data_path+"jieba_userdict.txt")
	file_utils.createDirs(['wikipedia'])

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""
	if len(category_path_list) >= 2:
		query_category = category_path_list[-1].decode('utf-8')
	main_category_list = [query_category]

	category_set = common.getCandidateCategory(category_id)
	extractFeatureFromWikiCategory(category_id,category_path,main_category_list,category_set)
コード例 #6
0
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')

    jieba.load_userdict(data_path + "jieba_userdict.txt")
    file_utils.createDirs(['wikipedia'])

    category_path_list = category_path.split('_')
    category_id = int(category_path_list[0])
    query_category = ""
    if len(category_path_list) >= 2:
        query_category = category_path_list[-1].decode('utf-8')
    main_category_list = [query_category]

    category_set = common.getCandidateCategory(category_id)
    extractFeatureFromWikiCategory(category_id, category_path,
                                   main_category_list, category_set)
コード例 #7
0
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""

	query_category = category_path_list[-1].decode('utf-8')
	category_relevant_set = getMainCategoryRevelantWord(query_category)
	main_category_list = list(category_relevant_set)

	file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/17/')
	category_info_dict = readCategoryInfo(file_path_list)
	file_utils.createDirs(['baidu_baike_search'])

	extractFeature(category_id,category_path,main_category_list,category_info_dict)
コード例 #8
0
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	print '-loading preparation file'
	jieba.load_userdict(data_path+"jieba_userdict.txt")
	app_tag_dict = pickle.load(open(data_path+'app_tag.dict','rb'))
	app_category_dict = pickle.load(open(data_path+'app_category.dict','rb'))

	file_utils.createDirs(['tag_tf','title_tf'])
	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""
	if len(category_path_list) >= 2:
		query_category = category_path_list[-1].decode('utf-8')
	category_set = common.getCandidateCategory(category_id)
	tf(category_id,category_path,query_category,category_set,app_category_dict,app_tag_dict)
コード例 #9
0
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')

    print '-loading preparation file'
    jieba.load_userdict(data_path + "jieba_userdict.txt")
    app_tag_dict = pickle.load(open(data_path + 'app_tag.dict', 'rb'))
    app_category_dict = pickle.load(open(data_path + 'app_category.dict',
                                         'rb'))

    file_utils.createDirs(['tag_tf', 'title_tf'])
    category_path_list = category_path.split('_')
    category_id = int(category_path_list[0])
    query_category = ""
    if len(category_path_list) >= 2:
        query_category = category_path_list[-1].decode('utf-8')
    category_set = common.getCandidateCategory(category_id)
    tf(category_id, category_path, query_category, category_set,
       app_category_dict, app_tag_dict)
コード例 #10
0
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')

    category_path_list = category_path.split('_')
    category_id = int(category_path_list[0])
    query_category = ""

    query_category = category_path_list[-1].decode('utf-8')
    category_relevant_set = getMainCategoryRevelantWord(query_category)
    main_category_list = list(category_relevant_set)

    file_path_list = file_utils.getFilePathList(
        '../../scrapy/baidu_baike_search/clean_data/17/')
    category_info_dict = readCategoryInfo(file_path_list)
    file_utils.createDirs(['baidu_baike_search'])

    extractFeature(category_id, category_path, main_category_list,
                   category_info_dict)
コード例 #11
0
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""
	if len(category_path_list) >= 2:
		query_category = category_path_list[-1].decode('utf-8')
	main_category_list = [query_category]

	file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/')
	# file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/')
	category_info_dict = readCategoryInfo(file_path_list)
	file_utils.createDirs(['baidu_baike'])
	# file_utils.createDirs(['baidu_baike_search'])

	sub_category_list = category_info_dict.keys()
	extractFeature(category_id,category_path,main_category_list,sub_category_list,category_info_dict)
コード例 #12
0
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""
	if len(category_path_list) >= 2:
		query_category = category_path_list[-1].decode('utf-8')
	main_category_list = [query_category]

	file_utils.createDirs(['baidu_baike_hierarchy'])
	file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/')
	category_info_dict = readCategoryInfo(file_path_list)
	g = generateCategoryNetwork(main_category_list,category_info_dict)
	hierarchy_node_dict = getHierarchy(g,category_info_dict,main_category_list)


	hierarchy_max_dict = {}
	category_hierarchy_score_dict = {}
	for query_category in category_info_dict.keys():
		calculateRelation(g,hierarchy_node_dict,hierarchy_max_dict,category_hierarchy_score_dict,query_category)
		# break

	outfile = open('baidu_baike_hierarchy/'+str(category_path)+'.csv','wb')
	for category in category_hierarchy_score_dict.keys():
		outlist = []
		for level in category_hierarchy_score_dict[category].keys():
			score_nomalize = 0
			if hierarchy_max_dict[level] != 0:
				score_nomalize = 1.0*category_hierarchy_score_dict[category][level]/hierarchy_max_dict[level]
			outlist.append(score_nomalize)
		best_level = -1
		if max(outlist) != 0:
			best_level = outlist.index(max(outlist))+1
		outfile.write(category+','+','.join([str(val) for val in outlist])+','+str(best_level)+'\r\n')