def readSegFile():
	jieba.load_userdict("../../data/jieba_userdict.txt")
	infile = open('../../data/all_cn_seg_nwi_clean.txt','rb')
	outfile = open('../../data/all_word.txt','wb')
	stopword_set = text_process.getStopword('../../data/stopword.txt')
	word_set = set([])
	word_fre_dict = {}
	row_counter = 0
	for row in infile:
		row_counter += 1
		print row_counter
		row = row.strip().decode('utf-8')
		items = row.split('<@>')
		app_name = items[1]
		brief_seg = items[2].split()
		title_seg = jieba.cut(app_name)
		for title in title_seg:
			if text_process.isChinese(title) and title not in stopword_set:
				word_set.add(title)
				word_fre_dict.setdefault(title,0)
				word_fre_dict[title] += 1
		for brief in brief_seg:
			if text_process.isChinese(brief) and brief not in stopword_set:
				word_set.add(brief)
				word_fre_dict.setdefault(brief,0)
				word_fre_dict[brief] += 1

	sorted_list = sorted(word_fre_dict.items(),key=lambda p:p[1],reverse=True)
	for val in sorted_list:
		if val[1] >= 10:
			outfile.write(val[0]+','+str(val[1])+'\r\n')
def readSegFile():
    jieba.load_userdict("../../data/jieba_userdict.txt")
    infile = open('../../data/all_cn_seg_nwi_clean.txt', 'rb')
    outfile = open('../../data/all_word.txt', 'wb')
    stopword_set = text_process.getStopword('../../data/stopword.txt')
    word_set = set([])
    word_fre_dict = {}
    row_counter = 0
    for row in infile:
        row_counter += 1
        print row_counter
        row = row.strip().decode('utf-8')
        items = row.split('<@>')
        app_name = items[1]
        brief_seg = items[2].split()
        title_seg = jieba.cut(app_name)
        for title in title_seg:
            if text_process.isChinese(title) and title not in stopword_set:
                word_set.add(title)
                word_fre_dict.setdefault(title, 0)
                word_fre_dict[title] += 1
        for brief in brief_seg:
            if text_process.isChinese(brief) and brief not in stopword_set:
                word_set.add(brief)
                word_fre_dict.setdefault(brief, 0)
                word_fre_dict[brief] += 1

    sorted_list = sorted(word_fre_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for val in sorted_list:
        if val[1] >= 10:
            outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def generateCandidateCategory(category_path):
    print 'loading file'
    jieba.load_userdict(data_path + "jieba_userdict.txt")
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    print 'reading file'
    word_title_dict = {}
    word_brief_dict = {}
    word_all_dict = {}
    infile = open('../data/' + category_path + '.json', 'rb')
    outfile = open('candidate_category/' + str(category_path) + '.txt', 'wb')
    for row in infile:
        json_obj = json.loads(row.strip())
        app_name = json_obj["soft_name"]
        app_brief = json_obj["soft_brief"]

        seg_title_list = jieba.cut(app_name)
        seg_brief_list = jieba.cut(app_brief)

        for seg_title in seg_title_list:
            if text_process.isChinese(
                    seg_title) and seg_title not in stopword_set:
                word_title_dict.setdefault(seg_title, 0)
                word_title_dict[seg_title] += 1

        for seg_brief in seg_brief_list:
            if text_process.isChinese(
                    seg_brief) and seg_brief not in stopword_set:
                word_brief_dict.setdefault(seg_brief, 0)
                word_brief_dict[seg_brief] += 1

    print 'sorting'
    sorted_list = sorted(word_title_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for item in sorted_list:
        if item[1] >= 10:
            word_all_dict.setdefault(item[0], 0)
            word_all_dict[item[0]] += item[1]
            # outfile.write(item[0]+','+str(item[1])+'\r\n')

    sorted_list = sorted(word_brief_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for item in sorted_list:
        if item[1] >= 50:
            word_all_dict.setdefault(item[0], 0)
            word_all_dict[item[0]] += item[1]
            # outfile.write(item[0]+','+str(item[1])+'\r\n')

    sorted_list = sorted(word_all_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for item in sorted_list:
        outfile.write(item[0] + ',' + str(item[1]) + '\r\n')
Beispiel #4
0
def mineKeywordCombination(category_id, query_keyword):

    #主类目名称
    main_category = idToName(category_id)

    jieba.load_userdict('../../../data/jieba_userdict.txt')
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    combination_fre_dict = {}

    outfile = open('keyword_combination.txt', 'wb')
    #遍历主类目下的app
    infile = open('../data/' + str(category_id) + '.json', 'rb')
    for row in infile:

        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_name_seg = [
            word for word in jieba.cut(app_name)
            if word not in stopword_set and text_process.isChinese(word)
        ]
        app_brief_seg = [
            word for word in jieba.cut(app_brief)
            if word not in stopword_set and text_process.isChinese(word)
        ]
        app_name_brief = app_name + " " + app_brief

        app_name_combination_dict = combineNeighborWord(
            app_name_seg, query_keyword)
        for word in app_name_combination_dict.keys():
            combination_fre_dict.setdefault(word, 0)
            combination_fre_dict[word] += app_name_combination_dict[word]

        app_brief_combination_dict = combineNeighborWord(
            app_brief_seg, query_keyword)
        for word in app_brief_combination_dict.keys():
            combination_fre_dict.setdefault(word, 0)
            combination_fre_dict[word] += app_brief_combination_dict[word]

    sorted_list = sorted(combination_fre_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for val in sorted_list:
        if val[1] >= 2:
            print val[0] + ',' + str(val[1])
            outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def readSegFile():
	infile = open('../../data/all_cn_seg_nwi_clean.txt','rb')
	outfile = open('../../data/candidate_title_word.txt','wb')
	stopword_set = text_process.getStopword('../../data/stopword.txt')
	word_set = set([])
	word_fre_dict = {}
	row_counter = 0
	for row in infile:
		row_counter += 1
		print row_counter
		row = row.strip().decode('utf-8')
		items = row.split('<@>')
		app_name = items[1]
		brief_seg = items[2].split()
		title_seg = jieba.cut(app_name)
		for title in title_seg:
			if text_process.isChinese(title) and title not in stopword_set:
				word_set.add(title)
				word_fre_dict.setdefault(title,0)
				word_fre_dict[title] += 1
		# for brief in brief_seg:
		# 	word_set.add(brief)
	for word in word_fre_dict.keys():
		if word_fre_dict[word] >= 10:
			outfile.write(word+'\r\n')
def generateCandidateCategory(category_path):
	print 'loading file'
	jieba.load_userdict(data_path+"jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	print 'reading file'
	word_title_dict = {}
	word_brief_dict = {}
	word_all_dict = {}
	infile = open('../data/'+category_path+'.json','rb')
	outfile = open('candidate_category/'+str(category_path)+'.txt','wb')
	for row in infile:
		json_obj = json.loads(row.strip())
		app_name = json_obj["soft_name"]
		app_brief = json_obj["soft_brief"]

		seg_title_list = jieba.cut(app_name)
		seg_brief_list = jieba.cut(app_brief)

		for seg_title in seg_title_list:
			if text_process.isChinese(seg_title) and seg_title not in stopword_set:
				word_title_dict.setdefault(seg_title,0)
				word_title_dict[seg_title] += 1

		for seg_brief in seg_brief_list:
			if text_process.isChinese(seg_brief) and seg_brief not in stopword_set: 
				word_brief_dict.setdefault(seg_brief,0)
				word_brief_dict[seg_brief] += 1

	print 'sorting'
	sorted_list = sorted(word_title_dict.items(),key=lambda p:p[1],reverse=True)
	for item in sorted_list:
		if item[1] >= 10:
			word_all_dict.setdefault(item[0],0)
			word_all_dict[item[0]] += item[1]
			# outfile.write(item[0]+','+str(item[1])+'\r\n')

	sorted_list = sorted(word_brief_dict.items(),key=lambda p:p[1],reverse=True)
	for item in sorted_list:
		if item[1] >= 50:
			word_all_dict.setdefault(item[0],0)
			word_all_dict[item[0]] += item[1]
			# outfile.write(item[0]+','+str(item[1])+'\r\n')

	sorted_list = sorted(word_all_dict.items(),key=lambda p:p[1],reverse=True)
	for item in sorted_list:
		outfile.write(item[0]+','+str(item[1])+'\r\n')
def formatText(text):
	format_text = ""
	for val in text:
		if text_process.isChinese(val) or val == '[' or val == ']':
			continue
		format_text += val
	format_text = format_text.strip()
	return format_text
def formatText(text):
	format_text = ""
	for val in text:
		if text_process.isChinese(val) or val == '[' or val == ']':
			continue
		format_text += val
	format_text = format_text.strip()
	return format_text
def calculateCoverage(category_stat_dict, synonyms_set_list):
    print 'loading file'
    jieba.load_userdict(data_path + "jieba_userdict.txt")
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    print 'reading file'
    infile = open('../data/' + category_path + '.json', 'rb')
    all_app_counter = 0
    for row in infile:
        json_obj = json.loads(row.strip())
        app_id = int(json_obj["soft_id"])
        app_name = json_obj["soft_name"]
        app_brief = json_obj["soft_brief"]
        app_download = int(json_obj["download_times"])

        if app_download < 100:
            continue

        all_app_counter += 1

        seg_title_list = jieba.cut(app_name)
        seg_brief_list = jieba.cut(app_brief)

        for seg_title in seg_title_list:
            if text_process.isChinese(
                    seg_title) and seg_title not in stopword_set:
                for main_category in category_stat_dict.keys():
                    if seg_title in category_stat_dict[main_category][0]:
                        category_stat_dict[main_category][1].add(app_id)

        for seg_brief in seg_brief_list:
            if text_process.isChinese(
                    seg_brief) and seg_brief not in stopword_set:
                for main_category in category_stat_dict.keys():
                    if seg_brief in category_stat_dict[main_category][0]:
                        category_stat_dict[main_category][1].add(app_id)

    top_coverage_category_info_dict = {}
    for iter_num in range(20):
        stat(top_coverage_category_info_dict, category_stat_dict,
             all_app_counter, synonyms_set_list)
def calculateCoverage(category_stat_dict,synonyms_set_list):
	print 'loading file'
	jieba.load_userdict(data_path+"jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	print 'reading file'
	infile = open('../data/'+category_path+'.json','rb')
	all_app_counter = 0
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["soft_id"])
		app_name = json_obj["soft_name"]
		app_brief = json_obj["soft_brief"]
		app_download = int(json_obj["download_times"])

		if app_download < 100:
			continue

		all_app_counter += 1

		seg_title_list = jieba.cut(app_name)
		seg_brief_list = jieba.cut(app_brief)

		for seg_title in seg_title_list:
			if text_process.isChinese(seg_title) and seg_title not in stopword_set:
				for main_category in category_stat_dict.keys():
					if seg_title in category_stat_dict[main_category][0]:
						category_stat_dict[main_category][1].add(app_id)

		for seg_brief in seg_brief_list:
			if text_process.isChinese(seg_brief) and seg_brief not in stopword_set: 
				for main_category in category_stat_dict.keys():
					if seg_brief in category_stat_dict[main_category][0]:
						category_stat_dict[main_category][1].add(app_id)
	
	top_coverage_category_info_dict = {}
	for iter_num in range(20):
		stat(top_coverage_category_info_dict,category_stat_dict,all_app_counter,synonyms_set_list)
Beispiel #11
0
def recommendTag(category_name, category_parent_dict, category_child_dict,
                 category_synonyms_dict, indicator_set, comment_category_set,
                 ambiguation_dict):
    #主类目名称
    main_category = category_name

    #未被匹配到的app
    others_app = {}
    outfile_json = open('tag_recommend_result.json', 'wb')
    jieba.load_userdict('../../../data/jieba_userdict.txt')
    stopword_set = text_process.getStopword('../../../data/stopword.txt')
    node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)

    candidate_tag_set, candidate_delegate_tag_set = rule_base.getCandidateTag(
        main_category, node_children_dict, category_synonyms_dict)
    level_category_dict = rule_base.createLevelCategoryDict(
        main_category, candidate_tag_set, category_parent_dict,
        category_child_dict, category_synonyms_dict)
    # level_category_dict[0] = set([main_category])
    for level in level_category_dict.keys():
        print level
        print ' '.join(level_category_dict[level])

    match_counter = 0
    all_app_counter = 0

    #遍历主类目下的app
    infile = open('../data/' + category_name + '.json', 'rb')
    outfile_match = open('../data/' + category_name + '_match.json', 'wb')
    outfile_unmatch = open('../data/' + category_name + '_unmatch.json', 'wb')

    for row in infile:
        all_app_counter += 1

        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_brief_seg = [
            word for word in jieba.cut(app_brief)
            if word not in stopword_set and text_process.isChinese(word)
        ]
        app_name_brief = app_name + " " + app_brief
        app_name_brief += " " + rule_base.grabEnglish(app_name_brief)

        output_dict = {}
        output_dict["id"] = app_id
        output_dict["content"] = {}
        tag_recommend_set = set([])

        #情感词匹配,暂时不处理情感词的同义关系
        for comment_word in [
                comment_word for comment_word in comment_category_set
                if comment_word in app_name_brief
        ]:
            output_dict.setdefault("character", []).append(comment_word)

        #自下而上匹配
        for depth in reversed(range(0, max(level_category_dict.keys()) + 1)):
            if depth not in level_category_dict.keys():
                continue
            current_level_category_set = level_category_dict[depth]
            for current_level_category in current_level_category_set:
                if current_level_category in app_name_brief and not rule_base.isAmbiguous(
                        current_level_category, ambiguation_dict,
                        app_name_brief):
                    category_delegate = category_synonyms_dict[
                        current_level_category][0]
                    tag_recommend_set.add(category_delegate)
                    #强规则
                    strong_parent_set = rule_base.getNodeListOnStrongPath(
                        category_parent_dict[category_delegate],
                        category_parent_dict, set([]))
                    tag_recommend_set = tag_recommend_set | (
                        strong_parent_set & candidate_tag_set)

            current_level_unmatch_category_set = current_level_category_set - tag_recommend_set
            for unmatch_category in current_level_unmatch_category_set:
                if unmatch_category in indicator_set:
                    continue
                unmatch_category = category_synonyms_dict[unmatch_category][0]
                unmatch_category_children = node_children_dict[
                    unmatch_category]
                match_children = unmatch_category_children & tag_recommend_set
                if len(match_children) >= 3:
                    tag_recommend_set.add(unmatch_category)

        #隐节点
        for tag in tag_recommend_set:
            if u'(' in tag and u')' in tag:
                hidden_node_next_level = rule_base.getNextLevelCategorySet(
                    category_synonyms_dict, category_child_dict, tag)
                for hidden_node_next_level_item in hidden_node_next_level:
                    hidden_node_next_level_item = category_synonyms_dict[
                        hidden_node_next_level_item][0]
                    if hidden_node_next_level_item in tag_recommend_set:
                        output_dict.setdefault(
                            tag, []).append(hidden_node_next_level_item)
        #去除推导词
        tag_recommend_set = tag_recommend_set - indicator_set

        #构建输出字典
        content = outputJson(main_category, category_parent_dict,
                             category_child_dict, category_synonyms_dict,
                             tag_recommend_set)
        output_dict['content'] = content

        if len(content.keys()) != 0:
            outfile_match.write(row)
            match_counter += 1
            if app_download >= 10000000:
                continue
            outfile_json.write(
                json.dumps(output_dict, ensure_ascii=False) + '\r\n')
        else:
            outfile_unmatch.write(row)
            if app_download <= 500:
                continue
            others_app.setdefault(app_name,
                                  [app_download, ' '.join(app_brief_seg)])
    print "覆盖率: " + str(1.0 * match_counter / all_app_counter)

    #剩下没有匹配到的按下载量排序,输出
    other_title_fre = {}
    sorted_list = sorted(others_app.items(),
                         key=lambda p: p[1][0],
                         reverse=True)
    outfile_others = open('others.txt', 'wb')
    for val in sorted_list:
        title_seg = jieba.cut(val[0])
        for title in title_seg:
            if text_process.isChinese(title) and title not in stopword_set:
                other_title_fre.setdefault(title, 0)
                other_title_fre[title] += 1
        outfile_others.write(val[0] + '<@>' + val[1][1] + '\r\n')

    sorted_list = sorted(other_title_fre.items(),
                         key=lambda p: p[1],
                         reverse=True)
    outfile_others_title = open('others_title.txt', 'wb')
    for val in sorted_list:
        outfile_others_title.write(val[0] + '<@>' + str(val[1]) + '\r\n')
Beispiel #12
0
def getCorpus(category_name):

    app_lable_dict = {
        10743: 1,
        1002128: 1,
        47: 1,
        498: 1,
        550: -1,
        48: -1,
        490: -1,
        761: -1,
        101108: -1,
        101916: -1
    }

    x_train = []
    y_train = []
    x_test = []

    jieba.load_userdict('../../../data/jieba_userdict.txt')
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    doc_app_id = []
    docs = []
    id_name_dict = {}
    infile = open('corpus/' + category_name + '.json', 'rb')
    for row in infile:
        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_brief_seg = [
            word for word in jieba.cut(app_name + " " + app_brief)
            if word not in stopword_set and text_process.isChinese(word)
        ]

        if len(app_brief_seg) <= 10 and app_download <= 100:
            continue

        doc_app_id.append(app_id)
        id_name_dict[app_id] = app_name
        docs.append(app_brief_seg)

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(text) for text in docs]

    for i in range(len(corpus)):
        doc = corpus[i]
        x = [0 for n in range(len(dictionary))]
        for val in doc:
            x[val[0]] = val[1]

        app_id = doc_app_id[i]
        if app_id in app_lable_dict.keys():
            x_train.append(x)
            if app_lable_dict[app_id] == 1:
                y_train.append(1)
            else:
                y_train.append(-1)
        else:
            x_test.append(x)

    return x_train, x_test, y_train, doc_app_id, id_name_dict
def recommendTag(
    category_name,
    category_parent_dict,
    category_child_dict,
    category_synonyms_dict,
    indicator_set,
    comment_category_set,
    ambiguation_dict,
):
    # 主类目名称
    main_category = category_name

    # 未被匹配到的app
    others_app = {}
    outfile_json = open("tag_recommend_result.json", "wb")
    jieba.load_userdict("../../../data/jieba_userdict.txt")
    stopword_set = text_process.getStopword("../../../data/stopword.txt")
    node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)

    candidate_tag_set, candidate_delegate_tag_set = rule_base.getCandidateTag(
        main_category, node_children_dict, category_synonyms_dict
    )
    level_category_dict = rule_base.createLevelCategoryDict(
        main_category, candidate_tag_set, category_parent_dict, category_child_dict, category_synonyms_dict
    )
    # level_category_dict[0] = set([main_category])
    for level in level_category_dict.keys():
        print level
        print " ".join(level_category_dict[level])

    match_counter = 0
    all_app_counter = 0

    # 遍历主类目下的app
    infile = open("../data/" + category_name + ".json", "rb")
    outfile_match = open("../data/" + category_name + "_match.json", "wb")
    outfile_unmatch = open("../data/" + category_name + "_unmatch.json", "wb")

    for row in infile:
        all_app_counter += 1

        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_brief_seg = [
            word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)
        ]
        app_name_brief = app_name + " " + app_brief
        app_name_brief += " " + rule_base.grabEnglish(app_name_brief)

        output_dict = {}
        output_dict["id"] = app_id
        output_dict["content"] = {}
        tag_recommend_set = set([])

        # 情感词匹配,暂时不处理情感词的同义关系
        for comment_word in [comment_word for comment_word in comment_category_set if comment_word in app_name_brief]:
            output_dict.setdefault("character", []).append(comment_word)

            # 自下而上匹配
        for depth in reversed(range(0, max(level_category_dict.keys()) + 1)):
            if depth not in level_category_dict.keys():
                continue
            current_level_category_set = level_category_dict[depth]
            for current_level_category in current_level_category_set:
                if current_level_category in app_name_brief and not rule_base.isAmbiguous(
                    current_level_category, ambiguation_dict, app_name_brief
                ):
                    category_delegate = category_synonyms_dict[current_level_category][0]
                    tag_recommend_set.add(category_delegate)
                    # 强规则
                    strong_parent_set = rule_base.getNodeListOnStrongPath(
                        category_parent_dict[category_delegate], category_parent_dict, set([])
                    )
                    tag_recommend_set = tag_recommend_set | (strong_parent_set & candidate_tag_set)

            current_level_unmatch_category_set = current_level_category_set - tag_recommend_set
            for unmatch_category in current_level_unmatch_category_set:
                if unmatch_category in indicator_set:
                    continue
                unmatch_category = category_synonyms_dict[unmatch_category][0]
                unmatch_category_children = node_children_dict[unmatch_category]
                match_children = unmatch_category_children & tag_recommend_set
                if len(match_children) >= 3:
                    tag_recommend_set.add(unmatch_category)

                    # 隐节点
        for tag in tag_recommend_set:
            if u"(" in tag and u")" in tag:
                hidden_node_next_level = rule_base.getNextLevelCategorySet(
                    category_synonyms_dict, category_child_dict, tag
                )
                for hidden_node_next_level_item in hidden_node_next_level:
                    hidden_node_next_level_item = category_synonyms_dict[hidden_node_next_level_item][0]
                    if hidden_node_next_level_item in tag_recommend_set:
                        output_dict.setdefault(tag, []).append(hidden_node_next_level_item)
                        # 去除推导词
        tag_recommend_set = tag_recommend_set - indicator_set

        # 构建输出字典
        content = outputJson(
            main_category, category_parent_dict, category_child_dict, category_synonyms_dict, tag_recommend_set
        )
        output_dict["content"] = content

        if len(content.keys()) != 0:
            outfile_match.write(row)
            match_counter += 1
            if app_download >= 10000000:
                continue
            outfile_json.write(json.dumps(output_dict, ensure_ascii=False) + "\r\n")
        else:
            outfile_unmatch.write(row)
            if app_download <= 500:
                continue
            others_app.setdefault(app_name, [app_download, " ".join(app_brief_seg)])
    print "覆盖率: " + str(1.0 * match_counter / all_app_counter)

    # 剩下没有匹配到的按下载量排序,输出
    other_title_fre = {}
    sorted_list = sorted(others_app.items(), key=lambda p: p[1][0], reverse=True)
    outfile_others = open("others.txt", "wb")
    for val in sorted_list:
        title_seg = jieba.cut(val[0])
        for title in title_seg:
            if text_process.isChinese(title) and title not in stopword_set:
                other_title_fre.setdefault(title, 0)
                other_title_fre[title] += 1
        outfile_others.write(val[0] + "<@>" + val[1][1] + "\r\n")

    sorted_list = sorted(other_title_fre.items(), key=lambda p: p[1], reverse=True)
    outfile_others_title = open("others_title.txt", "wb")
    for val in sorted_list:
        outfile_others_title.write(val[0] + "<@>" + str(val[1]) + "\r\n")
Beispiel #14
0
def getTrainTest(category_name,category_parent_dict,category_child_dict,category_synonyms_dict,indicator_set,comment_category_set,ambiguation_dict):
	#主类目名称
	main_category = u"软件"

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)
	candidate_tag_set,candidate_delegate_tag_set = rule_base.getCandidateTag(main_category,node_children_dict,category_synonyms_dict)
	level_category_dict = rule_base.createLevelCategoryDict(main_category,candidate_tag_set,category_parent_dict,category_child_dict,category_synonyms_dict)
	# for level in level_category_dict.keys():
	# 	print level
	# 	print ' '.join(level_category_dict[level])

	dictionary = corpora.Dictionary([list(candidate_delegate_tag_set)])
	valcabulary_size = len(dictionary)

	#遍历主类目下的app
	infile = open('../data/'+category_name+'.json','rb')
	X_train = []
	X_test = []
	X_test_info = []
	all_counter = 0
	train_counter = 0
	for row in infile:

		all_counter += 1
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_tag = json_obj["tags"]
		app_download = int(json_obj["download_times"])
		app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)]
		app_name_brief = app_name+" "+app_brief
		app_name_brief += " "+rule_base.grabEnglish(app_name_brief)

		tag_recommend_set = set([])

		for tag in candidate_tag_set:
			if tag in app_name_brief:
				tag_recommend_set.add(category_synonyms_dict[tag][0])

		doc = dictionary.doc2bow(list(tag_recommend_set))
		x = [0 for i in range(valcabulary_size)]
		for val in doc:
			index = val[0]
			x[index] = val[1]
		if u"视频" in app_tag or u"音乐" in app_tag and app_download >= 1000:
			train_counter += 1
			X_train.append(x)
		else:
			X_test.append(x)
			X_test_info.append([app_name,' '.join(app_brief_seg)])

	print 1.0*train_counter/all_counter
	return X_train,X_test,X_test_info
Beispiel #15
0
def classify(category_name,category_parent_dict,category_child_dict,category_synonyms_dict,indicator_set,comment_category_set,ambiguation_dict):
	#主类目名称
	main_category = u"软件"

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)
	candidate_tag_set,candidate_delegate_tag_set = rule_base.getCandidateTag(main_category,node_children_dict,category_synonyms_dict)
	level_category_dict = rule_base.createLevelCategoryDict(main_category,candidate_tag_set,category_parent_dict,category_child_dict,category_synonyms_dict)
	for level in level_category_dict.keys():
		print level
		print ' '.join(level_category_dict[level])

	#遍历主类目下的app
	infile = open('../data/'+category_name+'.json','rb')
	outfile_classification = open('../data/'+ category_name+'_classification.json','wb')

	for row in infile:
		
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])
		app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)]
		app_name_brief = app_name+" "+app_brief
		app_name_brief += " "+rule_base.grabEnglish(app_name_brief)

		tag_recommend_set = set([])

		for tag in candidate_tag_set:
			if tag in app_name_brief:
				tag_recommend_set.add(category_synonyms_dict[tag][0])
	
		if len(level_category_dict[1] & tag_recommend_set) != 0:
			candidate_main_level_set = level_category_dict[1] & tag_recommend_set
			candidate_main_level_score_dict = {}
			for candidate_main_level in candidate_main_level_set:
				score = len(node_children_dict[candidate_main_level] & tag_recommend_set)
				candidate_main_level_score_dict.setdefault(score,set([])).add(candidate_main_level)
			max_score = max(candidate_main_level_score_dict.keys())
			if max_score >= 3:
				final_category_list = list(candidate_main_level_score_dict[max_score])
				if final_category_list[0] != category_name:
					outfile_classification.write(str(app_id)+"->"+final_category_list[0]+"->"+app_name+"<@>"+" ".join(app_brief_seg)+'\r\n')
def getCorpus(category_name):

	app_lable_dict = {10743:1,1002128:1,47:1,498:1,550:-1,48:-1,490:-1,761:-1,101108:-1,101916:-1}

	x_train = []
	y_train = []
	x_test = []

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	doc_app_id = []
	docs = []
	id_name_dict = {}
	infile = open('corpus/'+category_name+'.json','rb')
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])
		app_brief_seg = [word for word in jieba.cut(app_name+" "+app_brief) if word not in stopword_set and text_process.isChinese(word)]

		if len(app_brief_seg) <= 10 and app_download <= 100:
			continue

		doc_app_id.append(app_id)
		id_name_dict[app_id] = app_name
		docs.append(app_brief_seg)

	dictionary = corpora.Dictionary(docs)
	corpus = [dictionary.doc2bow(text) for text in docs]

	for i in range(len(corpus)):
		doc = corpus[i]
		x = [0 for n in range(len(dictionary))]
		for val in doc:
			x[val[0]] = val[1]

		app_id = doc_app_id[i]
		if app_id in app_lable_dict.keys():
			x_train.append(x)
			if app_lable_dict[app_id] == 1:
				y_train.append(1)
			else:
				y_train.append(-1)
		else:
			x_test.append(x)

	return x_train,x_test,y_train,doc_app_id,id_name_dict
def mineKeywordCombination(category_id,query_keyword):

	#主类目名称
	main_category = idToName(category_id)

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	combination_fre_dict = {}

	outfile = open('keyword_combination.txt','wb')
	#遍历主类目下的app
	infile = open('../data/'+str(category_id)+'.json','rb')
	for row in infile:
		
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])
		app_name_seg = [word for word in jieba.cut(app_name) if word not in stopword_set and text_process.isChinese(word)]
		app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)]
		app_name_brief = app_name+" "+app_brief

		app_name_combination_dict = combineNeighborWord(app_name_seg,query_keyword)
		for word in app_name_combination_dict.keys():
			combination_fre_dict.setdefault(word,0)
			combination_fre_dict[word] += app_name_combination_dict[word]
		
		app_brief_combination_dict = combineNeighborWord(app_brief_seg,query_keyword)
		for word in app_brief_combination_dict.keys():
			combination_fre_dict.setdefault(word,0)
			combination_fre_dict[word] += app_brief_combination_dict[word]


	sorted_list = sorted(combination_fre_dict.items(),key=lambda p:p[1],reverse=True)
	for val in sorted_list:
		if val[1] >= 2:
			print val[0]+','+str(val[1])
			outfile.write(val[0]+','+str(val[1])+'\r\n')