def dosearch(query): weight = 0 raw_query = lower_letters(query) query = query_parser(query) query = query + [raw_query] id_list = [] res_name = [] weight = {} if query: for term in query: if term in t_inverted_index: for key, value in t_inverted_index[term].iteritems(): if key not in weight: weight[key] = tdxidf_weighting(term, key) else: weight[key] = weight[key] + tdxidf_weighting(term, key) if key not in id_list: id_list.append(key) rank_list = calc_vector_space(query, id_list) rank_fin = [] q = list(set(jieba.cut_for_search(raw_query))) if u" " in q: q.remove(u" ") cnt = [] for key_index, key in reversed(list(enumerate(rank_list))): info_term = list(set(jieba.cut_for_search(id_info_list[key]))) if u" " in info_term: info_term.remove(u" ") for term in q: if term in info_term: cnt.append(key) freq_cnt = Counter(cnt) freq_cnt_tuples = freq_cnt.most_common() for item, cnt in freq_cnt_tuples: rank_fin.append(item) for item in rank_list: if item not in rank_fin: rank_fin.append(item) if id_list: for ids in rank_fin: res = os.path.splitext(doc_id_list[ids])[0] res = res[7:] res_name.append(res) return res_name
def build_dict_for_spell_check(t_inverted_index): sumsum = 0 data = {} for term in t_inverted_index: if (type(term) is not str): term = term.decode("utf-8") for docID in t_inverted_index[term]: sumsum += t_inverted_index[term][docID] term = lower_letters(term) if (type(term) is not str): print type(term) term = term.decode("utf-8") print "f**k" print term print type(term) data[term] = sumsum sumsum = 0 f = open('./data/spell.dat', 'wb') pickle.dump(data, f) f.close()
id_info_list = {} play_and_share = {} color = {} auto_complete_list = [] for file_name in glob.glob(ur'./data/*.json'): f = codecs.open(file_name, 'r', 'utf-8') j = json.load(f) content = j["title"] + j["singer"] + j["album"] + j["lrc"] for tag in j["tag"]: content = content + tag color[j["title"]] = get_domi_color(j["title"]) play_and_share[doc_id] = [j["play_count_num"], j["share"]] doc_id_list[doc_id] = file_name.encode("utf-8") seg_list = list(jieba.cut_for_search(content)) seg_list = words_filter(seg_list) seg_list.append(lower_letters(j["title"])) seg_list.append(lower_letters(j["singer"])) seg_list.append(lower_letters(j["album"])) for tag in j["tag"]: seg_list.append(tag) auto_complete_list.append(j["title"]) auto_complete_list.append(j["singer"]) auto_complete_list.append(j["album"]) term = list(set(seg_list)) exclude = set(string.punctuation) info_str = j["title"] + " " + j["singer"] + " " + j["album"] info_str = lower_letters(info_str) info = ''.join(ch for ch in info_str if ch not in exclude) id_info_list[doc_id] = info if u" " in term: term.remove(u" ")