def load_attribute_mapping(): if not hasattr(load_attribute_mapping, '_attr'): attribute_mapping = {} items = readExcel(['实体', '属性标准名', 'FD属性', '多种属性表达'], '/Users/bishop/百度云同步盘/baike_attribute.xls', 1) for i in items['Sheet1']: attribute_mapping[i['FD属性']] = i['多种属性表达'] setattr(load_attribute_mapping, '_attr', attribute_mapping) return load_attribute_mapping._attr
def read_longquan18w(): # test data = libfile.readExcel(["count", "question"], LONGQUAN_18W_FILENAME) result = set() for sheet in data: for item in data[sheet]: q = clean_longquan_question(item["question"]) if q and len(q) > 2: result.add(q) print "Number of longquan question ", len(result) libfile.lines2file(result, LONGQUAN_18W_FILENAME_QUESTION)
def index_edit(self, option="question"): dataset_index = "zhidao_{}".format(option) gcounter[option] = 1 dirname = getLocalFile("label0627/*{}*xls*".format(option)) print dirname filenames = glob.glob(dirname) ids = set() for filename in filenames: print filename gcounter["files"] += 1 ret = libfile.readExcel( ["category", "question", "answers", "type"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 qa = u"{}{}".format(item["question"], item["answers"]) item["id"] = es_api.gen_es_id(qa) if item["id"] in ids: gcounter["items_skip_dup"] += 1 continue if not item["type"] in [1, "1"]: gcounter["items_skip_drop"] += 1 continue skip_words = self.api_nlp.detect_skip_words( qa, check_list=["skip_words_all", "skip_words_zhidao"]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["items_skip_minganci"] += 1 continue ids.add(item["id"]) item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] self.upload(dataset_index, item_new) self.upload(dataset_index) gcounter["esdata"] = len(ids)
def _index_qa(self, filenames, dataset_index, filter_option=0): ids = set() for filename in filenames: print filename gcounter["files"] += 1 ret = libfile.readExcel(["category", "question", "answers"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 item["id"] = es_api.gen_es_id(item["question"]) if item["id"] in ids: continue label = self.filter_qa_by_label( "{}".format(item["category"]), item["question"], item["answers"], filter_option=filter_option) if label: print "SKIP", label, "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["esdata_label_{}".format(label)] += 1 if filter_option in [1]: continue ids.add(item["id"]) item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] self.upload(dataset_index, item_new) self.upload(dataset_index) gcounter["esdata"] = len(ids)
def load_file(self, file): self.file = readExcel(self.header, file)
def init_zhidao_qa(self): #clean rewrite dataset_index_list = [ "qa0708query", "qa0708question", ] for dataset_index in dataset_index_list: dirname = getLocalFile("raw/{}/*".format(dataset_index)) map_items = {} for filename in glob.glob(dirname): gcounter["files"] += 1 ret = libfile.readExcel( ["category", "question", "answers", "type"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 qa = u"{}{}".format(item["question"], item["answers"]) item["id"] = es_api.gen_es_id(qa) if item["id"] in map_items: gcounter["items_skip_dup"] += 1 continue if not item["type"] in [1, "1"]: gcounter["items_skip_drop"] += 1 continue item["answers"] = clean_answer(u"{}".format( item["answers"])) item["question"] = clean_question(u"{}".format( item["question"])) if len(item["answers"]) < 2: gcounter["items_skip_empty_answer"] += 1 continue skip_words = self.api_nlp.detect_skip_words( item["answers"], check_list=[ "skip_words_all", "skip_words_zhidao" ]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["items_skip_minganci"] += 1 continue item_new = {"source": dataset_index} for p in ["question", "answers", "id"]: item_new[p] = item[p] map_items[item_new["question"]] = item_new gcounter["init_from_{}".format(dataset_index)] = len(map_items) print len(map_items) filename = getLocalFile("temp/{}.xls".format(dataset_index)) items = sorted(map_items.values(), key=lambda x: x["question"]) libfile.writeExcel(items, ["label", "question", "answers", "source"], filename)
def init_xianer7w_rewrite(self): dataset_index = "xianer7w_rewrite" gcounter[dataset_index] = 1 ids = set() filename = getLocalFile("raw/rewrite/xianer7w_rewrite_map.xlsx") print filename gcounter["files"] += 1 ret = libfile.readExcel(["question", "old_answers", "answers"], filename, start_row=0) #collect answer mapping map_answers = {} for item in ret.values()[0]: if item.get("old_answers"): a_old = item["old_answers"].strip() if item.get("answers"): a = item["answers"].strip() if a and a_old: map_answers[a_old] = a print len(map_answers) filename = getLocalFile("raw/rewrite/xianer7w_rewrite.xlsx") print filename gcounter["files"] += 1 ret = libfile.readExcel(["question", "old_answers", "answers"], filename, start_row=0) #use mapping items = [] for item in ret.values()[0]: gcounter["items"] += 1 q = item["question"] if item["old_answers"]: a = map_answers.get(item["old_answers"]) else: a = "" if not a: #print "SKIP no mapping", q, item["old_answers"] gcounter["items_no_mapping"] += 1 continue qa = q + a item["id"] = es_api.gen_es_id(q) if item["id"] in ids: gcounter["items_skip_dup"] += 1 continue skip_words = self.api_nlp.detect_skip_words( qa, check_list=["skip_words_all"]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item["question"], "\t---\t", a gcounter["items_skip_minganci"] += 1 continue ids.add(item["id"]) item_new = { "question": q, "answers": a, "id": item["id"], } items.append(item_new) gcounter["qa0708rewrite"] = len(ids) filename = getLocalFile("temp/qa0708rewrite.xls") libfile.writeExcel(items, ["label", "question", "answers"], filename)
def _merge_chat(self, filenames, option): filename_todo = getLocalFile("input/{}_todo.txt".format(option)) print "filename_todo", filename_todo q_todo = set() if os.path.exists(filename_todo): q_todo = libfile.file2set(filename_todo) gcounter["q_todo"] = len(q_todo) print "filename_todo", filename_todo, len(q_todo) filename_skip = getLocalFile("input/{}_skip.txt".format(option)) print "filename_skip", filename_skip q_skip = set() if os.path.exists(filename_skip): q_skip = libfile.file2set(filename_skip) gcounter["q_skip"] = len(q_skip) print "filename_skip", filename_skip, len(q_skip) data = {} q_all = set() for filename in filenames: #print filename gcounter["files"] += 1 ret = libfile.readExcel(["category", "question", "answers"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 q_all.add(item["question"]) if q_skip and item["question"] in q_skip: gcounter["items_skip"] += 1 continue item["id"] = es_api.gen_es_id(item["question"] + item["answers"]) if item["id"] in data: continue for dataset_index in ["chat8cmu6w", "chat8xianer12w"]: if dataset_index in filename: gcounter["from_" + dataset_index] += 1 label = self.filter_qa_by_label( item["category"], item["question"], item["answers"]) if label: item["label"] = label #print "SKIP", label, "\t---\t", item["question"], "\t---\t", item["answers"] #gcounter["_esdata_label_{}".format(label)]+=1 #elif not self.api_nlp.is_question_baike(item["question"]): # item["label"] = u"百科" else: item["label"] = u"" xlabel = re.sub(":.*$", "", item["label"]) gcounter["data_with_label_{}".format(xlabel)] += 1 data[item["id"]] = item item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] gcounter["data"] = len(data) results = sorted(data.values(), key=lambda x: x["question"]) print len(data), len(results) filename_output = getLocalFile("output/edit_{}.xls".format(option)) libfile.writeExcel(results, ["label", "question", "answers"], filename_output) filename_output = getLocalFile( "edit0623/sample1000_edit_{}.xls".format(option)) libfile.writeExcel(libdata.items2sample(data.values(), limit=1000), ["label", "question", "answers"], filename_output) if q_todo: q_todo.difference_update(q_all) filename_output = getLocalFile( "edit0623/question_miss_{}.xls".format(option)) libfile.lines2file(sorted(list(q_todo)), filename_output) gcounter["q_all"] = len(q_all) gcounter["q_miss"] = len(q_todo) page_size = 2000 max_page = len(results) / page_size + 1 for i in range(max_page): filename_output = getLocalFile("edit0623/edit_{}_{}.xls".format( option, i)) #print filename_output idx_start = i * page_size idx_end = min(len(results), (i + 1) * page_size) libfile.writeExcel(results[idx_start:idx_end], ["label", "question", "answers"], filename_output)