def index_edit(self, option="question"): dataset_index = "zhidao_{}".format(option) gcounter[option] = 1 dirname = getLocalFile("label0627/*{}*xls*".format(option)) print dirname filenames = glob.glob(dirname) ids = set() for filename in filenames: print filename gcounter["files"] += 1 ret = libfile.readExcel( ["category", "question", "answers", "type"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 qa = u"{}{}".format(item["question"], item["answers"]) item["id"] = es_api.gen_es_id(qa) if item["id"] in ids: gcounter["items_skip_dup"] += 1 continue if not item["type"] in [1, "1"]: gcounter["items_skip_drop"] += 1 continue skip_words = self.api_nlp.detect_skip_words( qa, check_list=["skip_words_all", "skip_words_zhidao"]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["items_skip_minganci"] += 1 continue ids.add(item["id"]) item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] self.upload(dataset_index, item_new) self.upload(dataset_index) gcounter["esdata"] = len(ids)
def index_xianer12w_test(self): dataset_index = "xianer12w_test" filename = getLocalFile("input/chat8xianer12w.txt") visited = set() for line in libfile.file2list(filename): if line in visited: continue visited.add(line) gcounter["lines"] += 1 item = { "question": line, "answers": u"无语", "id": es_api.gen_es_id(line) } self.upload(dataset_index, item) self.upload(dataset_index)
def fudan_ea_to_json(entity, attribute, attribute_name, extra_tag, values, category=None, searchscore=None, alias=[]): """ :param entity: type(entity) is unicode """ tags = [entity, entity.lower(), entity.upper(), extra_tag] entity_name = entity aliases = alias m = regdropbrackets.match(entity) if m: entity_name = m.group(1) tags.append(entity_name.lower()) tags.append(entity_name.upper()) eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute)) # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match ret = { 'id': eid, 'entity': entity, 'entity_name': entity_name, 'attribute': attribute, 'attribute_name': attribute_name, 'value': values[0] if len(values) > 0 else '', 'values': values, 'tags': list(set(tags)), 'searchscore': searchscore, } if category: ret.update({'category': category}) if searchscore: ret.update({'searchscore': searchscore}) return ret
def _index_qa(self, filenames, dataset_index, filter_option=0): ids = set() for filename in filenames: print filename gcounter["files"] += 1 ret = libfile.readExcel(["category", "question", "answers"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 item["id"] = es_api.gen_es_id(item["question"]) if item["id"] in ids: continue label = self.filter_qa_by_label( "{}".format(item["category"]), item["question"], item["answers"], filter_option=filter_option) if label: print "SKIP", label, "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["esdata_label_{}".format(label)] += 1 if filter_option in [1]: continue ids.add(item["id"]) item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] self.upload(dataset_index, item_new) self.upload(dataset_index) gcounter["esdata"] = len(ids)
def ea_to_json(entity, attribute, attribute_name, extra_tag, values): """ :param entity: type(entity) is unicode """ tags = [entity, entity.lower(), entity.upper(), extra_tag] alias = get_all_aliases(entity) if alias: tags.extend(list(alias)) # alias_mapping = load_alias_mapping() # if entity in alias_mapping: # tags.extend(alias_mapping[entity]) entity_name = entity m = regdropbrackets.match(entity) if m: entity_name = m.group(1) tags.append(entity_name.lower()) tags.append(entity_name.upper()) eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute)) # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match return { 'id': eid, 'entity': entity, 'entity_name': entity_name, 'attribute': attribute, 'attribute_name': attribute_name, 'value': values[0], 'values': values, 'tags': list(set(tags)) }
def init_zhidao_qa(self): #clean rewrite dataset_index_list = [ "qa0708query", "qa0708question", ] for dataset_index in dataset_index_list: dirname = getLocalFile("raw/{}/*".format(dataset_index)) map_items = {} for filename in glob.glob(dirname): gcounter["files"] += 1 ret = libfile.readExcel( ["category", "question", "answers", "type"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 qa = u"{}{}".format(item["question"], item["answers"]) item["id"] = es_api.gen_es_id(qa) if item["id"] in map_items: gcounter["items_skip_dup"] += 1 continue if not item["type"] in [1, "1"]: gcounter["items_skip_drop"] += 1 continue item["answers"] = clean_answer(u"{}".format( item["answers"])) item["question"] = clean_question(u"{}".format( item["question"])) if len(item["answers"]) < 2: gcounter["items_skip_empty_answer"] += 1 continue skip_words = self.api_nlp.detect_skip_words( item["answers"], check_list=[ "skip_words_all", "skip_words_zhidao" ]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item[ "question"], "\t---\t", item["answers"] gcounter["items_skip_minganci"] += 1 continue item_new = {"source": dataset_index} for p in ["question", "answers", "id"]: item_new[p] = item[p] map_items[item_new["question"]] = item_new gcounter["init_from_{}".format(dataset_index)] = len(map_items) print len(map_items) filename = getLocalFile("temp/{}.xls".format(dataset_index)) items = sorted(map_items.values(), key=lambda x: x["question"]) libfile.writeExcel(items, ["label", "question", "answers", "source"], filename)
def init_xianer7w_rewrite(self): dataset_index = "xianer7w_rewrite" gcounter[dataset_index] = 1 ids = set() filename = getLocalFile("raw/rewrite/xianer7w_rewrite_map.xlsx") print filename gcounter["files"] += 1 ret = libfile.readExcel(["question", "old_answers", "answers"], filename, start_row=0) #collect answer mapping map_answers = {} for item in ret.values()[0]: if item.get("old_answers"): a_old = item["old_answers"].strip() if item.get("answers"): a = item["answers"].strip() if a and a_old: map_answers[a_old] = a print len(map_answers) filename = getLocalFile("raw/rewrite/xianer7w_rewrite.xlsx") print filename gcounter["files"] += 1 ret = libfile.readExcel(["question", "old_answers", "answers"], filename, start_row=0) #use mapping items = [] for item in ret.values()[0]: gcounter["items"] += 1 q = item["question"] if item["old_answers"]: a = map_answers.get(item["old_answers"]) else: a = "" if not a: #print "SKIP no mapping", q, item["old_answers"] gcounter["items_no_mapping"] += 1 continue qa = q + a item["id"] = es_api.gen_es_id(q) if item["id"] in ids: gcounter["items_skip_dup"] += 1 continue skip_words = self.api_nlp.detect_skip_words( qa, check_list=["skip_words_all"]) if skip_words: print "SKIP", u"/".join( skip_words), "\t---\t", item["question"], "\t---\t", a gcounter["items_skip_minganci"] += 1 continue ids.add(item["id"]) item_new = { "question": q, "answers": a, "id": item["id"], } items.append(item_new) gcounter["qa0708rewrite"] = len(ids) filename = getLocalFile("temp/qa0708rewrite.xls") libfile.writeExcel(items, ["label", "question", "answers"], filename)
def _merge_chat(self, filenames, option): filename_todo = getLocalFile("input/{}_todo.txt".format(option)) print "filename_todo", filename_todo q_todo = set() if os.path.exists(filename_todo): q_todo = libfile.file2set(filename_todo) gcounter["q_todo"] = len(q_todo) print "filename_todo", filename_todo, len(q_todo) filename_skip = getLocalFile("input/{}_skip.txt".format(option)) print "filename_skip", filename_skip q_skip = set() if os.path.exists(filename_skip): q_skip = libfile.file2set(filename_skip) gcounter["q_skip"] = len(q_skip) print "filename_skip", filename_skip, len(q_skip) data = {} q_all = set() for filename in filenames: #print filename gcounter["files"] += 1 ret = libfile.readExcel(["category", "question", "answers"], filename, start_row=1) if ret: for items in ret.values(): for item in items: gcounter["items"] += 1 q_all.add(item["question"]) if q_skip and item["question"] in q_skip: gcounter["items_skip"] += 1 continue item["id"] = es_api.gen_es_id(item["question"] + item["answers"]) if item["id"] in data: continue for dataset_index in ["chat8cmu6w", "chat8xianer12w"]: if dataset_index in filename: gcounter["from_" + dataset_index] += 1 label = self.filter_qa_by_label( item["category"], item["question"], item["answers"]) if label: item["label"] = label #print "SKIP", label, "\t---\t", item["question"], "\t---\t", item["answers"] #gcounter["_esdata_label_{}".format(label)]+=1 #elif not self.api_nlp.is_question_baike(item["question"]): # item["label"] = u"百科" else: item["label"] = u"" xlabel = re.sub(":.*$", "", item["label"]) gcounter["data_with_label_{}".format(xlabel)] += 1 data[item["id"]] = item item_new = {} for p in ["question", "answers", "id"]: item_new[p] = item[p] gcounter["data"] = len(data) results = sorted(data.values(), key=lambda x: x["question"]) print len(data), len(results) filename_output = getLocalFile("output/edit_{}.xls".format(option)) libfile.writeExcel(results, ["label", "question", "answers"], filename_output) filename_output = getLocalFile( "edit0623/sample1000_edit_{}.xls".format(option)) libfile.writeExcel(libdata.items2sample(data.values(), limit=1000), ["label", "question", "answers"], filename_output) if q_todo: q_todo.difference_update(q_all) filename_output = getLocalFile( "edit0623/question_miss_{}.xls".format(option)) libfile.lines2file(sorted(list(q_todo)), filename_output) gcounter["q_all"] = len(q_all) gcounter["q_miss"] = len(q_todo) page_size = 2000 max_page = len(results) / page_size + 1 for i in range(max_page): filename_output = getLocalFile("edit0623/edit_{}_{}.xls".format( option, i)) #print filename_output idx_start = i * page_size idx_end = min(len(results), (i + 1) * page_size) libfile.writeExcel(results[idx_start:idx_end], ["label", "question", "answers"], filename_output)
def test(self, dataset_index="chat8xianer12w", option="query"): filename_todo = getLocalFile("input/{}_todo.txt".format(option)) print "filename_todo", filename_todo q_todo = set() if os.path.exists(filename_todo): q_todo = libfile.file2set(filename_todo) gcounter["q_todo"] = len(q_todo) print "filename_todo", filename_todo, len(q_todo) filename_skip = getLocalFile("input/{}_skip.txt".format(option)) print "filename_skip", filename_skip q_skip = set() if os.path.exists(filename_skip): q_skip = libfile.file2set(filename_skip) gcounter["q_skip"] = len(q_skip) print "filename_skip", filename_skip, len(q_skip) data = {} q_all = set() dirname = getLocalFile( "output0623/{}*worker*json.txt".format(dataset_index)) for filename in glob.glob(dirname): print filename gcounter["files"] += 1 for line in libfile.file2list(filename): entry = json.loads(line) query = entry["query"] #print entry.keys() if "items_all" not in entry: gcounter["selected_no_data"] += 1 continue elif len(entry["items_all"]) == 0: gcounter["selected_no_item"] += 1 continue if q_skip and query in q_skip: gcounter["items_skip"] += 1 q_all.add(query) continue if self.api_nlp.detect_skip_words(query): gcounter["selected_query_skipwords"] += 1 q_all.add(query) continue items_select = self.api_nlp.select_qapair_0624( query, entry["items_all"]) if items_select: gcounter["selected_yes"] += 1 q_all.add(query) else: gcounter["selected_no"] += 1 for item in items_select: item["id"] = es_api.gen_es_id(item["question"] + item["answers"]) if item["id"] in data: continue label = self.filter_qa_by_label("", item["question"], item["answers"]) if label: item["label"] = label else: item["label"] = u"" xlabel = re.sub(":.*$", "", item["label"]) gcounter["data_with_label_{}".format(xlabel)] += 1 gcounter["items"] += 1 data[item["id"]] = item #ret = libfile.readExcel(["category","question","answers"], filename, start_row=1) if q_todo: q_todo.difference_update(q_all) filename_output = getLocalFile( "edit0623/query_miss_{}.xls".format(option)) libfile.lines2file(sorted(list(q_todo)), filename_output) gcounter["q_all"] = len(q_all) gcounter["q_miss"] = len(q_todo)