Esempio n. 1
0
def load_attribute_mapping():
    if not hasattr(load_attribute_mapping, '_attr'):
        attribute_mapping = {}
        items = readExcel(['实体', '属性标准名', 'FD属性', '多种属性表达'],
                          '/Users/bishop/百度云同步盘/baike_attribute.xls', 1)
        for i in items['Sheet1']:
            attribute_mapping[i['FD属性']] = i['多种属性表达']
        setattr(load_attribute_mapping, '_attr', attribute_mapping)
    return load_attribute_mapping._attr
Esempio n. 2
0
def read_longquan18w():

    # test

    data = libfile.readExcel(["count", "question"], LONGQUAN_18W_FILENAME)
    result = set()
    for sheet in data:
        for item in data[sheet]:
            q = clean_longquan_question(item["question"])
            if q and len(q) > 2:
                result.add(q)
    print "Number of longquan question ", len(result)
    libfile.lines2file(result, LONGQUAN_18W_FILENAME_QUESTION)
Esempio n. 3
0
    def index_edit(self, option="question"):
        dataset_index = "zhidao_{}".format(option)
        gcounter[option] = 1
        dirname = getLocalFile("label0627/*{}*xls*".format(option))
        print dirname
        filenames = glob.glob(dirname)

        ids = set()

        for filename in filenames:
            print filename

            gcounter["files"] += 1
            ret = libfile.readExcel(
                ["category", "question", "answers", "type"],
                filename,
                start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        qa = u"{}{}".format(item["question"], item["answers"])
                        item["id"] = es_api.gen_es_id(qa)
                        if item["id"] in ids:
                            gcounter["items_skip_dup"] += 1
                            continue

                        if not item["type"] in [1, "1"]:
                            gcounter["items_skip_drop"] += 1
                            continue

                        skip_words = self.api_nlp.detect_skip_words(
                            qa,
                            check_list=["skip_words_all", "skip_words_zhidao"])
                        if skip_words:
                            print "SKIP", u"/".join(
                                skip_words), "\t---\t", item[
                                    "question"], "\t---\t", item["answers"]
                            gcounter["items_skip_minganci"] += 1
                            continue

                        ids.add(item["id"])
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]
                        self.upload(dataset_index, item_new)
        self.upload(dataset_index)
        gcounter["esdata"] = len(ids)
Esempio n. 4
0
    def _index_qa(self, filenames, dataset_index, filter_option=0):
        ids = set()

        for filename in filenames:
            print filename

            gcounter["files"] += 1
            ret = libfile.readExcel(["category", "question", "answers"],
                                    filename,
                                    start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        item["id"] = es_api.gen_es_id(item["question"])
                        if item["id"] in ids:
                            continue

                        label = self.filter_qa_by_label(
                            "{}".format(item["category"]),
                            item["question"],
                            item["answers"],
                            filter_option=filter_option)
                        if label:
                            print "SKIP", label, "\t---\t", item[
                                "question"], "\t---\t", item["answers"]
                            gcounter["esdata_label_{}".format(label)] += 1
                            if filter_option in [1]:
                                continue

                        ids.add(item["id"])
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]
                        self.upload(dataset_index, item_new)
        self.upload(dataset_index)
        gcounter["esdata"] = len(ids)
Esempio n. 5
0
 def load_file(self, file):
     self.file = readExcel(self.header, file)
Esempio n. 6
0
    def init_zhidao_qa(self):
        #clean rewrite

        dataset_index_list = [
            "qa0708query",
            "qa0708question",
        ]
        for dataset_index in dataset_index_list:
            dirname = getLocalFile("raw/{}/*".format(dataset_index))
            map_items = {}
            for filename in glob.glob(dirname):

                gcounter["files"] += 1
                ret = libfile.readExcel(
                    ["category", "question", "answers", "type"],
                    filename,
                    start_row=1)
                if ret:
                    for items in ret.values():
                        for item in items:
                            gcounter["items"] += 1

                            qa = u"{}{}".format(item["question"],
                                                item["answers"])
                            item["id"] = es_api.gen_es_id(qa)
                            if item["id"] in map_items:
                                gcounter["items_skip_dup"] += 1
                                continue

                            if not item["type"] in [1, "1"]:
                                gcounter["items_skip_drop"] += 1
                                continue

                            item["answers"] = clean_answer(u"{}".format(
                                item["answers"]))
                            item["question"] = clean_question(u"{}".format(
                                item["question"]))
                            if len(item["answers"]) < 2:
                                gcounter["items_skip_empty_answer"] += 1
                                continue

                            skip_words = self.api_nlp.detect_skip_words(
                                item["answers"],
                                check_list=[
                                    "skip_words_all", "skip_words_zhidao"
                                ])
                            if skip_words:
                                print "SKIP", u"/".join(
                                    skip_words), "\t---\t", item[
                                        "question"], "\t---\t", item["answers"]
                                gcounter["items_skip_minganci"] += 1
                                continue

                            item_new = {"source": dataset_index}
                            for p in ["question", "answers", "id"]:
                                item_new[p] = item[p]
                            map_items[item_new["question"]] = item_new

            gcounter["init_from_{}".format(dataset_index)] = len(map_items)
            print len(map_items)

            filename = getLocalFile("temp/{}.xls".format(dataset_index))
            items = sorted(map_items.values(), key=lambda x: x["question"])
            libfile.writeExcel(items,
                               ["label", "question", "answers", "source"],
                               filename)
Esempio n. 7
0
    def init_xianer7w_rewrite(self):
        dataset_index = "xianer7w_rewrite"
        gcounter[dataset_index] = 1

        ids = set()

        filename = getLocalFile("raw/rewrite/xianer7w_rewrite_map.xlsx")
        print filename

        gcounter["files"] += 1
        ret = libfile.readExcel(["question", "old_answers", "answers"],
                                filename,
                                start_row=0)

        #collect answer mapping
        map_answers = {}
        for item in ret.values()[0]:
            if item.get("old_answers"):
                a_old = item["old_answers"].strip()
            if item.get("answers"):
                a = item["answers"].strip()

            if a and a_old:
                map_answers[a_old] = a

        print len(map_answers)

        filename = getLocalFile("raw/rewrite/xianer7w_rewrite.xlsx")
        print filename

        gcounter["files"] += 1
        ret = libfile.readExcel(["question", "old_answers", "answers"],
                                filename,
                                start_row=0)

        #use mapping
        items = []
        for item in ret.values()[0]:
            gcounter["items"] += 1
            q = item["question"]
            if item["old_answers"]:
                a = map_answers.get(item["old_answers"])
            else:
                a = ""

            if not a:
                #print "SKIP no mapping", q, item["old_answers"]
                gcounter["items_no_mapping"] += 1
                continue

            qa = q + a
            item["id"] = es_api.gen_es_id(q)
            if item["id"] in ids:
                gcounter["items_skip_dup"] += 1
                continue

            skip_words = self.api_nlp.detect_skip_words(
                qa, check_list=["skip_words_all"])
            if skip_words:
                print "SKIP", u"/".join(
                    skip_words), "\t---\t", item["question"], "\t---\t", a
                gcounter["items_skip_minganci"] += 1
                continue

            ids.add(item["id"])
            item_new = {
                "question": q,
                "answers": a,
                "id": item["id"],
            }
            items.append(item_new)

        gcounter["qa0708rewrite"] = len(ids)

        filename = getLocalFile("temp/qa0708rewrite.xls")
        libfile.writeExcel(items, ["label", "question", "answers"], filename)
Esempio n. 8
0
    def _merge_chat(self, filenames, option):
        filename_todo = getLocalFile("input/{}_todo.txt".format(option))
        print "filename_todo", filename_todo
        q_todo = set()
        if os.path.exists(filename_todo):
            q_todo = libfile.file2set(filename_todo)
            gcounter["q_todo"] = len(q_todo)
            print "filename_todo", filename_todo, len(q_todo)

        filename_skip = getLocalFile("input/{}_skip.txt".format(option))
        print "filename_skip", filename_skip
        q_skip = set()
        if os.path.exists(filename_skip):
            q_skip = libfile.file2set(filename_skip)
            gcounter["q_skip"] = len(q_skip)
            print "filename_skip", filename_skip, len(q_skip)

        data = {}
        q_all = set()
        for filename in filenames:
            #print filename
            gcounter["files"] += 1
            ret = libfile.readExcel(["category", "question", "answers"],
                                    filename,
                                    start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        q_all.add(item["question"])

                        if q_skip and item["question"] in q_skip:
                            gcounter["items_skip"] += 1
                            continue

                        item["id"] = es_api.gen_es_id(item["question"] +
                                                      item["answers"])
                        if item["id"] in data:
                            continue

                        for dataset_index in ["chat8cmu6w", "chat8xianer12w"]:
                            if dataset_index in filename:
                                gcounter["from_" + dataset_index] += 1

                        label = self.filter_qa_by_label(
                            item["category"], item["question"],
                            item["answers"])
                        if label:
                            item["label"] = label
                            #print "SKIP", label, "\t---\t", item["question"], "\t---\t", item["answers"]
                            #gcounter["_esdata_label_{}".format(label)]+=1
                        #elif not self.api_nlp.is_question_baike(item["question"]):
                        #    item["label"] = u"百科"
                        else:
                            item["label"] = u""
                        xlabel = re.sub(":.*$", "", item["label"])
                        gcounter["data_with_label_{}".format(xlabel)] += 1

                        data[item["id"]] = item
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]

        gcounter["data"] = len(data)
        results = sorted(data.values(), key=lambda x: x["question"])
        print len(data), len(results)
        filename_output = getLocalFile("output/edit_{}.xls".format(option))
        libfile.writeExcel(results, ["label", "question", "answers"],
                           filename_output)

        filename_output = getLocalFile(
            "edit0623/sample1000_edit_{}.xls".format(option))
        libfile.writeExcel(libdata.items2sample(data.values(), limit=1000),
                           ["label", "question", "answers"], filename_output)

        if q_todo:
            q_todo.difference_update(q_all)
            filename_output = getLocalFile(
                "edit0623/question_miss_{}.xls".format(option))
            libfile.lines2file(sorted(list(q_todo)), filename_output)
            gcounter["q_all"] = len(q_all)
            gcounter["q_miss"] = len(q_todo)

        page_size = 2000
        max_page = len(results) / page_size + 1
        for i in range(max_page):
            filename_output = getLocalFile("edit0623/edit_{}_{}.xls".format(
                option, i))
            #print filename_output
            idx_start = i * page_size
            idx_end = min(len(results), (i + 1) * page_size)
            libfile.writeExcel(results[idx_start:idx_end],
                               ["label", "question", "answers"],
                               filename_output)