コード例 #1
0
ファイル: task_run.py プロジェクト: Justinyj/ruyiwebcrawl
def read_kidsfaq2w(limit=10):
    # filename = getLocalFile(KIDS_2W_FILENAME)
    # list_json = libfile.file2list(filename)
    # list_query = []
    # for item in list_json:
    #     item = json.loads(item)
    #     q = item["_source"]["question"]
    #     if "@" not in q:
    #         list_query.append(q)
    # libfile.lines2file(list_query, getLocalFile(KIDS_2W_QUERY_FILENAME))
    list_query = libfile.file2list(getLocalFile(KIDS_2W_QUERY_FILENAME))
    print "Length of kidsfaq2w ", len(list_query)

    random.shuffle(list_query)
    return list_query[0: limit if limit<len(list_query) else len(list_query)]
コード例 #2
0
ファイル: task_batch.py プロジェクト: Justinyj/ruyiwebcrawl
    def run_gen_url_search_realtime(self, filename):
        lines = libfile.file2list(filename)
        visited = set()
        for line in sorted(lines):
            for query_parser in [0]:
                query_url, qword = zhidao_fetch.get_search_url_qword(
                    line, query_parser=query_parser)

                if query_url in visited:
                    continue
                visited.add(query_url)
                print qword, query_url

        print len(visited)
        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "_urls.txt")))
        libfile.lines2file(sorted(list(visited)), filename_output)
コード例 #3
0
    def init_from_json(self):
        map_items = {}
        dirname = getLocalFile("raw/chat0708/*")
        for filename in glob.glob(dirname):
            src = os.path.basename(filename).replace(".txt", "")
            for line in libfile.file2list(filename):
                gcounter["total_" + src] += 1
                item = json.loads(line)
                item["source"] = src

                item["answers"] = clean_answer(item["answers"])
                item["question"] = clean_question(item["question"])
                if len(item["answers"]) < 2:
                    gcounter["items_skip_empty_answer"] += 1
                    continue

                label = ""
                if not label:
                    label = self.api_nlp.detect_skip_words(item["answers"])
                    if label:
                        label = u"敏感词:{}".format(u",".join(label))
                        print label, item["answers"]
                        gcounter["minganci_answer"] += 1
                    else:
                        label = ""
                if not label:
                    if re.search("^[0-9\-]$", item["answers"]):
                        label = "number"
                item["label"] = label

                q = item["question"]
                if q not in map_items:
                    map_items[q] = item
                    gcounter["from_" + src] += 1
                else:
                    map_items[q] = item
                    gcounter["overwrite_" + src] += 1
                    print "overwrite", q, src, map_items[q]["answers"], item[
                        "answers"]

        gcounter["init_from_json"] = len(map_items)

        filename = getLocalFile("temp/qa0708chat.xls")
        items = sorted(map_items.values(), key=lambda x: x["question"])
        libfile.writeExcel(items, ["label", "question", "answers", "source"],
                           filename)
コード例 #4
0
ファイル: task_batch.py プロジェクト: Justinyj/ruyiwebcrawl
    def run_get_best_search_realtime(self, filename):
        results = []
        counter = collections.Counter()

        lines = libfile.file2list(filename)
        for query_parser in [0]:
            for line in sorted(lines):
                cnt_label = "query_{}".format(query_parser)
                if counter[cnt_label] % 10 == 0:
                    print datetime.datetime.now().isoformat(
                    ), counter[cnt_label], line
                counter[cnt_label] += 1

                ret_one = search_zhidao_best(line,
                                             query_filter=0,
                                             query_parser=query_parser)
                if ret_one:
                    item = ret_one["best_qapair"]

                    print "=====>", line
                    print "------", item["match_score"], item["question"]
                    print item["answers"], "*******", item["answers_raw"][
                        len(item["answers"]):]

                    for p in ["query"]:
                        item[p] = ret_one[p]
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter
コード例 #5
0
    def index_xianer12w_test(self):
        dataset_index = "xianer12w_test"
        filename = getLocalFile("input/chat8xianer12w.txt")
        visited = set()
        for line in libfile.file2list(filename):

            if line in visited:
                continue

            visited.add(line)
            gcounter["lines"] += 1
            item = {
                "question": line,
                "answers": u"无语",
                "id": es_api.gen_es_id(line)
            }

            self.upload(dataset_index, item)
        self.upload(dataset_index)
コード例 #6
0
ファイル: task_run.py プロジェクト: Justinyj/ruyiwebcrawl
def clean_cmu():
    dirname = getLocalFile("raw/cmu/*.txt")
    #print dirname
    lines = set()
    seq = []
    counter = collections.Counter()
    for filename in glob.glob(dirname):
        counter["files"]+=1
        for line in libfile.file2list(filename):
            zhstr = libdata.extract_zh(line)
            counter["lines"]+=1
            if zhstr and len(zhstr)>1:
                counter["occurs"]+=1
                #print zhstr
                seq.append(zhstr)
                lines.add(zhstr)

    print len(lines)
    filename_output = getLocalFile("output/cmu6w.txt")
    libfile.lines2file(sorted(list(lines)), filename_output)

    print len(seq)
    filename_output = getLocalFile("output/cmu6w_seq.txt")
    libfile.lines2file(seq, filename_output)
コード例 #7
0
ファイル: task_run.py プロジェクト: Justinyj/ruyiwebcrawl
def fetch_detail(worker_id=None, worker_num=None, limit=None, config_index="prod", filename_input=None, fetch_option="top_n", fetch_limit=100):
    flag_batch = (worker_id is not None and worker_num is not None and worker_num>1)
    flag_prod = (config_index == "prod")
    flag_slack = (flag_prod and worker_id == 0)

    job_name = os.path.basename(filename_input).replace(".txt","")
    output_dir = "output0623"
    if flag_batch:
        filename_output_xls = getLocalFile("{}/{}.{}_worker.xls".format(output_dir, job_name, worker_id))
        filename_output_xls2 = getLocalFile("{}/{}.{}_worker_query.xls".format(output_dir, job_name, worker_id))
        filename_output_json = getLocalFile("{}/{}.{}_worker.json.txt".format(output_dir, job_name, worker_id))
    else:
        filename_output_xls = getLocalFile("{}/{}.batch_{}.all.xls".format(output_dir, job_name, config_index))
        filename_output_xls2 = getLocalFile("{}/{}.batch_{}.all_query.xls".format(output_dir, job_name, config_index))
        filename_output_json = getLocalFile("{}/{}.batch_{}.all.json.txt".format(output_dir, job_name, config_index))

    CONFIG ={
        "local":{
                "batch_id": "zhidao-search0623-20160621",
                "crawl_http_method": "get",
                "crawl_gap": 3,
                "crawl_use_js_engine": False,
                "crawl_timeout": 10,
                "crawl_http_headers": {},
                "note": "知道搜索,闲聊",
                "debug": True,
    #             "cache_server": "http://52.196.166.54:8000"  #内网IP
    #             "cache_server": "http://52.196.166.54:8000"
                "cache_server": "http://192.168.1.179:8000"
            },
        "prod":{
                "batch_id": "zhidao-search0623-20160621",
                "crawl_http_method": "get",
                "crawl_gap": 5,
                "crawl_use_js_engine": False,
                "crawl_timeout": 10,
                "crawl_http_headers": {},
                "note": "知道搜索,闲聊",
                "debug": False,
            }

    }
    print filename_input
    if not filename_input:
        print "FATAL "
        return
    else:
        list_query = libfile.file2list(filename_input)
        print "Length of kidsfaq2w ", len(list_query)

    config = CONFIG[config_index]
    #config = {}
    api = ZhidaoFetch(config)



    ts_start = time.time()
    ts_lap_start = time.time()
    counter = collections.Counter()
    if limit:
        step = len(list_query)/limit
        list_query = [list_query[i*step] for i in range(limit)]
    print len(list_query)

    if flag_slack:
        slack_msg( u"AWS {}/{}. run {} batch_id: {}, urls: {} debug: {}".format(
            worker_id,
            worker_num,
            config["note"],
            config["batch_id"],
            len(list_query),
            config.get("debug",False)) )

    results = []
    with codecs.open(filename_output_json, 'w') as fjson:
        for query in list_query:

            if counter["visited"] % 1000 ==0:
                print datetime.datetime.now().isoformat(), counter
            counter["visited"]+=1
            if flag_batch:
                if (counter["visited"] % worker_num) != worker_id:
                    counter["skipped_peer"]+=1
                    continue


            counter["processed"]+=1
            if counter["processed"] % 1000 == 0:
                if flag_slack:
                    slack_msg( "AWS {}/{}. working {}. lap {} seconds. {}".format(
                            worker_id,
                            worker_num,
                            config["batch_id"],
                            int( time.time() - ts_lap_start ),
                            json.dumps(counter) ))
                    ts_lap_start = time.time()

            if "search_all" == fetch_option:
                ret = api.search_all(query, limit = fetch_limit)
            else:
                ret = fn_fetch(query )

            if ret:
                ret["query"] = query
                fjson.write(u"{}\n".format(json.dumps(ret, ensure_ascii=False)))

            if ret and ret.get("items"):
                counter["has_result"] +=1
                counter["total_qa"] += len(ret["items"])
                if config.get("debug"):
                    print len(ret["items"]), json.dumps(ret, ensure_ascii=False)
                for item in ret["items"]:
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    item["query"] = query
                    for p in ["source"]:
                        counter["{}_{}".format(p, item[p])] +=1
                    for p in [  "question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")
            else:
                counter["missing_data"] +=1
                pass



    #libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like",  "cnt_answer", "query", "question_id", "question", "answers"], filename_output_xls)
    #libfile.writeExcel(results, [ "id","is_good", "match_score", "result_index", "cnt_like",  "cnt_answer", "query", "question", "answers"], filename_output_xls, page_size=5000)
    #print filename_output_xls
#    libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls)
    libfile.writeExcel(results, [ "label","question", "answers"], filename_output_xls)
    libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls2)


    duration_sec =  int( time.time() -ts_start )
    print "all done, seconds", duration_sec, duration_sec/counter["visited"], counter

    if flag_slack:
        slack_msg( "AWS {}/{}. done {}. total {} seconds".format(
                worker_id,
                worker_num,
                config["batch_id"],
                duration_sec) )
コード例 #8
0
    def test(self, dataset_index="chat8xianer12w", option="query"):
        filename_todo = getLocalFile("input/{}_todo.txt".format(option))
        print "filename_todo", filename_todo
        q_todo = set()
        if os.path.exists(filename_todo):
            q_todo = libfile.file2set(filename_todo)
            gcounter["q_todo"] = len(q_todo)
            print "filename_todo", filename_todo, len(q_todo)

        filename_skip = getLocalFile("input/{}_skip.txt".format(option))
        print "filename_skip", filename_skip
        q_skip = set()
        if os.path.exists(filename_skip):
            q_skip = libfile.file2set(filename_skip)
            gcounter["q_skip"] = len(q_skip)
            print "filename_skip", filename_skip, len(q_skip)

        data = {}
        q_all = set()
        dirname = getLocalFile(
            "output0623/{}*worker*json.txt".format(dataset_index))

        for filename in glob.glob(dirname):
            print filename
            gcounter["files"] += 1

            for line in libfile.file2list(filename):
                entry = json.loads(line)
                query = entry["query"]

                #print entry.keys()
                if "items_all" not in entry:
                    gcounter["selected_no_data"] += 1
                    continue
                elif len(entry["items_all"]) == 0:
                    gcounter["selected_no_item"] += 1
                    continue

                if q_skip and query in q_skip:
                    gcounter["items_skip"] += 1
                    q_all.add(query)
                    continue

                if self.api_nlp.detect_skip_words(query):
                    gcounter["selected_query_skipwords"] += 1
                    q_all.add(query)
                    continue

                items_select = self.api_nlp.select_qapair_0624(
                    query, entry["items_all"])
                if items_select:
                    gcounter["selected_yes"] += 1
                    q_all.add(query)
                else:
                    gcounter["selected_no"] += 1

                for item in items_select:
                    item["id"] = es_api.gen_es_id(item["question"] +
                                                  item["answers"])
                    if item["id"] in data:
                        continue

                    label = self.filter_qa_by_label("", item["question"],
                                                    item["answers"])
                    if label:
                        item["label"] = label
                    else:
                        item["label"] = u""
                    xlabel = re.sub(":.*$", "", item["label"])
                    gcounter["data_with_label_{}".format(xlabel)] += 1
                    gcounter["items"] += 1

                    data[item["id"]] = item
                #ret = libfile.readExcel(["category","question","answers"], filename, start_row=1)

        if q_todo:
            q_todo.difference_update(q_all)
            filename_output = getLocalFile(
                "edit0623/query_miss_{}.xls".format(option))
            libfile.lines2file(sorted(list(q_todo)), filename_output)
            gcounter["q_all"] = len(q_all)
            gcounter["q_miss"] = len(q_todo)