Example #1
0
def process(url, parameter, manager, *args, **kwargs):
    method, gap, js, data = parameter.split(':')
    gap = int(gap)
    batch_id = BATCH_ID['question']

    for _ in range(2):
        content = get_zhidao_content(url, method, gap, HEADER, batch_id)
        if content != u'':
            break
        time.sleep(gap)
    else:
        return False

    answer_ids = []
    question_content = generate_question_json(content, answer_ids)
    if question_content is None:
        return False

    m = Cache(BATCH_ID['json'])
    flag = m.post(url, question_content)
    if not flag:
        flag = m.post(url, question_content)
    if not flag:
        return flag

    answer_urls = []
    qid = re.search(
        'http://zhidao.baidu.com/question/(\d+).html', url).group(1)
    for answer_id in answer_ids[:3]:
        answer_urls.append( get_answer_url(qid, answer_id) )
    manager.put_urls_enqueue(BATCH_ID['answer'], answer_urls)

    return flag
Example #2
0
 def __init__(self, config):
     print config, "-----"
     self.config = config
     self.counter = collections.Counter()
     self.cache = Cache(self.config["batch_ids"]["json"],
                        self.config["cache_server"])
     self.downloader = DownloadWrapper(self.config["cache_server"],
                                       self.config["http_headers"])
Example #3
0
def process(url, batch_id, parameter, *args, **kwargs):
    if not hasattr(process, '_downloader'):
        domain_name = Downloader.url2domain(url)
        #{'Host': 'zhidao.baidu.com'}
        headers = {"Host": domain_name}
        setattr(process, '_downloader', DownloadWrapper(CACHE_SERVER, headers))
    if not hasattr(process, '_cache'):
        setattr(process, '_cache',
                Cache(batch_id.split('-', 1)[0] + '-json', CACHE_SERVER))

    method, gap, js, timeout, data = parameter.split(':')
    gap = int(gap)
    timeout = int(timeout)

    content = process._downloader.downloader_wrapper(url,
                                                     batch_id,
                                                     gap,
                                                     timeout=timeout)

    if kwargs and kwargs.get("debug"):
        print(len(content), "\n", content[:1000])

    if content is False:
        return False

    return True
Example #4
0
def process(url, parameter, *args, **kwargs):
    method, gap, js, data = parameter.split(':')
    gap = int(gap)
    content = get_zhidao_content(url, method, gap, HEADER, BATCH_ID['answer'])
    if content is u'':
        time.sleep(gap)
        content = get_zhidao_content(
            url, method, gap, HEADER, BATCH_ID['answer'])
    if content is u'':
        return False
    ans_content = generate_answer_json(content)
    if ans_content is None:
        return False
    m = Cache(BATCH_ID['json'])
    flag = m.post(url, ans_content)
    if not flag:
        flag = m.post(url, ans_content)
    return flag
Example #5
0
class ZhidaoPrefetch(object):
    def __init__(self, config):
        print config, "-----"
        self.config = config
        self.counter = collections.Counter()
        self.cache = Cache(self.config["batch_ids"]["json"],
                           self.config["cache_server"])
        self.downloader = DownloadWrapper(self.config["cache_server"],
                                          self.config["http_headers"])

    def is_debug(self):
        return self.config.get("debug", False)

    def zhidao_results(self, qids):
        q_jsons = []
        for qid in qids:
            q_json = self.zhidao_question(qid)
            if q_json is False:
                continue
            q_json["list_answers"] = []

            for rid in q_json["answer_ids"][:3]:
                a_json = self.zhidao_answer(qid, rid)
                if a_json is False:
                    continue
                q_json["list_answers"].append(a_json)

            q_jsons.append(q_json)
        return q_jsons

    def zhidao_question(self, qid):
        question_url = "http://zhidao.baidu.com/question/{}.html".format(qid)
        if self.is_debug():
            print question_url
        ret = self.downloader.downloader_wrapper(
            question_url,
            self.config["batch_ids"]["question"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"])
        if ret is False:
            return False
        q_json = generate_question_json(qid, ret)
        if q_json is None or q_json == {}:
            return False
        success = self.cache.post(question_url, q_json)
        return q_json

    def zhidao_answer(self, qid, rid):
        answer_url = ("http://zhidao.baidu.com/question/api/mini?qid={}"
                      "&rid={}&tag=timeliness".format(qid, rid))

        #print self.config["crawler"]
        if self.is_debug():
            print answer_url
        ret = self.downloader.downloader_wrapper(
            answer_url,
            self.config["batch_ids"]["answer"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"])
        if ret is False:
            return False
        try:
            a_json = generate_answer_json(ret)
        except:
            return False

        success = self.cache.post(answer_url, a_json)
        return a_json

    def zhidao_search(self, query, page_number=None, start_result_index=0):
        if isinstance(query, unicode):
            query = query.encode("utf-8")

        if page_number is None or page_number == 0:
            query_url = "http://zhidao.baidu.com/search/?word={}".format(
                urllib.quote(query))
        else:
            query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format(
                page_number * 10, urllib.quote(query))
        if self.is_debug():
            print query_url
        # query_url = "http://zhidao.baidu.com/search?word={}".format(quote_word)

        #print query
        #print query_url
        ret = self.downloader.downloader_wrapper(
            query_url,
            self.config["batch_ids"]["search"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"],
            refresh=False)
        # resp.headers: "content-type": "text/html;charset=UTF-8",
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False
        else:
            return parse_search_json_v0615(
                ret, start_result_index=start_result_index)

    def run_query(self, query, max_page):
        self.counter["query"] += 1
        qids_select = set()
        result_all = []
        for page_number in range(max_page):
            print "==== page ", page_number, query
            self.counter["page"] += 1

            result_local = self.zhidao_search(query, page_number,
                                              len(result_all))
            #print json.dumps( result_local, ensure_ascii=False, indent=4, sort_keys=True)
            result_all.extend(result_local)
            self.counter["q_total"] += len(result_local)

            for item in result_local:
                item["query"] = query
                if type(query) != unicode:
                    item["query"] = query.decode("utf-8")
                #print item
                if item["source"] == "recommend" or (item["cnt_like"] >= 3):
                    self.counter["q_good"] += 1
                    qids_select.add(item["question_id"])
                    print item["source"], item["cnt_like"], item[
                        "cnt_answer"], item['question'], "<----", item[
                            'answers']
                else:
                    print item["source"], item["cnt_like"], item[
                        "cnt_answer"], item['question']
            print datetime.datetime.now().isoformat(), self.counter
        return result_all
        #qajson = self.zhidao_results(qids_select)
        #print json.dumps(qajson, ensure_ascii=False, indent=4)

    def run_query_entity(self):
        filename = getTheFile("seed_entity.human.txt")
        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue

                self.run_query(line, 10)

    def run_query_batch(self, filename, limit):
        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                self.run_query(line, limit)

    def run_gen_url_search_realtime(self, filename):
        lines = libfile.file2list(filename)
        visited = set()
        for line in sorted(lines):
            for query_parser in [0]:
                query_url, qword = zhidao_fetch.get_search_url_qword(
                    line, query_parser=query_parser)

                if query_url in visited:
                    continue
                visited.add(query_url)
                print qword, query_url

        print len(visited)
        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "_urls.txt")))
        libfile.lines2file(sorted(list(visited)), filename_output)

    def run_test_search_realtime(self, filename, limit):
        results = []
        counter = collections.Counter()

        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                ret = self.run_query(line, limit)
                counter["query"] += 1
                for item in ret:
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter

    def run_get_best_search_realtime(self, filename):
        results = []
        counter = collections.Counter()

        lines = libfile.file2list(filename)
        for query_parser in [0]:
            for line in sorted(lines):
                cnt_label = "query_{}".format(query_parser)
                if counter[cnt_label] % 10 == 0:
                    print datetime.datetime.now().isoformat(
                    ), counter[cnt_label], line
                counter[cnt_label] += 1

                ret_one = search_zhidao_best(line,
                                             query_filter=0,
                                             query_parser=query_parser)
                if ret_one:
                    item = ret_one["best_qapair"]

                    print "=====>", line
                    print "------", item["match_score"], item["question"]
                    print item["answers"], "*******", item["answers_raw"][
                        len(item["answers"]):]

                    for p in ["query"]:
                        item[p] = ret_one[p]
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter
Example #6
0
 def __init__(self, cacheserver):
     self.cache = Cache(BATCH_ID['json'], cacheserver)
     self.downloader = DownloadWrapper(cacheserver,
                                       {'Host': 'zhidao.baidu.com'})
Example #7
0
class Scheduler(object):
    def __init__(self, cacheserver):
        self.cache = Cache(BATCH_ID['json'], cacheserver)
        self.downloader = DownloadWrapper(cacheserver,
                                          {'Host': 'zhidao.baidu.com'})

    @classmethod
    def instance(cls, *args):
        if not hasattr(cls, '_instance'):
            setattr(cls, '_instance', cls(*args))
        return cls._instance

    def zhidao_results(self, qids, gap, timeout=10):
        q_jsons = []
        for qid in qids:
            q_json = self.zhidao_question(qid, gap, timeout)
            if q_json is False:
                continue
            q_json['list_answers'] = []

            for rid in q_json['answer_ids'][:3]:
                a_json = self.zhidao_answer(qid, rid, gap, timeout)
                if a_json is False:
                    continue
                q_json['list_answers'].append(a_json)

            q_jsons.append(q_json)
        return q_jsons

    def zhidao_question(self, qid, gap, timeout):
        question_url = 'http://zhidao.baidu.com/question/{}.html'.format(qid)
        ret = self.downloader.downloader_wrapper(question_url,
                                                 BATCH_ID['question'],
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 error_check=True)
        if ret is False:
            return False
        q_json = generate_question_json(qid, ret)
        if q_json is None:
            return False
        success = self.cache.post(question_url, q_json)
        return q_json

    def zhidao_answer(self, qid, rid, gap, timeout):
        answer_url = ('http://zhidao.baidu.com/question/api/mini?qid={}'
                      '&rid={}&tag=timeliness'.format(qid, rid))

        ret = self.downloader.downloader_wrapper(answer_url,
                                                 BATCH_ID['answer'],
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030')
        if ret is False:
            return False
        try:
            a_json = generate_answer_json(ret)
        except:
            return False

        success = self.cache.post(answer_url, a_json)
        return a_json

    def zhidao_search(self, qword, batch_id, gap=3, timeout=10, refresh=True):
        quote_word = urllib.quote(qword.encode('utf-8')) if isinstance(
            qword, unicode) else urllib.quote(qword)
        # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8
        query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word)

        ret = self.downloader.downloader_wrapper(query_url,
                                                 batch_id,
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 refresh=refresh)
        # resp.headers: 'content-type': 'text/html;charset=UTF-8',
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False
        return zhidao_search_questions(ret)

    def zhidao_search_list_json(self,
                                qword,
                                batch_id,
                                gap=3,
                                timeout=10,
                                refresh=False):
        quote_word = urllib.quote(qword.encode('utf-8')) if isinstance(
            qword, unicode) else urllib.quote(qword)
        # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8
        query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word)

        ret = self.downloader.downloader_wrapper(query_url,
                                                 batch_id,
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 refresh=refresh)
        # resp.headers: 'content-type': 'text/html;charset=UTF-8',
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False

        search_result_json = parse_search_json_v0615(ret)
        for item in search_result_json:
            item["query"] = qword
            if type(qword) != unicode:
                item["query"] = qword.decode("utf-8")

        return search_result_json

    def zhidao_search_select_best(self, qword, gap=3, timeout=2):
        search_result_json = self.zhidao_search_list_json(
            qword, BATCH_ID['search'], gap, timeout)

        # get the best answer
        for item in search_result_json:
            if item["is_recommend"] == 1:
                return item

        return False

    def zhidao_search_select_best_qids(self, qword, gap=3, timeout=2):
        ret = self.zhidao_search_select_best(qword, gap, timeout)
        if ret:
            return [ret["question_id"]]
        return []

    def run(self, qword, gap=3, timeout=10):
        # qids = self.zhidao_search(qword, BATCH_ID['search'], gap, timeout)
        qids = self.zhidao_search_select_best_qids(qword, gap, timeout)
        return self.zhidao_results(qids, gap, timeout)
Example #8
0
# -*- coding: utf-8 -*-
from downloader.cache import Cache
import sys
import re
import requests
reload(sys)
sys.setdefaultencoding('utf-8')
BATCH_ID = 'dongfang-201606test'
m = Cache(BATCH_ID, 'http://192.168.1.179:8000/')
url = 'http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page=33333'
content = m.get(url)
#print requests.get(url).text
print '更多公告' in content
OK = 0
NO = 0
for index in range(1, 2000):
    url = 'http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page={}'.format(
        index)
    content = m.get(url)
    if content:
        if '更多公告' in content:
            OK += 1
        else:
            NO += 1
print NO, OK
#检测指定区间内正常网页和显示不完全网页的对比
#所谓的显示不完全表现为网页爬取正常,但是列表框内没有那50个公告列表以及连接