Esempio n. 1
0
def download_url(url):
    cacheserver = 'http://192.168.1.179:8000/'
    batch_id = 'dongfangcaifu-201606'
    url = url
    #url = 'http://data.eastmoney.com/notice/20160617/2Wvl2aWuYwihKD.html'
    m = DownloadWrapper(cacheserver)
    content = m.downloader_wrapper(url, batch_id, 0.5, encoding='gb2312')
    return content
Esempio n. 2
0
 def process_request(self, request,  spider):
     url = request.url
     m = DownloadWrapper(SERVER)
     content = m.downloader_wrapper(url, BATCH_ID, 2)
     if content:
         response = scrapy.http.response.html.HtmlResponse(url, encoding='utf-8', body=content)
         return response
     return
Esempio n. 3
0
class ZhidaoPrefetch(object):
    def __init__(self, config):
        print config, "-----"
        self.config = config
        self.counter = collections.Counter()
        self.cache = Cache(self.config["batch_ids"]["json"],
                           self.config["cache_server"])
        self.downloader = DownloadWrapper(self.config["cache_server"],
                                          self.config["http_headers"])

    def is_debug(self):
        return self.config.get("debug", False)

    def zhidao_results(self, qids):
        q_jsons = []
        for qid in qids:
            q_json = self.zhidao_question(qid)
            if q_json is False:
                continue
            q_json["list_answers"] = []

            for rid in q_json["answer_ids"][:3]:
                a_json = self.zhidao_answer(qid, rid)
                if a_json is False:
                    continue
                q_json["list_answers"].append(a_json)

            q_jsons.append(q_json)
        return q_jsons

    def zhidao_question(self, qid):
        question_url = "http://zhidao.baidu.com/question/{}.html".format(qid)
        if self.is_debug():
            print question_url
        ret = self.downloader.downloader_wrapper(
            question_url,
            self.config["batch_ids"]["question"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"])
        if ret is False:
            return False
        q_json = generate_question_json(qid, ret)
        if q_json is None or q_json == {}:
            return False
        success = self.cache.post(question_url, q_json)
        return q_json

    def zhidao_answer(self, qid, rid):
        answer_url = ("http://zhidao.baidu.com/question/api/mini?qid={}"
                      "&rid={}&tag=timeliness".format(qid, rid))

        #print self.config["crawler"]
        if self.is_debug():
            print answer_url
        ret = self.downloader.downloader_wrapper(
            answer_url,
            self.config["batch_ids"]["answer"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"])
        if ret is False:
            return False
        try:
            a_json = generate_answer_json(ret)
        except:
            return False

        success = self.cache.post(answer_url, a_json)
        return a_json

    def zhidao_search(self, query, page_number=None, start_result_index=0):
        if isinstance(query, unicode):
            query = query.encode("utf-8")

        if page_number is None or page_number == 0:
            query_url = "http://zhidao.baidu.com/search/?word={}".format(
                urllib.quote(query))
        else:
            query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format(
                page_number * 10, urllib.quote(query))
        if self.is_debug():
            print query_url
        # query_url = "http://zhidao.baidu.com/search?word={}".format(quote_word)

        #print query
        #print query_url
        ret = self.downloader.downloader_wrapper(
            query_url,
            self.config["batch_ids"]["search"],
            self.config["crawler"]["gap"],
            timeout=self.config["crawler"]["timeout"],
            encoding=self.config["crawler"]["encoding"],
            refresh=False)
        # resp.headers: "content-type": "text/html;charset=UTF-8",
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False
        else:
            return parse_search_json_v0615(
                ret, start_result_index=start_result_index)

    def run_query(self, query, max_page):
        self.counter["query"] += 1
        qids_select = set()
        result_all = []
        for page_number in range(max_page):
            print "==== page ", page_number, query
            self.counter["page"] += 1

            result_local = self.zhidao_search(query, page_number,
                                              len(result_all))
            #print json.dumps( result_local, ensure_ascii=False, indent=4, sort_keys=True)
            result_all.extend(result_local)
            self.counter["q_total"] += len(result_local)

            for item in result_local:
                item["query"] = query
                if type(query) != unicode:
                    item["query"] = query.decode("utf-8")
                #print item
                if item["source"] == "recommend" or (item["cnt_like"] >= 3):
                    self.counter["q_good"] += 1
                    qids_select.add(item["question_id"])
                    print item["source"], item["cnt_like"], item[
                        "cnt_answer"], item['question'], "<----", item[
                            'answers']
                else:
                    print item["source"], item["cnt_like"], item[
                        "cnt_answer"], item['question']
            print datetime.datetime.now().isoformat(), self.counter
        return result_all
        #qajson = self.zhidao_results(qids_select)
        #print json.dumps(qajson, ensure_ascii=False, indent=4)

    def run_query_entity(self):
        filename = getTheFile("seed_entity.human.txt")
        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue

                self.run_query(line, 10)

    def run_query_batch(self, filename, limit):
        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                self.run_query(line, limit)

    def run_gen_url_search_realtime(self, filename):
        lines = libfile.file2list(filename)
        visited = set()
        for line in sorted(lines):
            for query_parser in [0]:
                query_url, qword = zhidao_fetch.get_search_url_qword(
                    line, query_parser=query_parser)

                if query_url in visited:
                    continue
                visited.add(query_url)
                print qword, query_url

        print len(visited)
        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "_urls.txt")))
        libfile.lines2file(sorted(list(visited)), filename_output)

    def run_test_search_realtime(self, filename, limit):
        results = []
        counter = collections.Counter()

        with codecs.open(filename) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                if not line:
                    continue
                ret = self.run_query(line, limit)
                counter["query"] += 1
                for item in ret:
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter

    def run_get_best_search_realtime(self, filename):
        results = []
        counter = collections.Counter()

        lines = libfile.file2list(filename)
        for query_parser in [0]:
            for line in sorted(lines):
                cnt_label = "query_{}".format(query_parser)
                if counter[cnt_label] % 10 == 0:
                    print datetime.datetime.now().isoformat(
                    ), counter[cnt_label], line
                counter[cnt_label] += 1

                ret_one = search_zhidao_best(line,
                                             query_filter=0,
                                             query_parser=query_parser)
                if ret_one:
                    item = ret_one["best_qapair"]

                    print "=====>", line
                    print "------", item["match_score"], item["question"]
                    print item["answers"], "*******", item["answers_raw"][
                        len(item["answers"]):]

                    for p in ["query"]:
                        item[p] = ret_one[p]
                    #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True)
                    results.append(item)
                    for p in ["source", "result_index"]:
                        counter["{}_{}".format(p, item[p])] += 1
                    for p in ["question", "answers"]:
                        if p in item:
                            if not isinstance(item[p], unicode):
                                item[p] = item[p].decode("gb18030")

        filename_output = getLocalFile(
            os.path.basename(filename.replace("human.txt", "xls")))
        libfile.writeExcel(results, [
            "id", "source", "result_index", "cnt_like", "cnt_answer", "query",
            "question_id", "question", "answers"
        ], filename_output)
        #libfile.writeExcel(results, ["query", "source", "cnt_like",  "cnt_answer", "question", "answers"], filename_output)
        print counter
Esempio n. 4
0
class Scheduler(object):
    def __init__(self, cacheserver):
        self.cache = Cache(BATCH_ID['json'], cacheserver)
        self.downloader = DownloadWrapper(cacheserver,
                                          {'Host': 'zhidao.baidu.com'})

    @classmethod
    def instance(cls, *args):
        if not hasattr(cls, '_instance'):
            setattr(cls, '_instance', cls(*args))
        return cls._instance

    def zhidao_results(self, qids, gap, timeout=10):
        q_jsons = []
        for qid in qids:
            q_json = self.zhidao_question(qid, gap, timeout)
            if q_json is False:
                continue
            q_json['list_answers'] = []

            for rid in q_json['answer_ids'][:3]:
                a_json = self.zhidao_answer(qid, rid, gap, timeout)
                if a_json is False:
                    continue
                q_json['list_answers'].append(a_json)

            q_jsons.append(q_json)
        return q_jsons

    def zhidao_question(self, qid, gap, timeout):
        question_url = 'http://zhidao.baidu.com/question/{}.html'.format(qid)
        ret = self.downloader.downloader_wrapper(question_url,
                                                 BATCH_ID['question'],
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 error_check=True)
        if ret is False:
            return False
        q_json = generate_question_json(qid, ret)
        if q_json is None:
            return False
        success = self.cache.post(question_url, q_json)
        return q_json

    def zhidao_answer(self, qid, rid, gap, timeout):
        answer_url = ('http://zhidao.baidu.com/question/api/mini?qid={}'
                      '&rid={}&tag=timeliness'.format(qid, rid))

        ret = self.downloader.downloader_wrapper(answer_url,
                                                 BATCH_ID['answer'],
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030')
        if ret is False:
            return False
        try:
            a_json = generate_answer_json(ret)
        except:
            return False

        success = self.cache.post(answer_url, a_json)
        return a_json

    def zhidao_search(self, qword, batch_id, gap=3, timeout=10, refresh=True):
        quote_word = urllib.quote(qword.encode('utf-8')) if isinstance(
            qword, unicode) else urllib.quote(qword)
        # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8
        query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word)

        ret = self.downloader.downloader_wrapper(query_url,
                                                 batch_id,
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 refresh=refresh)
        # resp.headers: 'content-type': 'text/html;charset=UTF-8',
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False
        return zhidao_search_questions(ret)

    def zhidao_search_list_json(self,
                                qword,
                                batch_id,
                                gap=3,
                                timeout=10,
                                refresh=False):
        quote_word = urllib.quote(qword.encode('utf-8')) if isinstance(
            qword, unicode) else urllib.quote(qword)
        # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8
        query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word)

        ret = self.downloader.downloader_wrapper(query_url,
                                                 batch_id,
                                                 gap,
                                                 timeout=timeout,
                                                 encoding='gb18030',
                                                 refresh=refresh)
        # resp.headers: 'content-type': 'text/html;charset=UTF-8',
        # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
        if ret is False:
            return False

        search_result_json = parse_search_json_v0615(ret)
        for item in search_result_json:
            item["query"] = qword
            if type(qword) != unicode:
                item["query"] = qword.decode("utf-8")

        return search_result_json

    def zhidao_search_select_best(self, qword, gap=3, timeout=2):
        search_result_json = self.zhidao_search_list_json(
            qword, BATCH_ID['search'], gap, timeout)

        # get the best answer
        for item in search_result_json:
            if item["is_recommend"] == 1:
                return item

        return False

    def zhidao_search_select_best_qids(self, qword, gap=3, timeout=2):
        ret = self.zhidao_search_select_best(qword, gap, timeout)
        if ret:
            return [ret["question_id"]]
        return []

    def run(self, qword, gap=3, timeout=10):
        # qids = self.zhidao_search(qword, BATCH_ID['search'], gap, timeout)
        qids = self.zhidao_search_select_best_qids(qword, gap, timeout)
        return self.zhidao_results(qids, gap, timeout)
Esempio n. 5
0
class FudanAttr(object):

    batch_id = {
        'entity': 'fudankg-entity-20160623',
        'avp': 'fudankg-avp-20160623',
    }

    def __init__(self):
        self.downloader = DownloadWrapper('http://192.168.1.179:8000')
        self.attr_counter = Counter()


    def fudan_entities(self, word):
        if isinstance(word, unicode):
            word = word.encode('utf-8')

        entities_api = 'http://kw.fudan.edu.cn/cndbpedia/api/entity?mention={}'
        content = self.downloader.downloader_wrapper(
                entities_api.format(urllib.quote(word)),
                self.batch_id['entity'],
                gap=0,
                encoding='utf-8',
                )
        return json.loads(content)[u'entity']


    def fudan_attrvalue(self, entity):
        if isinstance(entity, unicode):
            entity = entity.encode('utf-8')

        avpair_api = 'http://kw.fudan.edu.cn/cndbpedia/api/entityAVP?entity={}'
        content = self.downloader.downloader_wrapper(
                avpair_api.format(urllib.quote(entity)),
                self.batch_id['avp'],
                gap=0,
                encoding='utf-8',
                )
        return json.loads(content).values()[0]


    def fudan_attr_count(self, result):
        """[ (word, entity, [(attr, value), (attr, value), ...])
           ]
        """
        for word, entity, avps in result:
              for a,v in avps:
                self.attr_counter[a] += 1
        count = sorted(self.attr_counter.items(), key=itemgetter(1), reverse=True)
        print( json.dumps(count, ensure_ascii=False, indent=4) )


    def fudan_gen_excel(self, result):
        items = []
        keys = ['word', 'entity', 'attribute', 'value']
        filename = 'fudan_eav.xlsx'

        for word, entity, avps in result:
              for a,v in avps:
                items.append({'word': word.decode('utf-8'),
                              'entity': entity,
                              'attribute': a,
                              'value': v})

        writeExcel(items, keys, filename)




    def prepare_entities(self, entities_fname='entities_0623.txt'):
        words = []
        with open(entities_fname) as fd:
            for line in fd:
                line = line.strip()
                if line == '':
                    continue
                words.append(line)

        picked_words = self.pick_some_words(words, 1000)
        self.save_picked_words(picked_words)


    def pick_some_words(self, words, num=1000):
        picked_words = []
        for i in range(num):
            picked_words.append( words[random.randint(1, len(words))] )
        return picked_words


    def save_picked_words(self, words):
        words.extend(['长江', '熊二', '杨绛先生'])
        with open('picked_thousand_words.txt', 'w') as fd:
            fd.write('\n'.join( list(set(words)) ))