Ejemplo n.º 1
0
 def anhuiTester(self):
     pinyin = "anhui"
     db_inst = DBManager.getInstance("ssdb",
                                     'jyyc_%s' % pinyin,
                                     host="spider5",
                                     port=57888)
     handler = AnhuiJyycHandler(pinyin)
     self.testFromSSDB(db_inst, handler)
Ejemplo n.º 2
0
 def heilongjiangTester(self):
     pinyin = "heilongjiang"
     db_inst = DBManager.getInstance("ssdb",
                                     'jyyc_%s' % pinyin,
                                     host="spider5",
                                     port=57888)
     handler = HeilongjiangJyycHandler(pinyin)
     self.testFromSSDB(db_inst, handler)
Ejemplo n.º 3
0
 def jiangsuTester(self):
     pinyin = "jiangsu"
     db_inst = DBManager.getInstance("ssdb",
                                     'jyyc_%s' % pinyin,
                                     host="spider5",
                                     port=57888)
     handler = JiangsuJyycHandler(pinyin)
     self.testFromSSDB(db_inst, handler)
Ejemplo n.º 4
0
 def beijingNbTest(self):
     pinyin = "beijing"
     db_inst = DBManager.getInstance("ssdb",
                                     "%s_nbxx" % pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = "473dff8aacd4ab651b932bc8a3bbfda3|_|北京崇尚兴业商贸有限公司|_|110108010048185|_|2016-06-23|_|beijing|_|2015"
     handler = BeijingNbHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 5
0
 def shanghaiNbTest(self):
     pinyin = "shanghai"
     db_inst = DBManager.getInstance("ssdb",
                                     "%s_nbxx" % pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = "70198bb285bc3e74898ed926a54aa5fa|_|上海佳吉快运有限公司|_|913101186074971991|_|2016-06-12|_|shanghai|_|2015"
     handler = ShanghaiNbHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 6
0
 def guizhouNbTest(self):
     pinyin = "guizhou"
     db_inst = DBManager.getInstance("ssdb",
                                     "new_%s_nbxx" % pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = "0ad1548ffe8ba00864126cc2c2a22619|_|锦屏县锦顺出租汽车有限公司|_|522628000053658|_|2016-06-10|_|guizhou|_|2015"
     handler = GuizhouNbHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 7
0
def testBySeed(crawler, pinyin, seed):
    CrawlerTester.pinyin = pinyin
    CrawlerTester.seed_dict = seed
    CrawlerTester.db_inst = DBManager.getInstance("ssdb",
                                                  "jyyc_" +
                                                  CrawlerTester.pinyin,
                                                  host="spider5",
                                                  port=57888)
    return crawler.crawl(CrawlerTester.seed_dict['page'])
Ejemplo n.º 8
0
def testByKeyword(crawler, pinyin, keyword):
    CrawlerTester.pinyin = pinyin
    CrawlerTester.seed_dict = {"name": keyword}
    CrawlerTester.db_inst = DBManager.getInstance("ssdb",
                                                  "new_" +
                                                  CrawlerTester.pinyin,
                                                  host="spider5",
                                                  port=57888)
    return crawler.crawl(CrawlerTester.seed_dict['name'])
Ejemplo n.º 9
0
 def __init__(self, queue_name):
     self.__db = DBManager.getInstance(f(seed_cf, 'seed_db', 'type'),
                                       queue_name,
                                       port=f(seed_cf, 'seed_db', 'port'),
                                       host=f(seed_cf, 'seed_db', 'host'))
     #self.__solr = DBManager.getInstance('solr','seed',server=["spider7:8983","spider7:8984","spider7:8985"])
     self.queue_name = queue_name
     from CommonLib.Logging import Logging
     self.log = Logging(name=queue_name)
     self.__data_dic = {}
Ejemplo n.º 10
0
 def jiangxiTest(self):
     pinyin = "jiangxi"
     db_inst = DBManager.getInstance("ssdb",
                                     "new_" + pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = ""
     row_key = "5994e3d1afbf82a9e526efae797d02db|_|乐平市新睦水稻种植专业合作社|_|jiangxi|_|2016-05-25"
     handler = JiangxiHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 11
0
 def guangdongNbTest(self):
     pinyin = "guangdong"
     db_inst = DBManager.getInstance("ssdb",
                                     "%s_nbxx" % pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = "23b441b3d17a26ef2e06ba79a4ed676a|_|广州广之旅国际旅行社股份有限公司|_|914401011904322413|_|2016-06-13|_|guangdong|_|2015"
     row_key = "b64ffc239d74dc4ad0a59cd4f6218e27|_|佛山市南湖国际旅行社股份有限公司|_|91440604776910212C|_|2016-06-13|_|guangdong|_|2014"
     handler = GuangdongNbHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 12
0
 def chongqingTest(self):
     pinyin = "chongqing"
     db_inst = DBManager.getInstance("ssdb",
                                     "new_" + pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = ""
     row_key = "fc46237ff1f403b39ad199502cd338a3|_|武隆县仁武酒业有限公司|_|chongqing|_|2016-05-30"
     handler = ChongqingHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 13
0
 def guangdongTest(self):
     pinyin = "guangdong"
     db_inst = DBManager.getInstance("ssdb",
                                     "new_%s" % pinyin,
                                     host="spider5",
                                     port=57888)
     row_key = "8effa8ebede5e87faa8157661c8d6555|_|广州顺丰速运有限公司|_|914401017248329968|_|2016-06-13|_|guangdong"
     # row_key = "23b441b3d17a26ef2e06ba79a4ed676a|_|广州广之旅国际旅行社股份有限公司|_|914401011904322413|_|2016-06-13|_|guangdong"
     row_key = "abb92dbbfaf77adafd1e98ddb100d076|_|佛山市南湖国际旅行社股份有限公司|_|91440604776910212C|_|2016-06-14|_|guangdong"
     handler = GuangdongHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 14
0
 def beijingTest(self):
     pinyin = "beijing"
     db_inst = DBManager.getInstance("ssdb",
                                     pinyin,
                                     host="spider5",
                                     port=57888)
     #row_key = "6fbb174d364fdf67fdb96cab6048db11|_|北京艺海佳景广告有限公司|_|beijing|_|2016-05-21"
     #row_key = "8d32a8b1d67d1f1d6a165c5577ac3efb|_|北京盛德东兴投资管理公司|_|beijing|_|2016-05-25"
     row_key = "dc3094f66fa56ffed50955b3b149cfa7|_|中国光大银行股份有限公司|_|beijing|_|2016-05-25"
     row_key = "6203925f878305c2f1f5be5a80434e0d|_|北京伊美尔长岛医学美容门诊部有限公司|_|91110108797596955A|_|2016-06-16|_|beijing"
     handler = BeijingHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 15
0
 def jilinTest(self):
     pinyin = "jilin"
     db_inst = DBManager.getInstance("ssdb",
                                     "new_" + pinyin,
                                     host="spider5",
                                     port=57888)
     # row_key = "66be9a1cbec45fd17e281324fca7f2fc|_|延边爱丽思鞋业有限公司|_|jilin|_|2016-05-25"
     # row_key = "4129cb0108048b80faf34503cad6ecc9|_|延边华侨旅游侨汇服务公司|_|jilin|_|2016-05-25"
     row_key = ""
     row_key = "f1c17231377bb8c56591ba774c2aca56|_|中国旅游服务公司吉林省公司|_|jilin|_|2016-05-25"
     handler = JilinHandler(pinyin)
     self.testFromSSDB(db_inst, row_key, handler)
Ejemplo n.º 16
0
def rename():
    """
    把 ssdb 里面hash 重命名,把旧的拷贝到新的里面
    :return:
    """
    # name_list = ["beijing"]
    q_name = "new_beijing"
    db_inst = DBManager.getInstance("ssdb", q_name, host="spider5", port=57888)
    # rowkey = 4
    rk = u"4ab61b0438638de25f6a68ba9b2834a5|_|北京梅牡易贷科技服务有限公司|_|beijing|_|2016-05-25"
    src_dic = db_inst.hget(rk)
    print src_dic
    with open("ttt.txt", "w") as f:
        f.write(src_dic)
Ejemplo n.º 17
0
class QYXX(Resource):
    __db = DBManager.getInstance(_type, 'name', host=_host, port=_port)
    threading_proxy = th()
    threading_proxy.start()

    def getChild(self, path, request):
        if path == "":
            return self
        else:
            return NotFount()
        return Resource.getChild(path, request)

    def _render_write(self, request, res):
        request.write(str(res))
        request.finish()

    def getip(self, request):
        #url:http://spider7:9876/qyxx?area=jiangsu&last=127.0.0.1:8080
        #url:http://127.0.0.1:9876/qyxx?area=jiangsu&last=127.0.0.1:8080
        argss = request.args
        area = argss["area"][0] if "area" in argss else "common"
        last = argss["last"][0] if "area" in argss else None
        last = networkSegment(area, last)

        def _get_proxy():
            if last:
                self.__db.keyDel(area + '_' + last)
            self.__db.changeTable('%s_bbd_white_proxy' % area)
            bbd_proxy = self.__db.get()
            if bbd_proxy:
                name = area + '_' + networkSegment(area, bbd_proxy)
                self.__db.keySetx(name, 300, ttl=300)
                self.__db.hincrHash(name, 'now_num')
                self.__db.multi_hsetHash(name, uptime=int(float(time.time())))
                return bbd_proxy
            self.__db.changeTable('%s_white_proxy' % area)
            buy_proxy = self.__db.get()
            if buy_proxy:
                return buy_proxy
            return self.threading_proxy.get_proxy()

        deferToThread(_get_proxy).addCallback(
            lambda x: self._render_write(request, x))

    def render_GET(self, request):
        self.getip(request)
        return NOT_DONE_YET
Ejemplo n.º 18
0
 def __init__(self,
              queue_name,
              sub_type,
              get_db_dict=None,
              save_db_dict=None):
     # config = __import__("FetchConfig")
     # get_db_dict = config.QYXX_GET_DB
     # save_db_dict = config.QYXX_PUT_DB
     self.logger = Logging(__name__)
     # queue_name = queue_name # for debug
     self.__get_db = DBManager.getInstance(get_db_dict["type"],
                                           queue_name,
                                           port=get_db_dict["port"],
                                           host=get_db_dict["host"])
     # self.__save_db = DBManager.getInstance(get_db_dict["type"],
     #                                        queue_name,
     #                                        port = get_db_dict["port"],
     #                                        host = get_db_dict["host"])
     self.queue_name = queue_name
     self.__data_dic = {}
Ejemplo n.º 19
0
 def __init__(self):
     self.__db = DBManager.getInstance(_type,
                                       'buy_tba_proxy_white_proxy',
                                       host=_host,
                                       port=_port)
Ejemplo n.º 20
0
 def __init__(self):
     self.__ssdb = DBManager.getInstance('ssdb',
                                         'test',
                                         port=port,
                                         host=host)
Ejemplo n.º 21
0
                        n += 1
                        text = text.replace(u':', ':')
                        texts = text.split(':')
                        if texts[0] == text:
                            if n == 1:
                                dict_[u'top_企业名称'] = text.strip()
                        else:
                            if len(texts) >= 2:
                                dict_[u'top_' + texts[0].strip()] = texts[1].strip()
        except:
                self.log.info(u"获取top信息异常")
        return dict_

def testFromSSDB(db_inst, row_key):
    html_dict_str = db_inst.hget(row_key)
    if not html_dict_str:
        print(u"从SSDB获取数据失败!")
        return
    handler = HeilongjiangHandler("qinghai")
    html_dict = json.loads(html_dict_str)
    handler.parse(html_dict)


if __name__ == "__main__":
    db_inst = DBManager.getInstance("ssdb", "heilongjiang", host="spider5", port=57888)
    row_key = "d833488218278803eadb28d469cd6257|_|黑龙江柏杉林木业有限公司|_|91230184690719556X|_|2016-06-21|_|heilongjiang"
    testFromSSDB(db_inst, row_key)
    pass


Ejemplo n.º 22
0
                titles = h2_text.split()
                if len(titles) >= 2:
                    dict_[u'top_公司名称'] = titles[0]
                    top2s = titles[1].split(u":")
                    if len(top2s) == 2:
                        dict_[u'top_' + top2s[0].strip()] = top2s[1].strip()
        except:
            self.log.info(u"获取top信息异常")
            dict_ = dict()
        return dict_


def testFromSSDB(db_inst, row_key):
    html_dict_str = db_inst.hget(row_key)
    if not html_dict_str:
        print(u"从SSDB获取数据失败!")
        return
    handler = QinghaiHandler("qinghai")
    html_dict = json.loads(html_dict_str)
    handler.parse(html_dict)


if __name__ == "__main__":
    db_inst = DBManager.getInstance("ssdb",
                                    "qinghai",
                                    host="spider5",
                                    port=57888)
    row_key = "62f434b12b8f3287f948e048da718882|_|西部矿业集团有限公司|_|9163000071040638XJ|_|2016-06-21|_|qinghai"
    testFromSSDB(db_inst, row_key)
    pass
Ejemplo n.º 23
0
def work(bbd_type, value_list=None):
    conf_file = "DBConfig.ini"
    db_conf_dict = \
        {
            'type':confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host':confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port':int(confGetterFunc(conf_file, 'html_db', 'port'))
        }

    def getNbxxDict(src_dict):
        nbxx_key_list = filter(lambda x: x.startswith("qynb_"),
                               src_dict.keys())
        nbxx_list = map(lambda x: {x: src_dict.pop(x)}, nbxx_key_list)
        return nbxx_list

    def getYear(nb_dict):
        key = nb_dict.keys()[0]
        year = key.split("_")[1]
        return year

    def storeResult(src_dict, company_dict=None):
        """
        回调函数,由爬虫调用,存储数据到ssdb
        :param src_dict:
        :param company_dict:
        :return:
        """
        try:
            if src_dict["status"] == 0:
                src_dict = UniField.unifyRequestResult(src_dict, bbd_type)
                if src_dict.has_key("rowkey"):
                    rowkey = src_dict["rowkey"]

                    nbxx_list = getNbxxDict(src_dict)
                    nb_year_list = []  # 用来向solr接口发送信息
                    for nb_item in nbxx_list:
                        # 拆分年报成单独的数据条目,使用不同的rowkey, 放入hash
                        year = getYear(nb_item)
                        nb_year_list.append(year)
                        nbxx_dict = UniField.cloneNeedColumns(src_dict)
                        nbxx_dict.update({"bbd_seed": bbd_seed_dict})
                        nbxx_dict.update(nb_item)
                        db_inst.changeTable(bbd_type + "_nbxx")
                        nb_rk = rowkey + "|_|" + year
                        nbxx_dict["rowkey"] = nb_rk
                        nbxx_dict["year"] = year
                        db_inst.hset(nb_rk, nbxx_dict)
                        log.info(u"存储 %s 年年报 成功,rowkey 为 [ %s ]", year, nb_rk)
                    zch = src_dict["rowkey_dict"]["company_zch"]
                    company_name = src_dict["rowkey_dict"]["company_name"]

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING,
                                        bbd_seed_dict)
                    log.info(log_info)
                    src_dict.update({"bbd_seed": bbd_seed_dict})
                    db_inst.changeTable(bbd_type)
                    db_inst.save(src_dict)
                    log.info(u" ,rowkey 为 [ %s ]", rowkey)
                    NbxxApiControler().nbUpdate(company_name=company_name,
                                                pinyin=bbd_type,
                                                zch=zch,
                                                years_list=nb_year_list)

                else:
                    raise Exception("No rowkey")
            else:
                db_inst.changeTable(bbd_type + "_error")
                db_inst.save(src_dict)
        except Exception as e:
            log.info(str(e))
            db_inst.changeTable(bbd_type + "_error")
            db_inst.save(src_dict)

            log.info(u"存储抓取网页原文 失败,rowkey 为 [ %s ]", rowkey)

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        try:
            keyword_num = len(keyword_list)
            for keyword in keyword_list:
                keyword_num -= 1
                seed_status = inst.crawl(keyword)
                if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                    # log.info("End seed with keyword %s", keyword)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取成功:)")
                    break
                elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                    log.info(u"种子抓取失败,关键字 [%s]", keyword)
                    continue
                else:
                    seed.update(status=seed_status.access_type)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取失败,存储到队列,种子状态为 %s", str(seed_status))
                    seed.save()
        except Exception as e:
            log.info(str(e))
            raise Exception(u"种子抓取过程中遇到异常")

    ##################################################################################################################################
    try:
        from CommonLib.Logging import Logging
        log = Logging(name=bbd_type)
        log.info("Process begin for %s,logger=%s", bbd_type, str(log))

        module_name = "Crawler" + bbd_type.capitalize()
        bbd_type = bbd_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="qyxx_all",
                                         pinyin=bbd_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance(db_conf_dict["type"],
                                        bbd_type,
                                        host=db_conf_dict["host"],
                                        port=db_conf_dict["port"])
        bbd_seed_dict = {}
        if value_list:
            for keywd_list in value_list:
                crawlerKeyWordList(keywd_list)
        else:
            seed = Seed(bbd_type)

            while True:
                seed.get()
                bbd_seed_dict = seed.getDict()
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict)
                log.info("starting a new seed %s", log_info)
                if seed.url_status:
                    seed_status = inst.crawlUrl(seed.url, seed.name)
                    if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                        log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                            bbd_seed_dict)
                        log.info(log_info)
                    else:  # 用url没有抓成功, 用keywordlist 抓
                        log.info(" Url get company info failed  [%s]",
                                 bbd_type)
                        keyword_list = seed.values
                        crawlerKeyWordList(keyword_list)
                else:
                    keyword_list = seed.values
                    crawlerKeyWordList(keyword_list)
    except Exception as e:
        log.info(str(e))
        seed.save()
        raise Exception(e)
Ejemplo n.º 24
0
 def __init__(self):
     self.dict_ = {}
     self.__db = DBManager.getInstance(_type,
                                       'name',
                                       host=_host,
                                       port=_port)
Ejemplo n.º 25
0
                    n += 1
                    text = text.replace(u':', ':')
                    texts = text.split(':')
                    if texts[0] == text:
                        if n == 1:
                            dict_[u'top_企业名称'] = text.strip()
                    else:
                        if len(texts) == 2:
                            dict_[u'top_' + texts[0].strip()] = texts[1].strip()
        except:
                self.log.info(u"获取top信息异常")
        return dict_

def testFromSSDB(db_inst, row_key):
    html_dict_str = db_inst.hget(row_key)
    if not html_dict_str:
        print(u"从SSDB获取数据失败!")
        return
    handler = ShanghaiHandler("shanghai")
    html_dict = json.loads(html_dict_str)
    handler.parse(html_dict)


if __name__ == "__main__":
    db_inst = DBManager.getInstance("ssdb", "new_shanghai_data", host="spider5", port=57888)
    row_key = "c97ec20e493f366be44508f44001a583|_|上海乾辉工贸有限公司分公司|_|shanghai|_|2016-05-22"
    testFromSSDB(db_inst, row_key)
    pass


Ejemplo n.º 26
0
 def __init__(self):
     self.__db = DBManager.getInstance(f('type'),
                                       f('table'),
                                       server=f('server'))
Ejemplo n.º 27
0
def work(pro_type, seed=None):
    def storeResult(src_dict, company_dict=None):
        # if company_dict.has_key(u"名称"):
        #     src_dict.update({"company_name": company_dict[u"名称"]})
        #     src_dict.update({"values":company_dict})

        src_dict = UniField.unifyRequestResult(src_dict, pro_type)
        if src_dict.has_key("rowkey"):

            rowkey = src_dict["rowkey"]
            print "统一字段后 rowkey=", rowkey
            src_dict.update({"BBD_SEED": seed.getDict()})
            if src_dict["status"] == 0:
                db_inst.changeTable("new_" + pro_type)
                db_inst.hset(rowkey, src_dict)
                db_inst.save(src_dict)

            else:
                db_inst.changeTable("new_" + pro_type + "_error")
                db_inst.hset(rowkey, src_dict)
                db_inst.save(src_dict)
            print "rowkey=", rowkey
        else:
            print "No Rowkey ,抓取后的结果为:", src_dict

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        keyword_num = len(keyword_list)
        for keyword in keyword_list:
            seed_status = inst.crawl(keyword)
            if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                # log.info("End seed with keyword %s", keyword)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                    seed.getDict())
                log.info(log_info)
                break
            elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:
                keyword_num -= 1
                # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, seed.getDict())
                log.info("Use Key word [%s] get company failed", keyword)
                continue
            else:
                seed.update(status=seed_status.access_type)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                    seed.getDict())
                log.info(log_info)
                seed.save()

    try:
        from CommonLib.Logging import Logging
        log = Logging(name=pro_type)
        log.info("Process begin for %s", pro_type)

        module_name = "Crawler" + pro_type.capitalize()
        pro_type = pro_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="qyxx_all",
                                         pinyin=pro_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance("ssdb",
                                        "new_" + pro_type,
                                        host="spider5",
                                        port=57888)
        if seed is None:
            seed = Seed(pro_type)
            seed.get()
        else:
            seed = seed

            # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, seed.getDict())
            # log.info("start to a new seed %s",log_info)
            # if seed.url_status:
            #     seed_status = inst.crawlUrl(seed.url, seed.name)
            #     if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
            #         log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, seed.getDict())
            #         log.info(log_info)
            #     else:# 用url没有抓成功, 用keywordlist 抓
            #         log.info(" Url get company info failed  [%s]", pro_type)
            #         keyword_list = seed.values
            #         crawlerKeyWordList(keyword_list)
            # else:
            keyword_list = seed.values
            crawlerKeyWordList(keyword_list)

    except Exception as e:
        print str(e)
Ejemplo n.º 28
0
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type

    nb_module_name = bbd_type.capitalize() + "Nb" + "Handler"
    nb_handler = ClassFactory.getClassInst(nb_module_name,
                                           package_name="Parser",
                                           pinyin=bbd_type.lower())

    bbd_table = "qyxx_data_nb"
    bbd_src_table = "qyxx_html_nb"
    normal_table = bbd_type + "_data" + "_nb"
    err_table = normal_table + "_error"
    # html_normal_table = bbd_type+"_src"+"_nb"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  #存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    fetch = Fetcher(queue_name + "_nbxx",
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            source_dict = fetch.hget()
            if source_dict:
                res_dict = UniField.cloneNeedColumns(source_dict)
                if res_dict.has_key("year"):
                    res_dict["_id"] = UniField.updateId(
                        res_dict['_id'], res_dict['year'])
                # log.info("start to a new seed %s",seed_dict)

                res_dict = nb_handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict)
                    # log.info(log_info)
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict)
                    # log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
Ejemplo n.º 29
0
                    "").replace(u'\xa0', '').replace(u':', u':').split(key)
                if len(infos) == 2:
                    dict_[u'top_企业名称'] = infos[0].strip()
                    if u':' in infos[1]:
                        temp = (key + infos[1]).split(u':')
                        dict_['top_' + temp[0].strip()] = temp[1].strip()
        except:
            self.log.info(u"获取top信息异常")
            dict_ = dict()
        return dict_


def testFromSSDB(db_inst, row_key):
    html_dict_str = db_inst.hget(row_key)
    if not html_dict_str:
        print(u"从SSDB获取数据失败!")
        return
    handler = NeimengguHandler("neimenggu")
    html_dict = json.loads(html_dict_str)
    handler.parse(html_dict)


if __name__ == "__main__":
    db_inst = DBManager.getInstance("ssdb",
                                    "neimenggu",
                                    host="spider5",
                                    port=57888)
    row_key = "1a7630b3a30addefcae6c3d092630a11|_|内蒙古蒙牛乳业包头有限责任公司|_|91150200701240234X|_|2016-06-22|_|neimenggu"
    testFromSSDB(db_inst, row_key)
    pass
Ejemplo n.º 30
0
 def __init__(self):
     self.__db = DBManager.getInstance(_type,
                                       'buy_proxy_test_results',
                                       host=_host,
                                       port=_port)