예제 #1
0
 def start(self, target, category=0):
     LogRec.get_logger(Config.INFLOGGER).info(u"类型为" + str(category) + u",开始下一轮检查并捕获信息")
     tmp = list()
     tmp.append(target)
     tmp.append(category)
     thr_global = threading.Thread(target=self.__go, args=(tmp))
     thr_global.start()
예제 #2
0
 def delete_con(db_no):
     try:
         if(ConnectDB.instance.has_key(db_no)):
             ConnectDB.thr.acquire()
             ConnectDB.instance[db_no].close()
             del(ConnectDB.instance[db_no])
             ConnectDB.thr.release()
     except Exception,ex:
         ConnectDB.thr.release()
         LogRec.get_logger(Config.ERRLOGGER).error(traceback.print_exc())
예제 #3
0
 def get_con(db_no):
     try:
         if(not ConnectDB.instance.has_key(db_no)):# 如果实例中没有该连接,则创建
             db_config=Config.MYSQL[db_no] # 获取配置文件中连接信息
             ConnectDB.thr.acquire()   # 加锁
             if(not ConnectDB.instance.has_key(db_no)):# 避免加锁时,实例被创建,再次检验
                 ConnectDB.instance[db_no]=MySQLdb.connect(host=db_config["host"],user=db_config["uname"],passwd=db_config["pwd"],db=db_config["db_name"],port=db_config["port"],charset=db_config["encoding"])
                 ConnectDB.instance[db_no].autocommit = True # 如果你设为false,在commit之前,所有的sql就像一个事务 对数据库伤害较大
                 ConnectDB.thr.release()
         return ConnectDB.instance[db_no]
     except Exception,ex:
         ConnectDB.thr.release()
         LogRec.get_logger(Config.ERRLOGGER).error(traceback.print_exc())
예제 #4
0
    def __write_to_db(self, sqlUtil, info_map, category):
        try:
            data_source = {}
            data_source["target"] = info_map["target"]
            data_source["type"] = category
            data_source["info"] = info_map["info"]
            data_source["title"] = info_map["title"]
            data_source["db_insert_time"] = int(time.time())

            if category == 1:
                publish_time = str(datetime.date.today().year) + info_map["publish_time"]
                data_source["publish_time"] = datetime.datetime.strptime(publish_time, "%Y%m月%d日 %H:%M")
            elif category == 0:
                publish_time = info_map["publish_time"]
                data_source["publish_time"] = datetime.datetime.strptime(publish_time, "%Y-%m-%d %H:%M:%S")

            if sqlUtil.insert_data(data_source, self.table_name):
                print u"插入完毕"

            # print data_source["info"]
            # print data_source["publish_time"]

        except Exception, e:
            info = sys.exc_info()
            err_logger = LogRec.get_logger(Config.ERRLOGGER)

            for file, lineno, function, text in traceback.extract_tb(info[2]):
                err_str = file, "line:", lineno, "in", function
                err_logger.error(err_str)
            err_str = "** %s: %s" % info[:2]
            err_logger.error(err_str)
예제 #5
0
    def do_main(self,req):
        try:
            resp = requests.get(req['target'],headers=self.headers)

            if req['type'] == 0:
                resp.encoding = 'gbk'
                content = resp.text
                pageinfo = self.filt_finance_content(content)
            elif req['type'] == 1:
                content = resp.text
                pageinfo = self.filt_licai_content(content)
            elif req['type'] == 2:
                resp.encoding = 'utf-8'
                content = resp.text
                pageinfo = self.filt_blog_content(content)
            elif req['type'] == 3:
                resp.encoding = 'gbk'
                content = resp.text
                pageinfo = self.filt_guba_content(content)

            # return pageinfo
            page_map = {'target':req['target'],'info':pageinfo,'title':req['title'],'publish_time':req['publish_time']}

        except:
            info = sys.exc_info()
            err_logger = LogRec.get_logger(Config.ERRLOGGER)
            for file, lineno, function, text in traceback.extract_tb(info[2]):
                err_str = file, "line:", lineno, "in", function
                err_logger.error(err_str)
            err_str = "** %s: %s" % info[:2]
            err_logger.error(err_str)

        return page_map
예제 #6
0
 def __dispose_except(self, sql):
     info = sys.exc_info()
     err_logger = LogRec.get_logger(Config.ERRLOGGER)
     for file, lineno, function, text in traceback.extract_tb(info[2]):
         err_str = file, "line:", lineno, "in", function
         err_logger.error(err_str)
     err_str = "** %s: %s" % info[:2]
     err_logger.error(err_str)
    def get_response(self):
        try:
            resp = requests.get(self.__target, headers=self.headers)
            resp.encoding = 'gbk'
        except:
            info = sys.exc_info()
            err_logger = LogRec.get_logger(Config.ERRLOGGER)
            for file, lineno, function, text in traceback.extract_tb(info[2]):
                err_str = file, "line:", lineno, "in", function
                err_logger.error(err_str)
            err_str = "** %s: %s" % info[:2]
            err_logger.error(err_str)

        return resp
예제 #8
0
 def get_pool_conn(db_no):
     try:
         ConnectDB.thr.acquire()   # 加锁
         db_config=Config.MYSQL[db_no] # 获取配置文件中连接信息
         conn =MySQLdb.connect(host=db_config["host"],user=db_config["uname"],passwd=db_config["pwd"],db=db_config["db_name"],port=db_config["port"],charset=db_config["encoding"])
         conn.autocommit = True # 如果你设为false,在commit之前,所有的sql就像一个事务 对数据库伤害较大
         ConnectDB.thr.release()
         return conn
     except Exception,ex:
         info = sys.exc_info()
         err_logger = LogRec.get_logger(Config.ERRLOGGER)
         for file, lineno, function, text in traceback.extract_tb(info[2]):
             err_str = file, "line:", lineno, "in", function
             err_logger.error(err_str)
         err_str = "** %s: %s" % info[:2]
         err_logger.error(err_str)
    def filt_link(self, content):
        try:
            # 先找出div 部分,然后再在小块中匹配
            divPattern = '''<div class="listBlk">([\s\S]*?)<div class="MainBtm">'''
            div_reg = re.compile(divPattern)
            div_content = div_reg.findall(content)[0]

            infoPattern = '''<li><a href="([\s\S]*?)" target="_blank">([\s\S]*?)</a><span>\(([\S\s]*?)\)</span></li>'''
            reg = re.compile(infoPattern)
            matches = reg.findall(div_content)

        except Exception,e:
            info = sys.exc_info()
            err_logger = LogRec.get_logger(Config.ERRLOGGER)
            for file, lineno, function, text in traceback.extract_tb(info[2]):
                err_str = file, "line:", lineno, "in", function
                err_logger.error(err_str)
            err_str = "** %s: %s" % info[:2]
            err_logger.error(err_str)
예제 #10
0
    def __go(self, target, category=0):

        if category == 0:
            self.type_0_lock.acquire()
        elif category == 1:
            self.type_1_lock.acquire()

        try:

            sqlUtil = MySqlDAL()

            LogRec.get_logger(Config.INFLOGGER).info(u"开始抓取文章链接。。。")
            crawler = WebCrawler()

            # 得到连接
            crawler.set_target(target)
            tmp_link_list = crawler.get_page_link()

            # 过滤数据库中已有的信息
            target_tuple = list()

            for row in tmp_link_list:
                tmp_link = row[0].strip()
                # 过滤blog情况
                if tmp_link.startswith(r"http://blog.sina.com.cn"):
                    tmp_index = tmp_link.rfind("?")
                    if tmp_index != -1:
                        tmp_link = tmp_link[:tmp_index]

                tmp_sql = "SELECT * FROM " + self.table_name + " WHERE target='" + tmp_link + "'"
                if not sqlUtil.get_dimensions_one_row(tmp_sql):
                    target_tuple.append(row)
                # else:
                #     print "already"

            # for row in target_tuple:
            #     for item in row:
            #         print item,
            #     print ""

            LogRec.get_logger(Config.INFLOGGER).info(u"获取最新链接列表,准备开始捕获文章内容")

            # 多线程得到内容
            f = Fetcher(threads=10)

            for row in target_tuple:
                if row[0].startswith(r"http://finance.sina.com.cn"):
                    f.push(
                        {"target": row[0].strip(), "type": 0, "title": row[1].strip(), "publish_time": row[2].strip()}
                    )
                elif row[0].startswith(r"http://licaishi.sina.com.cn"):
                    f.push(
                        {"target": row[0].strip(), "type": 1, "title": row[1].strip(), "publish_time": row[2].strip()}
                    )
                elif row[0].startswith(r"http://blog.sina.com.cn"):
                    tmp_index = row[0].rfind("?")
                    if tmp_index != -1:
                        tmp_target = row[0][:tmp_index]
                        f.push(
                            {
                                "target": tmp_target.strip(),
                                "type": 2,
                                "title": row[1].strip(),
                                "publish_time": row[2].strip(),
                            }
                        )
                    else:
                        f.push(
                            {
                                "target": row[0].strip(),
                                "type": 2,
                                "title": row[1].strip(),
                                "publish_time": row[2].strip(),
                            }
                        )
                elif row[0].startswith(r"http://guba.sina.com.cn"):
                    f.push(
                        {"target": row[0].strip(), "type": 3, "title": row[1].strip(), "publish_time": row[2].strip()}
                    )

            LogRec.get_logger(Config.INFLOGGER).info(u"准备写入数据库")

            while f.task_left():
                res_map = f.pop()

                # print res_map['info']
                self.__write_to_db(sqlUtil, res_map, category)

                time.sleep(0.5)

            sqlUtil.destory()

            LogRec.get_logger(Config.INFLOGGER).info(u"数据库写入完毕")

        except Exception, e:
            info = sys.exc_info()
            err_logger = LogRec.get_logger(Config.ERRLOGGER)

            for file, lineno, function, text in traceback.extract_tb(info[2]):
                err_str = file, "line:", lineno, "in", function
                err_logger.error(err_str)
            err_str = "** %s: %s" % info[:2]
            err_logger.error(err_str)