Ejemplo n.º 1
0
    def __init__(self, urls=[], project_name="sample", **kwargs):
        ProxiesHeaders.__init__(self)
        LogBase.__init__(self, project_name, "proxy")
        self._urls = urls
        self._method = 'GET'
        self._need_cookies = False

        if "data_list" in kwargs.keys():
            self._datas = kwargs['data_list']
            self._method = 'POST'
        if "need_cookies" in kwargs.keys():
            self._need_cookies = True
            self._resp_cookies = None

        self.__auth_with_time = self.auth_with_time
        self.__proxy_auth = self.__auth_with_time[0]
        self.__timestamp = self.__auth_with_time[1]
        self._proxy = {
            "http": "http://%s" % self._conf["ip_port"],
            "https": "https://%s" % self._conf["ip_port"]
        }
        self._headers = {"Proxy-Authorization": self.__proxy_auth}
        self._cookies = None
        self._single_content = None
        self._content = list()
        self._content_dict = dict()
Ejemplo n.º 2
0
 def __init__(self, project_name, list_res_iter, crawler_conf, rds, rds_key):
     LogBase.__init__(self, project_name, "ParserList")
     self.project_name    = project_name
     self.list_res_iter   = list_res_iter
     self.crawler_conf    = crawler_conf
     self.rds             = rds
     self.rds_key         = rds_key
Ejemplo n.º 3
0
    def __init__(self, db, project_name="sample"):
        LogBase.__init__(self, project_name, "redis")

        host, port = REDIS_CFG['host'], REDIS_CFG['port']
        self.db = db
        self.__pool__ = ConnectionPool(host=host, port=port, db=self.db)
        self._redis_conn = Redis(connection_pool=self.__pool__)
        self.info("Connect to redis-server SUCCEED.", host=host, port=port, db=self.db)
Ejemplo n.º 4
0
    def __init__(self, etl_name):
        project_name = "etl_%s" % etl_name
        self.project_name = project_name
        LogBase.__init__(self, project_name, "main")

        self.etl_name = etl_name
        self.etl_conf = ETLConfigReader.etl_config(etl_name)
        self.rds_data_iter = RedisScanner.rds_data_iter(
            self.etl_conf["sys_conf"]["redis_db"], project_name)
        self.community_tbname = "community_info"
        self.community_tbkeys = [
            'community_id', 'source_from', 'source_name', 'community_name',
            'lat', 'lng', 'cw_district', 'cw_busi', 'cw_detail', 'bd_province',
            'bd_city', 'bd_district', 'bd_busi', 'bd_street', 'bd_detail',
            'bd_adcode'
        ]
        self.community_dict = dict()
        self.db = DBOpter(project_name)
Ejemplo n.º 5
0
    def __init__(self, project_name="sample_project"):
        # Register Log service.
        logger_name = "database"
        LogBase.__init__(self, project_name, logger_name)

        # try to connect to database.
        try:
            self._conn = pymysql.connect(
                host=MYSQL_CFG["host"], port=int(MYSQL_CFG["port"]), 
                user=MYSQL_CFG["user"], passwd=MYSQL_CFG["passwd"],
                db=MYSQL_CFG["db"],     charset='utf8'
            )
        except Exception:
            self.err("Connect to database FAILED.", **MYSQL_CFG)
            
        self.cur = self._conn.cursor(cursor=pymysql.cursors.DictCursor)
        self.IntegrityError = IntegrityError

        self.info("Connect to database SUCCEED.", **MYSQL_CFG)
Ejemplo n.º 6
0
    def __init__(self, crawler_name):
        '''Do object
            You may get much info from appointed crawler files.
            It may include request's type, parser's type and
            so on.
        params:
            crawler_name: Crawler's name, which turns to config
            files in ./config folders.
        '''
        # var
        self.crawler_name = crawler_name
        self.req_order    = list()
        self.crawler_conf = dict()

        # log
        self.project_name      = "cw_%s"%crawler_name
        LogBase.__init__(self, self.project_name, "main")
        
        # init
        self.__load__
Ejemplo n.º 7
0
    def __init__(self, msg="", subject="", recvers=None):
        '''Sender
        params:
            msg:     Content
            subject: Subject
        '''
        if recvers is None:
            self.recvers = EMAIL_CFG['recver'].split(',')
        else:
            if isinstance(recvers, list):
                self.recvers = recvers
            else:
                self.recvers = recvers.split(',')
        self.logger = LogBase('email', "email_sender")
        self.__smtp_predo__

        self.msg = MIMEMultipart()
        self.msg.attach(MIMEText(msg, 'html', 'utf-8'))

        self.msg['From'] = formataddr([EMAIL_CFG['sender_name'], EMAIL_CFG['sender']])
        self.msg['To'] = formataddr(["", ",".join(self.recvers)])
        self.msg['Subject'] = Header(subject, 'utf-8')
Ejemplo n.º 8
0
def ziroom_extra(project_name, rid, rtn_data):
    '''ziroom_extra
    Ziroom Extra func.
    
    '''
    logger = LogBase(project_name, "ziroom_extra")
    logger.debug("Before Extra =>", data=rtn_data)
    # Extra func for house code.

    try:
        end = int(rtn_data['house_code'].split('_')[1])
        room_num = int(findall(r"([0-9])室[0-9]厅", rtn_data['house_type'])[0])
    except Exception:
        pass
    else:
        if room_num > 1:
            rds = RedisController(
                int(conf_kv_func("ziroom.sys_config", all=True)['redis_db']),
                project_name)
            for idx in range(1, room_num + 1):
                rds.__update_dict_to_redis__(
                    rid - end + idx, {"house_id": str(rid - end + idx)})

    # Extra func for price.
    try:
        price_dict = dict()
        price, price_dict = get_price_from_png(rtn_data["price"], price_dict,
                                               project_name)

        rtn_data["price"] = price

    except Exception:
        pass

    # Extra func for payment.
    try:
        payment_rtn_list = list()
        for payment in rtn_data["paymentlist"]:
            payment_rtn = dict()
            for k, v in zip(payment.keys(), payment.values()):
                if k == "period":
                    payment_rtn["period"] = v
                else:
                    payment_rtn[k], price_dict = get_price_from_png(
                        v, price_dict, project_name)

            payment_rtn_list.append(payment_rtn)

            rtn_data["paymentlist"] = payment_rtn_list

    except Exception:
        pass

    logger.debug("After Extra =>", data=rtn_data)

    return rtn_data
Ejemplo n.º 9
0
def get_price_from_png(price_object, price_dict, project_name):
    '''get_price_from_png
    Get price info from png files by using tesseract OCR.
    
    '''
    logger = LogBase(project_name, "ziroom_ocr")

    try:
        price = StringIO()
        url = "http:{}".format(price_object[0])

        if url in price_dict.keys():
            t = price_dict[url]
        else:
            img_path = "_output/{}".format(
                findall(r"/([0-9a-zA-z]+.png)", url)[0])
            req = ProxiesRequests([url], project_name)
            ctn = req.req_content_list[0][0]

            with open(img_path, "wb") as img:
                img.write(ctn)

            img = Image.open(img_path)
            bg = Image.new("RGBA", img.size, "white")
            merged_pic = Image.new("RGBA",
                                   tuple([int(s * 1.2) for s in img.size]),
                                   "white")
            mg = Image.alpha_composite(bg, img)

            merged_pic.paste(mg)
            t = image_to_string(merged_pic)

        for idx in price_object[2]:
            price.write(t[idx])

        price_dict[url] = t

        logger.debug("OCR price =>",
                     price=price.getvalue(),
                     price_dict=price_dict)

        return price.getvalue(), price_dict

    except Exception as e:
        logger.warn("OCR failed.", err=e)
        return "", price_dict

    finally:
        price.close()
Ejemplo n.º 10
0
 def __init__(self, rds, crawler_conf, project_name="sample"):
     LogBase.__init__(self, project_name, "ReqDetail")
     self.project_name   = project_name
     self.rds            = rds
     self.crawler_conf   = crawler_conf
     self.sys            = crawler_conf['sys_conf']
Ejemplo n.º 11
0
 def __init__(self, crawler_conf, project_name="sample"):
     LogBase.__init__(self, project_name, "ReqList")
     self.project_name   = project_name
     self.crawler        = crawler_conf['list_crawler']
     self.sys            = crawler_conf['sys_conf']
     self.compiles       = crawler_conf['compiles']
Ejemplo n.º 12
0
class Sender():

    def __init__(self, msg="", subject="", recvers=None):
        '''Sender
        params:
            msg:     Content
            subject: Subject
        '''
        if recvers is None:
            self.recvers = EMAIL_CFG['recver'].split(',')
        else:
            if isinstance(recvers, list):
                self.recvers = recvers
            else:
                self.recvers = recvers.split(',')
        self.logger = LogBase('email', "email_sender")
        self.__smtp_predo__

        self.msg = MIMEMultipart()
        self.msg.attach(MIMEText(msg, 'html', 'utf-8'))

        self.msg['From'] = formataddr([EMAIL_CFG['sender_name'], EMAIL_CFG['sender']])
        self.msg['To'] = formataddr(["", ",".join(self.recvers)])
        self.msg['Subject'] = Header(subject, 'utf-8')

    def add_attachment(self, filename, filepath):
        '''add_attachment
        params:
            filename: display name
            filepath: attachment path
        '''
        attach = MIMEText(open(filepath, "rb").read(), 'base64', 'utf-8')
        attach["Content-Type"] = 'application/octet-stream'
        attach["Content-Disposition"] = 'attachment; filename="{}"'.format(filename)
        self.msg.attach(attach)

    def send(self):
        try:
            self.__smtp.sendmail(EMAIL_CFG['sender'], self.recvers, self.msg.as_string())
        except Exception:
            self.logger.error("Send mail FAILED!")
        else:
            self.logger.info("Send mail success!", recver=self.recvers)
        finally:
            self.__smtp_aftdo__

    @property
    def __smtp_predo__(self):
        # Connect to smtp server.
        try:
            self.__smtp = smtplib.SMTP_SSL()
            self.__smtp.connect(EMAIL_CFG["smtp_server"], EMAIL_CFG["smtp_port"])
            self.__smtp.ehlo()
        except Exception:
            self.logger.error('Connect to SMTP server FAILED.', server=EMAIL_CFG["smtp_server"])
        else:
            self.logger.info('Connect to SMTP server succeed.', server=EMAIL_CFG["smtp_server"])

        # Login sender email.
        try:
            self.__smtp.login(user=EMAIL_CFG['sender'], password=EMAIL_CFG['sender_pwd'])
        except Exception:
            self.logger.error('Login FAILED.', user=EMAIL_CFG['sender'])
        finally:
            self.logger.info('Login succeed.', user=EMAIL_CFG['sender'])

    @property
    def __smtp_aftdo__(self):
        self.__smtp.quit()