def __init__(self, urls=[], project_name="sample", **kwargs): ProxiesHeaders.__init__(self) LogBase.__init__(self, project_name, "proxy") self._urls = urls self._method = 'GET' self._need_cookies = False if "data_list" in kwargs.keys(): self._datas = kwargs['data_list'] self._method = 'POST' if "need_cookies" in kwargs.keys(): self._need_cookies = True self._resp_cookies = None self.__auth_with_time = self.auth_with_time self.__proxy_auth = self.__auth_with_time[0] self.__timestamp = self.__auth_with_time[1] self._proxy = { "http": "http://%s" % self._conf["ip_port"], "https": "https://%s" % self._conf["ip_port"] } self._headers = {"Proxy-Authorization": self.__proxy_auth} self._cookies = None self._single_content = None self._content = list() self._content_dict = dict()
def __init__(self, project_name, list_res_iter, crawler_conf, rds, rds_key): LogBase.__init__(self, project_name, "ParserList") self.project_name = project_name self.list_res_iter = list_res_iter self.crawler_conf = crawler_conf self.rds = rds self.rds_key = rds_key
def __init__(self, db, project_name="sample"): LogBase.__init__(self, project_name, "redis") host, port = REDIS_CFG['host'], REDIS_CFG['port'] self.db = db self.__pool__ = ConnectionPool(host=host, port=port, db=self.db) self._redis_conn = Redis(connection_pool=self.__pool__) self.info("Connect to redis-server SUCCEED.", host=host, port=port, db=self.db)
def __init__(self, etl_name): project_name = "etl_%s" % etl_name self.project_name = project_name LogBase.__init__(self, project_name, "main") self.etl_name = etl_name self.etl_conf = ETLConfigReader.etl_config(etl_name) self.rds_data_iter = RedisScanner.rds_data_iter( self.etl_conf["sys_conf"]["redis_db"], project_name) self.community_tbname = "community_info" self.community_tbkeys = [ 'community_id', 'source_from', 'source_name', 'community_name', 'lat', 'lng', 'cw_district', 'cw_busi', 'cw_detail', 'bd_province', 'bd_city', 'bd_district', 'bd_busi', 'bd_street', 'bd_detail', 'bd_adcode' ] self.community_dict = dict() self.db = DBOpter(project_name)
def __init__(self, project_name="sample_project"): # Register Log service. logger_name = "database" LogBase.__init__(self, project_name, logger_name) # try to connect to database. try: self._conn = pymysql.connect( host=MYSQL_CFG["host"], port=int(MYSQL_CFG["port"]), user=MYSQL_CFG["user"], passwd=MYSQL_CFG["passwd"], db=MYSQL_CFG["db"], charset='utf8' ) except Exception: self.err("Connect to database FAILED.", **MYSQL_CFG) self.cur = self._conn.cursor(cursor=pymysql.cursors.DictCursor) self.IntegrityError = IntegrityError self.info("Connect to database SUCCEED.", **MYSQL_CFG)
def __init__(self, crawler_name): '''Do object You may get much info from appointed crawler files. It may include request's type, parser's type and so on. params: crawler_name: Crawler's name, which turns to config files in ./config folders. ''' # var self.crawler_name = crawler_name self.req_order = list() self.crawler_conf = dict() # log self.project_name = "cw_%s"%crawler_name LogBase.__init__(self, self.project_name, "main") # init self.__load__
def __init__(self, msg="", subject="", recvers=None): '''Sender params: msg: Content subject: Subject ''' if recvers is None: self.recvers = EMAIL_CFG['recver'].split(',') else: if isinstance(recvers, list): self.recvers = recvers else: self.recvers = recvers.split(',') self.logger = LogBase('email', "email_sender") self.__smtp_predo__ self.msg = MIMEMultipart() self.msg.attach(MIMEText(msg, 'html', 'utf-8')) self.msg['From'] = formataddr([EMAIL_CFG['sender_name'], EMAIL_CFG['sender']]) self.msg['To'] = formataddr(["", ",".join(self.recvers)]) self.msg['Subject'] = Header(subject, 'utf-8')
def ziroom_extra(project_name, rid, rtn_data): '''ziroom_extra Ziroom Extra func. ''' logger = LogBase(project_name, "ziroom_extra") logger.debug("Before Extra =>", data=rtn_data) # Extra func for house code. try: end = int(rtn_data['house_code'].split('_')[1]) room_num = int(findall(r"([0-9])室[0-9]厅", rtn_data['house_type'])[0]) except Exception: pass else: if room_num > 1: rds = RedisController( int(conf_kv_func("ziroom.sys_config", all=True)['redis_db']), project_name) for idx in range(1, room_num + 1): rds.__update_dict_to_redis__( rid - end + idx, {"house_id": str(rid - end + idx)}) # Extra func for price. try: price_dict = dict() price, price_dict = get_price_from_png(rtn_data["price"], price_dict, project_name) rtn_data["price"] = price except Exception: pass # Extra func for payment. try: payment_rtn_list = list() for payment in rtn_data["paymentlist"]: payment_rtn = dict() for k, v in zip(payment.keys(), payment.values()): if k == "period": payment_rtn["period"] = v else: payment_rtn[k], price_dict = get_price_from_png( v, price_dict, project_name) payment_rtn_list.append(payment_rtn) rtn_data["paymentlist"] = payment_rtn_list except Exception: pass logger.debug("After Extra =>", data=rtn_data) return rtn_data
def get_price_from_png(price_object, price_dict, project_name): '''get_price_from_png Get price info from png files by using tesseract OCR. ''' logger = LogBase(project_name, "ziroom_ocr") try: price = StringIO() url = "http:{}".format(price_object[0]) if url in price_dict.keys(): t = price_dict[url] else: img_path = "_output/{}".format( findall(r"/([0-9a-zA-z]+.png)", url)[0]) req = ProxiesRequests([url], project_name) ctn = req.req_content_list[0][0] with open(img_path, "wb") as img: img.write(ctn) img = Image.open(img_path) bg = Image.new("RGBA", img.size, "white") merged_pic = Image.new("RGBA", tuple([int(s * 1.2) for s in img.size]), "white") mg = Image.alpha_composite(bg, img) merged_pic.paste(mg) t = image_to_string(merged_pic) for idx in price_object[2]: price.write(t[idx]) price_dict[url] = t logger.debug("OCR price =>", price=price.getvalue(), price_dict=price_dict) return price.getvalue(), price_dict except Exception as e: logger.warn("OCR failed.", err=e) return "", price_dict finally: price.close()
def __init__(self, rds, crawler_conf, project_name="sample"): LogBase.__init__(self, project_name, "ReqDetail") self.project_name = project_name self.rds = rds self.crawler_conf = crawler_conf self.sys = crawler_conf['sys_conf']
def __init__(self, crawler_conf, project_name="sample"): LogBase.__init__(self, project_name, "ReqList") self.project_name = project_name self.crawler = crawler_conf['list_crawler'] self.sys = crawler_conf['sys_conf'] self.compiles = crawler_conf['compiles']
class Sender(): def __init__(self, msg="", subject="", recvers=None): '''Sender params: msg: Content subject: Subject ''' if recvers is None: self.recvers = EMAIL_CFG['recver'].split(',') else: if isinstance(recvers, list): self.recvers = recvers else: self.recvers = recvers.split(',') self.logger = LogBase('email', "email_sender") self.__smtp_predo__ self.msg = MIMEMultipart() self.msg.attach(MIMEText(msg, 'html', 'utf-8')) self.msg['From'] = formataddr([EMAIL_CFG['sender_name'], EMAIL_CFG['sender']]) self.msg['To'] = formataddr(["", ",".join(self.recvers)]) self.msg['Subject'] = Header(subject, 'utf-8') def add_attachment(self, filename, filepath): '''add_attachment params: filename: display name filepath: attachment path ''' attach = MIMEText(open(filepath, "rb").read(), 'base64', 'utf-8') attach["Content-Type"] = 'application/octet-stream' attach["Content-Disposition"] = 'attachment; filename="{}"'.format(filename) self.msg.attach(attach) def send(self): try: self.__smtp.sendmail(EMAIL_CFG['sender'], self.recvers, self.msg.as_string()) except Exception: self.logger.error("Send mail FAILED!") else: self.logger.info("Send mail success!", recver=self.recvers) finally: self.__smtp_aftdo__ @property def __smtp_predo__(self): # Connect to smtp server. try: self.__smtp = smtplib.SMTP_SSL() self.__smtp.connect(EMAIL_CFG["smtp_server"], EMAIL_CFG["smtp_port"]) self.__smtp.ehlo() except Exception: self.logger.error('Connect to SMTP server FAILED.', server=EMAIL_CFG["smtp_server"]) else: self.logger.info('Connect to SMTP server succeed.', server=EMAIL_CFG["smtp_server"]) # Login sender email. try: self.__smtp.login(user=EMAIL_CFG['sender'], password=EMAIL_CFG['sender_pwd']) except Exception: self.logger.error('Login FAILED.', user=EMAIL_CFG['sender']) finally: self.logger.info('Login succeed.', user=EMAIL_CFG['sender']) @property def __smtp_aftdo__(self): self.__smtp.quit()