def init_db(): config = global_obj.get("config") db_data = config["db"] obj = mongo.CMongodbManager("jiucai", db_data["addr"], db_data["port"], db_data["user"], db_data["password"]) global_obj.set("dbobj", obj) init_db_index()
def init_task(): config = global_obj.get("config") abortfile = None if "abort" in config: abortfile = config["abort"] timetaskobj = CTaskTimer(abortfile) global_obj.set("task_timer", timetaskobj)
def save_excel(data_list, collect=True): import excel_tool obj_list = [] head_list = [ "小区", "id", "价格", "均价", "建筑面积", "套内面积", "配备电梯", "挂牌时间", "房屋户型", "所在楼层", "户型结构", "建筑类型", "房屋朝向", "建筑结构", "装修情况", "梯户比例", "交易权属", "上次交易", "房屋用途", "房屋年限", "产权所属", "抵押信息", "房本备件" ] config = global_obj.get("config")["beike"] file = DATA_PATH + config.get("output", "结果") all_list = [] for data in data_list: save_data = [] for _, house in data["house_data"].items(): l = [ data["name"] if s == "小区" else house.get(s, "") for s in head_list ] save_data.append(l) if collect: all_list.append(l) obj_list.append( excel_tool.CSheetObject("%s_%s" % (data["name"], data["id"]), head_list, save_data)) if collect: obj = excel_tool.CSheetObject("汇总", head_list, all_list) obj_list.insert(0, obj) excel_tool.save_excel(file, obj_list)
def init_mail(): config = global_obj.get("config") mail_data = config["mail"] obj = mail.CMailBox(mail_data["user"], mail_data["password"], mail_data["host"]) obj.SetSender(mail_data["user"]) for name in mail_data["to"]: obj.SetReceive(name) global_obj.set("mail", obj)
def main_task(config_file): main.init_base(config_file) beike_db.init_db() init_mail() init_task() spider_beike.init() log.Sys("初始化完成") add_task() taskobj = global_obj.get("task_timer") taskobj.RunForever()
def add_task(): def start_task(tobj): spider_beike.beike_task() task_timer = global_obj.get("task_timer") time1 = CTimeTrigger(CTimeTrigger.TDay, "21:00:00") taskobj1 = CTask("spider_beike", time1, start_task, run_type=CTask.TForever) task_timer.AddTask(taskobj1)
def start_community(): ''' 使用多线程爬取小区 ''' beike_conf = global_obj.get("config")["beike"] task_list = [] for data in beike_conf["spider_list"]: cityName = data["city"] if "all" in data: community_list = get_all_community(cityName) else: community_list = data["community"] filterWord = None if "filter" in data: filterWord = data["filter"] for cName in community_list: task_list.append(( cityName, cName, filterWord, )) task2_list = [] data_list = [] def _get_community_info(threadobj, cityName, cName, filterWord): result_list = get_community_info(cityName, cName, filterWord) data_list.extend(result_list.values()) for cid, data in result_list.items(): for url in data["house_url_list"]: task2_list.append((url, data["house_data"])) del data["house_url_list"] thread_tool.start_thread(_get_community_info, task_list, 5) log.Info("爬取小区信息完毕", len(task_list), len(task2_list)) global g_count g_count = 0 def _get_house_info(tobj, url, house_data): get_house_info(url, house_data) log.Info("开始爬取所有信息", len(task2_list)) thread_tool.start_thread(_get_house_info, task2_list, 10) log.Info("爬取所有信息完成") return data_list
def send_diff_mail(diff_list): if len(diff_list) == 0: log.Info("no beike diff") return htmobj = html.CHtml("房奴调研:") def set_dff_house(new, old): head_list = new.keys() tbl_list = [] t2 = [] for key in head_list: v1 = str(new.get(key, "NULL")) if v1 != str(old.get(key, "NULL")): t2.append(htmobj.Font(v1, "red")) else: t2.append(v1) tbl_list.append(t2) tbl_list.append([str(old.get(key, "NULL")) for key in head_list]) htmobj.AddTable(tbl_list, head_list) for data in diff_list: htmobj.AddLine("=" * 30) htmobj.AddLine("小区<%s>信息发生变化" % (data["name"])) if len(data["new"]) > 0: htmobj.AddLine("新增房源:") htmobj.AddDict2Table(data["new"]) if len(data["del"]) > 0: htmobj.AddLine("有房源被删除:") htmobj.AddDict2Table(data["del"]) if len(data["diff"]) > 0: htmobj.AddLine("房源信息发生变化:") for v in data["diff"]: v1 = v[0] v2 = v[1] htmobj.AddLine("-" * 30) set_dff_house(v1, v2) htmobj.AddLine("+" * 30) htmobj.AddLine("*" * 30) html_text = htmobj.GetHtml() mailobj = global_obj.get("mail") message = mailobj.HtmlMailMessage() if message.SendMessage("房奴调研", html_text): log.Info("send beike mail done")
def log_obj(): return global_obj.get("logger")
def save_xiaoqu(cid, data): dbobj = global_obj.get("dbobj") col = dbobj.Collection(colname) col.update({"id": cid}, data, upsert=True)
def load_xiaoqu(cid): dbobj = global_obj.get("dbobj") col = dbobj.Collection(colname) ret = col.find_one({"id": cid}, {"_id": 0}) return ret
def init_db_index(): dbobj = global_obj.get("dbobj") dbobj.CreateIndex(colname, "id")