def manageWwwInfo(self, data, collection): # 获取网站的标题 domain_name = data['domain_name'] wwwtitle = '' if 'wwwtitle' in data: wwwtitle = data['wwwtitle'] htmlInfo = wwwInfo.start_parse(domain_name, self.contacttool_info, self.getContactFlag) title = '' # description = None # keywords = None status = 'C' if 'title' in htmlInfo: title = htmlInfo['title'] else: # 没有网站的 mongodb = MONGODB() mongodb.connect() mongodb.getdb('mxmanage') mongodb.getcollection(collection) mongodb.updateOne({"_id": data['_id']}, {'$set': { 'status': status }}) if title: status = self.checktitle(title) mongodb = MONGODB() mongodb.connect() mongodb.getdb('mxmanage') mongodb.getcollection(collection) mongodb.updateOne({"_id": data['_id']}, {'$set': { 'wwwtitle': title, 'status': status }}) print(self.name + data['domain_name'] + ' add title' + ' status ' + status) data['wwwtitle'] = title mongodb.close() data['status'] = status if not self.getContactFlag: return brandinfo = {} if 'contacttool' in htmlInfo: brandinfo = htmlInfo['contacttool'] if brandinfo: self.manageContacttoolInfo(data, collection, brandinfo)
def manageMxInfo(self, data, collection): domain_name = data['domain_name'] mx_info = MxManage.startParseMx(domain_name) if not 'mxsuffix' in mx_info: return if not 'mxrecord' in mx_info: return mongodbWhere = {"_id": data['_id']} # mongodb 操作对象初始化 mongodb = MONGODB() mongodb.connect() mongodb.getdb('mxmanage') mongodb.getcollection(collection) mx_brand_info = {} brand_id = 0 brand_name = '' # 是不是已经存在之前的mx记录 用于以后判断 mx 变更 if not 'mxrecord' in data: mxrecord = mx_info['mxrecord'] perdata = {'$set': {'mxrecord': mxrecord}} mongodb.updateOne(mongodbWhere, perdata) print(self.name + data['domain_name'] + ' append mxrecord') else: mxrecord = mx_info['mxrecord'] # 匹配下 原始的 mx 记录 pre_mxrecord = data['mxrecord'] if set(pre_mxrecord).issubset(set(mxrecord)): return # 后缀在 黑名单中 直接返回 有些后缀天天变化 if mx_info['mxsuffix'] in self.mx_blacklist_suffix: print(self.name + data['domain_name'] + ' mx suffix in blacklist') return if mx_info['mxsuffix'] in self.mxSuffix: mx_brand_info = self.mxSuffix[mx_info['mxsuffix']] brand_id = mx_brand_info['brand_id'] brand_name = mx_brand_info['brand_name'] else: # mx 后缀没有 需要添加到数据库中 try: # 表示没有该brand信息 not_classified_suffix = mx_info['mxsuffix'] db = DB() db.connect() where = " where host = '" + not_classified_suffix + "'" sql = "select * from sm_mx_suffix_notclassified " + where # print "查询未分类的mx"+sql stepCursor = db.query(sql) mx_notclassified = stepCursor.fetchone() # print mx_notclassified # {u'count': 0, u'addtime': 1487381057, u'host': u'dragonparking.com', u'id': 7} if mx_notclassified: updateSql = "update sm_mx_suffix_notclassified set count=" + str( mx_notclassified['count'] + 1) + where db.update(updateSql) else: insertSql = "insert into sm_mx_suffix_notclassified(`host`, `count`, `addtime`) VALUE ('" + not_classified_suffix + "','1','" + str( int(time.time())) + "') " db.update(insertSql) stepCursor.close() db.close() except OperationalError as ex: pass # 首先需要添加 if 'mx' in data: # 表示存在包含数据 匹配下是不是一致 不一致需要更新数据 pre_mx = data['mx'] # 之前的所有mx 信息 包含品牌 品牌id mx 以及优先级 now_mx = mx_info['mx'] # 表示品牌不一样 if brand_id != 0 and pre_mx['brand_id'] != 0 and pre_mx[ 'brand_id'] == brand_id: # 之前跟现在的品牌 都存在 但是相等的情况下 直接返回 return if MxManage.subMxSuffix( pre_mx['mx']) != MxManage.subMxSuffix(now_mx): perdata = { '$set': { 'mx': { 'mx': mx_info['mx'], 'priority': mx_info['priority'], 'brand_id': brand_id, 'brand_name': brand_name, 'addtime': int(time.time()) }, 'mx_changetime': int(time.time()) }, '$push': { # mx 历史记录 不包含当前所属品牌 'mxlist': data['mx'] } } mongodb.updateOne(mongodbWhere, perdata) print(self.name + data['domain_name'] + ' change MX') if self.addMailCusFlag: addCrmData.addMailCustomer(data, mx_info, mx_brand_info, collection, 'update') else: # 这种情况是 可能有些 之前mx后缀没有匹配的 后来又匹配到了 if brand_id and data['mx']['brand_id'] == 0: perdata = { '$set': { 'mx': { 'mx': mx_info['mx'], 'priority': mx_info['priority'], 'brand_id': brand_id, 'brand_name': brand_name, 'addtime': int(time.time()) }, 'mx_changetime': int(time.time()) } } print(self.name + data['domain_name'] + ' update brand info') mongodb.updateOne(mongodbWhere, perdata) else: # 有可能更新品牌信息 但是mx 没有变更 perdata = { '$set': { 'mx': { 'mx': mx_info['mx'], 'priority': mx_info['priority'], 'brand_id': brand_id, 'brand_name': brand_name, 'addtime': int(time.time()), }, 'mx_changetime': int(time.time()) } } print(self.name + data['domain_name'] + ' add MX') mongodb.updateOne(mongodbWhere, perdata) if self.addMailCusFlag: addCrmData.addMailCustomer(data, mx_info, mx_brand_info, collection, 'add')
def manageMailInfo(self, data, collection): mongodbWhere = {"_id": data['_id']} # 获取网站的标题 domainName = data['domain_name'] # 域名信息 brandInfo = wwwInfo.startParseMailIndex(domainName, self.mailSelfBuildInfo) # 判断下是不是包含 title mongodb = MONGODB() mongodb.connect() mongodb.getdb('mxmanage') mongodb.getcollection(collection) if 'title' in brandInfo and brandInfo['title'] != '': title = brandInfo['title'] status = self.checktitle(title, True) print(self.name + domainName + ' get mailtitle ' + ' status:' + status) # 更新mailtitle mongodb.updateOne( mongodbWhere, {'$set': { 'mailtitle': brandInfo['title'], 'status': status }}) if 'brandInfo' in brandInfo and len(brandInfo['brandInfo']) != 0: # print(self.name + domainName + ' get self build mail info') mongodb.updateOne( mongodbWhere, {'$set': { 'mailselfbuild': brandInfo['brandInfo'] }})
def manageContacttoolInfo(self, data, collection, brandinfo): mongodbWhere = {"_id": data['_id']} domain_name = data['domain_name'] # mongodb 操作对象初始化 mongodb = MONGODB() mongodb.connect() mongodb.getdb('mxmanage') mongodb.getcollection(collection) # 比对文章分类 if 'contacttool' in data: contacttool_info = data['contacttool'] pre_brand = contacttool_info['brand_id'] if pre_brand != brandinfo['brand_id']: perdata = { '$set': { 'contacttool': brandinfo, 'contacttool_changetime': int(time.time()) }, '$push': { # mx 历史记录 不包含当前所属品牌 'contacttoollist': contacttool_info } } mongodb.updateOne(mongodbWhere, perdata) # print(self.name + domain_name + ' change contact tool ') else: # 直接追加 perdata = { '$set': { 'contacttool': brandinfo, 'contacttool_changetime': int(time.time()) } } # print(self.name + domain_name + ' add new contact tool') mongodb.updateOne(mongodbWhere, perdata) mongodb.close()
def run(self): # 把要执行的代码写到run函数里面 线程在创建后会直接运行 run 函数 mongodb = MONGODB() mongodb.connect() mongodb.getdb('mxmanage') num_coll = 'mxmanage_stopnum' while True: self.queueLock.acquire() if self.q.qsize() == 0: # 当前客户 print("producer are producing data") current_coll = self.coll[0] # 从数据库中取数据 mongodb.getcollection(num_coll) flagWhere = {"flag": self.flag} stop_info = mongodb.findOne(flagWhere) if not stop_info: continue start = stop_info['start'] stop = stop_info['stop'] collection = stop_info['collection'] if start >= stop and start != 0: # 取数据失败 需要换个地区重新获取 del self.coll[0] # 如果没有 if self.coll: mongodb.getcollection(num_coll) mongodb.updateOne( flagWhere, { "$set": { "stop": 0, "start": 0, "collection": self.coll[0] } }) else: for province in self.permanent_coll: self.coll.append(province) mongodb.updateOne( flagWhere, { "$set": { "stop": 0, "start": 0, "collection": self.coll[0] } }) self.queueLock.release() continue if collection != current_coll: # 更新数据库中的 公司信息 从上次的断点位置继续遍历数据 length = len(self.coll) for item in range(length): if self.coll[0] == collection: break else: del self.coll[0] if stop == 0: # 重新 获取下总的数量 mongodb.getcollection(collection) # 重新来更新数据 stopnum = mongodb.count() # print(stopnum) mongodb.getcollection(num_coll) mongodb.updateOne(flagWhere, {"$set": { "stop": int(stopnum) }}) # 从数据库中获取 上次已经获取到哪了 这次从哪开始 有个问题是如果 数据有变动 会取到重复的值 mongodb.getcollection(collection) # print("producing data" + collection) mongodb.findMany(self.q, start, self.qCount) # print("produced data" + collection) # 更新mongodb 中的 start 数据 取数据成功 mongodb.getcollection(num_coll) mongodb.updateOne( flagWhere, {"$set": { "start": int(start + self.qCount) }}) mongodb.close() self.queueLock.release() else: self.queueLock.release() # print("生产者不需要生产" + str(self.q.qsize())) time.sleep(1)